feat(fts): add V009 korean morphological tokenizer migration
V007 trigram tokenizer 의 한국어 2자 query 0-hit 한계 (Bug #8) 해소를 위한 V009 migration 추가. unicode61 tokenizer 로 환원 + 한국어 형태소 분해 결과를 별 column `tokenized_korean_text` 에 pre-fill 하는 방식. - migrations/V009__fts_korean_morphological.sql 신규: column ADD, chunks_fts DROP+재정의, 3 trigger CASE expression, backfill INSERT, corpus_revision bump. - design §5.5 갱신: trigram → unicode61 + 형태소 column. CASE expression trigger 본문. - crates/kebab-store-sqlite/tests/fts.rs: V007 verbatim test 를 V009 source-of-truth 로 rename. v009_bumps_corpus_revision unit test 추가. - store.rs: clippy bool_to_int_with_if + cast_lossless 기존 경고 수정 (pdf_ocr_events 관련 코드, S1 작업 중 발견). 영어 substring 매칭은 V002 (whole-token only) 로 회귀 — spec §3 Non-Goals + 후속 release notes (v0.20.1) 에서 정직히 기술. Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S1) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1028,7 +1028,7 @@ impl SqliteStore {
|
||||
image_height,
|
||||
ms,
|
||||
chars,
|
||||
if success { 1i32 } else { 0i32 },
|
||||
i32::from(success),
|
||||
reason,
|
||||
ocr_engine
|
||||
],
|
||||
@@ -1042,7 +1042,7 @@ impl SqliteStore {
|
||||
/// means "delete everything older than now" (i.e. all past rows).
|
||||
pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result<u64> {
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(retention_days as i64);
|
||||
let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(i64::from(retention_days));
|
||||
let cutoff_ts = cutoff
|
||||
.format(&Rfc3339)
|
||||
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string());
|
||||
|
||||
@@ -368,19 +368,20 @@ fn extract_design_5_5_fts_block() -> String {
|
||||
fts_slice[..last_end + "END;".len()].to_string()
|
||||
}
|
||||
|
||||
/// Extract the §5.5 verbatim block from the V007 migration (replaced V002
|
||||
/// 's unicode61 tokenizer with trigram — V002 stays in place for
|
||||
/// historical cold-upgrade replay but V007 is now the source of truth),
|
||||
/// between the `── §5.5 verbatim block ──` anchor markers V007 carries.
|
||||
/// Extract the §5.5 verbatim block from the V009 migration (V009 replaces
|
||||
/// V007 's trigram tokenizer with unicode61 + CASE expression triggers for
|
||||
/// Korean morphological tokenization — V007 stays in place for historical
|
||||
/// cold-upgrade replay but V009 is now the source of truth),
|
||||
/// between the `── §5.5 verbatim block ──` anchor markers V009 carries.
|
||||
fn extract_migration_5_5_verbatim_block() -> String {
|
||||
let migration = include_str!("../../../migrations/V007__fts_trigram.sql");
|
||||
let migration = include_str!("../../../migrations/V009__fts_korean_morphological.sql");
|
||||
// The opening anchor line ends with `── §5.5 verbatim block ─...`.
|
||||
let open_marker = "§5.5 verbatim block";
|
||||
let close_marker = "End §5.5 verbatim block";
|
||||
|
||||
let open_idx = migration
|
||||
.find(open_marker)
|
||||
.expect("V007 must carry the `§5.5 verbatim block` opening anchor");
|
||||
.expect("V009 must carry the `§5.5 verbatim block` opening anchor");
|
||||
let after_open_line = open_idx
|
||||
+ migration[open_idx..]
|
||||
.find('\n')
|
||||
@@ -389,7 +390,7 @@ fn extract_migration_5_5_verbatim_block() -> String {
|
||||
|
||||
let close_idx = migration[after_open_line..]
|
||||
.find(close_marker)
|
||||
.expect("V007 must carry the `End §5.5 verbatim block` closing anchor")
|
||||
.expect("V009 must carry the `End §5.5 verbatim block` closing anchor")
|
||||
+ after_open_line;
|
||||
// Walk back from the close marker to the start of its comment line.
|
||||
let close_line_start = migration[..close_idx].rfind('\n').map_or(0, |n| n + 1);
|
||||
@@ -397,14 +398,15 @@ fn extract_migration_5_5_verbatim_block() -> String {
|
||||
migration[after_open_line..close_line_start].to_string()
|
||||
}
|
||||
|
||||
/// CI diff guard: the §5.5 block in `migrations/V007__fts_trigram.sql`
|
||||
/// must match the design doc verbatim (whitespace-normalized). V007
|
||||
/// replaced V002 's unicode61 tokenizer with trigram (2026-05-23).
|
||||
/// V002 stays in place for historical replay of cold-upgrade paths
|
||||
/// but is no longer compared against the design doc — V007 is now
|
||||
/// CI diff guard: the §5.5 block in `migrations/V009__fts_korean_morphological.sql`
|
||||
/// must match the design doc verbatim (whitespace-normalized). V009
|
||||
/// replaced V007 's trigram tokenizer with unicode61 + CASE expression
|
||||
/// triggers for Korean morphological tokenization (2026-05-28).
|
||||
/// V007 stays in place for historical replay of cold-upgrade paths
|
||||
/// but is no longer compared against the design doc — V009 is now
|
||||
/// the source of truth.
|
||||
#[test]
|
||||
fn fts_v007_matches_design_section_5_5_verbatim() {
|
||||
fn fts_v009_matches_design_section_5_5_verbatim() {
|
||||
let design = extract_design_5_5_fts_block();
|
||||
let migration_block = extract_migration_5_5_verbatim_block();
|
||||
|
||||
@@ -427,12 +429,30 @@ fn fts_v007_matches_design_section_5_5_verbatim() {
|
||||
let migration_n = normalize_ws(&migration_block);
|
||||
assert_eq!(
|
||||
design_n, migration_n,
|
||||
"V007__fts_trigram.sql §5.5 block must match design doc §5.5 verbatim \
|
||||
"V009__fts_korean_morphological.sql §5.5 block must match design doc §5.5 verbatim \
|
||||
(whitespace-normalized). If you intentionally changed one, \
|
||||
update the other in the same commit."
|
||||
);
|
||||
}
|
||||
|
||||
// ── 5b. V009 corpus_revision bump ────────────────────────────────────
|
||||
|
||||
/// V009 migration 이 corpus_revision kv 를 bump 하는지 검증.
|
||||
/// SqliteStore::open + run_migrations 후 corpus_revision 이 ≥ 1 이어야 함.
|
||||
/// (V004 seed = '0', V009 UPDATE = CAST(CAST('0' AS INTEGER) + 1 AS TEXT) = '1').
|
||||
#[test]
|
||||
fn v009_bumps_corpus_revision() {
|
||||
let env = common::TestEnv::new();
|
||||
let store = SqliteStore::open(&env.config()).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
let rev = store.corpus_revision();
|
||||
assert!(
|
||||
rev >= 1,
|
||||
"corpus_revision must be ≥ 1 after V009 migration \
|
||||
(V004 seeds 0, V009 bumps to ≥ 1); got {rev}"
|
||||
);
|
||||
}
|
||||
|
||||
// ── 6. WAL cleanup: drop store before tempdir reaps WAL/SHM ──────────
|
||||
|
||||
/// Mirror the P1-6 pattern: opening + migrating + dropping the store
|
||||
|
||||
@@ -1062,15 +1062,15 @@ CREATE INDEX idx_blocks_doc_id ON blocks(doc_id);
|
||||
|
||||
### 5.5 Chunks + FTS5
|
||||
|
||||
Tokenizer = `trigram` (V007, 2026-05-23). 한국어 어절(조사·어미가 붙은 단위)이
|
||||
unicode61 에서 단일 토큰화돼 lexical 부분 매칭이 불가능했던 문제를 해소
|
||||
(2자 미만 한국어 query 는 trigram 구조상 여전히 0-hit — 단일 토큰 측면에서는
|
||||
회귀 아님, multi-token query 는 `lexical.rs::build_match_string()` 가 whole-phrase
|
||||
후보 OR 결합으로 매칭). trade-off: 영어 lexical 도 substring 매칭으로 이동
|
||||
(recall↑, 단어 경계 정밀도↓), BM25 raw score 분포 변경 (RRF rank 기반 hybrid
|
||||
는 영향 미미), SQLite 파일 크기 ~2-10× 증가. 자세한 내용 = `tasks/HOTFIXES.md`
|
||||
(2026-05-22) + `docs/superpowers/specs/2026-05-22-korean-trigram-tokenizer-design.md`.
|
||||
`chunks_fts` 는 일반 FTS5 shadow table 이며 contentless 가 아님 (V002 / V007
|
||||
Tokenizer = `unicode61` (V009, 2026-05-28). V007 trigram 의 한국어 2자 query
|
||||
0-hit 한계 (Bug #8) 를 해소하기 위해 한국어 형태소 분석 기반 접근법 채택.
|
||||
`chunks` 테이블에 `tokenized_korean_text TEXT` 컬럼 추가 — ingest 경로가
|
||||
lindera ko-dic 형태소 분석 결과(공백 구분 형태소 sequence)를 pre-fill.
|
||||
chunks_ai/chunks_au trigger 가 `tokenized_korean_text || ' ' || text` 를
|
||||
FTS5 에 색인 (CASE expression: NULL 이면 raw text 만). '한국', '서울' 같은
|
||||
2자 단어도 형태소 경계 일치 시 hit 가능. 영어 substring 매칭은 V002 수준
|
||||
(whole-token only) 으로 회귀 — 자세한 내용 = `tasks/HOTFIXES.md` (2026-05-28).
|
||||
`chunks_fts` 는 일반 FTS5 shadow table 이며 contentless 가 아님 (V002 / V009
|
||||
DDL 에 `content=''` 없음).
|
||||
|
||||
```sql
|
||||
@@ -1085,7 +1085,8 @@ CREATE TABLE chunks (
|
||||
chunker_version TEXT NOT NULL,
|
||||
policy_hash TEXT NOT NULL,
|
||||
block_ids_json TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL
|
||||
created_at TEXT NOT NULL,
|
||||
tokenized_korean_text TEXT
|
||||
);
|
||||
CREATE INDEX idx_chunks_doc_id ON chunks(doc_id);
|
||||
CREATE INDEX idx_chunks_chunker_version ON chunks(chunker_version);
|
||||
@@ -1095,12 +1096,16 @@ CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
||||
doc_id UNINDEXED,
|
||||
heading_path,
|
||||
text,
|
||||
tokenize = 'trigram'
|
||||
tokenize = 'unicode61'
|
||||
);
|
||||
|
||||
CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
|
||||
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text);
|
||||
VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
|
||||
CASE WHEN new.tokenized_korean_text IS NOT NULL
|
||||
THEN new.tokenized_korean_text || ' ' || new.text
|
||||
ELSE new.text
|
||||
END);
|
||||
END;
|
||||
CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
|
||||
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
|
||||
@@ -1108,7 +1113,11 @@ END;
|
||||
CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
|
||||
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
|
||||
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text);
|
||||
VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
|
||||
CASE WHEN new.tokenized_korean_text IS NOT NULL
|
||||
THEN new.tokenized_korean_text || ' ' || new.text
|
||||
ELSE new.text
|
||||
END);
|
||||
END;
|
||||
```
|
||||
|
||||
|
||||
91
migrations/V009__fts_korean_morphological.sql
Normal file
91
migrations/V009__fts_korean_morphological.sql
Normal file
@@ -0,0 +1,91 @@
|
||||
-- V009__fts_korean_morphological.sql — Replace chunks_fts tokenizer: trigram → unicode61.
|
||||
--
|
||||
-- Per design §5.5 (chunks_fts virtual table + chunks_ai/ad/au triggers).
|
||||
-- The CREATE VIRTUAL TABLE / CREATE TRIGGER block below is reproduced
|
||||
-- VERBATIM from `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md`
|
||||
-- §5.5; CI diff-checks this against the design doc (test
|
||||
-- `fts_v009_matches_design_section_5_5_verbatim` in
|
||||
-- `crates/kebab-store-sqlite/tests/fts.rs`).
|
||||
--
|
||||
-- Tokenizer choice: unicode61 + pre-tokenized Korean column.
|
||||
-- V007 trigram enabled substring matching for Korean ≥3 chars but
|
||||
-- 2-char Korean queries (e.g. '한국', '서울') always returned 0 hits.
|
||||
-- V009 adds `tokenized_korean_text TEXT` column to `chunks` — the ingest
|
||||
-- path (S2+) runs lindera ko-dic morphological analysis and writes the
|
||||
-- space-separated morpheme sequence to this column. The chunks_ai/chunks_au
|
||||
-- triggers concatenate tokenized_korean_text with the raw text before
|
||||
-- indexing into chunks_fts, so both Korean morphemes AND English tokens
|
||||
-- are searchable via a single FTS query. English substring matching
|
||||
-- (V007 ad-hoc feature) reverts to whole-token matching (V002 behavior).
|
||||
-- corpus_revision is bumped so the in-process search cache is automatically
|
||||
-- invalidated. See tasks/HOTFIXES.md (2026-05-28) for the deviation log.
|
||||
--
|
||||
-- chunks_fts is a shadow of chunks (NOT contentless — V002 DDL has no
|
||||
-- `content=''`); this migration drops the old shadow, recreates it with
|
||||
-- the new tokenizer, recreates the sync triggers (CASE expression for
|
||||
-- tokenized_korean_text), and backfills from `chunks`. The `chunks` table
|
||||
-- and embeddings are untouched, so users do NOT need to re-ingest after
|
||||
-- upgrading — the migration is fully automatic. tokenized_korean_text
|
||||
-- starts as NULL for all pre-V009 rows; a subsequent kebab ingest
|
||||
-- (S2+ path) will fill it in via UPDATE, firing chunks_au to re-index.
|
||||
|
||||
-- ── Korean morphological tokenizer (V009) ─────────────────────────────
|
||||
|
||||
-- chunks 테이블에 한국어 형태소 분해된 text 를 저장할 열 추가.
|
||||
ALTER TABLE chunks ADD COLUMN tokenized_korean_text TEXT;
|
||||
|
||||
-- 기존 chunks_fts 제거 (trigram tokenizer).
|
||||
DROP TRIGGER IF EXISTS chunks_au;
|
||||
DROP TRIGGER IF EXISTS chunks_ad;
|
||||
DROP TRIGGER IF EXISTS chunks_ai;
|
||||
DROP TABLE IF EXISTS chunks_fts;
|
||||
|
||||
-- ── §5.5 verbatim block ────────────────────────────────────────────────
|
||||
|
||||
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
||||
chunk_id UNINDEXED,
|
||||
doc_id UNINDEXED,
|
||||
heading_path,
|
||||
text,
|
||||
tokenize = 'unicode61'
|
||||
);
|
||||
|
||||
CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
|
||||
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
|
||||
CASE WHEN new.tokenized_korean_text IS NOT NULL
|
||||
THEN new.tokenized_korean_text || ' ' || new.text
|
||||
ELSE new.text
|
||||
END);
|
||||
END;
|
||||
CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
|
||||
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
|
||||
END;
|
||||
CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
|
||||
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
|
||||
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
|
||||
CASE WHEN new.tokenized_korean_text IS NOT NULL
|
||||
THEN new.tokenized_korean_text || ' ' || new.text
|
||||
ELSE new.text
|
||||
END);
|
||||
END;
|
||||
|
||||
-- ── End §5.5 verbatim block ───────────────────────────────────────────
|
||||
|
||||
-- One-shot backfill from existing chunks. tokenized_korean_text is NULL
|
||||
-- for all pre-V009 rows so the CASE expression falls to the ELSE branch
|
||||
-- (raw text only). Subsequent re-ingest via S2+ will UPDATE
|
||||
-- tokenized_korean_text and fire chunks_au to re-index with morphemes.
|
||||
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
SELECT chunk_id, doc_id, heading_path_json,
|
||||
CASE WHEN tokenized_korean_text IS NOT NULL
|
||||
THEN tokenized_korean_text || ' ' || text
|
||||
ELSE text
|
||||
END
|
||||
FROM chunks;
|
||||
|
||||
-- Bump corpus_revision so the in-process LRU search cache is invalidated.
|
||||
-- kv table columns are `key` TEXT + `value` TEXT (V004__kv.sql).
|
||||
-- value is TEXT so CAST is required for integer arithmetic.
|
||||
UPDATE kv SET value = CAST(CAST(value AS INTEGER) + 1 AS TEXT) WHERE key = 'corpus_revision';
|
||||
Reference in New Issue
Block a user