feat(fts): add V009 korean morphological tokenizer migration

V007 trigram tokenizer 의 한국어 2자 query 0-hit 한계 (Bug #8) 해소를 위한 V009 migration 추가. unicode61 tokenizer 로 환원 + 한국어 형태소 분해 결과를 별 column `tokenized_korean_text` 에 pre-fill 하는 방식. - migrations/V009__fts_korean_morphological.sql 신규: column ADD, chunks_fts DROP+재정의, 3 trigger CASE expression, backfill INSERT, corpus_revision bump. - design §5.5 갱신: trigram → unicode61 + 형태소 column. CASE expression trigger 본문. - crates/kebab-store-sqlite/tests/fts.rs: V007 verbatim test 를 V009 source-of-truth 로 rename. v009_bumps_corpus_revision unit test 추가. - store.rs: clippy bool_to_int_with_if + cast_lossless 기존 경고 수정 (pdf_ocr_events 관련 코드, S1 작업 중 발견). 영어 substring 매칭은 V002 (whole-token only) 로 회귀 — spec §3 Non-Goals + 후속 release notes (v0.20.1) 에서 정직히 기술. Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S1) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 09:48:46 +00:00
parent 43366b1b15
commit b106120e93
4 changed files with 149 additions and 29 deletions
--- a/crates/kebab-store-sqlite/src/store.rs
+++ b/crates/kebab-store-sqlite/src/store.rs
@@ -1028,7 +1028,7 @@ impl SqliteStore {
                image_height,
                ms,
                chars,
-                if success { 1i32 } else { 0i32 },
+                i32::from(success),
                reason,
                ocr_engine
            ],
@@ -1042,7 +1042,7 @@ impl SqliteStore {
    /// means "delete everything older than now" (i.e. all past rows).
    pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result<u64> {
        use time::format_description::well_known::Rfc3339;
-        let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(retention_days as i64);
+        let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(i64::from(retention_days));
        let cutoff_ts = cutoff
            .format(&Rfc3339)
            .unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string());
--- a/crates/kebab-store-sqlite/tests/fts.rs
+++ b/crates/kebab-store-sqlite/tests/fts.rs
@@ -368,19 +368,20 @@ fn extract_design_5_5_fts_block() -> String {
    fts_slice[..last_end + "END;".len()].to_string()
 }

-/// Extract the §5.5 verbatim block from the V007 migration (replaced V002
-/// 's unicode61 tokenizer with trigram — V002 stays in place for
-/// historical cold-upgrade replay but V007 is now the source of truth),
-/// between the `── §5.5 verbatim block ──` anchor markers V007 carries.
+/// Extract the §5.5 verbatim block from the V009 migration (V009 replaces
+/// V007 's trigram tokenizer with unicode61 + CASE expression triggers for
+/// Korean morphological tokenization — V007 stays in place for historical
+/// cold-upgrade replay but V009 is now the source of truth),
+/// between the `── §5.5 verbatim block ──` anchor markers V009 carries.
 fn extract_migration_5_5_verbatim_block() -> String {
-    let migration = include_str!("../../../migrations/V007__fts_trigram.sql");
+    let migration = include_str!("../../../migrations/V009__fts_korean_morphological.sql");
    // The opening anchor line ends with `── §5.5 verbatim block ─...`.
    let open_marker = "§5.5 verbatim block";
    let close_marker = "End §5.5 verbatim block";

    let open_idx = migration
        .find(open_marker)
-        .expect("V007 must carry the `§5.5 verbatim block` opening anchor");
+        .expect("V009 must carry the `§5.5 verbatim block` opening anchor");
    let after_open_line = open_idx
        + migration[open_idx..]
            .find('\n')
@@ -389,7 +390,7 @@ fn extract_migration_5_5_verbatim_block() -> String {

    let close_idx = migration[after_open_line..]
        .find(close_marker)
-        .expect("V007 must carry the `End §5.5 verbatim block` closing anchor")
+        .expect("V009 must carry the `End §5.5 verbatim block` closing anchor")
        + after_open_line;
    // Walk back from the close marker to the start of its comment line.
    let close_line_start = migration[..close_idx].rfind('\n').map_or(0, |n| n + 1);
@@ -397,14 +398,15 @@ fn extract_migration_5_5_verbatim_block() -> String {
    migration[after_open_line..close_line_start].to_string()
 }

-/// CI diff guard: the §5.5 block in `migrations/V007__fts_trigram.sql`
-/// must match the design doc verbatim (whitespace-normalized). V007
-/// replaced V002 's unicode61 tokenizer with trigram (2026-05-23).
-/// V002 stays in place for historical replay of cold-upgrade paths
-/// but is no longer compared against the design doc — V007 is now
+/// CI diff guard: the §5.5 block in `migrations/V009__fts_korean_morphological.sql`
+/// must match the design doc verbatim (whitespace-normalized). V009
+/// replaced V007 's trigram tokenizer with unicode61 + CASE expression
+/// triggers for Korean morphological tokenization (2026-05-28).
+/// V007 stays in place for historical replay of cold-upgrade paths
+/// but is no longer compared against the design doc — V009 is now
 /// the source of truth.
 #[test]
-fn fts_v007_matches_design_section_5_5_verbatim() {
+fn fts_v009_matches_design_section_5_5_verbatim() {
    let design = extract_design_5_5_fts_block();
    let migration_block = extract_migration_5_5_verbatim_block();

@@ -427,12 +429,30 @@ fn fts_v007_matches_design_section_5_5_verbatim() {
    let migration_n = normalize_ws(&migration_block);
    assert_eq!(
        design_n, migration_n,
-        "V007__fts_trigram.sql §5.5 block must match design doc §5.5 verbatim \
+        "V009__fts_korean_morphological.sql §5.5 block must match design doc §5.5 verbatim \
         (whitespace-normalized). If you intentionally changed one, \
         update the other in the same commit."
    );
 }

+// ── 5b. V009 corpus_revision bump ────────────────────────────────────
+
+/// V009 migration 이 corpus_revision kv 를 bump 하는지 검증.
+/// SqliteStore::open + run_migrations 후 corpus_revision 이 ≥ 1 이어야 함.
+/// (V004 seed = '0', V009 UPDATE = CAST(CAST('0' AS INTEGER) + 1 AS TEXT) = '1').
+#[test]
+fn v009_bumps_corpus_revision() {
+    let env = common::TestEnv::new();
+    let store = SqliteStore::open(&env.config()).unwrap();
+    store.run_migrations().unwrap();
+    let rev = store.corpus_revision();
+    assert!(
+        rev >= 1,
+        "corpus_revision must be ≥ 1 after V009 migration \
+         (V004 seeds 0, V009 bumps to ≥ 1); got {rev}"
+    );
+}
+
 // ── 6. WAL cleanup: drop store before tempdir reaps WAL/SHM ──────────

 /// Mirror the P1-6 pattern: opening + migrating + dropping the store
--- a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
+++ b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
@@ -1062,15 +1062,15 @@ CREATE INDEX idx_blocks_doc_id ON blocks(doc_id);

 ### 5.5 Chunks + FTS5

-Tokenizer = `trigram` (V007, 2026-05-23). 한국어 어절(조사·어미가 붙은 단위)이
-unicode61 에서 단일 토큰화돼 lexical 부분 매칭이 불가능했던 문제를 해소
-(2자 미만 한국어 query 는 trigram 구조상 여전히 0-hit — 단일 토큰 측면에서는
-회귀 아님, multi-token query 는 `lexical.rs::build_match_string()` 가 whole-phrase
-후보 OR 결합으로 매칭). trade-off: 영어 lexical 도 substring 매칭으로 이동
-(recall↑, 단어 경계 정밀도↓), BM25 raw score 분포 변경 (RRF rank 기반 hybrid
-는 영향 미미), SQLite 파일 크기 ~2-10× 증가. 자세한 내용 = `tasks/HOTFIXES.md`
-(2026-05-22) + `docs/superpowers/specs/2026-05-22-korean-trigram-tokenizer-design.md`.
-`chunks_fts` 는 일반 FTS5 shadow table 이며 contentless 가 아님 (V002 / V007
+Tokenizer = `unicode61` (V009, 2026-05-28). V007 trigram 의 한국어 2자 query
+0-hit 한계 (Bug #8) 를 해소하기 위해 한국어 형태소 분석 기반 접근법 채택.
+`chunks` 테이블에 `tokenized_korean_text TEXT` 컬럼 추가 — ingest 경로가
+lindera ko-dic 형태소 분석 결과(공백 구분 형태소 sequence)를 pre-fill.
+chunks_ai/chunks_au trigger 가 `tokenized_korean_text || ' ' || text` 를
+FTS5 에 색인 (CASE expression: NULL 이면 raw text 만). '한국', '서울' 같은
+2자 단어도 형태소 경계 일치 시 hit 가능. 영어 substring 매칭은 V002 수준
+(whole-token only) 으로 회귀 — 자세한 내용 = `tasks/HOTFIXES.md` (2026-05-28).
+`chunks_fts` 는 일반 FTS5 shadow table 이며 contentless 가 아님 (V002 / V009
 DDL 에 `content=''` 없음).

 ```sql
@@ -1085,7 +1085,8 @@ CREATE TABLE chunks (
  chunker_version   TEXT NOT NULL,
  policy_hash       TEXT NOT NULL,
  block_ids_json    TEXT NOT NULL,
-  created_at        TEXT NOT NULL
+  created_at        TEXT NOT NULL,
+  tokenized_korean_text TEXT
 );
 CREATE INDEX idx_chunks_doc_id          ON chunks(doc_id);
 CREATE INDEX idx_chunks_chunker_version ON chunks(chunker_version);
@@ -1095,12 +1096,16 @@ CREATE VIRTUAL TABLE chunks_fts USING fts5(
  doc_id       UNINDEXED,
  heading_path,
  text,
-  tokenize = 'trigram'
+  tokenize = 'unicode61'
 );

 CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
  INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
-  VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text);
+  VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
+          CASE WHEN new.tokenized_korean_text IS NOT NULL
+               THEN new.tokenized_korean_text || ' ' || new.text
+               ELSE new.text
+          END);
 END;
 CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
  DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
@@ -1108,7 +1113,11 @@ END;
 CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
  DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
  INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
-  VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text);
+  VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
+          CASE WHEN new.tokenized_korean_text IS NOT NULL
+               THEN new.tokenized_korean_text || ' ' || new.text
+               ELSE new.text
+          END);
 END;
 ```

--- a/migrations/V009__fts_korean_morphological.sql
+++ b/migrations/V009__fts_korean_morphological.sql
@@ -0,0 +1,91 @@
+-- V009__fts_korean_morphological.sql — Replace chunks_fts tokenizer: trigram → unicode61.
+--
+-- Per design §5.5 (chunks_fts virtual table + chunks_ai/ad/au triggers).
+-- The CREATE VIRTUAL TABLE / CREATE TRIGGER block below is reproduced
+-- VERBATIM from `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md`
+-- §5.5; CI diff-checks this against the design doc (test
+-- `fts_v009_matches_design_section_5_5_verbatim` in
+-- `crates/kebab-store-sqlite/tests/fts.rs`).
+--
+-- Tokenizer choice: unicode61 + pre-tokenized Korean column.
+-- V007 trigram enabled substring matching for Korean ≥3 chars but
+-- 2-char Korean queries (e.g. '한국', '서울') always returned 0 hits.
+-- V009 adds `tokenized_korean_text TEXT` column to `chunks` — the ingest
+-- path (S2+) runs lindera ko-dic morphological analysis and writes the
+-- space-separated morpheme sequence to this column. The chunks_ai/chunks_au
+-- triggers concatenate tokenized_korean_text with the raw text before
+-- indexing into chunks_fts, so both Korean morphemes AND English tokens
+-- are searchable via a single FTS query. English substring matching
+-- (V007 ad-hoc feature) reverts to whole-token matching (V002 behavior).
+-- corpus_revision is bumped so the in-process search cache is automatically
+-- invalidated. See tasks/HOTFIXES.md (2026-05-28) for the deviation log.
+--
+-- chunks_fts is a shadow of chunks (NOT contentless — V002 DDL has no
+-- `content=''`); this migration drops the old shadow, recreates it with
+-- the new tokenizer, recreates the sync triggers (CASE expression for
+-- tokenized_korean_text), and backfills from `chunks`. The `chunks` table
+-- and embeddings are untouched, so users do NOT need to re-ingest after
+-- upgrading — the migration is fully automatic. tokenized_korean_text
+-- starts as NULL for all pre-V009 rows; a subsequent kebab ingest
+-- (S2+ path) will fill it in via UPDATE, firing chunks_au to re-index.
+
+-- ── Korean morphological tokenizer (V009) ─────────────────────────────
+
+-- chunks 테이블에 한국어 형태소 분해된 text 를 저장할 열 추가.
+ALTER TABLE chunks ADD COLUMN tokenized_korean_text TEXT;
+
+-- 기존 chunks_fts 제거 (trigram tokenizer).
+DROP TRIGGER IF EXISTS chunks_au;
+DROP TRIGGER IF EXISTS chunks_ad;
+DROP TRIGGER IF EXISTS chunks_ai;
+DROP TABLE IF EXISTS chunks_fts;
+
+-- ── §5.5 verbatim block ────────────────────────────────────────────────
+
+CREATE VIRTUAL TABLE chunks_fts USING fts5(
+  chunk_id     UNINDEXED,
+  doc_id       UNINDEXED,
+  heading_path,
+  text,
+  tokenize = 'unicode61'
+);
+
+CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
+  INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
+  VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
+          CASE WHEN new.tokenized_korean_text IS NOT NULL
+               THEN new.tokenized_korean_text || ' ' || new.text
+               ELSE new.text
+          END);
+END;
+CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
+  DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
+END;
+CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
+  DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
+  INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
+  VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
+          CASE WHEN new.tokenized_korean_text IS NOT NULL
+               THEN new.tokenized_korean_text || ' ' || new.text
+               ELSE new.text
+          END);
+END;
+
+-- ── End §5.5 verbatim block ───────────────────────────────────────────
+
+-- One-shot backfill from existing chunks. tokenized_korean_text is NULL
+-- for all pre-V009 rows so the CASE expression falls to the ELSE branch
+-- (raw text only). Subsequent re-ingest via S2+ will UPDATE
+-- tokenized_korean_text and fire chunks_au to re-index with morphemes.
+INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
+  SELECT chunk_id, doc_id, heading_path_json,
+         CASE WHEN tokenized_korean_text IS NOT NULL
+              THEN tokenized_korean_text || ' ' || text
+              ELSE text
+         END
+  FROM chunks;
+
+-- Bump corpus_revision so the in-process LRU search cache is invalidated.
+-- kv table columns are `key` TEXT + `value` TEXT (V004__kv.sql).
+-- value is TEXT so CAST is required for integer arithmetic.
+UPDATE kv SET value = CAST(CAST(value AS INTEGER) + 1 AS TEXT) WHERE key = 'corpus_revision';