From b106120e931ee54e9e89f84dba2dc49f79ac394e Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 28 May 2026 09:48:46 +0000 Subject: [PATCH] feat(fts): add V009 korean morphological tokenizer migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit V007 trigram tokenizer 의 한국어 2자 query 0-hit 한계 (Bug #8) 해소를 위한 V009 migration 추가. unicode61 tokenizer 로 환원 + 한국어 형태소 분해 결과를 별 column `tokenized_korean_text` 에 pre-fill 하는 방식. - migrations/V009__fts_korean_morphological.sql 신규: column ADD, chunks_fts DROP+재정의, 3 trigger CASE expression, backfill INSERT, corpus_revision bump. - design §5.5 갱신: trigram → unicode61 + 형태소 column. CASE expression trigger 본문. - crates/kebab-store-sqlite/tests/fts.rs: V007 verbatim test 를 V009 source-of-truth 로 rename. v009_bumps_corpus_revision unit test 추가. - store.rs: clippy bool_to_int_with_if + cast_lossless 기존 경고 수정 (pdf_ocr_events 관련 코드, S1 작업 중 발견). 영어 substring 매칭은 V002 (whole-token only) 로 회귀 — spec §3 Non-Goals + 후속 release notes (v0.20.1) 에서 정직히 기술. Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S1) Co-Authored-By: Claude Sonnet 4.6 --- crates/kebab-store-sqlite/src/store.rs | 4 +- crates/kebab-store-sqlite/tests/fts.rs | 48 +++++++--- .../2026-04-27-kebab-final-form-design.md | 35 ++++--- migrations/V009__fts_korean_morphological.sql | 91 +++++++++++++++++++ 4 files changed, 149 insertions(+), 29 deletions(-) create mode 100644 migrations/V009__fts_korean_morphological.sql diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index 437cc02..722001a 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -1028,7 +1028,7 @@ impl SqliteStore { image_height, ms, chars, - if success { 1i32 } else { 0i32 }, + i32::from(success), reason, ocr_engine ], @@ -1042,7 +1042,7 @@ impl SqliteStore { /// means "delete everything older than now" (i.e. all past rows). pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result { use time::format_description::well_known::Rfc3339; - let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(retention_days as i64); + let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(i64::from(retention_days)); let cutoff_ts = cutoff .format(&Rfc3339) .unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string()); diff --git a/crates/kebab-store-sqlite/tests/fts.rs b/crates/kebab-store-sqlite/tests/fts.rs index 5d9d978..2aa1c86 100644 --- a/crates/kebab-store-sqlite/tests/fts.rs +++ b/crates/kebab-store-sqlite/tests/fts.rs @@ -368,19 +368,20 @@ fn extract_design_5_5_fts_block() -> String { fts_slice[..last_end + "END;".len()].to_string() } -/// Extract the §5.5 verbatim block from the V007 migration (replaced V002 -/// 's unicode61 tokenizer with trigram — V002 stays in place for -/// historical cold-upgrade replay but V007 is now the source of truth), -/// between the `── §5.5 verbatim block ──` anchor markers V007 carries. +/// Extract the §5.5 verbatim block from the V009 migration (V009 replaces +/// V007 's trigram tokenizer with unicode61 + CASE expression triggers for +/// Korean morphological tokenization — V007 stays in place for historical +/// cold-upgrade replay but V009 is now the source of truth), +/// between the `── §5.5 verbatim block ──` anchor markers V009 carries. fn extract_migration_5_5_verbatim_block() -> String { - let migration = include_str!("../../../migrations/V007__fts_trigram.sql"); + let migration = include_str!("../../../migrations/V009__fts_korean_morphological.sql"); // The opening anchor line ends with `── §5.5 verbatim block ─...`. let open_marker = "§5.5 verbatim block"; let close_marker = "End §5.5 verbatim block"; let open_idx = migration .find(open_marker) - .expect("V007 must carry the `§5.5 verbatim block` opening anchor"); + .expect("V009 must carry the `§5.5 verbatim block` opening anchor"); let after_open_line = open_idx + migration[open_idx..] .find('\n') @@ -389,7 +390,7 @@ fn extract_migration_5_5_verbatim_block() -> String { let close_idx = migration[after_open_line..] .find(close_marker) - .expect("V007 must carry the `End §5.5 verbatim block` closing anchor") + .expect("V009 must carry the `End §5.5 verbatim block` closing anchor") + after_open_line; // Walk back from the close marker to the start of its comment line. let close_line_start = migration[..close_idx].rfind('\n').map_or(0, |n| n + 1); @@ -397,14 +398,15 @@ fn extract_migration_5_5_verbatim_block() -> String { migration[after_open_line..close_line_start].to_string() } -/// CI diff guard: the §5.5 block in `migrations/V007__fts_trigram.sql` -/// must match the design doc verbatim (whitespace-normalized). V007 -/// replaced V002 's unicode61 tokenizer with trigram (2026-05-23). -/// V002 stays in place for historical replay of cold-upgrade paths -/// but is no longer compared against the design doc — V007 is now +/// CI diff guard: the §5.5 block in `migrations/V009__fts_korean_morphological.sql` +/// must match the design doc verbatim (whitespace-normalized). V009 +/// replaced V007 's trigram tokenizer with unicode61 + CASE expression +/// triggers for Korean morphological tokenization (2026-05-28). +/// V007 stays in place for historical replay of cold-upgrade paths +/// but is no longer compared against the design doc — V009 is now /// the source of truth. #[test] -fn fts_v007_matches_design_section_5_5_verbatim() { +fn fts_v009_matches_design_section_5_5_verbatim() { let design = extract_design_5_5_fts_block(); let migration_block = extract_migration_5_5_verbatim_block(); @@ -427,12 +429,30 @@ fn fts_v007_matches_design_section_5_5_verbatim() { let migration_n = normalize_ws(&migration_block); assert_eq!( design_n, migration_n, - "V007__fts_trigram.sql §5.5 block must match design doc §5.5 verbatim \ + "V009__fts_korean_morphological.sql §5.5 block must match design doc §5.5 verbatim \ (whitespace-normalized). If you intentionally changed one, \ update the other in the same commit." ); } +// ── 5b. V009 corpus_revision bump ──────────────────────────────────── + +/// V009 migration 이 corpus_revision kv 를 bump 하는지 검증. +/// SqliteStore::open + run_migrations 후 corpus_revision 이 ≥ 1 이어야 함. +/// (V004 seed = '0', V009 UPDATE = CAST(CAST('0' AS INTEGER) + 1 AS TEXT) = '1'). +#[test] +fn v009_bumps_corpus_revision() { + let env = common::TestEnv::new(); + let store = SqliteStore::open(&env.config()).unwrap(); + store.run_migrations().unwrap(); + let rev = store.corpus_revision(); + assert!( + rev >= 1, + "corpus_revision must be ≥ 1 after V009 migration \ + (V004 seeds 0, V009 bumps to ≥ 1); got {rev}" + ); +} + // ── 6. WAL cleanup: drop store before tempdir reaps WAL/SHM ────────── /// Mirror the P1-6 pattern: opening + migrating + dropping the store diff --git a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md index 4302716..d912184 100644 --- a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md +++ b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md @@ -1062,15 +1062,15 @@ CREATE INDEX idx_blocks_doc_id ON blocks(doc_id); ### 5.5 Chunks + FTS5 -Tokenizer = `trigram` (V007, 2026-05-23). 한국어 어절(조사·어미가 붙은 단위)이 -unicode61 에서 단일 토큰화돼 lexical 부분 매칭이 불가능했던 문제를 해소 -(2자 미만 한국어 query 는 trigram 구조상 여전히 0-hit — 단일 토큰 측면에서는 -회귀 아님, multi-token query 는 `lexical.rs::build_match_string()` 가 whole-phrase -후보 OR 결합으로 매칭). trade-off: 영어 lexical 도 substring 매칭으로 이동 -(recall↑, 단어 경계 정밀도↓), BM25 raw score 분포 변경 (RRF rank 기반 hybrid -는 영향 미미), SQLite 파일 크기 ~2-10× 증가. 자세한 내용 = `tasks/HOTFIXES.md` -(2026-05-22) + `docs/superpowers/specs/2026-05-22-korean-trigram-tokenizer-design.md`. -`chunks_fts` 는 일반 FTS5 shadow table 이며 contentless 가 아님 (V002 / V007 +Tokenizer = `unicode61` (V009, 2026-05-28). V007 trigram 의 한국어 2자 query +0-hit 한계 (Bug #8) 를 해소하기 위해 한국어 형태소 분석 기반 접근법 채택. +`chunks` 테이블에 `tokenized_korean_text TEXT` 컬럼 추가 — ingest 경로가 +lindera ko-dic 형태소 분석 결과(공백 구분 형태소 sequence)를 pre-fill. +chunks_ai/chunks_au trigger 가 `tokenized_korean_text || ' ' || text` 를 +FTS5 에 색인 (CASE expression: NULL 이면 raw text 만). '한국', '서울' 같은 +2자 단어도 형태소 경계 일치 시 hit 가능. 영어 substring 매칭은 V002 수준 +(whole-token only) 으로 회귀 — 자세한 내용 = `tasks/HOTFIXES.md` (2026-05-28). +`chunks_fts` 는 일반 FTS5 shadow table 이며 contentless 가 아님 (V002 / V009 DDL 에 `content=''` 없음). ```sql @@ -1085,7 +1085,8 @@ CREATE TABLE chunks ( chunker_version TEXT NOT NULL, policy_hash TEXT NOT NULL, block_ids_json TEXT NOT NULL, - created_at TEXT NOT NULL + created_at TEXT NOT NULL, + tokenized_korean_text TEXT ); CREATE INDEX idx_chunks_doc_id ON chunks(doc_id); CREATE INDEX idx_chunks_chunker_version ON chunks(chunker_version); @@ -1095,12 +1096,16 @@ CREATE VIRTUAL TABLE chunks_fts USING fts5( doc_id UNINDEXED, heading_path, text, - tokenize = 'trigram' + tokenize = 'unicode61' ); CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text) - VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text); + VALUES (new.chunk_id, new.doc_id, new.heading_path_json, + CASE WHEN new.tokenized_korean_text IS NOT NULL + THEN new.tokenized_korean_text || ' ' || new.text + ELSE new.text + END); END; CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id; @@ -1108,7 +1113,11 @@ END; CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id; INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text) - VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text); + VALUES (new.chunk_id, new.doc_id, new.heading_path_json, + CASE WHEN new.tokenized_korean_text IS NOT NULL + THEN new.tokenized_korean_text || ' ' || new.text + ELSE new.text + END); END; ``` diff --git a/migrations/V009__fts_korean_morphological.sql b/migrations/V009__fts_korean_morphological.sql new file mode 100644 index 0000000..4a71811 --- /dev/null +++ b/migrations/V009__fts_korean_morphological.sql @@ -0,0 +1,91 @@ +-- V009__fts_korean_morphological.sql — Replace chunks_fts tokenizer: trigram → unicode61. +-- +-- Per design §5.5 (chunks_fts virtual table + chunks_ai/ad/au triggers). +-- The CREATE VIRTUAL TABLE / CREATE TRIGGER block below is reproduced +-- VERBATIM from `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` +-- §5.5; CI diff-checks this against the design doc (test +-- `fts_v009_matches_design_section_5_5_verbatim` in +-- `crates/kebab-store-sqlite/tests/fts.rs`). +-- +-- Tokenizer choice: unicode61 + pre-tokenized Korean column. +-- V007 trigram enabled substring matching for Korean ≥3 chars but +-- 2-char Korean queries (e.g. '한국', '서울') always returned 0 hits. +-- V009 adds `tokenized_korean_text TEXT` column to `chunks` — the ingest +-- path (S2+) runs lindera ko-dic morphological analysis and writes the +-- space-separated morpheme sequence to this column. The chunks_ai/chunks_au +-- triggers concatenate tokenized_korean_text with the raw text before +-- indexing into chunks_fts, so both Korean morphemes AND English tokens +-- are searchable via a single FTS query. English substring matching +-- (V007 ad-hoc feature) reverts to whole-token matching (V002 behavior). +-- corpus_revision is bumped so the in-process search cache is automatically +-- invalidated. See tasks/HOTFIXES.md (2026-05-28) for the deviation log. +-- +-- chunks_fts is a shadow of chunks (NOT contentless — V002 DDL has no +-- `content=''`); this migration drops the old shadow, recreates it with +-- the new tokenizer, recreates the sync triggers (CASE expression for +-- tokenized_korean_text), and backfills from `chunks`. The `chunks` table +-- and embeddings are untouched, so users do NOT need to re-ingest after +-- upgrading — the migration is fully automatic. tokenized_korean_text +-- starts as NULL for all pre-V009 rows; a subsequent kebab ingest +-- (S2+ path) will fill it in via UPDATE, firing chunks_au to re-index. + +-- ── Korean morphological tokenizer (V009) ───────────────────────────── + +-- chunks 테이블에 한국어 형태소 분해된 text 를 저장할 열 추가. +ALTER TABLE chunks ADD COLUMN tokenized_korean_text TEXT; + +-- 기존 chunks_fts 제거 (trigram tokenizer). +DROP TRIGGER IF EXISTS chunks_au; +DROP TRIGGER IF EXISTS chunks_ad; +DROP TRIGGER IF EXISTS chunks_ai; +DROP TABLE IF EXISTS chunks_fts; + +-- ── §5.5 verbatim block ──────────────────────────────────────────────── + +CREATE VIRTUAL TABLE chunks_fts USING fts5( + chunk_id UNINDEXED, + doc_id UNINDEXED, + heading_path, + text, + tokenize = 'unicode61' +); + +CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN + INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text) + VALUES (new.chunk_id, new.doc_id, new.heading_path_json, + CASE WHEN new.tokenized_korean_text IS NOT NULL + THEN new.tokenized_korean_text || ' ' || new.text + ELSE new.text + END); +END; +CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN + DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id; +END; +CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN + DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id; + INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text) + VALUES (new.chunk_id, new.doc_id, new.heading_path_json, + CASE WHEN new.tokenized_korean_text IS NOT NULL + THEN new.tokenized_korean_text || ' ' || new.text + ELSE new.text + END); +END; + +-- ── End §5.5 verbatim block ─────────────────────────────────────────── + +-- One-shot backfill from existing chunks. tokenized_korean_text is NULL +-- for all pre-V009 rows so the CASE expression falls to the ELSE branch +-- (raw text only). Subsequent re-ingest via S2+ will UPDATE +-- tokenized_korean_text and fire chunks_au to re-index with morphemes. +INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text) + SELECT chunk_id, doc_id, heading_path_json, + CASE WHEN tokenized_korean_text IS NOT NULL + THEN tokenized_korean_text || ' ' || text + ELSE text + END + FROM chunks; + +-- Bump corpus_revision so the in-process LRU search cache is invalidated. +-- kv table columns are `key` TEXT + `value` TEXT (V004__kv.sql). +-- value is TEXT so CAST is required for integer arithmetic. +UPDATE kv SET value = CAST(CAST(value AS INTEGER) + 1 AS TEXT) WHERE key = 'corpus_revision';