feat(fts): add V009 korean morphological tokenizer migration

V007 trigram tokenizer 의 한국어 2자 query 0-hit 한계 (Bug #8) 해소를
위한 V009 migration 추가. unicode61 tokenizer 로 환원 + 한국어 형태소
분해 결과를 별 column `tokenized_korean_text` 에 pre-fill 하는 방식.

- migrations/V009__fts_korean_morphological.sql 신규: column ADD,
  chunks_fts DROP+재정의, 3 trigger CASE expression, backfill INSERT,
  corpus_revision bump.
- design §5.5 갱신: trigram → unicode61 + 형태소 column. CASE
  expression trigger 본문.
- crates/kebab-store-sqlite/tests/fts.rs: V007 verbatim test 를
  V009 source-of-truth 로 rename. v009_bumps_corpus_revision unit
  test 추가.
- store.rs: clippy bool_to_int_with_if + cast_lossless 기존 경고 수정
  (pdf_ocr_events 관련 코드, S1 작업 중 발견).

영어 substring 매칭은 V002 (whole-token only) 로 회귀 — spec §3
Non-Goals + 후속 release notes (v0.20.1) 에서 정직히 기술.

Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md
Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S1)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 09:48:46 +00:00
parent 43366b1b15
commit b106120e93
4 changed files with 149 additions and 29 deletions

View File

@@ -1028,7 +1028,7 @@ impl SqliteStore {
image_height,
ms,
chars,
if success { 1i32 } else { 0i32 },
i32::from(success),
reason,
ocr_engine
],
@@ -1042,7 +1042,7 @@ impl SqliteStore {
/// means "delete everything older than now" (i.e. all past rows).
pub fn prune_pdf_ocr_events(&self, retention_days: u32) -> anyhow::Result<u64> {
use time::format_description::well_known::Rfc3339;
let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(retention_days as i64);
let cutoff = time::OffsetDateTime::now_utc() - time::Duration::days(i64::from(retention_days));
let cutoff_ts = cutoff
.format(&Rfc3339)
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string());

View File

@@ -368,19 +368,20 @@ fn extract_design_5_5_fts_block() -> String {
fts_slice[..last_end + "END;".len()].to_string()
}
/// Extract the §5.5 verbatim block from the V007 migration (replaced V002
/// 's unicode61 tokenizer with trigram — V002 stays in place for
/// historical cold-upgrade replay but V007 is now the source of truth),
/// between the `── §5.5 verbatim block ──` anchor markers V007 carries.
/// Extract the §5.5 verbatim block from the V009 migration (V009 replaces
/// V007 's trigram tokenizer with unicode61 + CASE expression triggers for
/// Korean morphological tokenization — V007 stays in place for historical
/// cold-upgrade replay but V009 is now the source of truth),
/// between the `── §5.5 verbatim block ──` anchor markers V009 carries.
fn extract_migration_5_5_verbatim_block() -> String {
let migration = include_str!("../../../migrations/V007__fts_trigram.sql");
let migration = include_str!("../../../migrations/V009__fts_korean_morphological.sql");
// The opening anchor line ends with `── §5.5 verbatim block ─...`.
let open_marker = "§5.5 verbatim block";
let close_marker = "End §5.5 verbatim block";
let open_idx = migration
.find(open_marker)
.expect("V007 must carry the `§5.5 verbatim block` opening anchor");
.expect("V009 must carry the `§5.5 verbatim block` opening anchor");
let after_open_line = open_idx
+ migration[open_idx..]
.find('\n')
@@ -389,7 +390,7 @@ fn extract_migration_5_5_verbatim_block() -> String {
let close_idx = migration[after_open_line..]
.find(close_marker)
.expect("V007 must carry the `End §5.5 verbatim block` closing anchor")
.expect("V009 must carry the `End §5.5 verbatim block` closing anchor")
+ after_open_line;
// Walk back from the close marker to the start of its comment line.
let close_line_start = migration[..close_idx].rfind('\n').map_or(0, |n| n + 1);
@@ -397,14 +398,15 @@ fn extract_migration_5_5_verbatim_block() -> String {
migration[after_open_line..close_line_start].to_string()
}
/// CI diff guard: the §5.5 block in `migrations/V007__fts_trigram.sql`
/// must match the design doc verbatim (whitespace-normalized). V007
/// replaced V002 's unicode61 tokenizer with trigram (2026-05-23).
/// V002 stays in place for historical replay of cold-upgrade paths
/// but is no longer compared against the design doc — V007 is now
/// CI diff guard: the §5.5 block in `migrations/V009__fts_korean_morphological.sql`
/// must match the design doc verbatim (whitespace-normalized). V009
/// replaced V007 's trigram tokenizer with unicode61 + CASE expression
/// triggers for Korean morphological tokenization (2026-05-28).
/// V007 stays in place for historical replay of cold-upgrade paths
/// but is no longer compared against the design doc — V009 is now
/// the source of truth.
#[test]
fn fts_v007_matches_design_section_5_5_verbatim() {
fn fts_v009_matches_design_section_5_5_verbatim() {
let design = extract_design_5_5_fts_block();
let migration_block = extract_migration_5_5_verbatim_block();
@@ -427,12 +429,30 @@ fn fts_v007_matches_design_section_5_5_verbatim() {
let migration_n = normalize_ws(&migration_block);
assert_eq!(
design_n, migration_n,
"V007__fts_trigram.sql §5.5 block must match design doc §5.5 verbatim \
"V009__fts_korean_morphological.sql §5.5 block must match design doc §5.5 verbatim \
(whitespace-normalized). If you intentionally changed one, \
update the other in the same commit."
);
}
// ── 5b. V009 corpus_revision bump ────────────────────────────────────
/// V009 migration 이 corpus_revision kv 를 bump 하는지 검증.
/// SqliteStore::open + run_migrations 후 corpus_revision 이 ≥ 1 이어야 함.
/// (V004 seed = '0', V009 UPDATE = CAST(CAST('0' AS INTEGER) + 1 AS TEXT) = '1').
#[test]
fn v009_bumps_corpus_revision() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let rev = store.corpus_revision();
assert!(
rev >= 1,
"corpus_revision must be ≥ 1 after V009 migration \
(V004 seeds 0, V009 bumps to ≥ 1); got {rev}"
);
}
// ── 6. WAL cleanup: drop store before tempdir reaps WAL/SHM ──────────
/// Mirror the P1-6 pattern: opening + migrating + dropping the store

View File

@@ -1062,15 +1062,15 @@ CREATE INDEX idx_blocks_doc_id ON blocks(doc_id);
### 5.5 Chunks + FTS5
Tokenizer = `trigram` (V007, 2026-05-23). 한국어 어절(조사·어미가 붙은 단위)이
unicode61 에서 단일 토큰화돼 lexical 부분 매칭이 불가능했던 문제를 해소
(2자 미만 한국어 query 는 trigram 구조상 여전히 0-hit — 단일 토큰 측면에서는
회귀 아님, multi-token query 는 `lexical.rs::build_match_string()` 가 whole-phrase
후보 OR 결합으로 매칭). trade-off: 영어 lexical 도 substring 매칭으로 이동
(recall↑, 단어 경계 정밀도↓), BM25 raw score 분포 변경 (RRF rank 기반 hybrid
는 영향 미미), SQLite 파일 크기 ~2-10× 증가. 자세한 내용 = `tasks/HOTFIXES.md`
(2026-05-22) + `docs/superpowers/specs/2026-05-22-korean-trigram-tokenizer-design.md`.
`chunks_fts` 는 일반 FTS5 shadow table 이며 contentless 가 아님 (V002 / V007
Tokenizer = `unicode61` (V009, 2026-05-28). V007 trigram 의 한국어 2자 query
0-hit 한계 (Bug #8) 를 해소하기 위해 한국어 형태소 분석 기반 접근법 채택.
`chunks` 테이블에 `tokenized_korean_text TEXT` 컬럼 추가 — ingest 경로가
lindera ko-dic 형태소 분석 결과(공백 구분 형태소 sequence)를 pre-fill.
chunks_ai/chunks_au trigger 가 `tokenized_korean_text || ' ' || text`
FTS5 에 색인 (CASE expression: NULL 이면 raw text 만). '한국', '서울' 같은
2자 단어도 형태소 경계 일치 시 hit 가능. 영어 substring 매칭은 V002 수준
(whole-token only) 으로 회귀 — 자세한 내용 = `tasks/HOTFIXES.md` (2026-05-28).
`chunks_fts` 는 일반 FTS5 shadow table 이며 contentless 가 아님 (V002 / V009
DDL 에 `content=''` 없음).
```sql
@@ -1085,7 +1085,8 @@ CREATE TABLE chunks (
chunker_version TEXT NOT NULL,
policy_hash TEXT NOT NULL,
block_ids_json TEXT NOT NULL,
created_at TEXT NOT NULL
created_at TEXT NOT NULL,
tokenized_korean_text TEXT
);
CREATE INDEX idx_chunks_doc_id ON chunks(doc_id);
CREATE INDEX idx_chunks_chunker_version ON chunks(chunker_version);
@@ -1095,12 +1096,16 @@ CREATE VIRTUAL TABLE chunks_fts USING fts5(
doc_id UNINDEXED,
heading_path,
text,
tokenize = 'trigram'
tokenize = 'unicode61'
);
CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text);
VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
CASE WHEN new.tokenized_korean_text IS NOT NULL
THEN new.tokenized_korean_text || ' ' || new.text
ELSE new.text
END);
END;
CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
@@ -1108,7 +1113,11 @@ END;
CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text);
VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
CASE WHEN new.tokenized_korean_text IS NOT NULL
THEN new.tokenized_korean_text || ' ' || new.text
ELSE new.text
END);
END;
```

View File

@@ -0,0 +1,91 @@
-- V009__fts_korean_morphological.sql — Replace chunks_fts tokenizer: trigram → unicode61.
--
-- Per design §5.5 (chunks_fts virtual table + chunks_ai/ad/au triggers).
-- The CREATE VIRTUAL TABLE / CREATE TRIGGER block below is reproduced
-- VERBATIM from `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md`
-- §5.5; CI diff-checks this against the design doc (test
-- `fts_v009_matches_design_section_5_5_verbatim` in
-- `crates/kebab-store-sqlite/tests/fts.rs`).
--
-- Tokenizer choice: unicode61 + pre-tokenized Korean column.
-- V007 trigram enabled substring matching for Korean ≥3 chars but
-- 2-char Korean queries (e.g. '한국', '서울') always returned 0 hits.
-- V009 adds `tokenized_korean_text TEXT` column to `chunks` — the ingest
-- path (S2+) runs lindera ko-dic morphological analysis and writes the
-- space-separated morpheme sequence to this column. The chunks_ai/chunks_au
-- triggers concatenate tokenized_korean_text with the raw text before
-- indexing into chunks_fts, so both Korean morphemes AND English tokens
-- are searchable via a single FTS query. English substring matching
-- (V007 ad-hoc feature) reverts to whole-token matching (V002 behavior).
-- corpus_revision is bumped so the in-process search cache is automatically
-- invalidated. See tasks/HOTFIXES.md (2026-05-28) for the deviation log.
--
-- chunks_fts is a shadow of chunks (NOT contentless — V002 DDL has no
-- `content=''`); this migration drops the old shadow, recreates it with
-- the new tokenizer, recreates the sync triggers (CASE expression for
-- tokenized_korean_text), and backfills from `chunks`. The `chunks` table
-- and embeddings are untouched, so users do NOT need to re-ingest after
-- upgrading — the migration is fully automatic. tokenized_korean_text
-- starts as NULL for all pre-V009 rows; a subsequent kebab ingest
-- (S2+ path) will fill it in via UPDATE, firing chunks_au to re-index.
-- ── Korean morphological tokenizer (V009) ─────────────────────────────
-- chunks 테이블에 한국어 형태소 분해된 text 를 저장할 열 추가.
ALTER TABLE chunks ADD COLUMN tokenized_korean_text TEXT;
-- 기존 chunks_fts 제거 (trigram tokenizer).
DROP TRIGGER IF EXISTS chunks_au;
DROP TRIGGER IF EXISTS chunks_ad;
DROP TRIGGER IF EXISTS chunks_ai;
DROP TABLE IF EXISTS chunks_fts;
-- ── §5.5 verbatim block ────────────────────────────────────────────────
CREATE VIRTUAL TABLE chunks_fts USING fts5(
chunk_id UNINDEXED,
doc_id UNINDEXED,
heading_path,
text,
tokenize = 'unicode61'
);
CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
CASE WHEN new.tokenized_korean_text IS NOT NULL
THEN new.tokenized_korean_text || ' ' || new.text
ELSE new.text
END);
END;
CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
END;
CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
CASE WHEN new.tokenized_korean_text IS NOT NULL
THEN new.tokenized_korean_text || ' ' || new.text
ELSE new.text
END);
END;
-- ── End §5.5 verbatim block ───────────────────────────────────────────
-- One-shot backfill from existing chunks. tokenized_korean_text is NULL
-- for all pre-V009 rows so the CASE expression falls to the ELSE branch
-- (raw text only). Subsequent re-ingest via S2+ will UPDATE
-- tokenized_korean_text and fire chunks_au to re-index with morphemes.
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
SELECT chunk_id, doc_id, heading_path_json,
CASE WHEN tokenized_korean_text IS NOT NULL
THEN tokenized_korean_text || ' ' || text
ELSE text
END
FROM chunks;
-- Bump corpus_revision so the in-process LRU search cache is invalidated.
-- kv table columns are `key` TEXT + `value` TEXT (V004__kv.sql).
-- value is TEXT so CAST is required for integer arithmetic.
UPDATE kv SET value = CAST(CAST(value AS INTEGER) + 1 AS TEXT) WHERE key = 'corpus_revision';