From 8dcedc4b11c351240f16410fb191eddd20ad43f3 Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 23 May 2026 00:52:40 +0000 Subject: [PATCH] =?UTF-8?q?feat(p10-r2):=20V007=20trigram=20migration=20+?= =?UTF-8?q?=20design=20=C2=A75.5=20+=20fts=20diff-check?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Task A2 + A3 한 묶음. migrations/V007__fts_trigram.sql 신규: - chunks_fts shadow 를 DROP + 재생성 (tokenize = trigram). - chunks_ai/ad/au trigger 재생성 (V002 와 동일). - chunks 에서 backfill INSERT — 사용자 re-ingest 불필요, V007 자동. - V002 는 historical cold-upgrade replay 위해 그대로 유지. design §5.5 갱신: - verbatim block 의 tokenize 만 trigram 으로 교체. - §5.5 본문 상단에 한국어 채택 사유 + trade-off (영어 lexical 변경, BM25 분포, 디스크 ~2-10x, contentless 아님) prose 한 단락 추가. crates/kebab-store-sqlite/tests/fts.rs: - fts_v002_matches_design_section_5_5_verbatim → fts_v007_matches_design_section_5_5_verbatim 으로 rename. - extract_migration_5_5_verbatim_block() 의 include_str! path 를 V007__fts_trigram.sql 로 변경. 주석/assertion msg V007 로. - V002 cold-upgrade test 들 (fts_v002_backfill_*) 은 그대로 유지. 검증: cargo test -p kebab-store-sqlite --test fts → 10/10 PASS (`fts_v007_matches_design_section_5_5_verbatim` 포함). Codex round 1/2 의 design §5.5 contentless 정정·trigram tokenizer 채택 사유 명시 발견 반영. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-store-sqlite/tests/fts.rs | 26 ++++---- .../2026-05-22-korean-trigram-tokenizer.md | 14 ++--- .../2026-04-27-kebab-final-form-design.md | 13 +++- migrations/V007__fts_trigram.sql | 60 +++++++++++++++++++ 4 files changed, 94 insertions(+), 19 deletions(-) create mode 100644 migrations/V007__fts_trigram.sql diff --git a/crates/kebab-store-sqlite/tests/fts.rs b/crates/kebab-store-sqlite/tests/fts.rs index 2d66204..11e1fc9 100644 --- a/crates/kebab-store-sqlite/tests/fts.rs +++ b/crates/kebab-store-sqlite/tests/fts.rs @@ -370,17 +370,19 @@ fn extract_design_5_5_fts_block() -> String { fts_slice[..last_end + "END;".len()].to_string() } -/// Extract the §5.5 verbatim block from the V002 migration, between the -/// `── §5.5 verbatim block ──` anchor markers the file already carries. +/// Extract the §5.5 verbatim block from the V007 migration (replaced V002 +/// 's unicode61 tokenizer with trigram — V002 stays in place for +/// historical cold-upgrade replay but V007 is now the source of truth), +/// between the `── §5.5 verbatim block ──` anchor markers V007 carries. fn extract_migration_5_5_verbatim_block() -> String { - let migration = include_str!("../../../migrations/V002__fts.sql"); + let migration = include_str!("../../../migrations/V007__fts_trigram.sql"); // The opening anchor line ends with `── §5.5 verbatim block ─...`. let open_marker = "§5.5 verbatim block"; let close_marker = "End §5.5 verbatim block"; let open_idx = migration .find(open_marker) - .expect("V002 must carry the `§5.5 verbatim block` opening anchor"); + .expect("V007 must carry the `§5.5 verbatim block` opening anchor"); let after_open_line = open_idx + migration[open_idx..] .find('\n') @@ -389,7 +391,7 @@ fn extract_migration_5_5_verbatim_block() -> String { let close_idx = migration[after_open_line..] .find(close_marker) - .expect("V002 must carry the `End §5.5 verbatim block` closing anchor") + .expect("V007 must carry the `End §5.5 verbatim block` closing anchor") + after_open_line; // Walk back from the close marker to the start of its comment line. let close_line_start = migration[..close_idx] @@ -400,12 +402,14 @@ fn extract_migration_5_5_verbatim_block() -> String { migration[after_open_line..close_line_start].to_string() } -/// CI diff guard: the §5.5 block in `migrations/V002__fts.sql` must -/// match the design doc verbatim (whitespace-normalized). If the -/// design doc moves the section, renames the heading, or edits the -/// SQL, this test fails first. Same for migration drift. +/// CI diff guard: the §5.5 block in `migrations/V007__fts_trigram.sql` +/// must match the design doc verbatim (whitespace-normalized). V007 +/// replaced V002 's unicode61 tokenizer with trigram (2026-05-23). +/// V002 stays in place for historical replay of cold-upgrade paths +/// but is no longer compared against the design doc — V007 is now +/// the source of truth. #[test] -fn fts_v002_matches_design_section_5_5_verbatim() { +fn fts_v007_matches_design_section_5_5_verbatim() { let design = extract_design_5_5_fts_block(); let migration_block = extract_migration_5_5_verbatim_block(); @@ -428,7 +432,7 @@ fn fts_v002_matches_design_section_5_5_verbatim() { let migration_n = normalize_ws(&migration_block); assert_eq!( design_n, migration_n, - "V002__fts.sql §5.5 block must match design doc §5.5 verbatim \ + "V007__fts_trigram.sql §5.5 block must match design doc §5.5 verbatim \ (whitespace-normalized). If you intentionally changed one, \ update the other in the same commit." ); diff --git a/docs/superpowers/plans/2026-05-22-korean-trigram-tokenizer.md b/docs/superpowers/plans/2026-05-22-korean-trigram-tokenizer.md index f1072df..55bca25 100644 --- a/docs/superpowers/plans/2026-05-22-korean-trigram-tokenizer.md +++ b/docs/superpowers/plans/2026-05-22-korean-trigram-tokenizer.md @@ -71,7 +71,7 @@ Codex 리뷰로 현재 `build_match_string()` (lexical.rs:177) 이 trigram 비 - Create: `migrations/V007__fts_trigram.sql` - Read: `migrations/V002__fts.sql` (trigger 본문 verbatim 복사용) -- [ ] **Step 1: V007 작성** — 아래 내용으로 생성. 컬럼 구성은 V002 와 동일, `tokenize` 만 교체. trigger 본문은 V002 와 동일. +- [x] **Step 1: V007 작성** — 아래 내용으로 생성. 컬럼 구성은 V002 와 동일, `tokenize` 만 교체. trigger 본문은 V002 와 동일. ```sql -- V007__fts_trigram.sql @@ -116,7 +116,7 @@ INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text) > Step 1 전에 `migrations/V002__fts.sql` 의 `CREATE VIRTUAL TABLE` 컬럼 목록과 trigger 본문을 실제로 대조해, 위 SQL 이 V002 와 trigger 본문·컬럼명(`heading_path_json` 등)에서 정확히 일치하는지 확인한다. 다르면 V002 를 source 로 맞춘다. -- [ ] **Step 2: migration 적용 확인** — `cargo test -p kebab-store-sqlite` 를 돌려 refinery 가 V007 을 무오류로 적용하는지 확인한다. Expected: 컴파일 + 기존 store 테스트 통과 (단 A3 의 diff-check 테스트는 아직 실패 — 다음 task). +- [x] **Step 2: migration 적용 확인** — `cargo test -p kebab-store-sqlite` 통과 (10/10 fts tests + 모든 store test PASS). V007 backfill 도 정상 동작. ### Task A3: design §5.5 verbatim + CI diff-check 갱신 @@ -124,15 +124,15 @@ INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text) - Modify: `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` (§5.5, 라인 ~1024-1043) - Modify: `crates/kebab-store-sqlite/tests/fts.rs` (`fts_v002_matches_design_section_5_5_verbatim`, 라인 ~408-435) -- [ ] **Step 1: diff-check 테스트 실행(실패 확인)** — `cargo test -p kebab-store-sqlite fts_v002_matches_design` 실행. Expected: FAIL — design §5.5 는 아직 unicode61, V007 은 trigram (또는 테스트가 V002 만 본다면 PASS 인 채로 남아 trigram 을 검증 안 함 — Step 2 에서 V007 로 대상 변경). +- [x] **Step 1: diff-check 테스트 baseline 확인** — A2 검증에서 `fts_v002_matches_design_section_5_5_verbatim` 는 PASS (V002 vs design 둘 다 unicode61 시점이라 match). V007 추가 자체는 기존 test 안 깨뜨림. -- [ ] **Step 2: design §5.5 갱신** — §5.5 의 verbatim SQL 블록의 `tokenize = 'unicode61 remove_diacritics 2'` 를 `tokenize = 'trigram'` 으로 바꾸고, 블록이 V007 의 `CREATE VIRTUAL TABLE` + 3 trigger 와 정확히 일치하도록 맞춘다. §5.5 본문에 한국어 trigram 채택 사유 한 문장 추가(unicode61 의 한국어 한계, HOTFIXES 2026-05-22 cross-link). +- [x] **Step 2: design §5.5 갱신** — `tokenize = 'unicode61 remove_diacritics 2'` → `'trigram'`. §5.5 본문 위에 한국어 trigram 채택 사유 + trade-off + "contentless 가 아님" 명시 prose 한 단락 추가. -- [ ] **Step 3: diff-check 테스트를 V007 대상으로 갱신** — `fts.rs` 의 테스트 함수를 `fts_v007_matches_design_section_5_5_verbatim` 으로 rename, 대조 파일을 `V007__fts_trigram.sql` 로, 기대 design 섹션을 갱신된 §5.5 로 바꾼다. whitespace-normalized 비교 로직은 그대로. +- [x] **Step 3: diff-check 테스트를 V007 대상으로 갱신** — `extract_migration_5_5_verbatim_block()` 의 `include_str!` path 를 `V007__fts_trigram.sql` 로, 함수명 `fts_v002_matches_design_section_5_5_verbatim` → `fts_v007_matches_design_section_5_5_verbatim`, assertion msg 갱신. -- [ ] **Step 4: 테스트 통과 확인** — `cargo test -p kebab-store-sqlite fts_v007_matches_design` → PASS. +- [x] **Step 4: 테스트 통과 확인** — `cargo test -p kebab-store-sqlite --test fts` → 10/10 PASS (`fts_v007_matches_design_section_5_5_verbatim` 포함). -- [ ] **Step 5: Commit** — `git add migrations/V007__fts_trigram.sql crates/kebab-store-sqlite/tests/fts.rs docs/superpowers/specs/` → `git commit` (feat: trigram tokenizer migration + design §5.5). +- [ ] **Step 5: Commit** — A2 + A3 한 묶음으로 commit. ### Task A4: 한국어/영어 trigram 매칭 테스트 diff --git a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md index 3e2c7a9..c0c33f9 100644 --- a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md +++ b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md @@ -1004,6 +1004,17 @@ CREATE INDEX idx_blocks_doc_id ON blocks(doc_id); ### 5.5 Chunks + FTS5 +Tokenizer = `trigram` (V007, 2026-05-23). 한국어 어절(조사·어미가 붙은 단위)이 +unicode61 에서 단일 토큰화돼 lexical 부분 매칭이 불가능했던 문제를 해소 +(2자 미만 한국어 query 는 trigram 구조상 여전히 0-hit — 단일 토큰 측면에서는 +회귀 아님, multi-token query 는 `lexical.rs::build_match_string()` 가 whole-phrase +후보 OR 결합으로 매칭). trade-off: 영어 lexical 도 substring 매칭으로 이동 +(recall↑, 단어 경계 정밀도↓), BM25 raw score 분포 변경 (RRF rank 기반 hybrid +는 영향 미미), SQLite 파일 크기 ~2-10× 증가. 자세한 내용 = `tasks/HOTFIXES.md` +(2026-05-22) + `docs/superpowers/specs/2026-05-22-korean-trigram-tokenizer-design.md`. +`chunks_fts` 는 일반 FTS5 shadow table 이며 contentless 가 아님 (V002 / V007 +DDL 에 `content=''` 없음). + ```sql CREATE TABLE chunks ( chunk_id TEXT PRIMARY KEY, @@ -1026,7 +1037,7 @@ CREATE VIRTUAL TABLE chunks_fts USING fts5( doc_id UNINDEXED, heading_path, text, - tokenize = 'unicode61 remove_diacritics 2' + tokenize = 'trigram' ); CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN diff --git a/migrations/V007__fts_trigram.sql b/migrations/V007__fts_trigram.sql new file mode 100644 index 0000000..4546b05 --- /dev/null +++ b/migrations/V007__fts_trigram.sql @@ -0,0 +1,60 @@ +-- V007__fts_trigram.sql — Replace chunks_fts tokenizer: unicode61 → trigram. +-- +-- Per design §5.5 (chunks_fts virtual table + chunks_ai/ad/au triggers). +-- The CREATE VIRTUAL TABLE / CREATE TRIGGER block below is reproduced +-- VERBATIM from `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` +-- §5.5; CI diff-checks this against the design doc (test +-- `fts_v007_matches_design_section_5_5_verbatim` in +-- `crates/kebab-store-sqlite/tests/fts.rs`). +-- +-- Tokenizer choice: trigram. Korean is agglutinative — unicode61 tokenizes +-- whole eojeol (조사·어미 attached) so substring matching fails. trigram +-- indexes 3-character grams, enabling Korean partial matches. Trade-offs: +-- DB size grows (~2-10×), English lexical also moves to substring match +-- (recall↑, precision↓), BM25 score distribution shifts. See +-- `tasks/HOTFIXES.md` (2026-05-22) and the v0.17.0 design doc. +-- +-- chunks_fts is a shadow of chunks (NOT contentless — V002 DDL has no +-- `content=''`); this migration drops the old shadow, recreates it with +-- the new tokenizer, recreates the sync triggers (verbatim from V002), +-- and backfills from `chunks`. The `chunks` table and embeddings are +-- untouched, so users do NOT need to re-ingest after upgrading to +-- v0.17.0 — the migration is fully automatic. + +DROP TRIGGER IF EXISTS chunks_au; +DROP TRIGGER IF EXISTS chunks_ad; +DROP TRIGGER IF EXISTS chunks_ai; +DROP TABLE IF EXISTS chunks_fts; + +-- ── §5.5 verbatim block ──────────────────────────────────────────────── + +CREATE VIRTUAL TABLE chunks_fts USING fts5( + chunk_id UNINDEXED, + doc_id UNINDEXED, + heading_path, + text, + tokenize = 'trigram' +); + +CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN + INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text) + VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text); +END; +CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN + DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id; +END; +CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN + DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id; + INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text) + VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text); +END; + +-- ── End §5.5 verbatim block ─────────────────────────────────────────── + +-- One-shot backfill from existing chunks. Mirrors the V002 backfill +-- pattern — direct INSERT into chunks_fts bypasses chunks_ai trigger +-- (trigger fires on chunks INSERT, not chunks_fts INSERT), so no +-- double-insert. Refinery runs V007 exactly once via its bookkeeping +-- table, so this is naturally idempotent across restarts. +INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text) + SELECT chunk_id, doc_id, heading_path_json, text FROM chunks;