feat(p10-r2): V007 trigram migration + design §5.5 + fts diff-check
Task A2 + A3 한 묶음. migrations/V007__fts_trigram.sql 신규: - chunks_fts shadow 를 DROP + 재생성 (tokenize = trigram). - chunks_ai/ad/au trigger 재생성 (V002 와 동일). - chunks 에서 backfill INSERT — 사용자 re-ingest 불필요, V007 자동. - V002 는 historical cold-upgrade replay 위해 그대로 유지. design §5.5 갱신: - verbatim block 의 tokenize 만 trigram 으로 교체. - §5.5 본문 상단에 한국어 채택 사유 + trade-off (영어 lexical 변경, BM25 분포, 디스크 ~2-10x, contentless 아님) prose 한 단락 추가. crates/kebab-store-sqlite/tests/fts.rs: - fts_v002_matches_design_section_5_5_verbatim → fts_v007_matches_design_section_5_5_verbatim 으로 rename. - extract_migration_5_5_verbatim_block() 의 include_str! path 를 V007__fts_trigram.sql 로 변경. 주석/assertion msg V007 로. - V002 cold-upgrade test 들 (fts_v002_backfill_*) 은 그대로 유지. 검증: cargo test -p kebab-store-sqlite --test fts → 10/10 PASS (`fts_v007_matches_design_section_5_5_verbatim` 포함). Codex round 1/2 의 design §5.5 contentless 정정·trigram tokenizer 채택 사유 명시 발견 반영. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -370,17 +370,19 @@ fn extract_design_5_5_fts_block() -> String {
|
||||
fts_slice[..last_end + "END;".len()].to_string()
|
||||
}
|
||||
|
||||
/// Extract the §5.5 verbatim block from the V002 migration, between the
|
||||
/// `── §5.5 verbatim block ──` anchor markers the file already carries.
|
||||
/// Extract the §5.5 verbatim block from the V007 migration (replaced V002
|
||||
/// 's unicode61 tokenizer with trigram — V002 stays in place for
|
||||
/// historical cold-upgrade replay but V007 is now the source of truth),
|
||||
/// between the `── §5.5 verbatim block ──` anchor markers V007 carries.
|
||||
fn extract_migration_5_5_verbatim_block() -> String {
|
||||
let migration = include_str!("../../../migrations/V002__fts.sql");
|
||||
let migration = include_str!("../../../migrations/V007__fts_trigram.sql");
|
||||
// The opening anchor line ends with `── §5.5 verbatim block ─...`.
|
||||
let open_marker = "§5.5 verbatim block";
|
||||
let close_marker = "End §5.5 verbatim block";
|
||||
|
||||
let open_idx = migration
|
||||
.find(open_marker)
|
||||
.expect("V002 must carry the `§5.5 verbatim block` opening anchor");
|
||||
.expect("V007 must carry the `§5.5 verbatim block` opening anchor");
|
||||
let after_open_line = open_idx
|
||||
+ migration[open_idx..]
|
||||
.find('\n')
|
||||
@@ -389,7 +391,7 @@ fn extract_migration_5_5_verbatim_block() -> String {
|
||||
|
||||
let close_idx = migration[after_open_line..]
|
||||
.find(close_marker)
|
||||
.expect("V002 must carry the `End §5.5 verbatim block` closing anchor")
|
||||
.expect("V007 must carry the `End §5.5 verbatim block` closing anchor")
|
||||
+ after_open_line;
|
||||
// Walk back from the close marker to the start of its comment line.
|
||||
let close_line_start = migration[..close_idx]
|
||||
@@ -400,12 +402,14 @@ fn extract_migration_5_5_verbatim_block() -> String {
|
||||
migration[after_open_line..close_line_start].to_string()
|
||||
}
|
||||
|
||||
/// CI diff guard: the §5.5 block in `migrations/V002__fts.sql` must
|
||||
/// match the design doc verbatim (whitespace-normalized). If the
|
||||
/// design doc moves the section, renames the heading, or edits the
|
||||
/// SQL, this test fails first. Same for migration drift.
|
||||
/// CI diff guard: the §5.5 block in `migrations/V007__fts_trigram.sql`
|
||||
/// must match the design doc verbatim (whitespace-normalized). V007
|
||||
/// replaced V002 's unicode61 tokenizer with trigram (2026-05-23).
|
||||
/// V002 stays in place for historical replay of cold-upgrade paths
|
||||
/// but is no longer compared against the design doc — V007 is now
|
||||
/// the source of truth.
|
||||
#[test]
|
||||
fn fts_v002_matches_design_section_5_5_verbatim() {
|
||||
fn fts_v007_matches_design_section_5_5_verbatim() {
|
||||
let design = extract_design_5_5_fts_block();
|
||||
let migration_block = extract_migration_5_5_verbatim_block();
|
||||
|
||||
@@ -428,7 +432,7 @@ fn fts_v002_matches_design_section_5_5_verbatim() {
|
||||
let migration_n = normalize_ws(&migration_block);
|
||||
assert_eq!(
|
||||
design_n, migration_n,
|
||||
"V002__fts.sql §5.5 block must match design doc §5.5 verbatim \
|
||||
"V007__fts_trigram.sql §5.5 block must match design doc §5.5 verbatim \
|
||||
(whitespace-normalized). If you intentionally changed one, \
|
||||
update the other in the same commit."
|
||||
);
|
||||
|
||||
@@ -71,7 +71,7 @@ Codex 리뷰로 현재 `build_match_string()` (lexical.rs:177) 이 trigram 비
|
||||
- Create: `migrations/V007__fts_trigram.sql`
|
||||
- Read: `migrations/V002__fts.sql` (trigger 본문 verbatim 복사용)
|
||||
|
||||
- [ ] **Step 1: V007 작성** — 아래 내용으로 생성. 컬럼 구성은 V002 와 동일, `tokenize` 만 교체. trigger 본문은 V002 와 동일.
|
||||
- [x] **Step 1: V007 작성** — 아래 내용으로 생성. 컬럼 구성은 V002 와 동일, `tokenize` 만 교체. trigger 본문은 V002 와 동일.
|
||||
|
||||
```sql
|
||||
-- V007__fts_trigram.sql
|
||||
@@ -116,7 +116,7 @@ INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
|
||||
> Step 1 전에 `migrations/V002__fts.sql` 의 `CREATE VIRTUAL TABLE` 컬럼 목록과 trigger 본문을 실제로 대조해, 위 SQL 이 V002 와 trigger 본문·컬럼명(`heading_path_json` 등)에서 정확히 일치하는지 확인한다. 다르면 V002 를 source 로 맞춘다.
|
||||
|
||||
- [ ] **Step 2: migration 적용 확인** — `cargo test -p kebab-store-sqlite` 를 돌려 refinery 가 V007 을 무오류로 적용하는지 확인한다. Expected: 컴파일 + 기존 store 테스트 통과 (단 A3 의 diff-check 테스트는 아직 실패 — 다음 task).
|
||||
- [x] **Step 2: migration 적용 확인** — `cargo test -p kebab-store-sqlite` 통과 (10/10 fts tests + 모든 store test PASS). V007 backfill 도 정상 동작.
|
||||
|
||||
### Task A3: design §5.5 verbatim + CI diff-check 갱신
|
||||
|
||||
@@ -124,15 +124,15 @@ INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
- Modify: `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` (§5.5, 라인 ~1024-1043)
|
||||
- Modify: `crates/kebab-store-sqlite/tests/fts.rs` (`fts_v002_matches_design_section_5_5_verbatim`, 라인 ~408-435)
|
||||
|
||||
- [ ] **Step 1: diff-check 테스트 실행(실패 확인)** — `cargo test -p kebab-store-sqlite fts_v002_matches_design` 실행. Expected: FAIL — design §5.5 는 아직 unicode61, V007 은 trigram (또는 테스트가 V002 만 본다면 PASS 인 채로 남아 trigram 을 검증 안 함 — Step 2 에서 V007 로 대상 변경).
|
||||
- [x] **Step 1: diff-check 테스트 baseline 확인** — A2 검증에서 `fts_v002_matches_design_section_5_5_verbatim` 는 PASS (V002 vs design 둘 다 unicode61 시점이라 match). V007 추가 자체는 기존 test 안 깨뜨림.
|
||||
|
||||
- [ ] **Step 2: design §5.5 갱신** — §5.5 의 verbatim SQL 블록의 `tokenize = 'unicode61 remove_diacritics 2'` 를 `tokenize = 'trigram'` 으로 바꾸고, 블록이 V007 의 `CREATE VIRTUAL TABLE` + 3 trigger 와 정확히 일치하도록 맞춘다. §5.5 본문에 한국어 trigram 채택 사유 한 문장 추가(unicode61 의 한국어 한계, HOTFIXES 2026-05-22 cross-link).
|
||||
- [x] **Step 2: design §5.5 갱신** — `tokenize = 'unicode61 remove_diacritics 2'` → `'trigram'`. §5.5 본문 위에 한국어 trigram 채택 사유 + trade-off + "contentless 가 아님" 명시 prose 한 단락 추가.
|
||||
|
||||
- [ ] **Step 3: diff-check 테스트를 V007 대상으로 갱신** — `fts.rs` 의 테스트 함수를 `fts_v007_matches_design_section_5_5_verbatim` 으로 rename, 대조 파일을 `V007__fts_trigram.sql` 로, 기대 design 섹션을 갱신된 §5.5 로 바꾼다. whitespace-normalized 비교 로직은 그대로.
|
||||
- [x] **Step 3: diff-check 테스트를 V007 대상으로 갱신** — `extract_migration_5_5_verbatim_block()` 의 `include_str!` path 를 `V007__fts_trigram.sql` 로, 함수명 `fts_v002_matches_design_section_5_5_verbatim` → `fts_v007_matches_design_section_5_5_verbatim`, assertion msg 갱신.
|
||||
|
||||
- [ ] **Step 4: 테스트 통과 확인** — `cargo test -p kebab-store-sqlite fts_v007_matches_design` → PASS.
|
||||
- [x] **Step 4: 테스트 통과 확인** — `cargo test -p kebab-store-sqlite --test fts` → 10/10 PASS (`fts_v007_matches_design_section_5_5_verbatim` 포함).
|
||||
|
||||
- [ ] **Step 5: Commit** — `git add migrations/V007__fts_trigram.sql crates/kebab-store-sqlite/tests/fts.rs docs/superpowers/specs/` → `git commit` (feat: trigram tokenizer migration + design §5.5).
|
||||
- [ ] **Step 5: Commit** — A2 + A3 한 묶음으로 commit.
|
||||
|
||||
### Task A4: 한국어/영어 trigram 매칭 테스트
|
||||
|
||||
|
||||
@@ -1004,6 +1004,17 @@ CREATE INDEX idx_blocks_doc_id ON blocks(doc_id);
|
||||
|
||||
### 5.5 Chunks + FTS5
|
||||
|
||||
Tokenizer = `trigram` (V007, 2026-05-23). 한국어 어절(조사·어미가 붙은 단위)이
|
||||
unicode61 에서 단일 토큰화돼 lexical 부분 매칭이 불가능했던 문제를 해소
|
||||
(2자 미만 한국어 query 는 trigram 구조상 여전히 0-hit — 단일 토큰 측면에서는
|
||||
회귀 아님, multi-token query 는 `lexical.rs::build_match_string()` 가 whole-phrase
|
||||
후보 OR 결합으로 매칭). trade-off: 영어 lexical 도 substring 매칭으로 이동
|
||||
(recall↑, 단어 경계 정밀도↓), BM25 raw score 분포 변경 (RRF rank 기반 hybrid
|
||||
는 영향 미미), SQLite 파일 크기 ~2-10× 증가. 자세한 내용 = `tasks/HOTFIXES.md`
|
||||
(2026-05-22) + `docs/superpowers/specs/2026-05-22-korean-trigram-tokenizer-design.md`.
|
||||
`chunks_fts` 는 일반 FTS5 shadow table 이며 contentless 가 아님 (V002 / V007
|
||||
DDL 에 `content=''` 없음).
|
||||
|
||||
```sql
|
||||
CREATE TABLE chunks (
|
||||
chunk_id TEXT PRIMARY KEY,
|
||||
@@ -1026,7 +1037,7 @@ CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
||||
doc_id UNINDEXED,
|
||||
heading_path,
|
||||
text,
|
||||
tokenize = 'unicode61 remove_diacritics 2'
|
||||
tokenize = 'trigram'
|
||||
);
|
||||
|
||||
CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
|
||||
|
||||
60
migrations/V007__fts_trigram.sql
Normal file
60
migrations/V007__fts_trigram.sql
Normal file
@@ -0,0 +1,60 @@
|
||||
-- V007__fts_trigram.sql — Replace chunks_fts tokenizer: unicode61 → trigram.
|
||||
--
|
||||
-- Per design §5.5 (chunks_fts virtual table + chunks_ai/ad/au triggers).
|
||||
-- The CREATE VIRTUAL TABLE / CREATE TRIGGER block below is reproduced
|
||||
-- VERBATIM from `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md`
|
||||
-- §5.5; CI diff-checks this against the design doc (test
|
||||
-- `fts_v007_matches_design_section_5_5_verbatim` in
|
||||
-- `crates/kebab-store-sqlite/tests/fts.rs`).
|
||||
--
|
||||
-- Tokenizer choice: trigram. Korean is agglutinative — unicode61 tokenizes
|
||||
-- whole eojeol (조사·어미 attached) so substring matching fails. trigram
|
||||
-- indexes 3-character grams, enabling Korean partial matches. Trade-offs:
|
||||
-- DB size grows (~2-10×), English lexical also moves to substring match
|
||||
-- (recall↑, precision↓), BM25 score distribution shifts. See
|
||||
-- `tasks/HOTFIXES.md` (2026-05-22) and the v0.17.0 design doc.
|
||||
--
|
||||
-- chunks_fts is a shadow of chunks (NOT contentless — V002 DDL has no
|
||||
-- `content=''`); this migration drops the old shadow, recreates it with
|
||||
-- the new tokenizer, recreates the sync triggers (verbatim from V002),
|
||||
-- and backfills from `chunks`. The `chunks` table and embeddings are
|
||||
-- untouched, so users do NOT need to re-ingest after upgrading to
|
||||
-- v0.17.0 — the migration is fully automatic.
|
||||
|
||||
DROP TRIGGER IF EXISTS chunks_au;
|
||||
DROP TRIGGER IF EXISTS chunks_ad;
|
||||
DROP TRIGGER IF EXISTS chunks_ai;
|
||||
DROP TABLE IF EXISTS chunks_fts;
|
||||
|
||||
-- ── §5.5 verbatim block ────────────────────────────────────────────────
|
||||
|
||||
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
||||
chunk_id UNINDEXED,
|
||||
doc_id UNINDEXED,
|
||||
heading_path,
|
||||
text,
|
||||
tokenize = 'trigram'
|
||||
);
|
||||
|
||||
CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
|
||||
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text);
|
||||
END;
|
||||
CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
|
||||
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
|
||||
END;
|
||||
CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
|
||||
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
|
||||
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text);
|
||||
END;
|
||||
|
||||
-- ── End §5.5 verbatim block ───────────────────────────────────────────
|
||||
|
||||
-- One-shot backfill from existing chunks. Mirrors the V002 backfill
|
||||
-- pattern — direct INSERT into chunks_fts bypasses chunks_ai trigger
|
||||
-- (trigger fires on chunks INSERT, not chunks_fts INSERT), so no
|
||||
-- double-insert. Refinery runs V007 exactly once via its bookkeeping
|
||||
-- table, so this is naturally idempotent across restarts.
|
||||
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
SELECT chunk_id, doc_id, heading_path_json, text FROM chunks;
|
||||
Reference in New Issue
Block a user