Files
kebab/crates/kebab-store-sqlite/tests/fts.rs
altair823 b106120e93 feat(fts): add V009 korean morphological tokenizer migration
V007 trigram tokenizer 의 한국어 2자 query 0-hit 한계 (Bug #8) 해소를
위한 V009 migration 추가. unicode61 tokenizer 로 환원 + 한국어 형태소
분해 결과를 별 column `tokenized_korean_text` 에 pre-fill 하는 방식.

- migrations/V009__fts_korean_morphological.sql 신규: column ADD,
  chunks_fts DROP+재정의, 3 trigger CASE expression, backfill INSERT,
  corpus_revision bump.
- design §5.5 갱신: trigram → unicode61 + 형태소 column. CASE
  expression trigger 본문.
- crates/kebab-store-sqlite/tests/fts.rs: V007 verbatim test 를
  V009 source-of-truth 로 rename. v009_bumps_corpus_revision unit
  test 추가.
- store.rs: clippy bool_to_int_with_if + cast_lossless 기존 경고 수정
  (pdf_ocr_events 관련 코드, S1 작업 중 발견).

영어 substring 매칭은 V002 (whole-token only) 로 회귀 — spec §3
Non-Goals + 후속 release notes (v0.20.1) 에서 정직히 기술.

Spec: docs/superpowers/specs/2026-05-28-v0.20.x-korean-morphological-tokenizer-spec.md
Plan: docs/superpowers/plans/2026-05-28-v0.20.x-korean-morphological-tokenizer-plan.md (S1)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 09:48:46 +00:00

606 lines
23 KiB
Rust

//! P2-1 FTS5 schema + trigger + rebuild tests.
//!
//! Strategy: `chunks_fts` triggers fire off raw SQL on `chunks`, so we
//! seed and mutate via direct INSERT/UPDATE/DELETE rather than the full
//! `kb-parse-md → kb-normalize → kb-chunk → put_chunks` pipeline. That
//! keeps the assertions about trigger behavior independent of any
//! upstream crate. The `chunks` rows we produce satisfy NOT NULL on the
//! columns required by V001 §5.5; we elide FK pressure on `documents`
//! by disabling foreign keys for the test connection (the trigger logic
//! we exercise has no `documents` dependency).
//!
//! Test connections open a fresh side-channel `rusqlite::Connection`
//! that bypasses the `SqliteStore` mutex; that's fine because each test
//! gets its own tempdir and no concurrent mutator is in flight.
use kebab_store_sqlite::{SqliteStore, rebuild_chunks_fts};
use rusqlite::Connection;
mod common;
/// Insert a chunks row directly. The triggers will mirror it into
/// `chunks_fts` as part of the same statement.
fn insert_chunk(
conn: &Connection,
chunk_id: &str,
doc_id: &str,
heading_path_json: &str,
text: &str,
) {
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES (?, ?, ?, ?, NULL, '[]', 0, 'v1', 'h', '[]', '2024-01-01T00:00:00Z')",
rusqlite::params![chunk_id, doc_id, text, heading_path_json],
)
.expect("insert chunk row");
}
fn count(conn: &Connection, table: &str) -> i64 {
conn.query_row(&format!("SELECT COUNT(*) FROM {table}"), [], |r| r.get(0))
.expect("count")
}
/// Open a fresh side-channel connection with FK enforcement OFF. The
/// FTS triggers we test do not touch `documents`, but `chunks` has a
/// FK to `documents(doc_id)`; turning FK enforcement off lets us seed
/// chunks without first synthesizing a full documents/assets row graph.
fn raw_conn_no_fk(env: &common::TestEnv) -> Connection {
let conn = Connection::open(env.db_path()).expect("open side conn");
conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
conn
}
// ── 1. Migration apply: backfill ──────────────────────────────────────
/// Apply V001 only, seed N rows into `chunks` (which has no FTS shadow
/// at this point — V001 doesn't create `chunks_fts`), then apply V002's
/// SQL verbatim. The V002 backfill INSERT must produce one chunks_fts
/// row per pre-existing chunks row, and each row's columns must match.
///
/// This is the literal cold-upgrade path: V001-shipped database, V002
/// applied on top, existing chunks become searchable without re-ingest.
/// The trigger-based mirror (chunks_ai) is covered by the §2 tests.
#[test]
fn fts_v002_backfills_existing_chunks() {
let env = common::TestEnv::new();
let conn = Connection::open(env.db_path()).expect("open db");
conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
// 1) Apply V001 only — chunks table exists, chunks_fts does not.
let v001_sql = include_str!("../../../migrations/V001__init.sql");
conn.execute_batch(v001_sql).expect("apply V001");
assert!(
conn.query_row(
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_fts'",
[],
|r| r.get::<_, String>(0),
)
.is_err(),
"chunks_fts must not exist under V001 only"
);
// 2) Seed pre-existing chunks rows (the V001-shipped state we expect
// on a customer DB upgrading from P1 to P2-1).
const N: usize = 4;
for i in 0..N {
let cid = format!("{i:0>32}");
insert_chunk(
&conn,
&cid,
&"d".repeat(32),
"[\"Section\"]",
&format!("seedrow{i} payload"),
);
}
assert_eq!(count(&conn, "chunks"), N as i64);
// 3) Apply V002 verbatim — its CREATE VIRTUAL TABLE + triggers + the
// final backfill INSERT. The triggers don't fire on this path
// (they only fire on chunks INSERT/UPDATE/DELETE); the backfill
// INSERT does the work.
let v002_sql = include_str!("../../../migrations/V002__fts.sql");
conn.execute_batch(v002_sql).expect("apply V002");
// 4) Assert: count parity, and the backfilled rows mirror the chunks
// rows column-for-column on the indexed/UNINDEXED columns.
assert_eq!(
count(&conn, "chunks_fts"),
N as i64,
"V002 backfill INSERT must seed one chunks_fts row per chunks row"
);
for i in 0..N {
let cid = format!("{i:0>32}");
let term = format!("seedrow{i}");
let hit: String = conn
.query_row(
"SELECT chunk_id FROM chunks_fts WHERE chunks_fts MATCH ?",
[&term],
|r| r.get(0),
)
.unwrap_or_else(|_| panic!("MATCH {term} must hit backfilled row"));
assert_eq!(hit, cid, "backfill must preserve chunk_id mapping");
}
}
/// Direct test of the V002 backfill INSERT on a DB seeded under V001.
/// We achieve V001-only state by running all migrations, dropping the
/// FTS rows, then re-running the exact backfill INSERT V002 ships and
/// asserting count parity.
#[test]
fn fts_v002_backfill_select_matches_chunks_count() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let conn = raw_conn_no_fk(&env);
for i in 0..5 {
let cid = format!("{i:0>32}");
insert_chunk(&conn, &cid, &"d".repeat(32), "[]", &format!("row {i}"));
}
// Wipe + run the literal V002 backfill INSERT.
conn.execute("DELETE FROM chunks_fts", []).unwrap();
assert_eq!(count(&conn, "chunks_fts"), 0);
conn.execute(
"INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
SELECT chunk_id, doc_id, heading_path_json, text FROM chunks",
[],
)
.unwrap();
assert_eq!(count(&conn, "chunks_fts"), count(&conn, "chunks"));
}
// ── 2. Trigger sync: INSERT / DELETE / UPDATE ────────────────────────
#[test]
fn fts_chunks_ai_trigger_propagates_insert() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let conn = raw_conn_no_fk(&env);
insert_chunk(
&conn,
&"a".repeat(32),
&"d".repeat(32),
"[\"Heading\"]",
"needle in haystack",
);
// chunks_fts row count == 1 and MATCH finds it.
assert_eq!(count(&conn, "chunks_fts"), 1);
let hit: String = conn
.query_row(
"SELECT chunk_id FROM chunks_fts WHERE chunks_fts MATCH 'needle'",
[],
|r| r.get(0),
)
.expect("MATCH 'needle' must hit");
assert_eq!(hit, "a".repeat(32));
}
#[test]
fn fts_chunks_ad_trigger_propagates_delete() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let conn = raw_conn_no_fk(&env);
let cid = "a".repeat(32);
insert_chunk(&conn, &cid, &"d".repeat(32), "[]", "ephemeral");
assert_eq!(count(&conn, "chunks_fts"), 1);
conn.execute("DELETE FROM chunks WHERE chunk_id = ?", [&cid])
.expect("delete chunk");
assert_eq!(
count(&conn, "chunks_fts"),
0,
"chunks_ad must remove the FTS row"
);
}
#[test]
fn fts_chunks_au_trigger_propagates_update() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let conn = raw_conn_no_fk(&env);
let cid = "a".repeat(32);
insert_chunk(&conn, &cid, &"d".repeat(32), "[]", "before");
// Old text is searchable.
assert_eq!(count_match(&conn, "before"), 1);
assert_eq!(count_match(&conn, "after"), 0);
conn.execute(
"UPDATE chunks SET text = ? WHERE chunk_id = ?",
rusqlite::params!["after rewrite", cid],
)
.expect("update chunk text");
// New text is searchable; old token is gone. Row count unchanged.
assert_eq!(count(&conn, "chunks_fts"), 1);
assert_eq!(
count_match(&conn, "before"),
0,
"old text must not survive UPDATE"
);
assert_eq!(count_match(&conn, "after"), 1, "new text must be indexed");
}
fn count_match(conn: &Connection, term: &str) -> i64 {
conn.query_row(
"SELECT COUNT(*) FROM chunks_fts WHERE chunks_fts MATCH ?",
[term],
|r| r.get(0),
)
.expect("count_match")
}
// ── 3. rebuild_chunks_fts ────────────────────────────────────────────
#[test]
fn fts_rebuild_chunks_fts_is_idempotent() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let conn = raw_conn_no_fk(&env);
for i in 0..3 {
let cid = format!("{i:0>32}");
insert_chunk(&conn, &cid, &"d".repeat(32), "[]", &format!("token{i}"));
}
let before = count(&conn, "chunks_fts");
assert_eq!(before, 3);
// First rebuild: trivial round-trip — same row count.
rebuild_chunks_fts(&conn).expect("rebuild 1");
assert_eq!(count(&conn, "chunks_fts"), before);
// Second rebuild: idempotent (same row count again).
rebuild_chunks_fts(&conn).expect("rebuild 2");
assert_eq!(count(&conn, "chunks_fts"), before);
// After rebuild, MATCH still finds expected tokens.
for i in 0..3 {
assert_eq!(count_match(&conn, &format!("token{i}")), 1);
}
}
#[test]
fn fts_rebuild_chunks_fts_recovers_from_drift() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let conn = raw_conn_no_fk(&env);
let cid = "a".repeat(32);
insert_chunk(&conn, &cid, &"d".repeat(32), "[]", "recovered");
// Manually wipe chunks_fts to simulate drift; this is the failure
// mode `kb index --rebuild-fts` exists to recover from.
conn.execute("DELETE FROM chunks_fts", []).unwrap();
assert_eq!(count(&conn, "chunks_fts"), 0);
assert_eq!(count(&conn, "chunks"), 1);
rebuild_chunks_fts(&conn).expect("rebuild");
assert_eq!(count(&conn, "chunks_fts"), 1);
assert_eq!(count_match(&conn, "recovered"), 1);
}
// ── 4. Migration double-apply no-op ──────────────────────────────────
#[test]
fn fts_double_run_migrations_is_noop() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().expect("run 1");
// Second invocation must be a no-op (refinery's bookkeeping table
// tracks applied versions). The chunks_fts virtual table is still
// present and queryable.
store.run_migrations().expect("run 2");
let conn = raw_conn_no_fk(&env);
// The virtual table is queryable.
let n: i64 = conn
.query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
.expect("chunks_fts queryable after double-run");
assert_eq!(n, 0);
}
// ── 5. CI diff guard: V002 SQL matches design §5.5 verbatim ──────────
/// Whitespace-normalize a SQL block: trim, then collapse every run of
/// whitespace (newlines included) into a single space. Lets the
/// design-doc ↔ migration-file comparison ignore cosmetic drift like
/// blank-line counts while still catching token-level changes.
fn normalize_ws(s: &str) -> String {
s.split_whitespace().collect::<Vec<_>>().join(" ")
}
/// Extract the §5.5 FTS slice from the design doc: locate the
/// `### 5.5 Chunks + FTS5` heading, walk to the next ```sql fenced
/// block, then within that block slice from `CREATE VIRTUAL TABLE
/// chunks_fts` through the last `END;`. The §5.5 fenced block also
/// contains the `chunks` CREATE TABLE — we only want the FTS portion.
///
/// Failure modes (any of these means the design doc layout drifted —
/// the test should fail loud, which is the point):
/// - heading missing
/// - no ```sql block follows
/// - no `CREATE VIRTUAL TABLE chunks_fts` inside that block
/// - no `END;` after the virtual-table line
fn extract_design_5_5_fts_block() -> String {
let doc = include_str!("../../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md");
let heading_idx = doc
.find("### 5.5 Chunks + FTS5")
.expect("design doc must contain `### 5.5 Chunks + FTS5` heading");
let after_heading = &doc[heading_idx..];
// Find the opening fence ```sql after the heading.
let fence_open_rel = after_heading
.find("```sql")
.expect("§5.5 must be followed by a ```sql fenced block");
// Move past the fence line.
let body_start_rel = fence_open_rel
+ after_heading[fence_open_rel..]
.find('\n')
.expect("```sql fence must end with a newline")
+ 1;
let body = &after_heading[body_start_rel..];
let fence_close_rel = body
.find("\n```")
.expect("§5.5 ```sql block must close with ``` on its own line");
let fenced = &body[..fence_close_rel];
// Within the fenced block, slice from CREATE VIRTUAL TABLE chunks_fts
// through the last `END;`.
let virt_idx = fenced
.find("CREATE VIRTUAL TABLE chunks_fts")
.expect("§5.5 fenced block must contain `CREATE VIRTUAL TABLE chunks_fts`");
let fts_slice = &fenced[virt_idx..];
let last_end = fts_slice
.rfind("END;")
.expect("§5.5 FTS slice must terminate with `END;`");
fts_slice[..last_end + "END;".len()].to_string()
}
/// Extract the §5.5 verbatim block from the V009 migration (V009 replaces
/// V007 's trigram tokenizer with unicode61 + CASE expression triggers for
/// Korean morphological tokenization — V007 stays in place for historical
/// cold-upgrade replay but V009 is now the source of truth),
/// between the `── §5.5 verbatim block ──` anchor markers V009 carries.
fn extract_migration_5_5_verbatim_block() -> String {
let migration = include_str!("../../../migrations/V009__fts_korean_morphological.sql");
// The opening anchor line ends with `── §5.5 verbatim block ─...`.
let open_marker = "§5.5 verbatim block";
let close_marker = "End §5.5 verbatim block";
let open_idx = migration
.find(open_marker)
.expect("V009 must carry the `§5.5 verbatim block` opening anchor");
let after_open_line = open_idx
+ migration[open_idx..]
.find('\n')
.expect("opening anchor line must end with a newline")
+ 1;
let close_idx = migration[after_open_line..]
.find(close_marker)
.expect("V009 must carry the `End §5.5 verbatim block` closing anchor")
+ after_open_line;
// Walk back from the close marker to the start of its comment line.
let close_line_start = migration[..close_idx].rfind('\n').map_or(0, |n| n + 1);
migration[after_open_line..close_line_start].to_string()
}
/// CI diff guard: the §5.5 block in `migrations/V009__fts_korean_morphological.sql`
/// must match the design doc verbatim (whitespace-normalized). V009
/// replaced V007 's trigram tokenizer with unicode61 + CASE expression
/// triggers for Korean morphological tokenization (2026-05-28).
/// V007 stays in place for historical replay of cold-upgrade paths
/// but is no longer compared against the design doc — V009 is now
/// the source of truth.
#[test]
fn fts_v009_matches_design_section_5_5_verbatim() {
let design = extract_design_5_5_fts_block();
let migration_block = extract_migration_5_5_verbatim_block();
// Sanity: the slices we extracted look like the §5.5 FTS block (not
// some unrelated snippet that happened to match a marker).
assert!(
design.contains("CREATE VIRTUAL TABLE chunks_fts"),
"design slice must include CREATE VIRTUAL TABLE chunks_fts"
);
assert!(
migration_block.contains("CREATE VIRTUAL TABLE chunks_fts"),
"migration slice must include CREATE VIRTUAL TABLE chunks_fts"
);
assert!(
design.trim_end().ends_with("END;"),
"design slice must terminate with END;"
);
let design_n = normalize_ws(&design);
let migration_n = normalize_ws(&migration_block);
assert_eq!(
design_n, migration_n,
"V009__fts_korean_morphological.sql §5.5 block must match design doc §5.5 verbatim \
(whitespace-normalized). If you intentionally changed one, \
update the other in the same commit."
);
}
// ── 5b. V009 corpus_revision bump ────────────────────────────────────
/// V009 migration 이 corpus_revision kv 를 bump 하는지 검증.
/// SqliteStore::open + run_migrations 후 corpus_revision 이 ≥ 1 이어야 함.
/// (V004 seed = '0', V009 UPDATE = CAST(CAST('0' AS INTEGER) + 1 AS TEXT) = '1').
#[test]
fn v009_bumps_corpus_revision() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let rev = store.corpus_revision();
assert!(
rev >= 1,
"corpus_revision must be ≥ 1 after V009 migration \
(V004 seeds 0, V009 bumps to ≥ 1); got {rev}"
);
}
// ── 6. WAL cleanup: drop store before tempdir reaps WAL/SHM ──────────
/// Mirror the P1-6 pattern: opening + migrating + dropping the store
/// must not strand `kebab.sqlite-wal`/`-shm` files such that the tempdir
/// can't be cleaned up. After dropping the store + side-channel conn,
/// the WAL/SHM siblings must either not exist or be removable — if a
/// stray handle were holding them open, on Windows the remove would
/// fail (on Linux unlink succeeds even with open handles, so this is
/// mostly a portability canary, but we still assert).
#[test]
fn fts_store_drop_releases_wal_files() {
let env = common::TestEnv::new();
let db_path = env.db_path();
{
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
// Force at least one trigger fire so WAL has content to flush.
let conn = raw_conn_no_fk(&env);
insert_chunk(&conn, &"a".repeat(32), &"d".repeat(32), "[]", "x");
drop(conn);
drop(store);
}
// After the store drops, any remaining WAL/SHM siblings must be
// removable. If a connection is still open this would fail on
// platforms with mandatory file locking.
for suffix in ["-wal", "-shm"] {
let p = db_path.with_extension(format!("sqlite{suffix}"));
if p.exists() {
std::fs::remove_file(&p).unwrap_or_else(|e| {
panic!(
"WAL/SHM sibling {} should be removable after store drop: {e}",
p.display()
)
});
}
}
// The main DB file should likewise be removable.
if db_path.exists() {
std::fs::remove_file(&db_path).expect("main DB file should be removable after store drop");
}
}
// ── 7. Trigram tokenizer behavior (V007) — Korean + English ──────────
/// V007 의 trigram tokenizer 가 한국어 3자 이상 연속 substring 을
/// 매칭하는지. Codex round 1/2 가 sqlite 3.45.1 로 검증한 동작을 pin:
/// - raw query 가 3자 이상 공백 없는 substring 인 경우 hit.
/// - raw query 가 공백을 포함하면 FTS5 가 토큰 경계로 분리 →
/// 양 토큰이 3자 미만이면 0-hit.
/// - quoted phrase ("..." 안에 공백 포함) 는 통째로 substring 매칭.
#[test]
fn fts_trigram_korean_3char_substring_hits() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let conn = raw_conn_no_fk(&env);
insert_chunk(
&conn,
&"k".repeat(32),
&"d".repeat(32),
"[]",
"해시 충돌은 키와 값을 매핑할 때 발생한다",
);
// raw 3+ chars 공백 없는 연속 substring → hit.
assert_eq!(
count_match(&conn, "충돌은"),
1,
"raw 3-char 공백 없는 substring '충돌은' must hit"
);
assert_eq!(
count_match(&conn, "발생한"),
1,
"raw 3-char 공백 없는 substring '발생한' must hit"
);
// quoted phrase (공백 포함) → substring 매칭으로 hit.
assert_eq!(
count_match(&conn, "\"해시 충돌\""),
1,
"quoted whole phrase '해시 충돌' (5 chars including space)"
);
assert_eq!(
count_match(&conn, "\"시 충\""),
1,
"quoted phrase '시 충' across the space boundary"
);
// raw with no whitespace but substring not present in source → 0-hit.
assert_eq!(
count_match(&conn, "해시충"),
0,
"원문에 공백 없는 '해시충' trigram 이 없으므로 0-hit"
);
}
/// V007 trigram 의 핵심 제약: 3 Unicode chars 미만 query 는 색인 단위가
/// 없어 항상 0-hit. design §3.4 + 사용자 결정 (lexical core 정상 0-hit,
/// CLI/TUI wrapper 가 안내 메시지 출력). 회귀 감지 — trigram 구조 변경
/// 또는 다른 tokenizer 도입 시 이 test 가 먼저 fail 한다.
#[test]
fn fts_trigram_korean_short_query_zero_hit_pinned() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let conn = raw_conn_no_fk(&env);
insert_chunk(
&conn,
&"k".repeat(32),
&"d".repeat(32),
"[]",
"해시 충돌은 키와 값을 매핑할 때 발생한다",
);
// 2자 한국어 query — 도그푸딩에서 보고된 핵심 케이스 ('충돌'/'값').
assert_eq!(count_match(&conn, "충돌"), 0, "2-char Korean query");
// 1자 한국어 query.
assert_eq!(count_match(&conn, ""), 0, "1-char Korean query");
}
/// V007 trigram 은 영어에도 substring 매칭으로 동작 — recall ↑, 단어
/// 경계 정밀도 ↓. design §3.4 의 동작 변경을 명시적으로 핀.
#[test]
fn fts_trigram_english_substring_hits() {
let env = common::TestEnv::new();
let store = SqliteStore::open(&env.config()).unwrap();
store.run_migrations().unwrap();
let conn = raw_conn_no_fk(&env);
insert_chunk(
&conn,
&"e".repeat(32),
&"d".repeat(32),
"[]",
"the tokenizer normalizes whitespace before matching",
);
// trigram substring — 'token' hits inside 'tokenizer'.
assert_eq!(
count_match(&conn, "token"),
1,
"substring of 'tokenizer' — trigram recall"
);
assert_eq!(count_match(&conn, "izer"), 1, "substring of 'tokenizer'");
// 3-char-minimum applies to English too.
assert_eq!(count_match(&conn, "to"), 0, "2-char English query");
}