feat(p2-1): chunks_fts virtual table + sync triggers (V002 migration)
Adds FTS5 lexical index for chunks per design §5.5: chunks_fts virtual table (unicode61 remove_diacritics 2 tokenizer, contentless w/ UNINDEXED chunk_id+doc_id) plus chunks_ai/chunks_ad/chunks_au triggers that mirror every chunks mutation into chunks_fts inside the host transaction. V002 ships the verbatim §5.5 SQL block plus a one-shot backfill INSERT so existing P1 databases gain searchability without re-ingest. Refinery bookkeeping makes double-apply naturally idempotent. Adds rebuild_chunks_fts(&Connection) escape hatch for kb index --rebuild-fts (CLI wiring deferred to a later task). Uses SAVEPOINT instead of Transaction so callers can invoke from inside an outer transaction; WAL serializes writers so no DELETE/INSERT race vs. concurrent chunks mutators is possible. Tests (10): V001-only → V002 cold-upgrade backfill (literal path), chunks_ai/ad/au trigger sync, MATCH-token verification, rebuild idempotency, drift recovery, double-run no-op, V002 ↔ design §5.5 verbatim diff guard (anchored extraction from both files), WAL/SHM release on store drop. All 185 workspace tests pass. Allowed deps respected (kb-core, kb-config, rusqlite, refinery — no new deps). FTS query implementation deferred to p2-2 (lexical-retriever). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
73
crates/kb-store-sqlite/src/fts.rs
Normal file
73
crates/kb-store-sqlite/src/fts.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
//! FTS5 maintenance helpers (P2-1).
|
||||
//!
|
||||
//! `chunks_fts` is a contentless FTS5 virtual table created by
|
||||
//! `migrations/V002__fts.sql` and kept in sync with the `chunks` table by
|
||||
//! the `chunks_ai` / `chunks_ad` / `chunks_au` triggers (design §5.5).
|
||||
//!
|
||||
//! Normal operation needs nothing from this module — every mutation on
|
||||
//! `chunks` propagates automatically inside the host transaction. The
|
||||
//! only entry point exposed here is [`rebuild_chunks_fts`], used as the
|
||||
//! escape hatch for `kb index --rebuild-fts` (wired by `kb-cli` later;
|
||||
//! out of scope for P2-1).
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use rusqlite::Connection;
|
||||
|
||||
/// Wipe `chunks_fts` and repopulate it from `chunks`.
|
||||
///
|
||||
/// Useful when:
|
||||
/// - the FTS index is suspected to have drifted (manual SQL,
|
||||
/// crash-during-migration on a future schema bump, etc.);
|
||||
/// - a tokenizer / schema change ships in a later migration and an
|
||||
/// already-running deployment needs to re-tokenize without re-ingest.
|
||||
///
|
||||
/// The two statements run inside a single transaction so a failure
|
||||
/// between DELETE and INSERT cannot leave `chunks_fts` empty.
|
||||
///
|
||||
/// # Concurrency
|
||||
///
|
||||
/// Caller is expected to hold the `SqliteStore` mutex (or otherwise own
|
||||
/// a private `Connection`); two concurrent rebuilds on the same DB file
|
||||
/// would race the DELETE / INSERT pair. The SAVEPOINT acquires SQLite's
|
||||
/// reserved-write lock at the DELETE; in WAL mode SQLite serializes
|
||||
/// writers, so concurrent INSERTs into `chunks` from another connection
|
||||
/// block until RELEASE — there is no duplicate-FTS-row race. Calling
|
||||
/// from inside a caller-owned transaction is safe; SAVEPOINT nests
|
||||
/// correctly. A panic inside the DELETE/INSERT closure leaks the
|
||||
/// savepoint name on the connection until the connection is dropped;
|
||||
/// that's acceptable because the next caller's `SAVEPOINT
|
||||
/// rebuild_chunks_fts` legally shadows the leaked one.
|
||||
pub fn rebuild_chunks_fts(conn: &Connection) -> Result<()> {
|
||||
// SAVEPOINT (instead of `transaction()`) keeps this function callable
|
||||
// from inside a caller-owned transaction. `&Connection` does not
|
||||
// permit `conn.transaction()` anyway (that needs `&mut Connection`),
|
||||
// so SAVEPOINT is the right primitive here.
|
||||
conn.execute("SAVEPOINT rebuild_chunks_fts", [])
|
||||
.context("open savepoint rebuild_chunks_fts")?;
|
||||
|
||||
let result: Result<()> = (|| {
|
||||
conn.execute("DELETE FROM chunks_fts", [])
|
||||
.context("DELETE FROM chunks_fts")?;
|
||||
conn.execute(
|
||||
"INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
SELECT chunk_id, doc_id, heading_path_json, text FROM chunks",
|
||||
[],
|
||||
)
|
||||
.context("repopulate chunks_fts from chunks")?;
|
||||
Ok(())
|
||||
})();
|
||||
|
||||
match result {
|
||||
Ok(()) => {
|
||||
conn.execute("RELEASE rebuild_chunks_fts", [])
|
||||
.context("release savepoint rebuild_chunks_fts")?;
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
// Best-effort rollback; bubble the original error.
|
||||
let _ = conn.execute("ROLLBACK TO rebuild_chunks_fts", []);
|
||||
let _ = conn.execute("RELEASE rebuild_chunks_fts", []);
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -13,11 +13,13 @@
|
||||
//! `kb-chunk` may appear as **dev-deps** — see `Cargo.toml` — to drive
|
||||
//! the contract round-trip test off a real Markdown fixture.)
|
||||
|
||||
mod documents;
|
||||
mod error;
|
||||
mod fts;
|
||||
mod jobs;
|
||||
mod schema;
|
||||
mod store;
|
||||
mod documents;
|
||||
mod jobs;
|
||||
|
||||
pub use error::StoreError;
|
||||
pub use fts::rebuild_chunks_fts;
|
||||
pub use store::SqliteStore;
|
||||
|
||||
479
crates/kb-store-sqlite/tests/fts.rs
Normal file
479
crates/kb-store-sqlite/tests/fts.rs
Normal file
@@ -0,0 +1,479 @@
|
||||
//! P2-1 FTS5 schema + trigger + rebuild tests.
|
||||
//!
|
||||
//! Strategy: `chunks_fts` triggers fire off raw SQL on `chunks`, so we
|
||||
//! seed and mutate via direct INSERT/UPDATE/DELETE rather than the full
|
||||
//! `kb-parse-md → kb-normalize → kb-chunk → put_chunks` pipeline. That
|
||||
//! keeps the assertions about trigger behavior independent of any
|
||||
//! upstream crate. The `chunks` rows we produce satisfy NOT NULL on the
|
||||
//! columns required by V001 §5.5; we elide FK pressure on `documents`
|
||||
//! by disabling foreign keys for the test connection (the trigger logic
|
||||
//! we exercise has no `documents` dependency).
|
||||
//!
|
||||
//! Test connections open a fresh side-channel `rusqlite::Connection`
|
||||
//! that bypasses the `SqliteStore` mutex; that's fine because each test
|
||||
//! gets its own tempdir and no concurrent mutator is in flight.
|
||||
|
||||
use kb_store_sqlite::{SqliteStore, rebuild_chunks_fts};
|
||||
use rusqlite::Connection;
|
||||
|
||||
mod common;
|
||||
|
||||
/// Insert a chunks row directly. The triggers will mirror it into
|
||||
/// `chunks_fts` as part of the same statement.
|
||||
fn insert_chunk(
|
||||
conn: &Connection,
|
||||
chunk_id: &str,
|
||||
doc_id: &str,
|
||||
heading_path_json: &str,
|
||||
text: &str,
|
||||
) {
|
||||
conn.execute(
|
||||
"INSERT INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, ?, ?, NULL, '[]', 0, 'v1', 'h', '[]', '2024-01-01T00:00:00Z')",
|
||||
rusqlite::params![chunk_id, doc_id, text, heading_path_json],
|
||||
)
|
||||
.expect("insert chunk row");
|
||||
}
|
||||
|
||||
fn count(conn: &Connection, table: &str) -> i64 {
|
||||
conn.query_row(&format!("SELECT COUNT(*) FROM {table}"), [], |r| r.get(0))
|
||||
.expect("count")
|
||||
}
|
||||
|
||||
/// Open a fresh side-channel connection with FK enforcement OFF. The
|
||||
/// FTS triggers we test do not touch `documents`, but `chunks` has a
|
||||
/// FK to `documents(doc_id)`; turning FK enforcement off lets us seed
|
||||
/// chunks without first synthesizing a full documents/assets row graph.
|
||||
fn raw_conn_no_fk(env: &common::TestEnv) -> Connection {
|
||||
let conn = Connection::open(env.db_path()).expect("open side conn");
|
||||
conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
|
||||
conn
|
||||
}
|
||||
|
||||
// ── 1. Migration apply: backfill ──────────────────────────────────────
|
||||
|
||||
/// Apply V001 only, seed N rows into `chunks` (which has no FTS shadow
|
||||
/// at this point — V001 doesn't create `chunks_fts`), then apply V002's
|
||||
/// SQL verbatim. The V002 backfill INSERT must produce one chunks_fts
|
||||
/// row per pre-existing chunks row, and each row's columns must match.
|
||||
///
|
||||
/// This is the literal cold-upgrade path: V001-shipped database, V002
|
||||
/// applied on top, existing chunks become searchable without re-ingest.
|
||||
/// The trigger-based mirror (chunks_ai) is covered by the §2 tests.
|
||||
#[test]
|
||||
fn fts_v002_backfills_existing_chunks() {
|
||||
let env = common::TestEnv::new();
|
||||
let conn = Connection::open(env.db_path()).expect("open db");
|
||||
conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
|
||||
|
||||
// 1) Apply V001 only — chunks table exists, chunks_fts does not.
|
||||
let v001_sql = include_str!("../../../migrations/V001__init.sql");
|
||||
conn.execute_batch(v001_sql).expect("apply V001");
|
||||
assert!(
|
||||
conn.query_row(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_fts'",
|
||||
[],
|
||||
|r| r.get::<_, String>(0),
|
||||
)
|
||||
.is_err(),
|
||||
"chunks_fts must not exist under V001 only"
|
||||
);
|
||||
|
||||
// 2) Seed pre-existing chunks rows (the V001-shipped state we expect
|
||||
// on a customer DB upgrading from P1 to P2-1).
|
||||
const N: usize = 4;
|
||||
for i in 0..N {
|
||||
let cid = format!("{:0>32}", i);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&cid,
|
||||
&"d".repeat(32),
|
||||
"[\"Section\"]",
|
||||
&format!("seedrow{i} payload"),
|
||||
);
|
||||
}
|
||||
assert_eq!(count(&conn, "chunks"), N as i64);
|
||||
|
||||
// 3) Apply V002 verbatim — its CREATE VIRTUAL TABLE + triggers + the
|
||||
// final backfill INSERT. The triggers don't fire on this path
|
||||
// (they only fire on chunks INSERT/UPDATE/DELETE); the backfill
|
||||
// INSERT does the work.
|
||||
let v002_sql = include_str!("../../../migrations/V002__fts.sql");
|
||||
conn.execute_batch(v002_sql).expect("apply V002");
|
||||
|
||||
// 4) Assert: count parity, and the backfilled rows mirror the chunks
|
||||
// rows column-for-column on the indexed/UNINDEXED columns.
|
||||
assert_eq!(
|
||||
count(&conn, "chunks_fts"),
|
||||
N as i64,
|
||||
"V002 backfill INSERT must seed one chunks_fts row per chunks row"
|
||||
);
|
||||
for i in 0..N {
|
||||
let cid = format!("{:0>32}", i);
|
||||
let term = format!("seedrow{i}");
|
||||
let hit: String = conn
|
||||
.query_row(
|
||||
"SELECT chunk_id FROM chunks_fts WHERE chunks_fts MATCH ?",
|
||||
[&term],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.unwrap_or_else(|_| panic!("MATCH {term} must hit backfilled row"));
|
||||
assert_eq!(hit, cid, "backfill must preserve chunk_id mapping");
|
||||
}
|
||||
}
|
||||
|
||||
/// Direct test of the V002 backfill INSERT on a DB seeded under V001.
|
||||
/// We achieve V001-only state by running all migrations, dropping the
|
||||
/// FTS rows, then re-running the exact backfill INSERT V002 ships and
|
||||
/// asserting count parity.
|
||||
#[test]
|
||||
fn fts_v002_backfill_select_matches_chunks_count() {
|
||||
let env = common::TestEnv::new();
|
||||
let store = SqliteStore::open(&env.config()).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
|
||||
let conn = raw_conn_no_fk(&env);
|
||||
for i in 0..5 {
|
||||
let cid = format!("{:0>32}", i);
|
||||
insert_chunk(&conn, &cid, &"d".repeat(32), "[]", &format!("row {i}"));
|
||||
}
|
||||
// Wipe + run the literal V002 backfill INSERT.
|
||||
conn.execute("DELETE FROM chunks_fts", []).unwrap();
|
||||
assert_eq!(count(&conn, "chunks_fts"), 0);
|
||||
conn.execute(
|
||||
"INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
SELECT chunk_id, doc_id, heading_path_json, text FROM chunks",
|
||||
[],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(count(&conn, "chunks_fts"), count(&conn, "chunks"));
|
||||
}
|
||||
|
||||
// ── 2. Trigger sync: INSERT / DELETE / UPDATE ────────────────────────
|
||||
|
||||
#[test]
|
||||
fn fts_chunks_ai_trigger_propagates_insert() {
|
||||
let env = common::TestEnv::new();
|
||||
let store = SqliteStore::open(&env.config()).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
|
||||
let conn = raw_conn_no_fk(&env);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&"a".repeat(32),
|
||||
&"d".repeat(32),
|
||||
"[\"Heading\"]",
|
||||
"needle in haystack",
|
||||
);
|
||||
|
||||
// chunks_fts row count == 1 and MATCH finds it.
|
||||
assert_eq!(count(&conn, "chunks_fts"), 1);
|
||||
let hit: String = conn
|
||||
.query_row(
|
||||
"SELECT chunk_id FROM chunks_fts WHERE chunks_fts MATCH 'needle'",
|
||||
[],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.expect("MATCH 'needle' must hit");
|
||||
assert_eq!(hit, "a".repeat(32));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_chunks_ad_trigger_propagates_delete() {
|
||||
let env = common::TestEnv::new();
|
||||
let store = SqliteStore::open(&env.config()).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
|
||||
let conn = raw_conn_no_fk(&env);
|
||||
let cid = "a".repeat(32);
|
||||
insert_chunk(&conn, &cid, &"d".repeat(32), "[]", "ephemeral");
|
||||
assert_eq!(count(&conn, "chunks_fts"), 1);
|
||||
|
||||
conn.execute("DELETE FROM chunks WHERE chunk_id = ?", [&cid])
|
||||
.expect("delete chunk");
|
||||
assert_eq!(
|
||||
count(&conn, "chunks_fts"),
|
||||
0,
|
||||
"chunks_ad must remove the FTS row"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_chunks_au_trigger_propagates_update() {
|
||||
let env = common::TestEnv::new();
|
||||
let store = SqliteStore::open(&env.config()).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
|
||||
let conn = raw_conn_no_fk(&env);
|
||||
let cid = "a".repeat(32);
|
||||
insert_chunk(&conn, &cid, &"d".repeat(32), "[]", "before");
|
||||
|
||||
// Old text is searchable.
|
||||
assert_eq!(count_match(&conn, "before"), 1);
|
||||
assert_eq!(count_match(&conn, "after"), 0);
|
||||
|
||||
conn.execute(
|
||||
"UPDATE chunks SET text = ? WHERE chunk_id = ?",
|
||||
rusqlite::params!["after rewrite", cid],
|
||||
)
|
||||
.expect("update chunk text");
|
||||
|
||||
// New text is searchable; old token is gone. Row count unchanged.
|
||||
assert_eq!(count(&conn, "chunks_fts"), 1);
|
||||
assert_eq!(
|
||||
count_match(&conn, "before"),
|
||||
0,
|
||||
"old text must not survive UPDATE"
|
||||
);
|
||||
assert_eq!(count_match(&conn, "after"), 1, "new text must be indexed");
|
||||
}
|
||||
|
||||
fn count_match(conn: &Connection, term: &str) -> i64 {
|
||||
conn.query_row(
|
||||
"SELECT COUNT(*) FROM chunks_fts WHERE chunks_fts MATCH ?",
|
||||
[term],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.expect("count_match")
|
||||
}
|
||||
|
||||
// ── 3. rebuild_chunks_fts ────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn fts_rebuild_chunks_fts_is_idempotent() {
|
||||
let env = common::TestEnv::new();
|
||||
let store = SqliteStore::open(&env.config()).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
|
||||
let conn = raw_conn_no_fk(&env);
|
||||
for i in 0..3 {
|
||||
let cid = format!("{:0>32}", i);
|
||||
insert_chunk(&conn, &cid, &"d".repeat(32), "[]", &format!("token{i}"));
|
||||
}
|
||||
let before = count(&conn, "chunks_fts");
|
||||
assert_eq!(before, 3);
|
||||
|
||||
// First rebuild: trivial round-trip — same row count.
|
||||
rebuild_chunks_fts(&conn).expect("rebuild 1");
|
||||
assert_eq!(count(&conn, "chunks_fts"), before);
|
||||
|
||||
// Second rebuild: idempotent (same row count again).
|
||||
rebuild_chunks_fts(&conn).expect("rebuild 2");
|
||||
assert_eq!(count(&conn, "chunks_fts"), before);
|
||||
|
||||
// After rebuild, MATCH still finds expected tokens.
|
||||
for i in 0..3 {
|
||||
assert_eq!(count_match(&conn, &format!("token{i}")), 1);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts_rebuild_chunks_fts_recovers_from_drift() {
|
||||
let env = common::TestEnv::new();
|
||||
let store = SqliteStore::open(&env.config()).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
|
||||
let conn = raw_conn_no_fk(&env);
|
||||
let cid = "a".repeat(32);
|
||||
insert_chunk(&conn, &cid, &"d".repeat(32), "[]", "recovered");
|
||||
|
||||
// Manually wipe chunks_fts to simulate drift; this is the failure
|
||||
// mode `kb index --rebuild-fts` exists to recover from.
|
||||
conn.execute("DELETE FROM chunks_fts", []).unwrap();
|
||||
assert_eq!(count(&conn, "chunks_fts"), 0);
|
||||
assert_eq!(count(&conn, "chunks"), 1);
|
||||
|
||||
rebuild_chunks_fts(&conn).expect("rebuild");
|
||||
assert_eq!(count(&conn, "chunks_fts"), 1);
|
||||
assert_eq!(count_match(&conn, "recovered"), 1);
|
||||
}
|
||||
|
||||
// ── 4. Migration double-apply no-op ──────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn fts_double_run_migrations_is_noop() {
|
||||
let env = common::TestEnv::new();
|
||||
let store = SqliteStore::open(&env.config()).unwrap();
|
||||
store.run_migrations().expect("run 1");
|
||||
// Second invocation must be a no-op (refinery's bookkeeping table
|
||||
// tracks applied versions). The chunks_fts virtual table is still
|
||||
// present and queryable.
|
||||
store.run_migrations().expect("run 2");
|
||||
|
||||
let conn = raw_conn_no_fk(&env);
|
||||
// The virtual table is queryable.
|
||||
let n: i64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
|
||||
.expect("chunks_fts queryable after double-run");
|
||||
assert_eq!(n, 0);
|
||||
}
|
||||
|
||||
// ── 5. CI diff guard: V002 SQL matches design §5.5 verbatim ──────────
|
||||
|
||||
/// Whitespace-normalize a SQL block: trim, then collapse every run of
|
||||
/// whitespace (newlines included) into a single space. Lets the
|
||||
/// design-doc ↔ migration-file comparison ignore cosmetic drift like
|
||||
/// blank-line counts while still catching token-level changes.
|
||||
fn normalize_ws(s: &str) -> String {
|
||||
s.split_whitespace().collect::<Vec<_>>().join(" ")
|
||||
}
|
||||
|
||||
/// Extract the §5.5 FTS slice from the design doc: locate the
|
||||
/// `### 5.5 Chunks + FTS5` heading, walk to the next ```sql fenced
|
||||
/// block, then within that block slice from `CREATE VIRTUAL TABLE
|
||||
/// chunks_fts` through the last `END;`. The §5.5 fenced block also
|
||||
/// contains the `chunks` CREATE TABLE — we only want the FTS portion.
|
||||
///
|
||||
/// Failure modes (any of these means the design doc layout drifted —
|
||||
/// the test should fail loud, which is the point):
|
||||
/// - heading missing
|
||||
/// - no ```sql block follows
|
||||
/// - no `CREATE VIRTUAL TABLE chunks_fts` inside that block
|
||||
/// - no `END;` after the virtual-table line
|
||||
fn extract_design_5_5_fts_block() -> String {
|
||||
let doc = include_str!(
|
||||
"../../../docs/superpowers/specs/2026-04-27-kb-final-form-design.md"
|
||||
);
|
||||
let heading_idx = doc
|
||||
.find("### 5.5 Chunks + FTS5")
|
||||
.expect("design doc must contain `### 5.5 Chunks + FTS5` heading");
|
||||
let after_heading = &doc[heading_idx..];
|
||||
|
||||
// Find the opening fence ```sql after the heading.
|
||||
let fence_open_rel = after_heading
|
||||
.find("```sql")
|
||||
.expect("§5.5 must be followed by a ```sql fenced block");
|
||||
// Move past the fence line.
|
||||
let body_start_rel = fence_open_rel
|
||||
+ after_heading[fence_open_rel..]
|
||||
.find('\n')
|
||||
.expect("```sql fence must end with a newline")
|
||||
+ 1;
|
||||
let body = &after_heading[body_start_rel..];
|
||||
let fence_close_rel = body
|
||||
.find("\n```")
|
||||
.expect("§5.5 ```sql block must close with ``` on its own line");
|
||||
let fenced = &body[..fence_close_rel];
|
||||
|
||||
// Within the fenced block, slice from CREATE VIRTUAL TABLE chunks_fts
|
||||
// through the last `END;`.
|
||||
let virt_idx = fenced
|
||||
.find("CREATE VIRTUAL TABLE chunks_fts")
|
||||
.expect("§5.5 fenced block must contain `CREATE VIRTUAL TABLE chunks_fts`");
|
||||
let fts_slice = &fenced[virt_idx..];
|
||||
let last_end = fts_slice
|
||||
.rfind("END;")
|
||||
.expect("§5.5 FTS slice must terminate with `END;`");
|
||||
fts_slice[..last_end + "END;".len()].to_string()
|
||||
}
|
||||
|
||||
/// Extract the §5.5 verbatim block from the V002 migration, between the
|
||||
/// `── §5.5 verbatim block ──` anchor markers the file already carries.
|
||||
fn extract_migration_5_5_verbatim_block() -> String {
|
||||
let migration = include_str!("../../../migrations/V002__fts.sql");
|
||||
// The opening anchor line ends with `── §5.5 verbatim block ─...`.
|
||||
let open_marker = "§5.5 verbatim block";
|
||||
let close_marker = "End §5.5 verbatim block";
|
||||
|
||||
let open_idx = migration
|
||||
.find(open_marker)
|
||||
.expect("V002 must carry the `§5.5 verbatim block` opening anchor");
|
||||
let after_open_line = open_idx
|
||||
+ migration[open_idx..]
|
||||
.find('\n')
|
||||
.expect("opening anchor line must end with a newline")
|
||||
+ 1;
|
||||
|
||||
let close_idx = migration[after_open_line..]
|
||||
.find(close_marker)
|
||||
.expect("V002 must carry the `End §5.5 verbatim block` closing anchor")
|
||||
+ after_open_line;
|
||||
// Walk back from the close marker to the start of its comment line.
|
||||
let close_line_start = migration[..close_idx]
|
||||
.rfind('\n')
|
||||
.map(|n| n + 1)
|
||||
.unwrap_or(0);
|
||||
|
||||
migration[after_open_line..close_line_start].to_string()
|
||||
}
|
||||
|
||||
/// CI diff guard: the §5.5 block in `migrations/V002__fts.sql` must
|
||||
/// match the design doc verbatim (whitespace-normalized). If the
|
||||
/// design doc moves the section, renames the heading, or edits the
|
||||
/// SQL, this test fails first. Same for migration drift.
|
||||
#[test]
|
||||
fn fts_v002_matches_design_section_5_5_verbatim() {
|
||||
let design = extract_design_5_5_fts_block();
|
||||
let migration_block = extract_migration_5_5_verbatim_block();
|
||||
|
||||
// Sanity: the slices we extracted look like the §5.5 FTS block (not
|
||||
// some unrelated snippet that happened to match a marker).
|
||||
assert!(
|
||||
design.contains("CREATE VIRTUAL TABLE chunks_fts"),
|
||||
"design slice must include CREATE VIRTUAL TABLE chunks_fts"
|
||||
);
|
||||
assert!(
|
||||
migration_block.contains("CREATE VIRTUAL TABLE chunks_fts"),
|
||||
"migration slice must include CREATE VIRTUAL TABLE chunks_fts"
|
||||
);
|
||||
assert!(
|
||||
design.trim_end().ends_with("END;"),
|
||||
"design slice must terminate with END;"
|
||||
);
|
||||
|
||||
let design_n = normalize_ws(&design);
|
||||
let migration_n = normalize_ws(&migration_block);
|
||||
assert_eq!(
|
||||
design_n, migration_n,
|
||||
"V002__fts.sql §5.5 block must match design doc §5.5 verbatim \
|
||||
(whitespace-normalized). If you intentionally changed one, \
|
||||
update the other in the same commit."
|
||||
);
|
||||
}
|
||||
|
||||
// ── 6. WAL cleanup: drop store before tempdir reaps WAL/SHM ──────────
|
||||
|
||||
/// Mirror the P1-6 pattern: opening + migrating + dropping the store
|
||||
/// must not strand `kb.sqlite-wal`/`-shm` files such that the tempdir
|
||||
/// can't be cleaned up. After dropping the store + side-channel conn,
|
||||
/// the WAL/SHM siblings must either not exist or be removable — if a
|
||||
/// stray handle were holding them open, on Windows the remove would
|
||||
/// fail (on Linux unlink succeeds even with open handles, so this is
|
||||
/// mostly a portability canary, but we still assert).
|
||||
#[test]
|
||||
fn fts_store_drop_releases_wal_files() {
|
||||
let env = common::TestEnv::new();
|
||||
let db_path = env.db_path();
|
||||
{
|
||||
let store = SqliteStore::open(&env.config()).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
// Force at least one trigger fire so WAL has content to flush.
|
||||
let conn = raw_conn_no_fk(&env);
|
||||
insert_chunk(&conn, &"a".repeat(32), &"d".repeat(32), "[]", "x");
|
||||
drop(conn);
|
||||
drop(store);
|
||||
}
|
||||
|
||||
// After the store drops, any remaining WAL/SHM siblings must be
|
||||
// removable. If a connection is still open this would fail on
|
||||
// platforms with mandatory file locking.
|
||||
for suffix in ["-wal", "-shm"] {
|
||||
let p = db_path.with_extension(format!("sqlite{suffix}"));
|
||||
if p.exists() {
|
||||
std::fs::remove_file(&p).unwrap_or_else(|e| {
|
||||
panic!(
|
||||
"WAL/SHM sibling {} should be removable after store drop: {e}",
|
||||
p.display()
|
||||
)
|
||||
});
|
||||
}
|
||||
}
|
||||
// The main DB file should likewise be removable.
|
||||
if db_path.exists() {
|
||||
std::fs::remove_file(&db_path)
|
||||
.expect("main DB file should be removable after store drop");
|
||||
}
|
||||
}
|
||||
47
migrations/V002__fts.sql
Normal file
47
migrations/V002__fts.sql
Normal file
@@ -0,0 +1,47 @@
|
||||
-- V002__fts.sql — FTS5 virtual table + sync triggers.
|
||||
--
|
||||
-- Per design §5.5 (chunks_fts virtual table + chunks_ai/ad/au triggers).
|
||||
-- The CREATE VIRTUAL TABLE / CREATE TRIGGER block below is reproduced
|
||||
-- VERBATIM from `docs/superpowers/specs/2026-04-27-kb-final-form-design.md`
|
||||
-- §5.5 lines 866–885; CI diff-checks this against the design doc.
|
||||
--
|
||||
-- Tokenizer choice: `unicode61 remove_diacritics 2` follows the design
|
||||
-- default for P2-1 (Korean morphological tokenizer is a P+ note).
|
||||
--
|
||||
-- Backfill: V001 already shipped the `chunks` table without an FTS
|
||||
-- shadow; on V002 apply we seed `chunks_fts` from the existing rows so
|
||||
-- already-ingested workspaces become searchable without re-ingesting.
|
||||
-- Per design §9 (versioning), V002 is additive: no destructive change
|
||||
-- to V001 tables.
|
||||
|
||||
-- ── §5.5 verbatim block ────────────────────────────────────────────────
|
||||
|
||||
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
||||
chunk_id UNINDEXED,
|
||||
doc_id UNINDEXED,
|
||||
heading_path,
|
||||
text,
|
||||
tokenize = 'unicode61 remove_diacritics 2'
|
||||
);
|
||||
|
||||
CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
|
||||
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text);
|
||||
END;
|
||||
CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
|
||||
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
|
||||
END;
|
||||
CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
|
||||
DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
|
||||
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
VALUES (new.chunk_id, new.doc_id, new.heading_path_json, new.text);
|
||||
END;
|
||||
|
||||
-- ── End §5.5 verbatim block ───────────────────────────────────────────
|
||||
|
||||
-- One-shot backfill for existing chunks. The triggers above only fire
|
||||
-- on future mutations; V001 may have left `chunks` populated. Refinery
|
||||
-- runs V002 exactly once via its bookkeeping table, so this INSERT is
|
||||
-- naturally idempotent across restarts.
|
||||
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
|
||||
SELECT chunk_id, doc_id, heading_path_json, text FROM chunks;
|
||||
Reference in New Issue
Block a user