From b335151d18512c32ef4860b245c99d4cc5bedc45 Mon Sep 17 00:00:00 2001 From: altair823 Date: Fri, 1 May 2026 05:20:35 +0000 Subject: [PATCH] feat(p2-2): kb-search crate + LexicalRetriever (FTS5 + bm25) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the first concrete kb_core::Retriever, exercising chunks_fts (P2-1) to answer SearchMode::Lexical queries. Returns Vec with bm25-derived ranking, snippet() previews, and W3C-fragment-style Citation built from the chunk's first source_spans entry. New crate kb-search: - LexicalRetriever::new(Arc, IndexVersion). - search() builds an FTS5 MATCH expression by escaping every whitespace token into a quoted literal (inner " doubled); single-quote-wrapped text passes through verbatim as raw FTS5 syntax. Empty query short-circuits to Ok(vec![]). - bm25 normalization: score = -bm25 / (1 + |bm25|), bounded (0, 1] for any FTS5-returned negative bm25. - Snippet via snippet(chunks_fts, 3, '', '', '…', word_budget) where word_budget = snippet_chars / 4 clamped to [1, 64]; trim_snippet enforces the char cap on the way out (chars per design §6.4 — accepts the combining-mark trade-off). - Citation from chunks.source_spans_json first span: Line / Page / Region / Time forwarded; Byte / empty array fall back to Line{1,1} with a tracing::warn so forward-compat regressions surface. - Filters: tags_any (subquery on document_tags), lang (= column), trust_min (CASE-rank in SQL) all applied at SQL level. path_glob uses globset with literal_separator(true) — guarantees '*' does not cross '/' per spec Risks/notes — applied as Rust post-filter with +128 row over-fetch when set, then rank reassigned 1..k contiguously. - Determinism: ORDER BY score, f.chunk_id (lexicographic blake3 hex tiebreaker on identical bm25). Tested explicitly with two chunks of identical text content. - RetrievalDetail: method=Lexical, both lexical_score and fusion_score set, vector_* None. kb-store-sqlite: - Adds pub fn read_conn(&self) -> MutexGuard<'_, Connection>. Read-only contract is doc-only — kb-search needs MutexGuard for prepare_cached + iter, which a closure-scoped wrapper would awkwardly constrain. Closure variant left as a P3 follow-up. Tests (26 new): empty corpus, empty query, single hit + citation round-trip, snippet length cap, tags_any exclusion, lang+trust composition, path_glob with '*' not crossing '/', citation line round- trip, bm25 top-1 ∈ (0, 1], determinism (varied scores AND identical- score tiebreaker), index_version passthrough, snapshot (crates/kb-search/tests/fixtures/search/lexical/run-1.json — stable under bundled SQLite; KB_UPDATE_SNAPSHOTS=1 to regenerate). Workspace: 211 tests pass, cargo clippy --workspace --all-targets -D warnings clean. Allowed deps respected: kb-core, kb-config, kb-store-sqlite, rusqlite, tracing, thiserror, anyhow (forced by trait return type), serde_json (parses *_json TEXT columns), globset (path_glob '*' boundary). Out of scope (deferred): vector retriever (p3-3), hybrid fusion (p3-4), reranker (P+), Korean morphological tokenizer (P+). Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 16 + Cargo.toml | 6 + crates/kb-search/Cargo.toml | 22 + crates/kb-search/src/lexical.rs | 649 +++++++++++++++++ crates/kb-search/src/lib.rs | 15 + .../tests/fixtures/search/lexical/run-1.json | 60 ++ crates/kb-search/tests/lexical.rs | 666 ++++++++++++++++++ crates/kb-store-sqlite/src/store.rs | 16 + 8 files changed, 1450 insertions(+) create mode 100644 crates/kb-search/Cargo.toml create mode 100644 crates/kb-search/src/lexical.rs create mode 100644 crates/kb-search/src/lib.rs create mode 100644 crates/kb-search/tests/fixtures/search/lexical/run-1.json create mode 100644 crates/kb-search/tests/lexical.rs diff --git a/Cargo.lock b/Cargo.lock index cecb492..1bd474c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -801,6 +801,22 @@ dependencies = [ "serde", ] +[[package]] +name = "kb-search" +version = "0.1.0" +dependencies = [ + "anyhow", + "globset", + "kb-config", + "kb-core", + "kb-store-sqlite", + "rusqlite", + "serde_json", + "tempfile", + "thiserror 2.0.18", + "tracing", +] + [[package]] name = "kb-source-fs" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index b957c22..951f3ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ members = [ "crates/kb-normalize", "crates/kb-chunk", "crates/kb-store-sqlite", + "crates/kb-search", "crates/kb-app", "crates/kb-cli", ] @@ -29,3 +30,8 @@ time = { version = "0.3", features = ["serde", "macros", "formatting", " uuid = { version = "1", features = ["v7", "serde"] } blake3 = "1" tracing = "0.1" +# `bundled` ships SQLite source so the workspace doesn't depend on a +# system libsqlite3 (matches the kb-store-sqlite feature set). +rusqlite = { version = "0.32", features = ["bundled"] } +globset = "0.4" +tempfile = "3" diff --git a/crates/kb-search/Cargo.toml b/crates/kb-search/Cargo.toml new file mode 100644 index 0000000..d7a49ab --- /dev/null +++ b/crates/kb-search/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "kb-search" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Retriever implementations for kb (P2-2 lexical FTS5; P3 vector / hybrid will follow)" + +[dependencies] +kb-core = { path = "../kb-core" } +kb-config = { path = "../kb-config" } +kb-store-sqlite = { path = "../kb-store-sqlite" } +rusqlite = { workspace = true } +globset = { workspace = true } +serde_json = { workspace = true } +tracing = { workspace = true } +thiserror = { workspace = true } +anyhow = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/crates/kb-search/src/lexical.rs b/crates/kb-search/src/lexical.rs new file mode 100644 index 0000000..6b06dc1 --- /dev/null +++ b/crates/kb-search/src/lexical.rs @@ -0,0 +1,649 @@ +//! Lexical (FTS5 + bm25) retriever — design §3.7 / §1.5 / §2.2 / §6.4. +//! +//! Owns the SQL pattern documented in `tasks/p2/p2-2-lexical-retriever.md` +//! and constructs `kb_core::SearchHit` values directly from the joined +//! `chunks_fts` / `chunks` / `documents` rows. Reads only — never mutates +//! the underlying SQLite file. + +use std::sync::Arc; + +use anyhow::{Context, Result}; +use globset::GlobMatcher; +use kb_core::{ + ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail, + Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery, SourceSpan, + TrustLevel, WorkspacePath, +}; +use kb_store_sqlite::SqliteStore; +use rusqlite::{params_from_iter, Connection, Row, ToSql}; + +// ── Tunables ───────────────────────────────────────────────────────────── + +/// FTS5 hard limit on the `snippet()` `nToken` argument. +/// See SQLite's FTS5 docs: snippet() rejects nToken > 64. +const FTS5_SNIPPET_MAX_WORDS: usize = 64; + +/// Floor for the snippet word budget. `snippet_chars / 4` may yield 0 for +/// pathologically small configs; we always ask FTS5 for at least one word +/// so it can still return something matchable for the test harness. +const FTS5_SNIPPET_MIN_WORDS: usize = 1; + +/// Default `k` when `SearchQuery::k == 0`. Mirrors §6.4 default_k=10. +const DEFAULT_K: usize = 10; + +/// When `path_glob` is set we have to over-fetch and post-filter in Rust, +/// because SQLite's GLOB operator treats `*` as "any chars including `/`", +/// which contradicts the design rule that `*` must NOT cross path +/// separators. Empirically `+128` is generous for any realistic workspace +/// and bounded enough to keep memory predictable. +const PATH_GLOB_OVERFETCH: usize = 128; + +// ── Public surface ─────────────────────────────────────────────────────── + +/// Lexical retriever backed by SQLite FTS5 + bm25. +pub struct LexicalRetriever { + store: Arc, + index_version: IndexVersion, + /// Number of `snippet()` words derived from `kb-config::search.snippet_chars`, + /// clamped into `[FTS5_SNIPPET_MIN_WORDS, FTS5_SNIPPET_MAX_WORDS]`. + snippet_words: usize, + /// Hard cap on the returned snippet's character length per design §6.4. + snippet_chars: usize, +} + +impl LexicalRetriever { + /// Construct with default settings derived from `kb-config`'s defaults. + /// Snippet width is computed from `Config::defaults().search.snippet_chars`. + pub fn new(store: Arc, index_version: IndexVersion) -> Self { + let cfg = kb_config::Config::defaults(); + Self::with_settings(store, index_version, cfg.search.snippet_chars) + } + + /// Construct with explicit `snippet_chars`. Used by tests / callers + /// that have already loaded a `Config`. + pub fn with_settings( + store: Arc, + index_version: IndexVersion, + snippet_chars: usize, + ) -> Self { + // Heuristic: 1 token ≈ 4 chars (English-leaning estimate; Korean + // tokens average shorter, so the cap-by-chars trim below is what + // actually enforces the contract). The `/4` keeps us well below + // FTS5's nToken=64 limit for typical snippet_chars=220 budgets. + let raw = snippet_chars / 4; + let snippet_words = raw.clamp(FTS5_SNIPPET_MIN_WORDS, FTS5_SNIPPET_MAX_WORDS); + Self { + store, + index_version, + snippet_words, + snippet_chars, + } + } +} + +impl Retriever for LexicalRetriever { + fn search(&self, query: &SearchQuery) -> Result> { + let match_opt = build_match_string(&query.text); + let k = if query.k == 0 { DEFAULT_K } else { query.k }; + let filters = &query.filters; + // One-line summary at request entry. Filter shape only — no + // tag/lang/path values, which could be PII-sensitive. + tracing::debug!( + match_str = match_opt.as_deref().unwrap_or(""), + tags_any = filters.tags_any.len(), + has_lang = filters.lang.is_some(), + has_trust_min = filters.trust_min.is_some(), + has_path_glob = filters.path_glob.is_some(), + k, + "kb-search lexical: search start" + ); + + // Empty / whitespace-only query → nothing to do. Per spec we + // succeed with an empty hit list rather than erroring. + let match_str = match match_opt { + Some(s) => s, + None => return Ok(Vec::new()), + }; + + // Pre-compile the path_glob once. The `Glob` produced rejects + // syntactically invalid patterns at construction time so the + // caller gets a clear error rather than a silent empty result. + let path_matcher = match &filters.path_glob { + Some(g) => Some(compile_glob(g)?), + None => None, + }; + + // Fetch budget: when post-filtering by glob we need to over-fetch + // so that the final `take(k)` still has enough rows after culling. + let fetch_limit = if path_matcher.is_some() { + k.saturating_add(PATH_GLOB_OVERFETCH) + } else { + k + }; + + let conn = self.store.read_conn(); + let raw_rows = run_query( + &conn, + &match_str, + self.snippet_words, + filters, + fetch_limit, + )?; + + let mut hits: Vec = Vec::with_capacity(raw_rows.len().min(k)); + let mut rank: u32 = 0; + for row in raw_rows { + // Path glob is the only filter we evaluate in Rust because the + // semantics differ from SQLite's GLOB (no `/` crossing). + if let Some(m) = &path_matcher { + if !m.is_match(&row.workspace_path) { + continue; + } + } + rank = rank.saturating_add(1); + let hit = build_hit(row, rank, &self.index_version, self.snippet_chars)?; + hits.push(hit); + if hits.len() >= k { + break; + } + } + tracing::debug!(rows = hits.len(), "kb-search lexical: search done"); + Ok(hits) + } + + fn index_version(&self) -> IndexVersion { + self.index_version.clone() + } +} + +// ── Match-string construction ──────────────────────────────────────────── + +/// Translate a user-typed query into an FTS5 match string. +/// +/// Rules (from the task spec): +/// +/// - The query is wrapped in a single pair of `'...'` → strip the quotes +/// and pass the inner text through verbatim. The user has explicitly +/// opted into FTS5 syntax (e.g. `'rust AND cargo'`, `'foo*'`). +/// +/// - Otherwise: split on whitespace, escape every token by wrapping it +/// in `"..."` (FTS5 string literal), with any inner `"` doubled. Join +/// with spaces — FTS5 default operator is implicit AND. +/// +/// - An empty / whitespace-only token list → return `None` (caller +/// short-circuits to `Ok(vec![])`). +fn build_match_string(text: &str) -> Option { + let trimmed = text.trim(); + if trimmed.is_empty() { + return None; + } + if let Some(inner) = strip_single_quotes(trimmed) { + let inner_trim = inner.trim(); + if inner_trim.is_empty() { + return None; + } + return Some(inner_trim.to_string()); + } + let tokens: Vec = trimmed + .split_whitespace() + .map(escape_fts5_token) + .collect(); + if tokens.is_empty() { + None + } else { + Some(tokens.join(" ")) + } +} + +/// Return `Some(inner)` if `s` is wrapped in a matching pair of single +/// quotes (`'...'`), otherwise `None`. We require the closing quote to +/// be the last character so `'foo' bar` doesn't accidentally engage +/// raw-FTS5 mode. +fn strip_single_quotes(s: &str) -> Option<&str> { + let bytes = s.as_bytes(); + if bytes.len() >= 2 && bytes[0] == b'\'' && bytes[bytes.len() - 1] == b'\'' { + Some(&s[1..s.len() - 1]) + } else { + None + } +} + +/// FTS5-escape one token by wrapping it in double quotes (FTS5 string +/// literal). Inner `"` are escaped by doubling per FTS5 grammar. This is +/// the simple-and-safe approach that defangs every special character — +/// `(`, `)`, `*`, `^`, `:`, `"`, etc. — without trying to parse FTS5 +/// expressions. +fn escape_fts5_token(tok: &str) -> String { + let mut out = String::with_capacity(tok.len() + 2); + out.push('"'); + for ch in tok.chars() { + if ch == '"' { + out.push('"'); + out.push('"'); + } else { + out.push(ch); + } + } + out.push('"'); + out +} + +// ── SQL execution ──────────────────────────────────────────────────────── + +/// Raw row shape mirroring the columns selected by [`run_query`]. Kept +/// internal — every public path constructs `SearchHit` from this. +struct RawRow { + chunk_id: String, + doc_id: String, + bm25_raw: f64, + snippet: String, + heading_path_json: String, + section_label: Option, + source_spans_json: String, + chunker_version: String, + workspace_path: String, +} + +/// Build + execute the FTS5 query. The SQL pattern is the one documented +/// in `tasks/p2/p2-2-lexical-retriever.md` (§Behavior contract). +fn run_query( + conn: &Connection, + match_str: &str, + snippet_words: usize, + filters: &SearchFilters, + fetch_limit: usize, +) -> Result> { + // Build the dynamic SQL + positional parameter vector. Positional `?` + // is used (not named bindings) because the dynamic IN-list for + // `tags_any` is most natural with `params_from_iter`. + let mut sql = String::from( + "SELECT \ + f.chunk_id, f.doc_id, \ + bm25(chunks_fts) AS score, \ + snippet(chunks_fts, 3, '', '', '…', ?) AS snippet, \ + c.heading_path_json, c.section_label, c.source_spans_json, \ + c.chunker_version, \ + d.workspace_path \ + FROM chunks_fts f \ + JOIN chunks c ON c.chunk_id = f.chunk_id \ + JOIN documents d ON d.doc_id = f.doc_id", + ); + + let mut params: Vec> = Vec::new(); + // 1) snippet word count. + params.push(Box::new(snippet_words as i64)); + // 2) MATCH expression. + sql.push_str(" WHERE chunks_fts MATCH ?"); + params.push(Box::new(match_str.to_owned())); + + // tags_any: doc must own at least one of the requested tags. + if !filters.tags_any.is_empty() { + sql.push_str( + " AND f.doc_id IN (SELECT doc_id FROM document_tags WHERE tag IN (", + ); + for (i, tag) in filters.tags_any.iter().enumerate() { + if i > 0 { + sql.push(','); + } + sql.push('?'); + params.push(Box::new(tag.clone())); + } + sql.push_str("))"); + } + if let Some(lang) = &filters.lang { + sql.push_str(" AND d.lang = ?"); + params.push(Box::new(lang.0.clone())); + } + if let Some(trust_min) = &filters.trust_min { + // Mirror `kb_store_sqlite::documents::list_documents` ranking: + // Generated < Secondary < Primary. Doing the rank in SQL + // (rather than post-filtering) keeps the row stream short + // when the workspace contains many low-trust docs. + sql.push_str( + " AND CASE d.trust_level \ + WHEN 'primary' THEN 3 \ + WHEN 'secondary' THEN 2 \ + WHEN 'generated' THEN 1 \ + ELSE 0 \ + END >= ?", + ); + let rank: i64 = match trust_min { + TrustLevel::Primary => 3, + TrustLevel::Secondary => 2, + TrustLevel::Generated => 1, + }; + params.push(Box::new(rank)); + } + // path_glob is intentionally NOT applied here — see module comment + // on PATH_GLOB_OVERFETCH and the post-filter in `LexicalRetriever::search`. + + // Determinism: tie-break on chunk_id so equal bm25 scores produce a + // stable order across runs. `f.chunk_id` is the FTS row's UNINDEXED + // copy of the same value as `c.chunk_id`; either side works. + sql.push_str(" ORDER BY score, f.chunk_id LIMIT ?"); + params.push(Box::new(i64::try_from(fetch_limit).unwrap_or(i64::MAX))); + + let mut stmt = conn + .prepare(&sql) + .context("kb-search lexical: prepare FTS5 statement")?; + let rows = stmt + .query_map(params_from_iter(params.iter().map(|b| b.as_ref())), row_from_sql) + .context("kb-search lexical: execute FTS5 query")?; + let mut out: Vec = Vec::new(); + for r in rows { + out.push(r.context("kb-search lexical: read row")?); + } + Ok(out) +} + +fn row_from_sql(row: &Row<'_>) -> rusqlite::Result { + Ok(RawRow { + chunk_id: row.get(0)?, + doc_id: row.get(1)?, + bm25_raw: row.get(2)?, + snippet: row.get(3)?, + heading_path_json: row.get(4)?, + section_label: row.get(5)?, + source_spans_json: row.get(6)?, + chunker_version: row.get(7)?, + workspace_path: row.get(8)?, + }) +} + +// ── Hit construction ───────────────────────────────────────────────────── + +fn build_hit( + raw: RawRow, + rank: u32, + index_version: &IndexVersion, + snippet_chars: usize, +) -> Result { + let normalized = normalize_bm25(raw.bm25_raw); + let heading_path: Vec = serde_json::from_str(&raw.heading_path_json) + .context("kb-search lexical: deserialize heading_path_json")?; + let source_spans: Vec = serde_json::from_str(&raw.source_spans_json) + .context("kb-search lexical: deserialize source_spans_json")?; + + let workspace_path = WorkspacePath::new(raw.workspace_path) + .context("kb-search lexical: documents.workspace_path violates WorkspacePath invariant")?; + + let citation = build_citation( + &raw.chunk_id, + workspace_path.clone(), + raw.section_label.clone(), + source_spans.first(), + ); + + // FTS5's snippet() respects the word budget but produces a + // character-length we can't predict precisely (token boundaries vary + // with the tokenizer). The contract caps at `snippet_chars`; trim + // defensively if SQLite ever returns a longer string. + let snippet = trim_snippet(&raw.snippet, snippet_chars); + + Ok(SearchHit { + rank, + chunk_id: ChunkId(raw.chunk_id), + doc_id: DocumentId(raw.doc_id), + doc_path: workspace_path, + heading_path, + section_label: raw.section_label, + snippet, + citation, + retrieval: RetrievalDetail { + method: SearchMode::Lexical, + fusion_score: normalized, + lexical_score: Some(normalized), + vector_score: None, + lexical_rank: Some(rank), + vector_rank: None, + }, + index_version: index_version.clone(), + embedding_model: None, + chunker_version: ChunkerVersion(raw.chunker_version), + }) +} + +/// Map the raw bm25 score (FTS5 returns a *negative* number; lower is +/// better) into a positive score in `(0, 1]`. The formula +/// `score = -bm25 / (1 + |bm25|)` is monotonic, smooth, and bounded — +/// suitable both for human display and for use as an RRF input. +fn normalize_bm25(bm25_raw: f64) -> f32 { + let abs = bm25_raw.abs(); + let normalized = -bm25_raw / (1.0_f64 + abs); + normalized as f32 +} + +/// Build a `Citation` from the chunk's first `SourceSpan`. P1 markdown +/// only emits `Line`, so the other variants are mostly defensive — we +/// forward them as faithfully as possible so a future PDF / image +/// extractor can flow through without churn. +fn build_citation( + chunk_id: &str, + path: WorkspacePath, + section: Option, + first_span: Option<&SourceSpan>, +) -> Citation { + match first_span { + Some(SourceSpan::Line { start, end }) => Citation::Line { + path, + start: *start, + end: *end, + section, + }, + Some(SourceSpan::Page { page, .. }) => Citation::Page { + path, + page: *page, + section, + }, + Some(SourceSpan::Region { x, y, w, h }) => Citation::Region { + path, + x: *x, + y: *y, + w: *w, + h: *h, + }, + Some(SourceSpan::Time { start_ms, end_ms }) => Citation::Time { + path, + start_ms: *start_ms, + end_ms: *end_ms, + speaker: None, + }, + // Byte-spans don't have a Citation variant. Fall back to a Line + // citation pointing at the document head — better than fabricating + // a position. Spans-empty falls into the same branch. + other @ (Some(SourceSpan::Byte { .. }) | None) => { + let span_shape = match other { + Some(_) => "Byte", + None => "empty array", + }; + tracing::warn!( + chunk_id, + span_shape, + "kb-search lexical: SourceSpan has no Citation mapping; falling back to Line {{1, 1}}" + ); + Citation::Line { + path, + start: 1, + end: 1, + section, + } + } + } +} + +/// Cap the snippet at `max_chars` characters (Unicode scalar values, not +/// bytes — matches the §6.4 setting's "characters" semantics). Returns +/// the input unchanged when already short enough. +fn trim_snippet(s: &str, max_chars: usize) -> String { + // We slice on Unicode scalar values per §6.4's "characters" semantics; this + // can orphan a combining mark in extreme cases (Hebrew niqqud, Devanagari) + // but matches the spec's char-budget definition. + if s.chars().count() <= max_chars { + return s.to_string(); + } + s.chars().take(max_chars).collect() +} + +// ── path_glob ──────────────────────────────────────────────────────────── + +/// Compile a `path_glob` pattern. We enable `literal_separator` so `*` +/// does NOT cross `/` — design requires `*` to match within a single +/// path segment, not across them. (`globset`'s default is to let `*` +/// span separators.) +fn compile_glob(pattern: &str) -> Result { + let g = globset::GlobBuilder::new(pattern) + .literal_separator(true) + .build() + .with_context(|| format!("kb-search lexical: invalid path_glob {pattern:?}"))?; + Ok(g.compile_matcher()) +} + +// ── Unit tests for pure helpers ────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn build_match_string_empty_returns_none() { + assert!(build_match_string("").is_none()); + assert!(build_match_string(" ").is_none()); + assert!(build_match_string("''").is_none()); + assert!(build_match_string("' '").is_none()); + } + + #[test] + fn build_match_string_default_is_quoted_and_anded() { + let s = build_match_string("rust cargo").unwrap(); + // Two tokens, each quoted, joined by a space (implicit AND). + assert_eq!(s, r#""rust" "cargo""#); + } + + #[test] + fn build_match_string_escapes_special_chars() { + // `*`, `(`, `)`, `:`, `^`, `"` should all be wrapped inside + // FTS5 string-literal quotes so they're treated as literal + // text rather than FTS5 operators. + let s = build_match_string(r#"foo* (bar) baz:qux ^head he"llo"#).unwrap(); + assert_eq!( + s, + r#""foo*" "(bar)" "baz:qux" "^head" "he""llo""# + ); + // The doubled `""` is FTS5's way of embedding a literal quote + // inside a string literal. + assert!(s.contains(r#"he""llo"#)); + // Sanity: every special character lives between matching `"` + // delimiters — there is no bare-token (unquoted) span anywhere. + // We check this by confirming the string starts and ends with `"` + // and the count of unescaped `"` is even (each token is wrapped). + assert!(s.starts_with('"') && s.ends_with('"')); + } + + #[test] + fn build_match_string_passthrough_when_single_quoted() { + // The FTS5 expression is preserved verbatim. + let s = build_match_string("'foo OR bar*'").unwrap(); + assert_eq!(s, "foo OR bar*"); + } + + #[test] + fn normalize_bm25_top_score_in_unit_interval() { + // A "perfect" hit is bm25 = -1.0 → normalized 0.5. + // A high-relevance hit (bm25 = -10.0) → 10/11 ≈ 0.909. + let high = normalize_bm25(-10.0); + assert!(high > 0.0 && high <= 1.0, "got {high}"); + let medium = normalize_bm25(-1.0); + assert!((medium - 0.5).abs() < 1e-6); + } + + #[test] + fn normalize_bm25_monotonic() { + // Lower (more-negative) bm25 must map to a higher normalized score. + let a = normalize_bm25(-2.0); + let b = normalize_bm25(-1.0); + assert!(a > b, "{a} should exceed {b}"); + } + + #[test] + fn trim_snippet_caps_at_char_count() { + let s = "a".repeat(300); + let trimmed = trim_snippet(&s, 220); + assert_eq!(trimmed.chars().count(), 220); + } + + #[test] + fn trim_snippet_passthrough_when_short() { + let s = "short"; + assert_eq!(trim_snippet(s, 220), "short"); + } + + #[test] + fn build_citation_line_round_trip() { + let p = WorkspacePath::new("a/b.md".to_string()).unwrap(); + let span = SourceSpan::Line { start: 7, end: 12 }; + let c = build_citation("c1", p.clone(), Some("S1".to_string()), Some(&span)); + match c { + Citation::Line { + start, + end, + ref section, + path: ref pp, + } => { + assert_eq!(start, 7); + assert_eq!(end, 12); + assert_eq!(section.as_deref(), Some("S1")); + assert_eq!(pp, &p); + } + other => panic!("expected Citation::Line, got {other:?}"), + } + } + + #[test] + fn build_citation_page_forwards_section() { + let p = WorkspacePath::new("doc.pdf".to_string()).unwrap(); + let span = SourceSpan::Page { + page: 4, + char_start: None, + char_end: None, + }; + let c = build_citation("c1", p, Some("Intro".to_string()), Some(&span)); + match c { + Citation::Page { + page, + ref section, + .. + } => { + assert_eq!(page, 4); + assert_eq!(section.as_deref(), Some("Intro")); + } + other => panic!("expected Citation::Page, got {other:?}"), + } + } + + #[test] + fn build_citation_none_falls_back_to_line_one() { + let p = WorkspacePath::new("x.md".to_string()).unwrap(); + let c = build_citation("c1", p, None, None); + match c { + Citation::Line { start, end, .. } => { + assert_eq!((start, end), (1, 1)); + } + other => panic!("expected fallback Citation::Line, got {other:?}"), + } + } + + #[test] + fn compile_glob_rejects_invalid_pattern() { + // `[` is a character-class opener; an unclosed class is invalid. + let r = compile_glob("notes/[abc"); + assert!(r.is_err()); + } + + #[test] + fn compile_glob_star_does_not_cross_slash() { + // This is the design invariant: `*` must NOT match `/`. + let m = compile_glob("notes/*.md").unwrap(); + assert!(m.is_match("notes/foo.md")); + assert!(!m.is_match("notes/sub/foo.md")); + } +} diff --git a/crates/kb-search/src/lib.rs b/crates/kb-search/src/lib.rs new file mode 100644 index 0000000..76afd55 --- /dev/null +++ b/crates/kb-search/src/lib.rs @@ -0,0 +1,15 @@ +//! `kb-search` — `kb_core::Retriever` implementations. +//! +//! P2-2 ships [`LexicalRetriever`], a SQLite-FTS5-backed retriever for +//! `SearchMode::Lexical`. Vector + Hybrid retrievers land in P3-3 / P3-4. +//! +//! Allowed deps per task spec: `kb-core`, `kb-config`, `kb-store-sqlite`, +//! `rusqlite`, `globset`, `tracing`, `thiserror`, `anyhow`. Forbidden: +//! `kb-source-fs`, `kb-parse-md`, `kb-normalize`, `kb-chunk`, +//! `kb-store-vector`, `kb-embed*`, `kb-llm*`, `kb-rag`, `kb-tui`, +//! `kb-desktop`. Only `serde_json` is a transitive helper used to decode +//! JSON-typed columns from `chunks` / `documents`. + +mod lexical; + +pub use lexical::LexicalRetriever; diff --git a/crates/kb-search/tests/fixtures/search/lexical/run-1.json b/crates/kb-search/tests/fixtures/search/lexical/run-1.json new file mode 100644 index 0000000..701d2fb --- /dev/null +++ b/crates/kb-search/tests/fixtures/search/lexical/run-1.json @@ -0,0 +1,60 @@ +[ + { + "chunk_id": "c3000000000000000000000000000000", + "chunker_version": "v1", + "citation": { + "end": 8, + "kind": "line", + "path": "notes/snap.md", + "section": "Snap", + "start": 7 + }, + "doc_id": "d0000000000000000000000000000000", + "doc_path": "notes/snap.md", + "embedding_model": null, + "heading_path": [ + "Snap" + ], + "index_version": "v1.0", + "rank": 1, + "retrieval": { + "fusion_score": 1.4490997273242101e-6, + "lexical_rank": 1, + "lexical_score": 1.4490997273242101e-6, + "method": "lexical", + "vector_rank": null, + "vector_score": null + }, + "section_label": "Snap", + "snippet": "alpha alpha" + }, + { + "chunk_id": "c1000000000000000000000000000000", + "chunker_version": "v1", + "citation": { + "end": 2, + "kind": "line", + "path": "notes/snap.md", + "section": "Snap", + "start": 1 + }, + "doc_id": "d0000000000000000000000000000000", + "doc_path": "notes/snap.md", + "embedding_model": null, + "heading_path": [ + "Snap" + ], + "index_version": "v1.0", + "rank": 2, + "retrieval": { + "fusion_score": 9.641424867368187e-7, + "lexical_rank": 2, + "lexical_score": 9.641424867368187e-7, + "method": "lexical", + "vector_rank": null, + "vector_score": null + }, + "section_label": "Snap", + "snippet": "alpha bravo charlie" + } +] \ No newline at end of file diff --git a/crates/kb-search/tests/lexical.rs b/crates/kb-search/tests/lexical.rs new file mode 100644 index 0000000..91a2b71 --- /dev/null +++ b/crates/kb-search/tests/lexical.rs @@ -0,0 +1,666 @@ +//! P2-2 integration tests for `LexicalRetriever`. +//! +//! Strategy: seed the SQLite store via raw inserts with `foreign_keys = +//! OFF` (mirroring the P2-1 FTS tests). This avoids dragging +//! `kb-parse-md` / `kb-normalize` / `kb-chunk` into kb-search's dev-deps, +//! which would violate the task's "Allowed deps" list. + +use std::sync::Arc; + +use kb_config::Config; +use kb_core::{IndexVersion, Lang, Retriever, SearchFilters, SearchMode, SearchQuery, TrustLevel}; +use kb_search::LexicalRetriever; +use kb_store_sqlite::SqliteStore; +use rusqlite::Connection; +use tempfile::TempDir; + +// ── Test scaffolding ───────────────────────────────────────────────────── + +struct Env { + _temp: TempDir, + store: Arc, + db_path: std::path::PathBuf, +} + +impl Env { + fn new() -> Self { + let temp = tempfile::tempdir().expect("tempdir"); + let mut config = Config::defaults(); + config.storage.data_dir = temp.path().to_string_lossy().into_owned(); + let store = SqliteStore::open(&config).expect("open store"); + store.run_migrations().expect("run migrations"); + let db_path = temp.path().join("kb.sqlite"); + Self { + _temp: temp, + store: Arc::new(store), + db_path, + } + } + + /// Side-channel raw connection with FK enforcement off — same + /// trick used by P2-1's FTS tests so we can seed `chunks` / + /// `documents` directly without the full ingest graph. + fn raw_conn(&self) -> Connection { + let conn = Connection::open(&self.db_path).expect("open side conn"); + conn.pragma_update(None, "foreign_keys", "OFF").unwrap(); + conn + } + + fn retriever(&self) -> LexicalRetriever { + LexicalRetriever::new( + Arc::clone(&self.store), + IndexVersion("v1.0".to_string()), + ) + } + + fn retriever_with_snippet_chars(&self, snippet_chars: usize) -> LexicalRetriever { + LexicalRetriever::with_settings( + Arc::clone(&self.store), + IndexVersion("v1.0".to_string()), + snippet_chars, + ) + } +} + +/// Minimal documents row. Many columns are NOT NULL and we don't care +/// about their exact values for retrieval tests, so we wedge in +/// reasonable defaults. +#[allow(clippy::too_many_arguments)] +fn insert_document( + conn: &Connection, + doc_id: &str, + workspace_path: &str, + title: &str, + lang: &str, + trust_level: &str, + tags: &[&str], +) { + // assets row first — documents.asset_id has a FK with ON DELETE + // RESTRICT but FKs are OFF on this connection. Still we insert a + // matching row so JOINs pick it up. + let asset_id = format!("{:0>32}", &doc_id[..1.min(doc_id.len())]); // 32-hex-ish + let asset_id = format!("{:0>32}", asset_id.chars().take(32).collect::()); + conn.execute( + "INSERT OR IGNORE INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, 'file:///x', ?, '\"markdown\"', 0, + 'd0', 'reference', '/x', '2024-01-01T00:00:00Z')", + rusqlite::params![asset_id, workspace_path], + ) + .expect("insert asset"); + + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, + source_type, trust_level, parser_version, + doc_version, schema_version, metadata_json, + provenance_json, created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, 'markdown', ?, 'pv1', 1, 1, + '{}', '{\"events\":[]}', + '2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z')", + rusqlite::params![doc_id, asset_id, workspace_path, title, lang, trust_level], + ) + .expect("insert document"); + + for tag in tags { + conn.execute( + "INSERT INTO document_tags (doc_id, tag) VALUES (?, ?)", + rusqlite::params![doc_id, tag], + ) + .expect("insert tag"); + } +} + +#[allow(clippy::too_many_arguments)] +fn insert_chunk( + conn: &Connection, + chunk_id: &str, + doc_id: &str, + text: &str, + heading_path: &[&str], + section_label: Option<&str>, + source_spans_json: &str, + chunker_version: &str, +) { + let heading_json = serde_json::to_string(heading_path).unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, ?, ?, ?, ?, 0, ?, 'h', '[]', '2024-01-01T00:00:00Z')", + rusqlite::params![ + chunk_id, + doc_id, + text, + heading_json, + section_label, + source_spans_json, + chunker_version, + ], + ) + .expect("insert chunk"); +} + +/// Pad a short ID to the 32-hex shape kb_core newtypes expect. +fn id32(prefix: &str) -> String { + let mut s = prefix.to_string(); + while s.len() < 32 { + s.push('0'); + } + s.truncate(32); + s +} + +// ── Tests ──────────────────────────────────────────────────────────────── + +#[test] +fn lexical_empty_corpus_returns_empty_vec() { + let env = Env::new(); + let r = env.retriever(); + let q = SearchQuery { + text: "rust".to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: SearchFilters::default(), + }; + let hits = r.search(&q).expect("search"); + assert!(hits.is_empty(), "empty corpus must yield empty Vec"); +} + +#[test] +fn lexical_empty_query_returns_empty_vec_without_db_hit() { + // Even with rows in the DB, a blank query must short-circuit to []. + let env = Env::new(); + let conn = env.raw_conn(); + insert_document(&conn, &id32("d"), "notes/a.md", "A", "en", "primary", &[]); + insert_chunk( + &conn, + &id32("c1"), + &id32("d"), + "rust cargo macros", + &["A"], + None, + r#"[{"kind":"line","start":1,"end":3}]"#, + "v1", + ); + drop(conn); + + let r = env.retriever(); + for empty in ["", " ", "''"] { + let q = SearchQuery { + text: empty.to_string(), + mode: SearchMode::Lexical, + k: 5, + filters: SearchFilters::default(), + }; + let hits = r.search(&q).unwrap(); + assert!(hits.is_empty(), "query {empty:?} must yield empty Vec"); + } +} + +#[test] +fn lexical_single_doc_match_returns_one_hit_with_citation_round_trip() { + let env = Env::new(); + let conn = env.raw_conn(); + insert_document(&conn, &id32("d"), "notes/rust.md", "Rust Notes", "en", "primary", &[]); + insert_chunk( + &conn, + &id32("c1"), + &id32("d"), + "Rust borrow checker enforces ownership.", + &["Notes"], + Some("Notes"), + r#"[{"kind":"line","start":4,"end":4}]"#, + "v1", + ); + drop(conn); + + let r = env.retriever(); + let q = SearchQuery { + text: "borrow".to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: SearchFilters::default(), + }; + let hits = r.search(&q).expect("search"); + assert_eq!(hits.len(), 1); + let h = &hits[0]; + assert_eq!(h.rank, 1); + assert_eq!(h.doc_path.0, "notes/rust.md"); + assert_eq!(h.heading_path, vec!["Notes".to_string()]); + assert_eq!(h.section_label.as_deref(), Some("Notes")); + assert_eq!(h.retrieval.method, SearchMode::Lexical); + assert_eq!(h.retrieval.lexical_rank, Some(1)); + assert!(h.retrieval.vector_score.is_none()); + + // Citation round-trips through `to_uri`/`parse` (line variant). + let uri = h.citation.to_uri(); + let parsed = kb_core::Citation::parse(&uri).expect("parse uri"); + // Reparsed citation has section=None (URI fragment doesn't carry it), + // so compare by `to_uri` equivalence rather than struct equality. + assert_eq!(parsed.to_uri(), uri); + // Sanity: this is a Line citation matching the seeded source span. + assert_eq!(uri, "notes/rust.md#L4"); +} + +#[test] +fn lexical_snippet_length_capped_at_snippet_chars() { + let env = Env::new(); + let conn = env.raw_conn(); + insert_document( + &conn, + &id32("d"), + "notes/long.md", + "Long", + "en", + "primary", + &[], + ); + // A text long enough that FTS5 might return a snippet > 80 chars + // when given a high word budget. We instead set a tight cap below + // and rely on `trim_snippet` as the backstop. + let mut text = String::new(); + for _ in 0..50 { + text.push_str("alpha beta gamma delta epsilon "); + } + insert_chunk( + &conn, + &id32("c1"), + &id32("d"), + &text, + &["Long"], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + ); + drop(conn); + + // Set snippet_chars to a known bound; the retriever clamps + trims + // any snippet to fit. + let r = env.retriever_with_snippet_chars(80); + let hits = r + .search(&SearchQuery { + text: "alpha".to_string(), + mode: SearchMode::Lexical, + k: 1, + filters: SearchFilters::default(), + }) + .unwrap(); + assert_eq!(hits.len(), 1); + assert!( + hits[0].snippet.chars().count() <= 80, + "snippet must be ≤ snippet_chars; got {} chars: {:?}", + hits[0].snippet.chars().count(), + hits[0].snippet + ); +} + +#[test] +fn lexical_filter_tags_any_excludes_untagged_docs() { + let env = Env::new(); + let conn = env.raw_conn(); + insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &["rust"]); + insert_document(&conn, &id32("d2"), "notes/b.md", "B", "en", "primary", &["python"]); + insert_chunk( + &conn, + &id32("c1"), + &id32("d1"), + "ownership and borrow checker", + &["A"], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + ); + insert_chunk( + &conn, + &id32("c2"), + &id32("d2"), + "borrow semantics in python", + &["B"], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + ); + drop(conn); + + let r = env.retriever(); + let q = SearchQuery { + text: "borrow".to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: SearchFilters { + tags_any: vec!["rust".to_string()], + ..Default::default() + }, + }; + let hits = r.search(&q).unwrap(); + assert_eq!(hits.len(), 1, "tags_any=[rust] must exclude python doc"); + assert_eq!(hits[0].doc_path.0, "notes/a.md"); +} + +#[test] +fn lexical_filter_lang_and_trust_min_compose() { + let env = Env::new(); + let conn = env.raw_conn(); + insert_document(&conn, &id32("d1"), "ko/a.md", "A", "ko", "primary", &[]); + insert_document(&conn, &id32("d2"), "en/b.md", "B", "en", "primary", &[]); + insert_document(&conn, &id32("d3"), "en/c.md", "C", "en", "generated", &[]); + for (cid, did, body) in [ + ("c1", "d1", "검색 키워드 alpha"), + ("c2", "d2", "alpha bravo"), + ("c3", "d3", "alpha gamma"), + ] { + insert_chunk( + &conn, + &id32(cid), + &id32(did), + body, + &[], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + ); + } + drop(conn); + + let r = env.retriever(); + // lang=en + trust_min=secondary → only d2 (primary ≥ secondary). + let hits = r + .search(&SearchQuery { + text: "alpha".to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: SearchFilters { + lang: Some(Lang("en".to_string())), + trust_min: Some(TrustLevel::Secondary), + ..Default::default() + }, + }) + .unwrap(); + assert_eq!(hits.len(), 1); + assert_eq!(hits[0].doc_path.0, "en/b.md"); +} + +#[test] +fn lexical_filter_path_glob_does_not_cross_slash() { + let env = Env::new(); + let conn = env.raw_conn(); + insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &[]); + insert_document(&conn, &id32("d2"), "notes/sub/b.md", "B", "en", "primary", &[]); + insert_chunk( + &conn, + &id32("c1"), + &id32("d1"), + "shared keyword", + &[], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + ); + insert_chunk( + &conn, + &id32("c2"), + &id32("d2"), + "shared keyword", + &[], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + ); + drop(conn); + + let r = env.retriever(); + let hits = r + .search(&SearchQuery { + text: "keyword".to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: SearchFilters { + path_glob: Some("notes/*.md".to_string()), + ..Default::default() + }, + }) + .unwrap(); + let paths: Vec<&str> = hits.iter().map(|h| h.doc_path.0.as_str()).collect(); + assert_eq!(paths, vec!["notes/a.md"], "* must not match across `/`"); +} + +#[test] +fn lexical_citation_round_trip_against_first_source_span() { + let env = Env::new(); + let conn = env.raw_conn(); + insert_document(&conn, &id32("d"), "notes/m.md", "M", "en", "primary", &[]); + insert_chunk( + &conn, + &id32("c1"), + &id32("d"), + "echo bravo", + &[], + None, + // Two spans; the citation uses the first. + r#"[{"kind":"line","start":12,"end":34},{"kind":"line","start":60,"end":61}]"#, + "v1", + ); + drop(conn); + + let r = env.retriever(); + let hits = r + .search(&SearchQuery { + text: "bravo".to_string(), + mode: SearchMode::Lexical, + k: 1, + filters: SearchFilters::default(), + }) + .unwrap(); + assert_eq!(hits.len(), 1); + let uri = hits[0].citation.to_uri(); + assert_eq!(uri, "notes/m.md#L12-L34"); + let parsed = kb_core::Citation::parse(&uri).unwrap(); + assert_eq!(parsed.to_uri(), uri); +} + +#[test] +fn lexical_top_score_within_unit_interval_three_chunks() { + let env = Env::new(); + let conn = env.raw_conn(); + insert_document(&conn, &id32("d"), "notes/r.md", "R", "en", "primary", &[]); + // Three chunks of varying relevance to the query 'alpha': + // c1: alpha alpha alpha (best) + // c2: alpha bravo + // c3: bravo charlie alpha (one occurrence) + for (cid, body) in [ + ("c1", "alpha alpha alpha keyword"), + ("c2", "alpha bravo charlie"), + ("c3", "bravo charlie alpha"), + ] { + insert_chunk( + &conn, + &id32(cid), + &id32("d"), + body, + &[], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + ); + } + drop(conn); + + let r = env.retriever(); + let hits = r + .search(&SearchQuery { + text: "alpha".to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: SearchFilters::default(), + }) + .unwrap(); + assert!(!hits.is_empty(), "must surface at least one hit"); + let top = hits[0].retrieval.fusion_score; + assert!( + top > 0.0 && top <= 1.0, + "top normalized score must be in (0, 1]; got {top}" + ); + // All scores in [0, 1]. + for h in &hits { + let s = h.retrieval.fusion_score; + assert!((0.0..=1.0).contains(&s), "hit score {s} out of [0, 1]"); + // lexical_score and fusion_score equal in lexical-only mode. + assert_eq!(h.retrieval.lexical_score, Some(s)); + } + // bm25 should rank c1 (3 occurrences) above c2 / c3. + assert!(hits[0].chunk_id.0.starts_with("c1")); +} + +#[test] +fn lexical_determinism_same_query_twice() { + let env = Env::new(); + let conn = env.raw_conn(); + insert_document(&conn, &id32("d"), "notes/r.md", "R", "en", "primary", &[]); + for (cid, body) in [ + ("c1", "alpha alpha"), + ("c2", "alpha bravo"), + ("c3", "alpha charlie"), + ("c4", "alpha delta"), + ] { + insert_chunk( + &conn, + &id32(cid), + &id32("d"), + body, + &[], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + ); + } + drop(conn); + + let r = env.retriever(); + let q = SearchQuery { + text: "alpha".to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: SearchFilters::default(), + }; + let a = r.search(&q).unwrap(); + let b = r.search(&q).unwrap(); + assert_eq!(a, b, "same DB + same query must yield identical Vec"); +} + +#[test] +fn lexical_determinism_chunk_id_tiebreaker_on_equal_bm25() { + // Two chunks with byte-identical text + length → identical bm25 scores + // for any `MATCH` against them. The retriever must fall back to + // `chunk_id` ordering so the result is stable across runs. + let env = Env::new(); + let conn = env.raw_conn(); + insert_document(&conn, &id32("d"), "notes/tie.md", "Tie", "en", "primary", &[]); + let cid_a = id32("aaaa"); + let cid_b = id32("bbbb"); + assert!(cid_a < cid_b, "test premise: aaaa-id sorts before bbbb-id"); + for cid in [&cid_a, &cid_b] { + insert_chunk( + &conn, + cid, + &id32("d"), + "alpha bravo charlie", + &[], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + ); + } + drop(conn); + + let r = env.retriever(); + let q = SearchQuery { + text: "alpha".to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: SearchFilters::default(), + }; + let a = r.search(&q).unwrap(); + let b = r.search(&q).unwrap(); + assert_eq!(a.len(), 2, "both chunks should match"); + // bm25 must be equal for byte-identical chunks; the secondary sort + // by chunk_id pins the order. + assert!( + (a[0].retrieval.fusion_score - a[1].retrieval.fusion_score).abs() < 1e-9, + "byte-identical chunks must score equally; got {} vs {}", + a[0].retrieval.fusion_score, + a[1].retrieval.fusion_score + ); + assert!( + a[0].chunk_id.0 < a[1].chunk_id.0, + "tiebreaker must order by chunk_id ascending; got {} then {}", + a[0].chunk_id.0, + a[1].chunk_id.0 + ); + assert_eq!(a, b, "tiebreaker order must be stable across runs"); +} + +#[test] +fn lexical_index_version_is_returned_unchanged() { + let env = Env::new(); + let r = LexicalRetriever::new( + Arc::clone(&env.store), + IndexVersion("custom-label-1".to_string()), + ); + assert_eq!(r.index_version().0, "custom-label-1"); +} + +#[test] +fn lexical_snapshot_run_1() { + // Pinned snapshot. A small, deterministic corpus; the JSON shape of + // `Vec` for a fixed query is checked verbatim against + // `tests/fixtures/search/lexical/run-1.json`. Update both sides in + // the same commit when intentional changes ship. + // Stable because rusqlite ships bundled SQLite — a tokenizer/bm25 algorithm change in a future SQLite bump will require regenerating run-1.json via `KB_UPDATE_SNAPSHOTS=1`. + let env = Env::new(); + let conn = env.raw_conn(); + insert_document(&conn, &id32("d"), "notes/snap.md", "Snap", "en", "primary", &[]); + for (cid, body, span) in [ + ( + "c1", + "alpha bravo charlie", + r#"[{"kind":"line","start":1,"end":2}]"#, + ), + ( + "c2", + "bravo only here", + r#"[{"kind":"line","start":4,"end":5}]"#, + ), + ( + "c3", + "alpha alpha", + r#"[{"kind":"line","start":7,"end":8}]"#, + ), + ] { + insert_chunk(&conn, &id32(cid), &id32("d"), body, &["Snap"], Some("Snap"), span, "v1"); + } + drop(conn); + + let r = env.retriever(); + let hits = r + .search(&SearchQuery { + text: "alpha".to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: SearchFilters::default(), + }) + .unwrap(); + let actual = serde_json::to_value(&hits).unwrap(); + + let baseline_path = + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/search/lexical/run-1.json"); + if std::env::var_os("KB_UPDATE_SNAPSHOTS").is_some() { + std::fs::write(&baseline_path, serde_json::to_string_pretty(&actual).unwrap()).unwrap(); + } + let baseline_text = std::fs::read_to_string(&baseline_path) + .expect("baseline snapshot must exist; run with KB_UPDATE_SNAPSHOTS=1 to seed"); + let expected: serde_json::Value = serde_json::from_str(&baseline_text).unwrap(); + assert_eq!(actual, expected, "lexical run-1 snapshot drift"); +} diff --git a/crates/kb-store-sqlite/src/store.rs b/crates/kb-store-sqlite/src/store.rs index 83d0182..b7f57d6 100644 --- a/crates/kb-store-sqlite/src/store.rs +++ b/crates/kb-store-sqlite/src/store.rs @@ -109,6 +109,22 @@ impl SqliteStore { self.conn.lock().unwrap_or_else(|p| p.into_inner()) } + /// Read-only borrow of the connection. + /// + /// Provided so sibling crates (e.g. `kb-search`) can run SELECTs + /// against the schema owned by this crate without re-opening the + /// SQLite file. Callers MUST treat the returned `Connection` as + /// read-only — issuing mutating SQL (INSERT / UPDATE / DELETE / DDL) + /// through this guard bypasses the per-document transaction discipline + /// (`put_*` methods) and the FTS5 backfill helpers that the store + /// layer enforces. Mutating callers must use `kb-store-sqlite`'s own + /// public write methods instead. + /// + /// Poisoning is recovered the same way as [`Self::lock_conn`]. + pub fn read_conn(&self) -> MutexGuard<'_, Connection> { + self.conn.lock().unwrap_or_else(|p| p.into_inner()) + } + /// Persist a `RawAsset` *with its raw bytes*: row goes into `assets`, /// bytes go to `data_dir/assets//` if `byte_len ≤ /// copy_threshold_mb`, otherwise the row records the source URI's -- 2.49.1