feat(p2-2): lexical-retriever — kb-search crate + LexicalRetriever (FTS5 + bm25) #13

Merged
altair823 merged 1 commits from feat/p2-2-lexical-retriever into main 2026-05-01 08:03:25 +00:00
8 changed files with 1450 additions and 0 deletions

16
Cargo.lock generated
View File

@@ -801,6 +801,22 @@ dependencies = [
"serde",
]
[[package]]
name = "kb-search"
version = "0.1.0"
dependencies = [
"anyhow",
"globset",
"kb-config",
"kb-core",
"kb-store-sqlite",
"rusqlite",
"serde_json",
"tempfile",
"thiserror 2.0.18",
"tracing",
]
[[package]]
name = "kb-source-fs"
version = "0.1.0"

View File

@@ -9,6 +9,7 @@ members = [
"crates/kb-normalize",
"crates/kb-chunk",
"crates/kb-store-sqlite",
"crates/kb-search",
"crates/kb-app",
"crates/kb-cli",
]
@@ -29,3 +30,8 @@ time = { version = "0.3", features = ["serde", "macros", "formatting", "
uuid = { version = "1", features = ["v7", "serde"] }
blake3 = "1"
tracing = "0.1"
# `bundled` ships SQLite source so the workspace doesn't depend on a
# system libsqlite3 (matches the kb-store-sqlite feature set).
rusqlite = { version = "0.32", features = ["bundled"] }
globset = "0.4"
tempfile = "3"

View File

@@ -0,0 +1,22 @@
[package]
name = "kb-search"
version = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "Retriever implementations for kb (P2-2 lexical FTS5; P3 vector / hybrid will follow)"
[dependencies]
kb-core = { path = "../kb-core" }
kb-config = { path = "../kb-config" }
kb-store-sqlite = { path = "../kb-store-sqlite" }
rusqlite = { workspace = true }
globset = { workspace = true }
serde_json = { workspace = true }
tracing = { workspace = true }
thiserror = { workspace = true }
anyhow = { workspace = true }
[dev-dependencies]
tempfile = { workspace = true }

View File

@@ -0,0 +1,649 @@
//! Lexical (FTS5 + bm25) retriever — design §3.7 / §1.5 / §2.2 / §6.4.
//!
//! Owns the SQL pattern documented in `tasks/p2/p2-2-lexical-retriever.md`
//! and constructs `kb_core::SearchHit` values directly from the joined
//! `chunks_fts` / `chunks` / `documents` rows. Reads only — never mutates
//! the underlying SQLite file.
use std::sync::Arc;
use anyhow::{Context, Result};
use globset::GlobMatcher;
use kb_core::{
ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail,
Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery, SourceSpan,
TrustLevel, WorkspacePath,
};
use kb_store_sqlite::SqliteStore;
use rusqlite::{params_from_iter, Connection, Row, ToSql};
// ── Tunables ─────────────────────────────────────────────────────────────
/// FTS5 hard limit on the `snippet()` `nToken` argument.
/// See SQLite's FTS5 docs: snippet() rejects nToken > 64.
const FTS5_SNIPPET_MAX_WORDS: usize = 64;
/// Floor for the snippet word budget. `snippet_chars / 4` may yield 0 for
/// pathologically small configs; we always ask FTS5 for at least one word
/// so it can still return something matchable for the test harness.
const FTS5_SNIPPET_MIN_WORDS: usize = 1;
/// Default `k` when `SearchQuery::k == 0`. Mirrors §6.4 default_k=10.
const DEFAULT_K: usize = 10;
/// When `path_glob` is set we have to over-fetch and post-filter in Rust,
/// because SQLite's GLOB operator treats `*` as "any chars including `/`",
/// which contradicts the design rule that `*` must NOT cross path
/// separators. Empirically `+128` is generous for any realistic workspace
/// and bounded enough to keep memory predictable.
const PATH_GLOB_OVERFETCH: usize = 128;
// ── Public surface ───────────────────────────────────────────────────────
/// Lexical retriever backed by SQLite FTS5 + bm25.
pub struct LexicalRetriever {
store: Arc<SqliteStore>,
index_version: IndexVersion,
/// Number of `snippet()` words derived from `kb-config::search.snippet_chars`,
/// clamped into `[FTS5_SNIPPET_MIN_WORDS, FTS5_SNIPPET_MAX_WORDS]`.
snippet_words: usize,
/// Hard cap on the returned snippet's character length per design §6.4.
snippet_chars: usize,
}
impl LexicalRetriever {
/// Construct with default settings derived from `kb-config`'s defaults.
/// Snippet width is computed from `Config::defaults().search.snippet_chars`.
pub fn new(store: Arc<SqliteStore>, index_version: IndexVersion) -> Self {
let cfg = kb_config::Config::defaults();
Self::with_settings(store, index_version, cfg.search.snippet_chars)
}
/// Construct with explicit `snippet_chars`. Used by tests / callers
/// that have already loaded a `Config`.
pub fn with_settings(
store: Arc<SqliteStore>,
index_version: IndexVersion,
snippet_chars: usize,
) -> Self {
// Heuristic: 1 token ≈ 4 chars (English-leaning estimate; Korean
// tokens average shorter, so the cap-by-chars trim below is what
// actually enforces the contract). The `/4` keeps us well below
// FTS5's nToken=64 limit for typical snippet_chars=220 budgets.
let raw = snippet_chars / 4;
let snippet_words = raw.clamp(FTS5_SNIPPET_MIN_WORDS, FTS5_SNIPPET_MAX_WORDS);
Self {
store,
index_version,
snippet_words,
snippet_chars,
}
}
}
impl Retriever for LexicalRetriever {
fn search(&self, query: &SearchQuery) -> Result<Vec<SearchHit>> {
let match_opt = build_match_string(&query.text);
let k = if query.k == 0 { DEFAULT_K } else { query.k };
let filters = &query.filters;
// One-line summary at request entry. Filter shape only — no
// tag/lang/path values, which could be PII-sensitive.

search 진입과 종료 양쪽에 tracing::debug!을 둔 점이 좋습니다. kb search --mode lexical 디버깅 시 match string 자체와 필터 형태(개수/플래그)를 별도로 분리해서 로그한 부분 — 사용자 입력 값을 PII로 누설하지 않으면서도 "필터가 적용되긴 했나?"를 확인할 수 있게 만든 균형이 적절합니다.

search 진입과 종료 양쪽에 `tracing::debug!`을 둔 점이 좋습니다. `kb search --mode lexical` 디버깅 시 match string 자체와 필터 형태(개수/플래그)를 별도로 분리해서 로그한 부분 — 사용자 입력 값을 PII로 누설하지 않으면서도 "필터가 적용되긴 했나?"를 확인할 수 있게 만든 균형이 적절합니다.
tracing::debug!(
match_str = match_opt.as_deref().unwrap_or("<empty>"),
tags_any = filters.tags_any.len(),
has_lang = filters.lang.is_some(),
has_trust_min = filters.trust_min.is_some(),
has_path_glob = filters.path_glob.is_some(),
k,
"kb-search lexical: search start"
);
// Empty / whitespace-only query → nothing to do. Per spec we
// succeed with an empty hit list rather than erroring.
let match_str = match match_opt {
Some(s) => s,
None => return Ok(Vec::new()),
};
// Pre-compile the path_glob once. The `Glob` produced rejects
// syntactically invalid patterns at construction time so the
// caller gets a clear error rather than a silent empty result.
let path_matcher = match &filters.path_glob {
Some(g) => Some(compile_glob(g)?),
None => None,
};
// Fetch budget: when post-filtering by glob we need to over-fetch
// so that the final `take(k)` still has enough rows after culling.
let fetch_limit = if path_matcher.is_some() {
k.saturating_add(PATH_GLOB_OVERFETCH)
} else {
k
};
let conn = self.store.read_conn();
let raw_rows = run_query(
&conn,
&match_str,
self.snippet_words,
filters,
fetch_limit,
)?;
let mut hits: Vec<SearchHit> = Vec::with_capacity(raw_rows.len().min(k));
let mut rank: u32 = 0;
for row in raw_rows {
// Path glob is the only filter we evaluate in Rust because the
// semantics differ from SQLite's GLOB (no `/` crossing).
if let Some(m) = &path_matcher {
if !m.is_match(&row.workspace_path) {
continue;
}
}
rank = rank.saturating_add(1);
let hit = build_hit(row, rank, &self.index_version, self.snippet_chars)?;
hits.push(hit);
if hits.len() >= k {
break;
}
}
tracing::debug!(rows = hits.len(), "kb-search lexical: search done");
Ok(hits)
}
fn index_version(&self) -> IndexVersion {
self.index_version.clone()
}
}
// ── Match-string construction ────────────────────────────────────────────
/// Translate a user-typed query into an FTS5 match string.
///
/// Rules (from the task spec):
///
/// - The query is wrapped in a single pair of `'...'` → strip the quotes
/// and pass the inner text through verbatim. The user has explicitly
/// opted into FTS5 syntax (e.g. `'rust AND cargo'`, `'foo*'`).
///
/// - Otherwise: split on whitespace, escape every token by wrapping it
/// in `"..."` (FTS5 string literal), with any inner `"` doubled. Join
/// with spaces — FTS5 default operator is implicit AND.
///
/// - An empty / whitespace-only token list → return `None` (caller
/// short-circuits to `Ok(vec![])`).
fn build_match_string(text: &str) -> Option<String> {
let trimmed = text.trim();
if trimmed.is_empty() {
return None;
}
if let Some(inner) = strip_single_quotes(trimmed) {
let inner_trim = inner.trim();
if inner_trim.is_empty() {
return None;
}
return Some(inner_trim.to_string());
}
let tokens: Vec<String> = trimmed
.split_whitespace()
.map(escape_fts5_token)
.collect();
if tokens.is_empty() {
None
} else {
Some(tokens.join(" "))
}
}
/// Return `Some(inner)` if `s` is wrapped in a matching pair of single
/// quotes (`'...'`), otherwise `None`. We require the closing quote to
/// be the last character so `'foo' bar` doesn't accidentally engage

FTS5 매치 빌더의 escape 전략이 단순하면서도 안전합니다. 모든 토큰을 "..."로 감싸고 내부 "만 이중화하면 FTS5 metacharacter 전체(*, ^, :, (, ))가 한 번에 무력화됩니다. parsing 없이 grammar 우회를 닫는 가장 boring한 정답입니다. 사용자가 의도적으로 FTS5 문법을 쓰고 싶을 때는 '...'로 wrap하는 explicit opt-in이 있고요.

FTS5 매치 빌더의 escape 전략이 단순하면서도 안전합니다. 모든 토큰을 `"..."`로 감싸고 내부 `"`만 이중화하면 FTS5 metacharacter 전체(`*`, `^`, `:`, `(`, `)`)가 한 번에 무력화됩니다. parsing 없이 grammar 우회를 닫는 가장 boring한 정답입니다. 사용자가 의도적으로 FTS5 문법을 쓰고 싶을 때는 `'...'`로 wrap하는 explicit opt-in이 있고요.
/// raw-FTS5 mode.
fn strip_single_quotes(s: &str) -> Option<&str> {
let bytes = s.as_bytes();
if bytes.len() >= 2 && bytes[0] == b'\'' && bytes[bytes.len() - 1] == b'\'' {
Some(&s[1..s.len() - 1])
} else {
None
}
}
/// FTS5-escape one token by wrapping it in double quotes (FTS5 string
/// literal). Inner `"` are escaped by doubling per FTS5 grammar. This is
/// the simple-and-safe approach that defangs every special character —
/// `(`, `)`, `*`, `^`, `:`, `"`, etc. — without trying to parse FTS5
/// expressions.
fn escape_fts5_token(tok: &str) -> String {
let mut out = String::with_capacity(tok.len() + 2);
out.push('"');
for ch in tok.chars() {
if ch == '"' {
out.push('"');
out.push('"');
} else {
out.push(ch);
}
}
out.push('"');
out
}
// ── SQL execution ────────────────────────────────────────────────────────
/// Raw row shape mirroring the columns selected by [`run_query`]. Kept
/// internal — every public path constructs `SearchHit` from this.
struct RawRow {
chunk_id: String,
doc_id: String,
bm25_raw: f64,
snippet: String,
heading_path_json: String,
section_label: Option<String>,
source_spans_json: String,
chunker_version: String,
workspace_path: String,
}
/// Build + execute the FTS5 query. The SQL pattern is the one documented
/// in `tasks/p2/p2-2-lexical-retriever.md` (§Behavior contract).
fn run_query(
conn: &Connection,
match_str: &str,
snippet_words: usize,
filters: &SearchFilters,
fetch_limit: usize,
) -> Result<Vec<RawRow>> {
// Build the dynamic SQL + positional parameter vector. Positional `?`
// is used (not named bindings) because the dynamic IN-list for
// `tags_any` is most natural with `params_from_iter`.
let mut sql = String::from(
"SELECT \
f.chunk_id, f.doc_id, \
bm25(chunks_fts) AS score, \
snippet(chunks_fts, 3, '', '', '…', ?) AS snippet, \
c.heading_path_json, c.section_label, c.source_spans_json, \
c.chunker_version, \
d.workspace_path \
FROM chunks_fts f \
JOIN chunks c ON c.chunk_id = f.chunk_id \
JOIN documents d ON d.doc_id = f.doc_id",
);
let mut params: Vec<Box<dyn ToSql>> = Vec::new();
// 1) snippet word count.
params.push(Box::new(snippet_words as i64));
// 2) MATCH expression.
sql.push_str(" WHERE chunks_fts MATCH ?");
params.push(Box::new(match_str.to_owned()));
// tags_any: doc must own at least one of the requested tags.
if !filters.tags_any.is_empty() {
sql.push_str(
" AND f.doc_id IN (SELECT doc_id FROM document_tags WHERE tag IN (",
);
for (i, tag) in filters.tags_any.iter().enumerate() {
if i > 0 {
sql.push(',');
}
sql.push('?');
params.push(Box::new(tag.clone()));
}
sql.push_str("))");
}
if let Some(lang) = &filters.lang {
sql.push_str(" AND d.lang = ?");
params.push(Box::new(lang.0.clone()));
}
if let Some(trust_min) = &filters.trust_min {
// Mirror `kb_store_sqlite::documents::list_documents` ranking:
// Generated < Secondary < Primary. Doing the rank in SQL
// (rather than post-filtering) keeps the row stream short
// when the workspace contains many low-trust docs.
sql.push_str(
" AND CASE d.trust_level \
WHEN 'primary' THEN 3 \
WHEN 'secondary' THEN 2 \
WHEN 'generated' THEN 1 \
ELSE 0 \
END >= ?",
);
let rank: i64 = match trust_min {
TrustLevel::Primary => 3,
TrustLevel::Secondary => 2,
TrustLevel::Generated => 1,
};
params.push(Box::new(rank));
}
// path_glob is intentionally NOT applied here — see module comment

i64::try_from(fetch_limit).unwrap_or(i64::MAX) 캐스트 — usize::MAX 입력에서 negative LIMIT으로 wrap되어 SQLite가 에러 내는 시나리오를 차단합니다. 실제로 발생할 일은 거의 없지만 robustness side에 둔 게 옳습니다.

`i64::try_from(fetch_limit).unwrap_or(i64::MAX)` 캐스트 — `usize::MAX` 입력에서 negative LIMIT으로 wrap되어 SQLite가 에러 내는 시나리오를 차단합니다. 실제로 발생할 일은 거의 없지만 robustness side에 둔 게 옳습니다.
// on PATH_GLOB_OVERFETCH and the post-filter in `LexicalRetriever::search`.
// Determinism: tie-break on chunk_id so equal bm25 scores produce a
// stable order across runs. `f.chunk_id` is the FTS row's UNINDEXED
// copy of the same value as `c.chunk_id`; either side works.
sql.push_str(" ORDER BY score, f.chunk_id LIMIT ?");
params.push(Box::new(i64::try_from(fetch_limit).unwrap_or(i64::MAX)));
let mut stmt = conn
.prepare(&sql)
.context("kb-search lexical: prepare FTS5 statement")?;
let rows = stmt
.query_map(params_from_iter(params.iter().map(|b| b.as_ref())), row_from_sql)
.context("kb-search lexical: execute FTS5 query")?;
let mut out: Vec<RawRow> = Vec::new();
for r in rows {
out.push(r.context("kb-search lexical: read row")?);
}
Ok(out)
}
fn row_from_sql(row: &Row<'_>) -> rusqlite::Result<RawRow> {
Ok(RawRow {
chunk_id: row.get(0)?,
doc_id: row.get(1)?,
bm25_raw: row.get(2)?,
snippet: row.get(3)?,
heading_path_json: row.get(4)?,
section_label: row.get(5)?,
source_spans_json: row.get(6)?,
chunker_version: row.get(7)?,
workspace_path: row.get(8)?,
})
}
// ── Hit construction ─────────────────────────────────────────────────────
fn build_hit(
raw: RawRow,
rank: u32,
index_version: &IndexVersion,
snippet_chars: usize,
) -> Result<SearchHit> {
let normalized = normalize_bm25(raw.bm25_raw);
let heading_path: Vec<String> = serde_json::from_str(&raw.heading_path_json)
.context("kb-search lexical: deserialize heading_path_json")?;
let source_spans: Vec<SourceSpan> = serde_json::from_str(&raw.source_spans_json)
.context("kb-search lexical: deserialize source_spans_json")?;
let workspace_path = WorkspacePath::new(raw.workspace_path)
.context("kb-search lexical: documents.workspace_path violates WorkspacePath invariant")?;
let citation = build_citation(
&raw.chunk_id,
workspace_path.clone(),
raw.section_label.clone(),
source_spans.first(),
);
// FTS5's snippet() respects the word budget but produces a
// character-length we can't predict precisely (token boundaries vary
// with the tokenizer). The contract caps at `snippet_chars`; trim
// defensively if SQLite ever returns a longer string.
let snippet = trim_snippet(&raw.snippet, snippet_chars);
Ok(SearchHit {
rank,
chunk_id: ChunkId(raw.chunk_id),
doc_id: DocumentId(raw.doc_id),
doc_path: workspace_path,
heading_path,
section_label: raw.section_label,
snippet,
citation,
retrieval: RetrievalDetail {
method: SearchMode::Lexical,
fusion_score: normalized,
lexical_score: Some(normalized),
vector_score: None,
lexical_rank: Some(rank),
vector_rank: None,
},
index_version: index_version.clone(),
embedding_model: None,
chunker_version: ChunkerVersion(raw.chunker_version),
})
}
/// Map the raw bm25 score (FTS5 returns a *negative* number; lower is
/// better) into a positive score in `(0, 1]`. The formula
/// `score = -bm25 / (1 + |bm25|)` is monotonic, smooth, and bounded —
/// suitable both for human display and for use as an RRF input.
fn normalize_bm25(bm25_raw: f64) -> f32 {
let abs = bm25_raw.abs();
let normalized = -bm25_raw / (1.0_f64 + abs);
normalized as f32
}
/// Build a `Citation` from the chunk's first `SourceSpan`. P1 markdown
/// only emits `Line`, so the other variants are mostly defensive — we
/// forward them as faithfully as possible so a future PDF / image
/// extractor can flow through without churn.
fn build_citation(
chunk_id: &str,
path: WorkspacePath,
section: Option<String>,
first_span: Option<&SourceSpan>,
) -> Citation {
match first_span {
Some(SourceSpan::Line { start, end }) => Citation::Line {
path,
start: *start,
end: *end,
section,
},
Some(SourceSpan::Page { page, .. }) => Citation::Page {
path,
page: *page,
section,
},
Some(SourceSpan::Region { x, y, w, h }) => Citation::Region {
path,
x: *x,
y: *y,
w: *w,
h: *h,
},
Some(SourceSpan::Time { start_ms, end_ms }) => Citation::Time {
path,
start_ms: *start_ms,
end_ms: *end_ms,
speaker: None,
},
// Byte-spans don't have a Citation variant. Fall back to a Line
// citation pointing at the document head — better than fabricating
// a position. Spans-empty falls into the same branch.
other @ (Some(SourceSpan::Byte { .. }) | None) => {
let span_shape = match other {
Some(_) => "Byte",
None => "empty array",
};
tracing::warn!(
chunk_id,

trim_snippet이 .chars() (Unicode scalar value) 단위로 자르는 trade-off를 코멘트로 남긴 점이 좋습니다. design §6.4의 "characters" 정의가 USV 기준이라 spec에 부합하고, 동시에 Hebrew niqqud / Devanagari combining mark가 orphan될 수 있는 corner case를 미래 reader가 "버그"로 오해해서 grapheme cluster로 "고치는" 회귀를 막아둡니다.

trim_snippet이 `.chars()` (Unicode scalar value) 단위로 자르는 trade-off를 코멘트로 남긴 점이 좋습니다. design §6.4의 "characters" 정의가 USV 기준이라 spec에 부합하고, 동시에 Hebrew niqqud / Devanagari combining mark가 orphan될 수 있는 corner case를 미래 reader가 "버그"로 오해해서 grapheme cluster로 "고치는" 회귀를 막아둡니다.
span_shape,
"kb-search lexical: SourceSpan has no Citation mapping; falling back to Line {{1, 1}}"
);
Citation::Line {
path,
start: 1,
end: 1,
section,
}
}

Byte/empty-array source_span에 Citation::Line { 1, 1 } fallback + tracing::warn! 패턴이 의도를 ��� 표현합니다. 데이터 무결성 issue를 silent하게 가리지 않고 (warn으로 surface) 동시에 retrieval 자체는 멈추지 않게 (fallback으로 forward) — forward-compat regression이 로그에 잡힙니다.

Byte/empty-array source_span에 `Citation::Line { 1, 1 }` fallback + `tracing::warn!` 패턴이 의도를 ��� 표현합니다. 데이터 무결성 issue를 silent하게 가리지 않고 (warn으로 surface) 동시에 retrieval 자체는 멈추지 않게 (fallback으로 forward) — forward-compat regression이 로그에 잡힙니다.
}
}

globset의 literal_separator(true). 이 한 줄이 spec Risks/notes의 "*/을 cross하면 안 됨" 요건을 만족시킵니다. globset default가 cross하는지 empirical하게 확인하고 명시적으로 켠 점이 좋고, compile_glob_star_does_not_cross_slash 단위 테스트로 invariant을 pin한 것도 적절합니다.

globset의 `literal_separator(true)`. 이 한 줄이 spec Risks/notes의 "`*`이 `/`을 cross하면 안 됨" 요건을 만족시킵니다. globset default가 cross하는지 empirical하게 확인하고 명시적으로 켠 점이 좋고, `compile_glob_star_does_not_cross_slash` 단위 테스트로 invariant을 pin한 것도 적절합니다.
/// Cap the snippet at `max_chars` characters (Unicode scalar values, not
/// bytes — matches the §6.4 setting's "characters" semantics). Returns
/// the input unchanged when already short enough.
fn trim_snippet(s: &str, max_chars: usize) -> String {
// We slice on Unicode scalar values per §6.4's "characters" semantics; this
// can orphan a combining mark in extreme cases (Hebrew niqqud, Devanagari)
// but matches the spec's char-budget definition.
if s.chars().count() <= max_chars {
return s.to_string();
}
s.chars().take(max_chars).collect()
}
// ── path_glob ────────────────────────────────────────────────────────────
/// Compile a `path_glob` pattern. We enable `literal_separator` so `*`
/// does NOT cross `/` — design requires `*` to match within a single
/// path segment, not across them. (`globset`'s default is to let `*`
/// span separators.)
fn compile_glob(pattern: &str) -> Result<GlobMatcher> {
let g = globset::GlobBuilder::new(pattern)
.literal_separator(true)
.build()
.with_context(|| format!("kb-search lexical: invalid path_glob {pattern:?}"))?;
Ok(g.compile_matcher())
}
// ── Unit tests for pure helpers ──────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn build_match_string_empty_returns_none() {
assert!(build_match_string("").is_none());
assert!(build_match_string(" ").is_none());
assert!(build_match_string("''").is_none());
assert!(build_match_string("' '").is_none());
}
#[test]
fn build_match_string_default_is_quoted_and_anded() {
let s = build_match_string("rust cargo").unwrap();
// Two tokens, each quoted, joined by a space (implicit AND).
assert_eq!(s, r#""rust" "cargo""#);
}
#[test]
fn build_match_string_escapes_special_chars() {
// `*`, `(`, `)`, `:`, `^`, `"` should all be wrapped inside
// FTS5 string-literal quotes so they're treated as literal
// text rather than FTS5 operators.
let s = build_match_string(r#"foo* (bar) baz:qux ^head he"llo"#).unwrap();
assert_eq!(
s,
r#""foo*" "(bar)" "baz:qux" "^head" "he""llo""#
);
// The doubled `""` is FTS5's way of embedding a literal quote
// inside a string literal.
assert!(s.contains(r#"he""llo"#));
// Sanity: every special character lives between matching `"`
// delimiters — there is no bare-token (unquoted) span anywhere.
// We check this by confirming the string starts and ends with `"`
// and the count of unescaped `"` is even (each token is wrapped).
assert!(s.starts_with('"') && s.ends_with('"'));
}
#[test]
fn build_match_string_passthrough_when_single_quoted() {
// The FTS5 expression is preserved verbatim.
let s = build_match_string("'foo OR bar*'").unwrap();
assert_eq!(s, "foo OR bar*");
}
#[test]
fn normalize_bm25_top_score_in_unit_interval() {
// A "perfect" hit is bm25 = -1.0 → normalized 0.5.
// A high-relevance hit (bm25 = -10.0) → 10/11 ≈ 0.909.
let high = normalize_bm25(-10.0);
assert!(high > 0.0 && high <= 1.0, "got {high}");
let medium = normalize_bm25(-1.0);
assert!((medium - 0.5).abs() < 1e-6);
}
#[test]
fn normalize_bm25_monotonic() {
// Lower (more-negative) bm25 must map to a higher normalized score.
let a = normalize_bm25(-2.0);
let b = normalize_bm25(-1.0);
assert!(a > b, "{a} should exceed {b}");
}
#[test]
fn trim_snippet_caps_at_char_count() {
let s = "a".repeat(300);
let trimmed = trim_snippet(&s, 220);
assert_eq!(trimmed.chars().count(), 220);
}
#[test]
fn trim_snippet_passthrough_when_short() {
let s = "short";
assert_eq!(trim_snippet(s, 220), "short");
}
#[test]
fn build_citation_line_round_trip() {
let p = WorkspacePath::new("a/b.md".to_string()).unwrap();
let span = SourceSpan::Line { start: 7, end: 12 };
let c = build_citation("c1", p.clone(), Some("S1".to_string()), Some(&span));
match c {
Citation::Line {
start,
end,
ref section,
path: ref pp,
} => {
assert_eq!(start, 7);
assert_eq!(end, 12);
assert_eq!(section.as_deref(), Some("S1"));
assert_eq!(pp, &p);
}
other => panic!("expected Citation::Line, got {other:?}"),
}
}
#[test]
fn build_citation_page_forwards_section() {
let p = WorkspacePath::new("doc.pdf".to_string()).unwrap();
let span = SourceSpan::Page {
page: 4,
char_start: None,
char_end: None,
};
let c = build_citation("c1", p, Some("Intro".to_string()), Some(&span));
match c {
Citation::Page {
page,
ref section,
..
} => {
assert_eq!(page, 4);
assert_eq!(section.as_deref(), Some("Intro"));
}
other => panic!("expected Citation::Page, got {other:?}"),
}
}
#[test]
fn build_citation_none_falls_back_to_line_one() {
let p = WorkspacePath::new("x.md".to_string()).unwrap();
let c = build_citation("c1", p, None, None);
match c {
Citation::Line { start, end, .. } => {
assert_eq!((start, end), (1, 1));
}
other => panic!("expected fallback Citation::Line, got {other:?}"),
}
}
#[test]
fn compile_glob_rejects_invalid_pattern() {
// `[` is a character-class opener; an unclosed class is invalid.
let r = compile_glob("notes/[abc");
assert!(r.is_err());
}
#[test]
fn compile_glob_star_does_not_cross_slash() {
// This is the design invariant: `*` must NOT match `/`.
let m = compile_glob("notes/*.md").unwrap();
assert!(m.is_match("notes/foo.md"));
assert!(!m.is_match("notes/sub/foo.md"));
}
}

View File

@@ -0,0 +1,15 @@
//! `kb-search` — `kb_core::Retriever` implementations.
//!
//! P2-2 ships [`LexicalRetriever`], a SQLite-FTS5-backed retriever for
//! `SearchMode::Lexical`. Vector + Hybrid retrievers land in P3-3 / P3-4.
//!
//! Allowed deps per task spec: `kb-core`, `kb-config`, `kb-store-sqlite`,
//! `rusqlite`, `globset`, `tracing`, `thiserror`, `anyhow`. Forbidden:
//! `kb-source-fs`, `kb-parse-md`, `kb-normalize`, `kb-chunk`,
//! `kb-store-vector`, `kb-embed*`, `kb-llm*`, `kb-rag`, `kb-tui`,
//! `kb-desktop`. Only `serde_json` is a transitive helper used to decode
//! JSON-typed columns from `chunks` / `documents`.
mod lexical;
pub use lexical::LexicalRetriever;

View File

@@ -0,0 +1,60 @@
[

snapshot baseline의 stability 메모(rusqlite bundled SQLite + KB_UPDATE_SNAPSHOTS=1 재생성 절차)를 테스트 코드 측에 둔 점이 적절합니다. 향후 SQLite bump으로 bm25 알고리즘이나 tokenizer가 바뀔 때 reviewer가 무엇을 해야 하는지 즉시 파악 가능합니다.

snapshot baseline의 stability 메모(rusqlite bundled SQLite + KB_UPDATE_SNAPSHOTS=1 재생성 절차)를 테스트 코드 측에 둔 점이 적절합니다. 향후 SQLite bump으로 bm25 알고리즘이나 tokenizer가 바뀔 때 reviewer가 무엇을 해야 하는지 즉시 파악 가능합니다.
{
"chunk_id": "c3000000000000000000000000000000",
"chunker_version": "v1",
"citation": {
"end": 8,
"kind": "line",
"path": "notes/snap.md",
"section": "Snap",
"start": 7
},
"doc_id": "d0000000000000000000000000000000",
"doc_path": "notes/snap.md",
"embedding_model": null,
"heading_path": [
"Snap"
],
"index_version": "v1.0",
"rank": 1,
"retrieval": {
"fusion_score": 1.4490997273242101e-6,
"lexical_rank": 1,
"lexical_score": 1.4490997273242101e-6,
"method": "lexical",
"vector_rank": null,
"vector_score": null
},
"section_label": "Snap",
"snippet": "alpha alpha"
},
{
"chunk_id": "c1000000000000000000000000000000",
"chunker_version": "v1",
"citation": {
"end": 2,
"kind": "line",
"path": "notes/snap.md",
"section": "Snap",
"start": 1
},
"doc_id": "d0000000000000000000000000000000",
"doc_path": "notes/snap.md",
"embedding_model": null,
"heading_path": [
"Snap"
],
"index_version": "v1.0",
"rank": 2,
"retrieval": {
"fusion_score": 9.641424867368187e-7,
"lexical_rank": 2,
"lexical_score": 9.641424867368187e-7,
"method": "lexical",
"vector_rank": null,
"vector_score": null
},
"section_label": "Snap",
"snippet": "alpha bravo charlie"
}
]

View File

@@ -0,0 +1,666 @@
//! P2-2 integration tests for `LexicalRetriever`.
//!
//! Strategy: seed the SQLite store via raw inserts with `foreign_keys =
//! OFF` (mirroring the P2-1 FTS tests). This avoids dragging
//! `kb-parse-md` / `kb-normalize` / `kb-chunk` into kb-search's dev-deps,
//! which would violate the task's "Allowed deps" list.
use std::sync::Arc;
use kb_config::Config;
use kb_core::{IndexVersion, Lang, Retriever, SearchFilters, SearchMode, SearchQuery, TrustLevel};
use kb_search::LexicalRetriever;
use kb_store_sqlite::SqliteStore;
use rusqlite::Connection;
use tempfile::TempDir;
// ── Test scaffolding ─────────────────────────────────────────────────────
struct Env {
_temp: TempDir,
store: Arc<SqliteStore>,
db_path: std::path::PathBuf,
}
impl Env {
fn new() -> Self {
let temp = tempfile::tempdir().expect("tempdir");
let mut config = Config::defaults();
config.storage.data_dir = temp.path().to_string_lossy().into_owned();
let store = SqliteStore::open(&config).expect("open store");
store.run_migrations().expect("run migrations");
let db_path = temp.path().join("kb.sqlite");
Self {
_temp: temp,
store: Arc::new(store),
db_path,
}
}
/// Side-channel raw connection with FK enforcement off — same
/// trick used by P2-1's FTS tests so we can seed `chunks` /
/// `documents` directly without the full ingest graph.
fn raw_conn(&self) -> Connection {
let conn = Connection::open(&self.db_path).expect("open side conn");
conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
conn
}
fn retriever(&self) -> LexicalRetriever {
LexicalRetriever::new(
Arc::clone(&self.store),
IndexVersion("v1.0".to_string()),
)
}
fn retriever_with_snippet_chars(&self, snippet_chars: usize) -> LexicalRetriever {
LexicalRetriever::with_settings(
Arc::clone(&self.store),
IndexVersion("v1.0".to_string()),
snippet_chars,
)
}
}
/// Minimal documents row. Many columns are NOT NULL and we don't care
/// about their exact values for retrieval tests, so we wedge in
/// reasonable defaults.
#[allow(clippy::too_many_arguments)]
fn insert_document(
conn: &Connection,
doc_id: &str,
workspace_path: &str,
title: &str,
lang: &str,
trust_level: &str,
tags: &[&str],
) {
// assets row first — documents.asset_id has a FK with ON DELETE
// RESTRICT but FKs are OFF on this connection. Still we insert a
// matching row so JOINs pick it up.
let asset_id = format!("{:0>32}", &doc_id[..1.min(doc_id.len())]); // 32-hex-ish
let asset_id = format!("{:0>32}", asset_id.chars().take(32).collect::<String>());
conn.execute(
"INSERT OR IGNORE INTO assets (
asset_id, source_uri, workspace_path, media_type, byte_len,
checksum, storage_kind, storage_path, discovered_at
) VALUES (?, 'file:///x', ?, '\"markdown\"', 0,
'd0', 'reference', '/x', '2024-01-01T00:00:00Z')",
rusqlite::params![asset_id, workspace_path],
)
.expect("insert asset");
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path, title, lang,
source_type, trust_level, parser_version,
doc_version, schema_version, metadata_json,
provenance_json, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, 'markdown', ?, 'pv1', 1, 1,
'{}', '{\"events\":[]}',
'2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z')",
rusqlite::params![doc_id, asset_id, workspace_path, title, lang, trust_level],
)
.expect("insert document");
for tag in tags {
conn.execute(
"INSERT INTO document_tags (doc_id, tag) VALUES (?, ?)",
rusqlite::params![doc_id, tag],
)
.expect("insert tag");
}
}
#[allow(clippy::too_many_arguments)]
fn insert_chunk(
conn: &Connection,
chunk_id: &str,
doc_id: &str,
text: &str,
heading_path: &[&str],
section_label: Option<&str>,
source_spans_json: &str,
chunker_version: &str,
) {
let heading_json = serde_json::to_string(heading_path).unwrap();
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES (?, ?, ?, ?, ?, ?, 0, ?, 'h', '[]', '2024-01-01T00:00:00Z')",
rusqlite::params![
chunk_id,
doc_id,
text,
heading_json,
section_label,
source_spans_json,
chunker_version,
],
)
.expect("insert chunk");
}
/// Pad a short ID to the 32-hex shape kb_core newtypes expect.
fn id32(prefix: &str) -> String {
let mut s = prefix.to_string();
while s.len() < 32 {
s.push('0');
}
s.truncate(32);
s
}
// ── Tests ────────────────────────────────────────────────────────────────
#[test]
fn lexical_empty_corpus_returns_empty_vec() {
let env = Env::new();
let r = env.retriever();
let q = SearchQuery {
text: "rust".to_string(),
mode: SearchMode::Lexical,
k: 10,
filters: SearchFilters::default(),
};
let hits = r.search(&q).expect("search");
assert!(hits.is_empty(), "empty corpus must yield empty Vec");
}
#[test]
fn lexical_empty_query_returns_empty_vec_without_db_hit() {
// Even with rows in the DB, a blank query must short-circuit to [].
let env = Env::new();
let conn = env.raw_conn();
insert_document(&conn, &id32("d"), "notes/a.md", "A", "en", "primary", &[]);
insert_chunk(
&conn,
&id32("c1"),
&id32("d"),
"rust cargo macros",
&["A"],
None,
r#"[{"kind":"line","start":1,"end":3}]"#,
"v1",
);
drop(conn);
let r = env.retriever();
for empty in ["", " ", "''"] {
let q = SearchQuery {
text: empty.to_string(),
mode: SearchMode::Lexical,
k: 5,
filters: SearchFilters::default(),
};
let hits = r.search(&q).unwrap();
assert!(hits.is_empty(), "query {empty:?} must yield empty Vec");
}
}
#[test]
fn lexical_single_doc_match_returns_one_hit_with_citation_round_trip() {
let env = Env::new();
let conn = env.raw_conn();
insert_document(&conn, &id32("d"), "notes/rust.md", "Rust Notes", "en", "primary", &[]);
insert_chunk(
&conn,
&id32("c1"),
&id32("d"),
"Rust borrow checker enforces ownership.",
&["Notes"],
Some("Notes"),
r#"[{"kind":"line","start":4,"end":4}]"#,
"v1",
);
drop(conn);
let r = env.retriever();
let q = SearchQuery {
text: "borrow".to_string(),
mode: SearchMode::Lexical,
k: 10,
filters: SearchFilters::default(),
};
let hits = r.search(&q).expect("search");
assert_eq!(hits.len(), 1);
let h = &hits[0];
assert_eq!(h.rank, 1);
assert_eq!(h.doc_path.0, "notes/rust.md");
assert_eq!(h.heading_path, vec!["Notes".to_string()]);
assert_eq!(h.section_label.as_deref(), Some("Notes"));
assert_eq!(h.retrieval.method, SearchMode::Lexical);
assert_eq!(h.retrieval.lexical_rank, Some(1));
assert!(h.retrieval.vector_score.is_none());
// Citation round-trips through `to_uri`/`parse` (line variant).
let uri = h.citation.to_uri();
let parsed = kb_core::Citation::parse(&uri).expect("parse uri");
// Reparsed citation has section=None (URI fragment doesn't carry it),
// so compare by `to_uri` equivalence rather than struct equality.
assert_eq!(parsed.to_uri(), uri);
// Sanity: this is a Line citation matching the seeded source span.
assert_eq!(uri, "notes/rust.md#L4");
}
#[test]
fn lexical_snippet_length_capped_at_snippet_chars() {
let env = Env::new();
let conn = env.raw_conn();
insert_document(
&conn,
&id32("d"),
"notes/long.md",
"Long",
"en",
"primary",
&[],
);
// A text long enough that FTS5 might return a snippet > 80 chars
// when given a high word budget. We instead set a tight cap below
// and rely on `trim_snippet` as the backstop.
let mut text = String::new();
for _ in 0..50 {
text.push_str("alpha beta gamma delta epsilon ");
}
insert_chunk(
&conn,
&id32("c1"),
&id32("d"),
&text,
&["Long"],
None,
r#"[{"kind":"line","start":1,"end":1}]"#,
"v1",
);
drop(conn);
// Set snippet_chars to a known bound; the retriever clamps + trims
// any snippet to fit.
let r = env.retriever_with_snippet_chars(80);
let hits = r
.search(&SearchQuery {
text: "alpha".to_string(),
mode: SearchMode::Lexical,
k: 1,
filters: SearchFilters::default(),
})
.unwrap();
assert_eq!(hits.len(), 1);
assert!(
hits[0].snippet.chars().count() <= 80,
"snippet must be ≤ snippet_chars; got {} chars: {:?}",
hits[0].snippet.chars().count(),
hits[0].snippet
);
}
#[test]
fn lexical_filter_tags_any_excludes_untagged_docs() {
let env = Env::new();
let conn = env.raw_conn();
insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &["rust"]);
insert_document(&conn, &id32("d2"), "notes/b.md", "B", "en", "primary", &["python"]);
insert_chunk(
&conn,
&id32("c1"),
&id32("d1"),
"ownership and borrow checker",
&["A"],
None,
r#"[{"kind":"line","start":1,"end":1}]"#,
"v1",
);
insert_chunk(
&conn,
&id32("c2"),
&id32("d2"),
"borrow semantics in python",
&["B"],
None,
r#"[{"kind":"line","start":1,"end":1}]"#,
"v1",
);
drop(conn);
let r = env.retriever();
let q = SearchQuery {
text: "borrow".to_string(),
mode: SearchMode::Lexical,
k: 10,
filters: SearchFilters {
tags_any: vec!["rust".to_string()],
..Default::default()
},
};
let hits = r.search(&q).unwrap();
assert_eq!(hits.len(), 1, "tags_any=[rust] must exclude python doc");
assert_eq!(hits[0].doc_path.0, "notes/a.md");
}
#[test]
fn lexical_filter_lang_and_trust_min_compose() {
let env = Env::new();
let conn = env.raw_conn();
insert_document(&conn, &id32("d1"), "ko/a.md", "A", "ko", "primary", &[]);
insert_document(&conn, &id32("d2"), "en/b.md", "B", "en", "primary", &[]);
insert_document(&conn, &id32("d3"), "en/c.md", "C", "en", "generated", &[]);
for (cid, did, body) in [
("c1", "d1", "검색 키워드 alpha"),
("c2", "d2", "alpha bravo"),
("c3", "d3", "alpha gamma"),
] {
insert_chunk(
&conn,
&id32(cid),
&id32(did),
body,
&[],
None,
r#"[{"kind":"line","start":1,"end":1}]"#,
"v1",
);
}
drop(conn);
let r = env.retriever();
// lang=en + trust_min=secondary → only d2 (primary ≥ secondary).
let hits = r
.search(&SearchQuery {
text: "alpha".to_string(),
mode: SearchMode::Lexical,
k: 10,
filters: SearchFilters {
lang: Some(Lang("en".to_string())),
trust_min: Some(TrustLevel::Secondary),
..Default::default()
},
})
.unwrap();
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].doc_path.0, "en/b.md");
}
#[test]
fn lexical_filter_path_glob_does_not_cross_slash() {
let env = Env::new();
let conn = env.raw_conn();
insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &[]);
insert_document(&conn, &id32("d2"), "notes/sub/b.md", "B", "en", "primary", &[]);
insert_chunk(
&conn,
&id32("c1"),
&id32("d1"),
"shared keyword",
&[],
None,
r#"[{"kind":"line","start":1,"end":1}]"#,
"v1",
);
insert_chunk(
&conn,
&id32("c2"),
&id32("d2"),
"shared keyword",
&[],
None,
r#"[{"kind":"line","start":1,"end":1}]"#,
"v1",
);
drop(conn);
let r = env.retriever();
let hits = r
.search(&SearchQuery {
text: "keyword".to_string(),
mode: SearchMode::Lexical,
k: 10,
filters: SearchFilters {
path_glob: Some("notes/*.md".to_string()),
..Default::default()
},
})
.unwrap();
let paths: Vec<&str> = hits.iter().map(|h| h.doc_path.0.as_str()).collect();
assert_eq!(paths, vec!["notes/a.md"], "* must not match across `/`");
}
#[test]
fn lexical_citation_round_trip_against_first_source_span() {
let env = Env::new();
let conn = env.raw_conn();
insert_document(&conn, &id32("d"), "notes/m.md", "M", "en", "primary", &[]);
insert_chunk(
&conn,
&id32("c1"),
&id32("d"),
"echo bravo",
&[],
None,
// Two spans; the citation uses the first.
r#"[{"kind":"line","start":12,"end":34},{"kind":"line","start":60,"end":61}]"#,
"v1",
);
drop(conn);
let r = env.retriever();
let hits = r
.search(&SearchQuery {
text: "bravo".to_string(),
mode: SearchMode::Lexical,
k: 1,
filters: SearchFilters::default(),
})
.unwrap();
assert_eq!(hits.len(), 1);
let uri = hits[0].citation.to_uri();
assert_eq!(uri, "notes/m.md#L12-L34");
let parsed = kb_core::Citation::parse(&uri).unwrap();
assert_eq!(parsed.to_uri(), uri);
}
#[test]
fn lexical_top_score_within_unit_interval_three_chunks() {
let env = Env::new();
let conn = env.raw_conn();
insert_document(&conn, &id32("d"), "notes/r.md", "R", "en", "primary", &[]);
// Three chunks of varying relevance to the query 'alpha':
// c1: alpha alpha alpha (best)
// c2: alpha bravo
// c3: bravo charlie alpha (one occurrence)
for (cid, body) in [
("c1", "alpha alpha alpha keyword"),
("c2", "alpha bravo charlie"),
("c3", "bravo charlie alpha"),
] {
insert_chunk(
&conn,
&id32(cid),
&id32("d"),
body,
&[],
None,
r#"[{"kind":"line","start":1,"end":1}]"#,
"v1",
);
}
drop(conn);
let r = env.retriever();
let hits = r
.search(&SearchQuery {
text: "alpha".to_string(),
mode: SearchMode::Lexical,
k: 10,
filters: SearchFilters::default(),
})
.unwrap();
assert!(!hits.is_empty(), "must surface at least one hit");
let top = hits[0].retrieval.fusion_score;
assert!(
top > 0.0 && top <= 1.0,
"top normalized score must be in (0, 1]; got {top}"
);
// All scores in [0, 1].
for h in &hits {
let s = h.retrieval.fusion_score;
assert!((0.0..=1.0).contains(&s), "hit score {s} out of [0, 1]");
// lexical_score and fusion_score equal in lexical-only mode.
assert_eq!(h.retrieval.lexical_score, Some(s));
}
// bm25 should rank c1 (3 occurrences) above c2 / c3.
assert!(hits[0].chunk_id.0.starts_with("c1"));
}
#[test]
fn lexical_determinism_same_query_twice() {
let env = Env::new();
let conn = env.raw_conn();
insert_document(&conn, &id32("d"), "notes/r.md", "R", "en", "primary", &[]);
for (cid, body) in [
("c1", "alpha alpha"),
("c2", "alpha bravo"),
("c3", "alpha charlie"),
("c4", "alpha delta"),
] {
insert_chunk(
&conn,
&id32(cid),
&id32("d"),
body,
&[],
None,
r#"[{"kind":"line","start":1,"end":1}]"#,
"v1",
);
}
drop(conn);
let r = env.retriever();
let q = SearchQuery {
text: "alpha".to_string(),
mode: SearchMode::Lexical,
k: 10,
filters: SearchFilters::default(),
};
let a = r.search(&q).unwrap();
let b = r.search(&q).unwrap();
assert_eq!(a, b, "same DB + same query must yield identical Vec<SearchHit>");
}
#[test]
fn lexical_determinism_chunk_id_tiebreaker_on_equal_bm25() {
// Two chunks with byte-identical text + length → identical bm25 scores

동일 bm25를 발생시키는 별도 결정성 테스트를 추가한 게 결정적입니다. 일반 결정성 테스트(varied scores)만 있으면 chunk_id tiebreaker 경로가 한 번도 실행되지 않은 채로 "determinism PASS"가 나오는데, 동일 텍스트 두 청크 케이스로 그 경로를 실제로 exercise합니다.

동일 bm25를 발생시키는 별도 결정성 테스트를 추가한 게 결정적입니다. 일반 결정성 테스트(varied scores)만 있으면 chunk_id tiebreaker 경로가 한 번도 실행되지 않은 채로 "determinism PASS"가 나오는데, 동일 텍스트 두 청크 케이스로 그 경로를 실제로 exercise합니다.
// for any `MATCH` against them. The retriever must fall back to
// `chunk_id` ordering so the result is stable across runs.
let env = Env::new();
let conn = env.raw_conn();
insert_document(&conn, &id32("d"), "notes/tie.md", "Tie", "en", "primary", &[]);
let cid_a = id32("aaaa");
let cid_b = id32("bbbb");
assert!(cid_a < cid_b, "test premise: aaaa-id sorts before bbbb-id");
for cid in [&cid_a, &cid_b] {
insert_chunk(
&conn,
cid,
&id32("d"),
"alpha bravo charlie",
&[],
None,
r#"[{"kind":"line","start":1,"end":1}]"#,
"v1",
);
}
drop(conn);
let r = env.retriever();
let q = SearchQuery {
text: "alpha".to_string(),
mode: SearchMode::Lexical,
k: 10,
filters: SearchFilters::default(),
};
let a = r.search(&q).unwrap();
let b = r.search(&q).unwrap();
assert_eq!(a.len(), 2, "both chunks should match");
// bm25 must be equal for byte-identical chunks; the secondary sort
// by chunk_id pins the order.
assert!(
(a[0].retrieval.fusion_score - a[1].retrieval.fusion_score).abs() < 1e-9,
"byte-identical chunks must score equally; got {} vs {}",
a[0].retrieval.fusion_score,
a[1].retrieval.fusion_score
);
assert!(
a[0].chunk_id.0 < a[1].chunk_id.0,
"tiebreaker must order by chunk_id ascending; got {} then {}",
a[0].chunk_id.0,
a[1].chunk_id.0
);
assert_eq!(a, b, "tiebreaker order must be stable across runs");
}
#[test]
fn lexical_index_version_is_returned_unchanged() {
let env = Env::new();
let r = LexicalRetriever::new(
Arc::clone(&env.store),
IndexVersion("custom-label-1".to_string()),
);
assert_eq!(r.index_version().0, "custom-label-1");
}
#[test]
fn lexical_snapshot_run_1() {
// Pinned snapshot. A small, deterministic corpus; the JSON shape of
// `Vec<SearchHit>` for a fixed query is checked verbatim against
// `tests/fixtures/search/lexical/run-1.json`. Update both sides in
// the same commit when intentional changes ship.
// Stable because rusqlite ships bundled SQLite — a tokenizer/bm25 algorithm change in a future SQLite bump will require regenerating run-1.json via `KB_UPDATE_SNAPSHOTS=1`.
let env = Env::new();
let conn = env.raw_conn();
insert_document(&conn, &id32("d"), "notes/snap.md", "Snap", "en", "primary", &[]);
for (cid, body, span) in [
(
"c1",
"alpha bravo charlie",
r#"[{"kind":"line","start":1,"end":2}]"#,
),
(
"c2",
"bravo only here",
r#"[{"kind":"line","start":4,"end":5}]"#,
),
(
"c3",
"alpha alpha",
r#"[{"kind":"line","start":7,"end":8}]"#,
),
] {
insert_chunk(&conn, &id32(cid), &id32("d"), body, &["Snap"], Some("Snap"), span, "v1");
}
drop(conn);
let r = env.retriever();
let hits = r
.search(&SearchQuery {
text: "alpha".to_string(),
mode: SearchMode::Lexical,
k: 10,
filters: SearchFilters::default(),
})
.unwrap();
let actual = serde_json::to_value(&hits).unwrap();
let baseline_path =
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/search/lexical/run-1.json");
if std::env::var_os("KB_UPDATE_SNAPSHOTS").is_some() {
std::fs::write(&baseline_path, serde_json::to_string_pretty(&actual).unwrap()).unwrap();
}
let baseline_text = std::fs::read_to_string(&baseline_path)
.expect("baseline snapshot must exist; run with KB_UPDATE_SNAPSHOTS=1 to seed");
let expected: serde_json::Value = serde_json::from_str(&baseline_text).unwrap();
assert_eq!(actual, expected, "lexical run-1 snapshot drift");
}

View File

@@ -109,6 +109,22 @@ impl SqliteStore {
self.conn.lock().unwrap_or_else(|p| p.into_inner())
}
/// Read-only borrow of the connection.
///
/// Provided so sibling crates (e.g. `kb-search`) can run SELECTs
/// against the schema owned by this crate without re-opening the
/// SQLite file. Callers MUST treat the returned `Connection` as
/// read-only — issuing mutating SQL (INSERT / UPDATE / DELETE / DDL)
/// through this guard bypasses the per-document transaction discipline
/// (`put_*` methods) and the FTS5 backfill helpers that the store
/// layer enforces. Mutating callers must use `kb-store-sqlite`'s own
/// public write methods instead.
///
/// Poisoning is recovered the same way as [`Self::lock_conn`].
pub fn read_conn(&self) -> MutexGuard<'_, Connection> {
self.conn.lock().unwrap_or_else(|p| p.into_inner())

read_conn doc-comment가 contract 의도는 명확하지만 &Connection 타입으로는 mutation을 막을 수 없다는 점은 PR 본문에 솔직하게 적어두셨습니다. closure scope (with_read_conn<R>(&self, f: impl FnOnce(&Connection) -> R) -> R) 변형이 type-system 차원에서 더 단단하긴 한데 prepare_cached + 행 iterator 패턴과 lifetimes가 awkward해진다는 trade-off도 있습니다 — P3 follow-up으로 미룬 결정에 동의합니다.

`read_conn` doc-comment가 contract 의도는 명확하지만 `&Connection` 타입으로는 mutation을 막을 수 없다는 점은 PR 본문에 솔직하게 적어두셨습니다. closure scope (`with_read_conn<R>(&self, f: impl FnOnce(&Connection) -> R) -> R`) 변형이 type-system 차원에서 더 단단하긴 한데 prepare_cached + 행 iterator 패턴과 lifetimes가 awkward해진다는 trade-off도 있습니다 — P3 follow-up으로 미룬 결정에 동의합니다.
}
/// Persist a `RawAsset` *with its raw bytes*: row goes into `assets`,
/// bytes go to `data_dir/assets/<aa>/<asset_id>` if `byte_len ≤
/// copy_threshold_mb`, otherwise the row records the source URI's