feat(p2-2): lexical-retriever — kb-search crate + LexicalRetriever (FTS5 + bm25) #13
16
Cargo.lock
generated
@@ -801,6 +801,22 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-search"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"globset",
|
||||
"kb-config",
|
||||
"kb-core",
|
||||
"kb-store-sqlite",
|
||||
"rusqlite",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror 2.0.18",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-source-fs"
|
||||
version = "0.1.0"
|
||||
|
||||
@@ -9,6 +9,7 @@ members = [
|
||||
"crates/kb-normalize",
|
||||
"crates/kb-chunk",
|
||||
"crates/kb-store-sqlite",
|
||||
"crates/kb-search",
|
||||
"crates/kb-app",
|
||||
"crates/kb-cli",
|
||||
]
|
||||
@@ -29,3 +30,8 @@ time = { version = "0.3", features = ["serde", "macros", "formatting", "
|
||||
uuid = { version = "1", features = ["v7", "serde"] }
|
||||
blake3 = "1"
|
||||
tracing = "0.1"
|
||||
# `bundled` ships SQLite source so the workspace doesn't depend on a
|
||||
# system libsqlite3 (matches the kb-store-sqlite feature set).
|
||||
rusqlite = { version = "0.32", features = ["bundled"] }
|
||||
globset = "0.4"
|
||||
tempfile = "3"
|
||||
|
||||
22
crates/kb-search/Cargo.toml
Normal file
@@ -0,0 +1,22 @@
|
||||
[package]
|
||||
name = "kb-search"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Retriever implementations for kb (P2-2 lexical FTS5; P3 vector / hybrid will follow)"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
kb-config = { path = "../kb-config" }
|
||||
kb-store-sqlite = { path = "../kb-store-sqlite" }
|
||||
rusqlite = { workspace = true }
|
||||
globset = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
649
crates/kb-search/src/lexical.rs
Normal file
@@ -0,0 +1,649 @@
|
||||
//! Lexical (FTS5 + bm25) retriever — design §3.7 / §1.5 / §2.2 / §6.4.
|
||||
//!
|
||||
//! Owns the SQL pattern documented in `tasks/p2/p2-2-lexical-retriever.md`
|
||||
//! and constructs `kb_core::SearchHit` values directly from the joined
|
||||
//! `chunks_fts` / `chunks` / `documents` rows. Reads only — never mutates
|
||||
//! the underlying SQLite file.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use globset::GlobMatcher;
|
||||
use kb_core::{
|
||||
ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail,
|
||||
Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery, SourceSpan,
|
||||
TrustLevel, WorkspacePath,
|
||||
};
|
||||
use kb_store_sqlite::SqliteStore;
|
||||
use rusqlite::{params_from_iter, Connection, Row, ToSql};
|
||||
|
||||
// ── Tunables ─────────────────────────────────────────────────────────────
|
||||
|
||||
/// FTS5 hard limit on the `snippet()` `nToken` argument.
|
||||
/// See SQLite's FTS5 docs: snippet() rejects nToken > 64.
|
||||
const FTS5_SNIPPET_MAX_WORDS: usize = 64;
|
||||
|
||||
/// Floor for the snippet word budget. `snippet_chars / 4` may yield 0 for
|
||||
/// pathologically small configs; we always ask FTS5 for at least one word
|
||||
/// so it can still return something matchable for the test harness.
|
||||
const FTS5_SNIPPET_MIN_WORDS: usize = 1;
|
||||
|
||||
/// Default `k` when `SearchQuery::k == 0`. Mirrors §6.4 default_k=10.
|
||||
const DEFAULT_K: usize = 10;
|
||||
|
||||
/// When `path_glob` is set we have to over-fetch and post-filter in Rust,
|
||||
/// because SQLite's GLOB operator treats `*` as "any chars including `/`",
|
||||
/// which contradicts the design rule that `*` must NOT cross path
|
||||
/// separators. Empirically `+128` is generous for any realistic workspace
|
||||
/// and bounded enough to keep memory predictable.
|
||||
const PATH_GLOB_OVERFETCH: usize = 128;
|
||||
|
||||
// ── Public surface ───────────────────────────────────────────────────────
|
||||
|
||||
/// Lexical retriever backed by SQLite FTS5 + bm25.
|
||||
pub struct LexicalRetriever {
|
||||
store: Arc<SqliteStore>,
|
||||
index_version: IndexVersion,
|
||||
/// Number of `snippet()` words derived from `kb-config::search.snippet_chars`,
|
||||
/// clamped into `[FTS5_SNIPPET_MIN_WORDS, FTS5_SNIPPET_MAX_WORDS]`.
|
||||
snippet_words: usize,
|
||||
/// Hard cap on the returned snippet's character length per design §6.4.
|
||||
snippet_chars: usize,
|
||||
}
|
||||
|
||||
impl LexicalRetriever {
|
||||
/// Construct with default settings derived from `kb-config`'s defaults.
|
||||
/// Snippet width is computed from `Config::defaults().search.snippet_chars`.
|
||||
pub fn new(store: Arc<SqliteStore>, index_version: IndexVersion) -> Self {
|
||||
let cfg = kb_config::Config::defaults();
|
||||
Self::with_settings(store, index_version, cfg.search.snippet_chars)
|
||||
}
|
||||
|
||||
/// Construct with explicit `snippet_chars`. Used by tests / callers
|
||||
/// that have already loaded a `Config`.
|
||||
pub fn with_settings(
|
||||
store: Arc<SqliteStore>,
|
||||
index_version: IndexVersion,
|
||||
snippet_chars: usize,
|
||||
) -> Self {
|
||||
// Heuristic: 1 token ≈ 4 chars (English-leaning estimate; Korean
|
||||
// tokens average shorter, so the cap-by-chars trim below is what
|
||||
// actually enforces the contract). The `/4` keeps us well below
|
||||
// FTS5's nToken=64 limit for typical snippet_chars=220 budgets.
|
||||
let raw = snippet_chars / 4;
|
||||
let snippet_words = raw.clamp(FTS5_SNIPPET_MIN_WORDS, FTS5_SNIPPET_MAX_WORDS);
|
||||
Self {
|
||||
store,
|
||||
index_version,
|
||||
snippet_words,
|
||||
snippet_chars,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Retriever for LexicalRetriever {
|
||||
fn search(&self, query: &SearchQuery) -> Result<Vec<SearchHit>> {
|
||||
let match_opt = build_match_string(&query.text);
|
||||
let k = if query.k == 0 { DEFAULT_K } else { query.k };
|
||||
let filters = &query.filters;
|
||||
// One-line summary at request entry. Filter shape only — no
|
||||
// tag/lang/path values, which could be PII-sensitive.
|
||||
|
|
||||
tracing::debug!(
|
||||
match_str = match_opt.as_deref().unwrap_or("<empty>"),
|
||||
tags_any = filters.tags_any.len(),
|
||||
has_lang = filters.lang.is_some(),
|
||||
has_trust_min = filters.trust_min.is_some(),
|
||||
has_path_glob = filters.path_glob.is_some(),
|
||||
k,
|
||||
"kb-search lexical: search start"
|
||||
);
|
||||
|
||||
// Empty / whitespace-only query → nothing to do. Per spec we
|
||||
// succeed with an empty hit list rather than erroring.
|
||||
let match_str = match match_opt {
|
||||
Some(s) => s,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
|
||||
// Pre-compile the path_glob once. The `Glob` produced rejects
|
||||
// syntactically invalid patterns at construction time so the
|
||||
// caller gets a clear error rather than a silent empty result.
|
||||
let path_matcher = match &filters.path_glob {
|
||||
Some(g) => Some(compile_glob(g)?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
// Fetch budget: when post-filtering by glob we need to over-fetch
|
||||
// so that the final `take(k)` still has enough rows after culling.
|
||||
let fetch_limit = if path_matcher.is_some() {
|
||||
k.saturating_add(PATH_GLOB_OVERFETCH)
|
||||
} else {
|
||||
k
|
||||
};
|
||||
|
||||
let conn = self.store.read_conn();
|
||||
let raw_rows = run_query(
|
||||
&conn,
|
||||
&match_str,
|
||||
self.snippet_words,
|
||||
filters,
|
||||
fetch_limit,
|
||||
)?;
|
||||
|
||||
let mut hits: Vec<SearchHit> = Vec::with_capacity(raw_rows.len().min(k));
|
||||
let mut rank: u32 = 0;
|
||||
for row in raw_rows {
|
||||
// Path glob is the only filter we evaluate in Rust because the
|
||||
// semantics differ from SQLite's GLOB (no `/` crossing).
|
||||
if let Some(m) = &path_matcher {
|
||||
if !m.is_match(&row.workspace_path) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
rank = rank.saturating_add(1);
|
||||
let hit = build_hit(row, rank, &self.index_version, self.snippet_chars)?;
|
||||
hits.push(hit);
|
||||
if hits.len() >= k {
|
||||
break;
|
||||
}
|
||||
}
|
||||
tracing::debug!(rows = hits.len(), "kb-search lexical: search done");
|
||||
Ok(hits)
|
||||
}
|
||||
|
||||
fn index_version(&self) -> IndexVersion {
|
||||
self.index_version.clone()
|
||||
}
|
||||
}
|
||||
|
||||
// ── Match-string construction ────────────────────────────────────────────
|
||||
|
||||
/// Translate a user-typed query into an FTS5 match string.
|
||||
///
|
||||
/// Rules (from the task spec):
|
||||
///
|
||||
/// - The query is wrapped in a single pair of `'...'` → strip the quotes
|
||||
/// and pass the inner text through verbatim. The user has explicitly
|
||||
/// opted into FTS5 syntax (e.g. `'rust AND cargo'`, `'foo*'`).
|
||||
///
|
||||
/// - Otherwise: split on whitespace, escape every token by wrapping it
|
||||
/// in `"..."` (FTS5 string literal), with any inner `"` doubled. Join
|
||||
/// with spaces — FTS5 default operator is implicit AND.
|
||||
///
|
||||
/// - An empty / whitespace-only token list → return `None` (caller
|
||||
/// short-circuits to `Ok(vec![])`).
|
||||
fn build_match_string(text: &str) -> Option<String> {
|
||||
let trimmed = text.trim();
|
||||
if trimmed.is_empty() {
|
||||
return None;
|
||||
}
|
||||
if let Some(inner) = strip_single_quotes(trimmed) {
|
||||
let inner_trim = inner.trim();
|
||||
if inner_trim.is_empty() {
|
||||
return None;
|
||||
}
|
||||
return Some(inner_trim.to_string());
|
||||
}
|
||||
let tokens: Vec<String> = trimmed
|
||||
.split_whitespace()
|
||||
.map(escape_fts5_token)
|
||||
.collect();
|
||||
if tokens.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(tokens.join(" "))
|
||||
}
|
||||
}
|
||||
|
||||
/// Return `Some(inner)` if `s` is wrapped in a matching pair of single
|
||||
/// quotes (`'...'`), otherwise `None`. We require the closing quote to
|
||||
/// be the last character so `'foo' bar` doesn't accidentally engage
|
||||
|
claude-reviewer-01
commented
FTS5 매치 빌더의 escape 전략이 단순하면서도 안전합니다. 모든 토큰을 FTS5 매치 빌더의 escape 전략이 단순하면서도 안전합니다. 모든 토큰을 `"..."`로 감싸고 내부 `"`만 이중화하면 FTS5 metacharacter 전체(`*`, `^`, `:`, `(`, `)`)가 한 번에 무력화됩니다. parsing 없이 grammar 우회를 닫는 가장 boring한 정답입니다. 사용자가 의도적으로 FTS5 문법을 쓰고 싶을 때는 `'...'`로 wrap하는 explicit opt-in이 있고요.
|
||||
/// raw-FTS5 mode.
|
||||
fn strip_single_quotes(s: &str) -> Option<&str> {
|
||||
let bytes = s.as_bytes();
|
||||
if bytes.len() >= 2 && bytes[0] == b'\'' && bytes[bytes.len() - 1] == b'\'' {
|
||||
Some(&s[1..s.len() - 1])
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// FTS5-escape one token by wrapping it in double quotes (FTS5 string
|
||||
/// literal). Inner `"` are escaped by doubling per FTS5 grammar. This is
|
||||
/// the simple-and-safe approach that defangs every special character —
|
||||
/// `(`, `)`, `*`, `^`, `:`, `"`, etc. — without trying to parse FTS5
|
||||
/// expressions.
|
||||
fn escape_fts5_token(tok: &str) -> String {
|
||||
let mut out = String::with_capacity(tok.len() + 2);
|
||||
out.push('"');
|
||||
for ch in tok.chars() {
|
||||
if ch == '"' {
|
||||
out.push('"');
|
||||
out.push('"');
|
||||
} else {
|
||||
out.push(ch);
|
||||
}
|
||||
}
|
||||
out.push('"');
|
||||
out
|
||||
}
|
||||
|
||||
// ── SQL execution ────────────────────────────────────────────────────────
|
||||
|
||||
/// Raw row shape mirroring the columns selected by [`run_query`]. Kept
|
||||
/// internal — every public path constructs `SearchHit` from this.
|
||||
struct RawRow {
|
||||
chunk_id: String,
|
||||
doc_id: String,
|
||||
bm25_raw: f64,
|
||||
snippet: String,
|
||||
heading_path_json: String,
|
||||
section_label: Option<String>,
|
||||
source_spans_json: String,
|
||||
chunker_version: String,
|
||||
workspace_path: String,
|
||||
}
|
||||
|
||||
/// Build + execute the FTS5 query. The SQL pattern is the one documented
|
||||
/// in `tasks/p2/p2-2-lexical-retriever.md` (§Behavior contract).
|
||||
fn run_query(
|
||||
conn: &Connection,
|
||||
match_str: &str,
|
||||
snippet_words: usize,
|
||||
filters: &SearchFilters,
|
||||
fetch_limit: usize,
|
||||
) -> Result<Vec<RawRow>> {
|
||||
// Build the dynamic SQL + positional parameter vector. Positional `?`
|
||||
// is used (not named bindings) because the dynamic IN-list for
|
||||
// `tags_any` is most natural with `params_from_iter`.
|
||||
let mut sql = String::from(
|
||||
"SELECT \
|
||||
f.chunk_id, f.doc_id, \
|
||||
bm25(chunks_fts) AS score, \
|
||||
snippet(chunks_fts, 3, '', '', '…', ?) AS snippet, \
|
||||
c.heading_path_json, c.section_label, c.source_spans_json, \
|
||||
c.chunker_version, \
|
||||
d.workspace_path \
|
||||
FROM chunks_fts f \
|
||||
JOIN chunks c ON c.chunk_id = f.chunk_id \
|
||||
JOIN documents d ON d.doc_id = f.doc_id",
|
||||
);
|
||||
|
||||
let mut params: Vec<Box<dyn ToSql>> = Vec::new();
|
||||
// 1) snippet word count.
|
||||
params.push(Box::new(snippet_words as i64));
|
||||
// 2) MATCH expression.
|
||||
sql.push_str(" WHERE chunks_fts MATCH ?");
|
||||
params.push(Box::new(match_str.to_owned()));
|
||||
|
||||
// tags_any: doc must own at least one of the requested tags.
|
||||
if !filters.tags_any.is_empty() {
|
||||
sql.push_str(
|
||||
" AND f.doc_id IN (SELECT doc_id FROM document_tags WHERE tag IN (",
|
||||
);
|
||||
for (i, tag) in filters.tags_any.iter().enumerate() {
|
||||
if i > 0 {
|
||||
sql.push(',');
|
||||
}
|
||||
sql.push('?');
|
||||
params.push(Box::new(tag.clone()));
|
||||
}
|
||||
sql.push_str("))");
|
||||
}
|
||||
if let Some(lang) = &filters.lang {
|
||||
sql.push_str(" AND d.lang = ?");
|
||||
params.push(Box::new(lang.0.clone()));
|
||||
}
|
||||
if let Some(trust_min) = &filters.trust_min {
|
||||
// Mirror `kb_store_sqlite::documents::list_documents` ranking:
|
||||
// Generated < Secondary < Primary. Doing the rank in SQL
|
||||
// (rather than post-filtering) keeps the row stream short
|
||||
// when the workspace contains many low-trust docs.
|
||||
sql.push_str(
|
||||
" AND CASE d.trust_level \
|
||||
WHEN 'primary' THEN 3 \
|
||||
WHEN 'secondary' THEN 2 \
|
||||
WHEN 'generated' THEN 1 \
|
||||
ELSE 0 \
|
||||
END >= ?",
|
||||
);
|
||||
let rank: i64 = match trust_min {
|
||||
TrustLevel::Primary => 3,
|
||||
TrustLevel::Secondary => 2,
|
||||
TrustLevel::Generated => 1,
|
||||
};
|
||||
params.push(Box::new(rank));
|
||||
}
|
||||
// path_glob is intentionally NOT applied here — see module comment
|
||||
|
claude-reviewer-01
commented
`i64::try_from(fetch_limit).unwrap_or(i64::MAX)` 캐스트 — `usize::MAX` 입력에서 negative LIMIT으로 wrap되어 SQLite가 에러 내는 시나리오를 차단합니다. 실제로 발생할 일은 거의 없지만 robustness side에 둔 게 옳습니다.
|
||||
// on PATH_GLOB_OVERFETCH and the post-filter in `LexicalRetriever::search`.
|
||||
|
||||
// Determinism: tie-break on chunk_id so equal bm25 scores produce a
|
||||
// stable order across runs. `f.chunk_id` is the FTS row's UNINDEXED
|
||||
// copy of the same value as `c.chunk_id`; either side works.
|
||||
sql.push_str(" ORDER BY score, f.chunk_id LIMIT ?");
|
||||
params.push(Box::new(i64::try_from(fetch_limit).unwrap_or(i64::MAX)));
|
||||
|
||||
let mut stmt = conn
|
||||
.prepare(&sql)
|
||||
.context("kb-search lexical: prepare FTS5 statement")?;
|
||||
let rows = stmt
|
||||
.query_map(params_from_iter(params.iter().map(|b| b.as_ref())), row_from_sql)
|
||||
.context("kb-search lexical: execute FTS5 query")?;
|
||||
let mut out: Vec<RawRow> = Vec::new();
|
||||
for r in rows {
|
||||
out.push(r.context("kb-search lexical: read row")?);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
fn row_from_sql(row: &Row<'_>) -> rusqlite::Result<RawRow> {
|
||||
Ok(RawRow {
|
||||
chunk_id: row.get(0)?,
|
||||
doc_id: row.get(1)?,
|
||||
bm25_raw: row.get(2)?,
|
||||
snippet: row.get(3)?,
|
||||
heading_path_json: row.get(4)?,
|
||||
section_label: row.get(5)?,
|
||||
source_spans_json: row.get(6)?,
|
||||
chunker_version: row.get(7)?,
|
||||
workspace_path: row.get(8)?,
|
||||
})
|
||||
}
|
||||
|
||||
// ── Hit construction ─────────────────────────────────────────────────────
|
||||
|
||||
fn build_hit(
|
||||
raw: RawRow,
|
||||
rank: u32,
|
||||
index_version: &IndexVersion,
|
||||
snippet_chars: usize,
|
||||
) -> Result<SearchHit> {
|
||||
let normalized = normalize_bm25(raw.bm25_raw);
|
||||
let heading_path: Vec<String> = serde_json::from_str(&raw.heading_path_json)
|
||||
.context("kb-search lexical: deserialize heading_path_json")?;
|
||||
let source_spans: Vec<SourceSpan> = serde_json::from_str(&raw.source_spans_json)
|
||||
.context("kb-search lexical: deserialize source_spans_json")?;
|
||||
|
||||
let workspace_path = WorkspacePath::new(raw.workspace_path)
|
||||
.context("kb-search lexical: documents.workspace_path violates WorkspacePath invariant")?;
|
||||
|
||||
let citation = build_citation(
|
||||
&raw.chunk_id,
|
||||
workspace_path.clone(),
|
||||
raw.section_label.clone(),
|
||||
source_spans.first(),
|
||||
);
|
||||
|
||||
// FTS5's snippet() respects the word budget but produces a
|
||||
// character-length we can't predict precisely (token boundaries vary
|
||||
// with the tokenizer). The contract caps at `snippet_chars`; trim
|
||||
// defensively if SQLite ever returns a longer string.
|
||||
let snippet = trim_snippet(&raw.snippet, snippet_chars);
|
||||
|
||||
Ok(SearchHit {
|
||||
rank,
|
||||
chunk_id: ChunkId(raw.chunk_id),
|
||||
doc_id: DocumentId(raw.doc_id),
|
||||
doc_path: workspace_path,
|
||||
heading_path,
|
||||
section_label: raw.section_label,
|
||||
snippet,
|
||||
citation,
|
||||
retrieval: RetrievalDetail {
|
||||
method: SearchMode::Lexical,
|
||||
fusion_score: normalized,
|
||||
lexical_score: Some(normalized),
|
||||
vector_score: None,
|
||||
lexical_rank: Some(rank),
|
||||
vector_rank: None,
|
||||
},
|
||||
index_version: index_version.clone(),
|
||||
embedding_model: None,
|
||||
chunker_version: ChunkerVersion(raw.chunker_version),
|
||||
})
|
||||
}
|
||||
|
||||
/// Map the raw bm25 score (FTS5 returns a *negative* number; lower is
|
||||
/// better) into a positive score in `(0, 1]`. The formula
|
||||
/// `score = -bm25 / (1 + |bm25|)` is monotonic, smooth, and bounded —
|
||||
/// suitable both for human display and for use as an RRF input.
|
||||
fn normalize_bm25(bm25_raw: f64) -> f32 {
|
||||
let abs = bm25_raw.abs();
|
||||
let normalized = -bm25_raw / (1.0_f64 + abs);
|
||||
normalized as f32
|
||||
}
|
||||
|
||||
/// Build a `Citation` from the chunk's first `SourceSpan`. P1 markdown
|
||||
/// only emits `Line`, so the other variants are mostly defensive — we
|
||||
/// forward them as faithfully as possible so a future PDF / image
|
||||
/// extractor can flow through without churn.
|
||||
fn build_citation(
|
||||
chunk_id: &str,
|
||||
path: WorkspacePath,
|
||||
section: Option<String>,
|
||||
first_span: Option<&SourceSpan>,
|
||||
) -> Citation {
|
||||
match first_span {
|
||||
Some(SourceSpan::Line { start, end }) => Citation::Line {
|
||||
path,
|
||||
start: *start,
|
||||
end: *end,
|
||||
section,
|
||||
},
|
||||
Some(SourceSpan::Page { page, .. }) => Citation::Page {
|
||||
path,
|
||||
page: *page,
|
||||
section,
|
||||
},
|
||||
Some(SourceSpan::Region { x, y, w, h }) => Citation::Region {
|
||||
path,
|
||||
x: *x,
|
||||
y: *y,
|
||||
w: *w,
|
||||
h: *h,
|
||||
},
|
||||
Some(SourceSpan::Time { start_ms, end_ms }) => Citation::Time {
|
||||
path,
|
||||
start_ms: *start_ms,
|
||||
end_ms: *end_ms,
|
||||
speaker: None,
|
||||
},
|
||||
// Byte-spans don't have a Citation variant. Fall back to a Line
|
||||
// citation pointing at the document head — better than fabricating
|
||||
// a position. Spans-empty falls into the same branch.
|
||||
other @ (Some(SourceSpan::Byte { .. }) | None) => {
|
||||
let span_shape = match other {
|
||||
Some(_) => "Byte",
|
||||
None => "empty array",
|
||||
};
|
||||
tracing::warn!(
|
||||
chunk_id,
|
||||
|
claude-reviewer-01
commented
trim_snippet이 trim_snippet이 `.chars()` (Unicode scalar value) 단위로 자르는 trade-off를 코멘트로 남긴 점이 좋습니다. design §6.4의 "characters" 정의가 USV 기준이라 spec에 부합하고, 동시에 Hebrew niqqud / Devanagari combining mark가 orphan될 수 있는 corner case를 미래 reader가 "버그"로 오해해서 grapheme cluster로 "고치는" 회귀를 막아둡니다.
|
||||
span_shape,
|
||||
"kb-search lexical: SourceSpan has no Citation mapping; falling back to Line {{1, 1}}"
|
||||
);
|
||||
Citation::Line {
|
||||
path,
|
||||
start: 1,
|
||||
end: 1,
|
||||
section,
|
||||
}
|
||||
}
|
||||
|
claude-reviewer-01
commented
Byte/empty-array source_span에 Byte/empty-array source_span에 `Citation::Line { 1, 1 }` fallback + `tracing::warn!` 패턴이 의도를 ��� 표현합니다. 데이터 무결성 issue를 silent하게 가리지 않고 (warn으로 surface) 동시에 retrieval 자체는 멈추지 않게 (fallback으로 forward) — forward-compat regression이 로그에 잡힙니다.
|
||||
}
|
||||
}
|
||||
|
claude-reviewer-01
commented
globset의 globset의 `literal_separator(true)`. 이 한 줄이 spec Risks/notes의 "`*`이 `/`을 cross하면 안 됨" 요건을 만족시킵니다. globset default가 cross하는지 empirical하게 확인하고 명시적으로 켠 점이 좋고, `compile_glob_star_does_not_cross_slash` 단위 테스트로 invariant을 pin한 것도 적절합니다.
|
||||
|
||||
/// Cap the snippet at `max_chars` characters (Unicode scalar values, not
|
||||
/// bytes — matches the §6.4 setting's "characters" semantics). Returns
|
||||
/// the input unchanged when already short enough.
|
||||
fn trim_snippet(s: &str, max_chars: usize) -> String {
|
||||
// We slice on Unicode scalar values per §6.4's "characters" semantics; this
|
||||
// can orphan a combining mark in extreme cases (Hebrew niqqud, Devanagari)
|
||||
// but matches the spec's char-budget definition.
|
||||
if s.chars().count() <= max_chars {
|
||||
return s.to_string();
|
||||
}
|
||||
s.chars().take(max_chars).collect()
|
||||
}
|
||||
|
||||
// ── path_glob ────────────────────────────────────────────────────────────
|
||||
|
||||
/// Compile a `path_glob` pattern. We enable `literal_separator` so `*`
|
||||
/// does NOT cross `/` — design requires `*` to match within a single
|
||||
/// path segment, not across them. (`globset`'s default is to let `*`
|
||||
/// span separators.)
|
||||
fn compile_glob(pattern: &str) -> Result<GlobMatcher> {
|
||||
let g = globset::GlobBuilder::new(pattern)
|
||||
.literal_separator(true)
|
||||
.build()
|
||||
.with_context(|| format!("kb-search lexical: invalid path_glob {pattern:?}"))?;
|
||||
Ok(g.compile_matcher())
|
||||
}
|
||||
|
||||
// ── Unit tests for pure helpers ──────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn build_match_string_empty_returns_none() {
|
||||
assert!(build_match_string("").is_none());
|
||||
assert!(build_match_string(" ").is_none());
|
||||
assert!(build_match_string("''").is_none());
|
||||
assert!(build_match_string("' '").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_match_string_default_is_quoted_and_anded() {
|
||||
let s = build_match_string("rust cargo").unwrap();
|
||||
// Two tokens, each quoted, joined by a space (implicit AND).
|
||||
assert_eq!(s, r#""rust" "cargo""#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_match_string_escapes_special_chars() {
|
||||
// `*`, `(`, `)`, `:`, `^`, `"` should all be wrapped inside
|
||||
// FTS5 string-literal quotes so they're treated as literal
|
||||
// text rather than FTS5 operators.
|
||||
let s = build_match_string(r#"foo* (bar) baz:qux ^head he"llo"#).unwrap();
|
||||
assert_eq!(
|
||||
s,
|
||||
r#""foo*" "(bar)" "baz:qux" "^head" "he""llo""#
|
||||
);
|
||||
// The doubled `""` is FTS5's way of embedding a literal quote
|
||||
// inside a string literal.
|
||||
assert!(s.contains(r#"he""llo"#));
|
||||
// Sanity: every special character lives between matching `"`
|
||||
// delimiters — there is no bare-token (unquoted) span anywhere.
|
||||
// We check this by confirming the string starts and ends with `"`
|
||||
// and the count of unescaped `"` is even (each token is wrapped).
|
||||
assert!(s.starts_with('"') && s.ends_with('"'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_match_string_passthrough_when_single_quoted() {
|
||||
// The FTS5 expression is preserved verbatim.
|
||||
let s = build_match_string("'foo OR bar*'").unwrap();
|
||||
assert_eq!(s, "foo OR bar*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_bm25_top_score_in_unit_interval() {
|
||||
// A "perfect" hit is bm25 = -1.0 → normalized 0.5.
|
||||
// A high-relevance hit (bm25 = -10.0) → 10/11 ≈ 0.909.
|
||||
let high = normalize_bm25(-10.0);
|
||||
assert!(high > 0.0 && high <= 1.0, "got {high}");
|
||||
let medium = normalize_bm25(-1.0);
|
||||
assert!((medium - 0.5).abs() < 1e-6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_bm25_monotonic() {
|
||||
// Lower (more-negative) bm25 must map to a higher normalized score.
|
||||
let a = normalize_bm25(-2.0);
|
||||
let b = normalize_bm25(-1.0);
|
||||
assert!(a > b, "{a} should exceed {b}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trim_snippet_caps_at_char_count() {
|
||||
let s = "a".repeat(300);
|
||||
let trimmed = trim_snippet(&s, 220);
|
||||
assert_eq!(trimmed.chars().count(), 220);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trim_snippet_passthrough_when_short() {
|
||||
let s = "short";
|
||||
assert_eq!(trim_snippet(s, 220), "short");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_citation_line_round_trip() {
|
||||
let p = WorkspacePath::new("a/b.md".to_string()).unwrap();
|
||||
let span = SourceSpan::Line { start: 7, end: 12 };
|
||||
let c = build_citation("c1", p.clone(), Some("S1".to_string()), Some(&span));
|
||||
match c {
|
||||
Citation::Line {
|
||||
start,
|
||||
end,
|
||||
ref section,
|
||||
path: ref pp,
|
||||
} => {
|
||||
assert_eq!(start, 7);
|
||||
assert_eq!(end, 12);
|
||||
assert_eq!(section.as_deref(), Some("S1"));
|
||||
assert_eq!(pp, &p);
|
||||
}
|
||||
other => panic!("expected Citation::Line, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_citation_page_forwards_section() {
|
||||
let p = WorkspacePath::new("doc.pdf".to_string()).unwrap();
|
||||
let span = SourceSpan::Page {
|
||||
page: 4,
|
||||
char_start: None,
|
||||
char_end: None,
|
||||
};
|
||||
let c = build_citation("c1", p, Some("Intro".to_string()), Some(&span));
|
||||
match c {
|
||||
Citation::Page {
|
||||
page,
|
||||
ref section,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(page, 4);
|
||||
assert_eq!(section.as_deref(), Some("Intro"));
|
||||
}
|
||||
other => panic!("expected Citation::Page, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_citation_none_falls_back_to_line_one() {
|
||||
let p = WorkspacePath::new("x.md".to_string()).unwrap();
|
||||
let c = build_citation("c1", p, None, None);
|
||||
match c {
|
||||
Citation::Line { start, end, .. } => {
|
||||
assert_eq!((start, end), (1, 1));
|
||||
}
|
||||
other => panic!("expected fallback Citation::Line, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compile_glob_rejects_invalid_pattern() {
|
||||
// `[` is a character-class opener; an unclosed class is invalid.
|
||||
let r = compile_glob("notes/[abc");
|
||||
assert!(r.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compile_glob_star_does_not_cross_slash() {
|
||||
// This is the design invariant: `*` must NOT match `/`.
|
||||
let m = compile_glob("notes/*.md").unwrap();
|
||||
assert!(m.is_match("notes/foo.md"));
|
||||
assert!(!m.is_match("notes/sub/foo.md"));
|
||||
}
|
||||
}
|
||||
15
crates/kb-search/src/lib.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
//! `kb-search` — `kb_core::Retriever` implementations.
|
||||
//!
|
||||
//! P2-2 ships [`LexicalRetriever`], a SQLite-FTS5-backed retriever for
|
||||
//! `SearchMode::Lexical`. Vector + Hybrid retrievers land in P3-3 / P3-4.
|
||||
//!
|
||||
//! Allowed deps per task spec: `kb-core`, `kb-config`, `kb-store-sqlite`,
|
||||
//! `rusqlite`, `globset`, `tracing`, `thiserror`, `anyhow`. Forbidden:
|
||||
//! `kb-source-fs`, `kb-parse-md`, `kb-normalize`, `kb-chunk`,
|
||||
//! `kb-store-vector`, `kb-embed*`, `kb-llm*`, `kb-rag`, `kb-tui`,
|
||||
//! `kb-desktop`. Only `serde_json` is a transitive helper used to decode
|
||||
//! JSON-typed columns from `chunks` / `documents`.
|
||||
|
||||
mod lexical;
|
||||
|
||||
pub use lexical::LexicalRetriever;
|
||||
60
crates/kb-search/tests/fixtures/search/lexical/run-1.json
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
[
|
||||
|
claude-reviewer-01
commented
snapshot baseline의 stability 메모(rusqlite bundled SQLite + KB_UPDATE_SNAPSHOTS=1 재생성 절차)를 테스트 코드 측에 둔 점이 적절합니다. 향후 SQLite bump으로 bm25 알고리즘이나 tokenizer가 바뀔 때 reviewer가 무엇을 해야 하는지 즉시 파악 가능합니다. snapshot baseline의 stability 메모(rusqlite bundled SQLite + KB_UPDATE_SNAPSHOTS=1 재생성 절차)를 테스트 코드 측에 둔 점이 적절합니다. 향후 SQLite bump으로 bm25 알고리즘이나 tokenizer가 바뀔 때 reviewer가 무엇을 해야 하는지 즉시 파악 가능합니다.
|
||||
{
|
||||
"chunk_id": "c3000000000000000000000000000000",
|
||||
"chunker_version": "v1",
|
||||
"citation": {
|
||||
"end": 8,
|
||||
"kind": "line",
|
||||
"path": "notes/snap.md",
|
||||
"section": "Snap",
|
||||
"start": 7
|
||||
},
|
||||
"doc_id": "d0000000000000000000000000000000",
|
||||
"doc_path": "notes/snap.md",
|
||||
"embedding_model": null,
|
||||
"heading_path": [
|
||||
"Snap"
|
||||
],
|
||||
"index_version": "v1.0",
|
||||
"rank": 1,
|
||||
"retrieval": {
|
||||
"fusion_score": 1.4490997273242101e-6,
|
||||
"lexical_rank": 1,
|
||||
"lexical_score": 1.4490997273242101e-6,
|
||||
"method": "lexical",
|
||||
"vector_rank": null,
|
||||
"vector_score": null
|
||||
},
|
||||
"section_label": "Snap",
|
||||
"snippet": "alpha alpha"
|
||||
},
|
||||
{
|
||||
"chunk_id": "c1000000000000000000000000000000",
|
||||
"chunker_version": "v1",
|
||||
"citation": {
|
||||
"end": 2,
|
||||
"kind": "line",
|
||||
"path": "notes/snap.md",
|
||||
"section": "Snap",
|
||||
"start": 1
|
||||
},
|
||||
"doc_id": "d0000000000000000000000000000000",
|
||||
"doc_path": "notes/snap.md",
|
||||
"embedding_model": null,
|
||||
"heading_path": [
|
||||
"Snap"
|
||||
],
|
||||
"index_version": "v1.0",
|
||||
"rank": 2,
|
||||
"retrieval": {
|
||||
"fusion_score": 9.641424867368187e-7,
|
||||
"lexical_rank": 2,
|
||||
"lexical_score": 9.641424867368187e-7,
|
||||
"method": "lexical",
|
||||
"vector_rank": null,
|
||||
"vector_score": null
|
||||
},
|
||||
"section_label": "Snap",
|
||||
"snippet": "alpha bravo charlie"
|
||||
}
|
||||
]
|
||||
666
crates/kb-search/tests/lexical.rs
Normal file
@@ -0,0 +1,666 @@
|
||||
//! P2-2 integration tests for `LexicalRetriever`.
|
||||
//!
|
||||
//! Strategy: seed the SQLite store via raw inserts with `foreign_keys =
|
||||
//! OFF` (mirroring the P2-1 FTS tests). This avoids dragging
|
||||
//! `kb-parse-md` / `kb-normalize` / `kb-chunk` into kb-search's dev-deps,
|
||||
//! which would violate the task's "Allowed deps" list.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use kb_config::Config;
|
||||
use kb_core::{IndexVersion, Lang, Retriever, SearchFilters, SearchMode, SearchQuery, TrustLevel};
|
||||
use kb_search::LexicalRetriever;
|
||||
use kb_store_sqlite::SqliteStore;
|
||||
use rusqlite::Connection;
|
||||
use tempfile::TempDir;
|
||||
|
||||
// ── Test scaffolding ─────────────────────────────────────────────────────
|
||||
|
||||
struct Env {
|
||||
_temp: TempDir,
|
||||
store: Arc<SqliteStore>,
|
||||
db_path: std::path::PathBuf,
|
||||
}
|
||||
|
||||
impl Env {
|
||||
fn new() -> Self {
|
||||
let temp = tempfile::tempdir().expect("tempdir");
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = temp.path().to_string_lossy().into_owned();
|
||||
let store = SqliteStore::open(&config).expect("open store");
|
||||
store.run_migrations().expect("run migrations");
|
||||
let db_path = temp.path().join("kb.sqlite");
|
||||
Self {
|
||||
_temp: temp,
|
||||
store: Arc::new(store),
|
||||
db_path,
|
||||
}
|
||||
}
|
||||
|
||||
/// Side-channel raw connection with FK enforcement off — same
|
||||
/// trick used by P2-1's FTS tests so we can seed `chunks` /
|
||||
/// `documents` directly without the full ingest graph.
|
||||
fn raw_conn(&self) -> Connection {
|
||||
let conn = Connection::open(&self.db_path).expect("open side conn");
|
||||
conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
|
||||
conn
|
||||
}
|
||||
|
||||
fn retriever(&self) -> LexicalRetriever {
|
||||
LexicalRetriever::new(
|
||||
Arc::clone(&self.store),
|
||||
IndexVersion("v1.0".to_string()),
|
||||
)
|
||||
}
|
||||
|
||||
fn retriever_with_snippet_chars(&self, snippet_chars: usize) -> LexicalRetriever {
|
||||
LexicalRetriever::with_settings(
|
||||
Arc::clone(&self.store),
|
||||
IndexVersion("v1.0".to_string()),
|
||||
snippet_chars,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Minimal documents row. Many columns are NOT NULL and we don't care
|
||||
/// about their exact values for retrieval tests, so we wedge in
|
||||
/// reasonable defaults.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn insert_document(
|
||||
conn: &Connection,
|
||||
doc_id: &str,
|
||||
workspace_path: &str,
|
||||
title: &str,
|
||||
lang: &str,
|
||||
trust_level: &str,
|
||||
tags: &[&str],
|
||||
) {
|
||||
// assets row first — documents.asset_id has a FK with ON DELETE
|
||||
// RESTRICT but FKs are OFF on this connection. Still we insert a
|
||||
// matching row so JOINs pick it up.
|
||||
let asset_id = format!("{:0>32}", &doc_id[..1.min(doc_id.len())]); // 32-hex-ish
|
||||
let asset_id = format!("{:0>32}", asset_id.chars().take(32).collect::<String>());
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, 'file:///x', ?, '\"markdown\"', 0,
|
||||
'd0', 'reference', '/x', '2024-01-01T00:00:00Z')",
|
||||
rusqlite::params![asset_id, workspace_path],
|
||||
)
|
||||
.expect("insert asset");
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang,
|
||||
source_type, trust_level, parser_version,
|
||||
doc_version, schema_version, metadata_json,
|
||||
provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, 'markdown', ?, 'pv1', 1, 1,
|
||||
'{}', '{\"events\":[]}',
|
||||
'2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z')",
|
||||
rusqlite::params![doc_id, asset_id, workspace_path, title, lang, trust_level],
|
||||
)
|
||||
.expect("insert document");
|
||||
|
||||
for tag in tags {
|
||||
conn.execute(
|
||||
"INSERT INTO document_tags (doc_id, tag) VALUES (?, ?)",
|
||||
rusqlite::params![doc_id, tag],
|
||||
)
|
||||
.expect("insert tag");
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn insert_chunk(
|
||||
conn: &Connection,
|
||||
chunk_id: &str,
|
||||
doc_id: &str,
|
||||
text: &str,
|
||||
heading_path: &[&str],
|
||||
section_label: Option<&str>,
|
||||
source_spans_json: &str,
|
||||
chunker_version: &str,
|
||||
) {
|
||||
let heading_json = serde_json::to_string(heading_path).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, 0, ?, 'h', '[]', '2024-01-01T00:00:00Z')",
|
||||
rusqlite::params![
|
||||
chunk_id,
|
||||
doc_id,
|
||||
text,
|
||||
heading_json,
|
||||
section_label,
|
||||
source_spans_json,
|
||||
chunker_version,
|
||||
],
|
||||
)
|
||||
.expect("insert chunk");
|
||||
}
|
||||
|
||||
/// Pad a short ID to the 32-hex shape kb_core newtypes expect.
|
||||
fn id32(prefix: &str) -> String {
|
||||
let mut s = prefix.to_string();
|
||||
while s.len() < 32 {
|
||||
s.push('0');
|
||||
}
|
||||
s.truncate(32);
|
||||
s
|
||||
}
|
||||
|
||||
// ── Tests ────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn lexical_empty_corpus_returns_empty_vec() {
|
||||
let env = Env::new();
|
||||
let r = env.retriever();
|
||||
let q = SearchQuery {
|
||||
text: "rust".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let hits = r.search(&q).expect("search");
|
||||
assert!(hits.is_empty(), "empty corpus must yield empty Vec");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_empty_query_returns_empty_vec_without_db_hit() {
|
||||
// Even with rows in the DB, a blank query must short-circuit to [].
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/a.md", "A", "en", "primary", &[]);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d"),
|
||||
"rust cargo macros",
|
||||
&["A"],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":3}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
for empty in ["", " ", "''"] {
|
||||
let q = SearchQuery {
|
||||
text: empty.to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 5,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let hits = r.search(&q).unwrap();
|
||||
assert!(hits.is_empty(), "query {empty:?} must yield empty Vec");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_single_doc_match_returns_one_hit_with_citation_round_trip() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/rust.md", "Rust Notes", "en", "primary", &[]);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d"),
|
||||
"Rust borrow checker enforces ownership.",
|
||||
&["Notes"],
|
||||
Some("Notes"),
|
||||
r#"[{"kind":"line","start":4,"end":4}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let q = SearchQuery {
|
||||
text: "borrow".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let hits = r.search(&q).expect("search");
|
||||
assert_eq!(hits.len(), 1);
|
||||
let h = &hits[0];
|
||||
assert_eq!(h.rank, 1);
|
||||
assert_eq!(h.doc_path.0, "notes/rust.md");
|
||||
assert_eq!(h.heading_path, vec!["Notes".to_string()]);
|
||||
assert_eq!(h.section_label.as_deref(), Some("Notes"));
|
||||
assert_eq!(h.retrieval.method, SearchMode::Lexical);
|
||||
assert_eq!(h.retrieval.lexical_rank, Some(1));
|
||||
assert!(h.retrieval.vector_score.is_none());
|
||||
|
||||
// Citation round-trips through `to_uri`/`parse` (line variant).
|
||||
let uri = h.citation.to_uri();
|
||||
let parsed = kb_core::Citation::parse(&uri).expect("parse uri");
|
||||
// Reparsed citation has section=None (URI fragment doesn't carry it),
|
||||
// so compare by `to_uri` equivalence rather than struct equality.
|
||||
assert_eq!(parsed.to_uri(), uri);
|
||||
// Sanity: this is a Line citation matching the seeded source span.
|
||||
assert_eq!(uri, "notes/rust.md#L4");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_snippet_length_capped_at_snippet_chars() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(
|
||||
&conn,
|
||||
&id32("d"),
|
||||
"notes/long.md",
|
||||
"Long",
|
||||
"en",
|
||||
"primary",
|
||||
&[],
|
||||
);
|
||||
// A text long enough that FTS5 might return a snippet > 80 chars
|
||||
// when given a high word budget. We instead set a tight cap below
|
||||
// and rely on `trim_snippet` as the backstop.
|
||||
let mut text = String::new();
|
||||
for _ in 0..50 {
|
||||
text.push_str("alpha beta gamma delta epsilon ");
|
||||
}
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d"),
|
||||
&text,
|
||||
&["Long"],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
// Set snippet_chars to a known bound; the retriever clamps + trims
|
||||
// any snippet to fit.
|
||||
let r = env.retriever_with_snippet_chars(80);
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: SearchFilters::default(),
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(hits.len(), 1);
|
||||
assert!(
|
||||
hits[0].snippet.chars().count() <= 80,
|
||||
"snippet must be ≤ snippet_chars; got {} chars: {:?}",
|
||||
hits[0].snippet.chars().count(),
|
||||
hits[0].snippet
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_tags_any_excludes_untagged_docs() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &["rust"]);
|
||||
insert_document(&conn, &id32("d2"), "notes/b.md", "B", "en", "primary", &["python"]);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d1"),
|
||||
"ownership and borrow checker",
|
||||
&["A"],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c2"),
|
||||
&id32("d2"),
|
||||
"borrow semantics in python",
|
||||
&["B"],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let q = SearchQuery {
|
||||
text: "borrow".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters {
|
||||
tags_any: vec!["rust".to_string()],
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = r.search(&q).unwrap();
|
||||
assert_eq!(hits.len(), 1, "tags_any=[rust] must exclude python doc");
|
||||
assert_eq!(hits[0].doc_path.0, "notes/a.md");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_lang_and_trust_min_compose() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d1"), "ko/a.md", "A", "ko", "primary", &[]);
|
||||
insert_document(&conn, &id32("d2"), "en/b.md", "B", "en", "primary", &[]);
|
||||
insert_document(&conn, &id32("d3"), "en/c.md", "C", "en", "generated", &[]);
|
||||
for (cid, did, body) in [
|
||||
("c1", "d1", "검색 키워드 alpha"),
|
||||
("c2", "d2", "alpha bravo"),
|
||||
("c3", "d3", "alpha gamma"),
|
||||
] {
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32(cid),
|
||||
&id32(did),
|
||||
body,
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
}
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
// lang=en + trust_min=secondary → only d2 (primary ≥ secondary).
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters {
|
||||
lang: Some(Lang("en".to_string())),
|
||||
trust_min: Some(TrustLevel::Secondary),
|
||||
..Default::default()
|
||||
},
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(hits.len(), 1);
|
||||
assert_eq!(hits[0].doc_path.0, "en/b.md");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_path_glob_does_not_cross_slash() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &[]);
|
||||
insert_document(&conn, &id32("d2"), "notes/sub/b.md", "B", "en", "primary", &[]);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d1"),
|
||||
"shared keyword",
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c2"),
|
||||
&id32("d2"),
|
||||
"shared keyword",
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "keyword".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters {
|
||||
path_glob: Some("notes/*.md".to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
})
|
||||
.unwrap();
|
||||
let paths: Vec<&str> = hits.iter().map(|h| h.doc_path.0.as_str()).collect();
|
||||
assert_eq!(paths, vec!["notes/a.md"], "* must not match across `/`");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_citation_round_trip_against_first_source_span() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/m.md", "M", "en", "primary", &[]);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d"),
|
||||
"echo bravo",
|
||||
&[],
|
||||
None,
|
||||
// Two spans; the citation uses the first.
|
||||
r#"[{"kind":"line","start":12,"end":34},{"kind":"line","start":60,"end":61}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "bravo".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: SearchFilters::default(),
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(hits.len(), 1);
|
||||
let uri = hits[0].citation.to_uri();
|
||||
assert_eq!(uri, "notes/m.md#L12-L34");
|
||||
let parsed = kb_core::Citation::parse(&uri).unwrap();
|
||||
assert_eq!(parsed.to_uri(), uri);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_top_score_within_unit_interval_three_chunks() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/r.md", "R", "en", "primary", &[]);
|
||||
// Three chunks of varying relevance to the query 'alpha':
|
||||
// c1: alpha alpha alpha (best)
|
||||
// c2: alpha bravo
|
||||
// c3: bravo charlie alpha (one occurrence)
|
||||
for (cid, body) in [
|
||||
("c1", "alpha alpha alpha keyword"),
|
||||
("c2", "alpha bravo charlie"),
|
||||
("c3", "bravo charlie alpha"),
|
||||
] {
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32(cid),
|
||||
&id32("d"),
|
||||
body,
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
}
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
})
|
||||
.unwrap();
|
||||
assert!(!hits.is_empty(), "must surface at least one hit");
|
||||
let top = hits[0].retrieval.fusion_score;
|
||||
assert!(
|
||||
top > 0.0 && top <= 1.0,
|
||||
"top normalized score must be in (0, 1]; got {top}"
|
||||
);
|
||||
// All scores in [0, 1].
|
||||
for h in &hits {
|
||||
let s = h.retrieval.fusion_score;
|
||||
assert!((0.0..=1.0).contains(&s), "hit score {s} out of [0, 1]");
|
||||
// lexical_score and fusion_score equal in lexical-only mode.
|
||||
assert_eq!(h.retrieval.lexical_score, Some(s));
|
||||
}
|
||||
// bm25 should rank c1 (3 occurrences) above c2 / c3.
|
||||
assert!(hits[0].chunk_id.0.starts_with("c1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_determinism_same_query_twice() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/r.md", "R", "en", "primary", &[]);
|
||||
for (cid, body) in [
|
||||
("c1", "alpha alpha"),
|
||||
("c2", "alpha bravo"),
|
||||
("c3", "alpha charlie"),
|
||||
("c4", "alpha delta"),
|
||||
] {
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32(cid),
|
||||
&id32("d"),
|
||||
body,
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
}
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let q = SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let a = r.search(&q).unwrap();
|
||||
let b = r.search(&q).unwrap();
|
||||
assert_eq!(a, b, "same DB + same query must yield identical Vec<SearchHit>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_determinism_chunk_id_tiebreaker_on_equal_bm25() {
|
||||
// Two chunks with byte-identical text + length → identical bm25 scores
|
||||
|
claude-reviewer-01
commented
동일 bm25를 발생시키는 별도 결정성 테스트를 추가한 게 결정적입니다. 일반 결정성 테스트(varied scores)만 있으면 chunk_id tiebreaker 경로가 한 번도 실행되지 않은 채로 "determinism PASS"가 나오는데, 동일 텍스트 두 청크 케이스로 그 경로를 실제로 exercise합니다. 동일 bm25를 발생시키는 별도 결정성 테스트를 추가한 게 결정적입니다. 일반 결정성 테스트(varied scores)만 있으면 chunk_id tiebreaker 경로가 한 번도 실행되지 않은 채로 "determinism PASS"가 나오는데, 동일 텍스트 두 청크 케이스로 그 경로를 실제로 exercise합니다.
|
||||
// for any `MATCH` against them. The retriever must fall back to
|
||||
// `chunk_id` ordering so the result is stable across runs.
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/tie.md", "Tie", "en", "primary", &[]);
|
||||
let cid_a = id32("aaaa");
|
||||
let cid_b = id32("bbbb");
|
||||
assert!(cid_a < cid_b, "test premise: aaaa-id sorts before bbbb-id");
|
||||
for cid in [&cid_a, &cid_b] {
|
||||
insert_chunk(
|
||||
&conn,
|
||||
cid,
|
||||
&id32("d"),
|
||||
"alpha bravo charlie",
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
}
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let q = SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let a = r.search(&q).unwrap();
|
||||
let b = r.search(&q).unwrap();
|
||||
assert_eq!(a.len(), 2, "both chunks should match");
|
||||
// bm25 must be equal for byte-identical chunks; the secondary sort
|
||||
// by chunk_id pins the order.
|
||||
assert!(
|
||||
(a[0].retrieval.fusion_score - a[1].retrieval.fusion_score).abs() < 1e-9,
|
||||
"byte-identical chunks must score equally; got {} vs {}",
|
||||
a[0].retrieval.fusion_score,
|
||||
a[1].retrieval.fusion_score
|
||||
);
|
||||
assert!(
|
||||
a[0].chunk_id.0 < a[1].chunk_id.0,
|
||||
"tiebreaker must order by chunk_id ascending; got {} then {}",
|
||||
a[0].chunk_id.0,
|
||||
a[1].chunk_id.0
|
||||
);
|
||||
assert_eq!(a, b, "tiebreaker order must be stable across runs");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_index_version_is_returned_unchanged() {
|
||||
let env = Env::new();
|
||||
let r = LexicalRetriever::new(
|
||||
Arc::clone(&env.store),
|
||||
IndexVersion("custom-label-1".to_string()),
|
||||
);
|
||||
assert_eq!(r.index_version().0, "custom-label-1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_snapshot_run_1() {
|
||||
// Pinned snapshot. A small, deterministic corpus; the JSON shape of
|
||||
// `Vec<SearchHit>` for a fixed query is checked verbatim against
|
||||
// `tests/fixtures/search/lexical/run-1.json`. Update both sides in
|
||||
// the same commit when intentional changes ship.
|
||||
// Stable because rusqlite ships bundled SQLite — a tokenizer/bm25 algorithm change in a future SQLite bump will require regenerating run-1.json via `KB_UPDATE_SNAPSHOTS=1`.
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/snap.md", "Snap", "en", "primary", &[]);
|
||||
for (cid, body, span) in [
|
||||
(
|
||||
"c1",
|
||||
"alpha bravo charlie",
|
||||
r#"[{"kind":"line","start":1,"end":2}]"#,
|
||||
),
|
||||
(
|
||||
"c2",
|
||||
"bravo only here",
|
||||
r#"[{"kind":"line","start":4,"end":5}]"#,
|
||||
),
|
||||
(
|
||||
"c3",
|
||||
"alpha alpha",
|
||||
r#"[{"kind":"line","start":7,"end":8}]"#,
|
||||
),
|
||||
] {
|
||||
insert_chunk(&conn, &id32(cid), &id32("d"), body, &["Snap"], Some("Snap"), span, "v1");
|
||||
}
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
})
|
||||
.unwrap();
|
||||
let actual = serde_json::to_value(&hits).unwrap();
|
||||
|
||||
let baseline_path =
|
||||
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/search/lexical/run-1.json");
|
||||
if std::env::var_os("KB_UPDATE_SNAPSHOTS").is_some() {
|
||||
std::fs::write(&baseline_path, serde_json::to_string_pretty(&actual).unwrap()).unwrap();
|
||||
}
|
||||
let baseline_text = std::fs::read_to_string(&baseline_path)
|
||||
.expect("baseline snapshot must exist; run with KB_UPDATE_SNAPSHOTS=1 to seed");
|
||||
let expected: serde_json::Value = serde_json::from_str(&baseline_text).unwrap();
|
||||
assert_eq!(actual, expected, "lexical run-1 snapshot drift");
|
||||
}
|
||||
@@ -109,6 +109,22 @@ impl SqliteStore {
|
||||
self.conn.lock().unwrap_or_else(|p| p.into_inner())
|
||||
}
|
||||
|
||||
/// Read-only borrow of the connection.
|
||||
///
|
||||
/// Provided so sibling crates (e.g. `kb-search`) can run SELECTs
|
||||
/// against the schema owned by this crate without re-opening the
|
||||
/// SQLite file. Callers MUST treat the returned `Connection` as
|
||||
/// read-only — issuing mutating SQL (INSERT / UPDATE / DELETE / DDL)
|
||||
/// through this guard bypasses the per-document transaction discipline
|
||||
/// (`put_*` methods) and the FTS5 backfill helpers that the store
|
||||
/// layer enforces. Mutating callers must use `kb-store-sqlite`'s own
|
||||
/// public write methods instead.
|
||||
///
|
||||
/// Poisoning is recovered the same way as [`Self::lock_conn`].
|
||||
pub fn read_conn(&self) -> MutexGuard<'_, Connection> {
|
||||
self.conn.lock().unwrap_or_else(|p| p.into_inner())
|
||||
|
claude-reviewer-01
commented
`read_conn` doc-comment가 contract 의도는 명확하지만 `&Connection` 타입으로는 mutation을 막을 수 없다는 점은 PR 본문에 솔직하게 적어두셨습니다. closure scope (`with_read_conn<R>(&self, f: impl FnOnce(&Connection) -> R) -> R`) 변형이 type-system 차원에서 더 단단하긴 한데 prepare_cached + 행 iterator 패턴과 lifetimes가 awkward해진다는 trade-off도 있습니다 — P3 follow-up으로 미룬 결정에 동의합니다.
|
||||
}
|
||||
|
||||
/// Persist a `RawAsset` *with its raw bytes*: row goes into `assets`,
|
||||
/// bytes go to `data_dir/assets/<aa>/<asset_id>` if `byte_len ≤
|
||||
/// copy_threshold_mb`, otherwise the row records the source URI's
|
||||
|
||||
search 진입과 종료 양쪽에
tracing::debug!을 둔 점이 좋습니다.kb search --mode lexical디버깅 시 match string 자체와 필터 형태(개수/플래그)를 별도로 분리해서 로그한 부분 — 사용자 입력 값을 PII로 누설하지 않으면서도 "필터가 적용되긴 했나?"를 확인할 수 있게 만든 균형이 적절합니다.