p10-1A-1 (PR #139) added SearchFilters.code_lang + .repo fields and the CLI --code-lang / --repo flags propagate them correctly into SearchFilters, but neither the lexical retriever's FTS SQL nor the shared filter_chunks helper (used by the vector retriever) ever applied them — so a code-lang-filtered search returned all-doc hits (markdown / pdf / code mixed). Discovered while dogfooding p10-1B with httpx + zod + lodash clones: `kebab search 'AsyncClient' --code-lang python --json` returned markdown hits from httpx/docs/ first. Fix: add IN-list filters on json_extract(d.metadata_json, '$.code_lang') and '$.repo' to both lexical.rs and filters.rs, mirroring the existing media filter pattern. Two regression tests added in each crate covering the new filter behavior. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
695 lines
26 KiB
Rust
695 lines
26 KiB
Rust
//! Lexical (FTS5 + bm25) retriever — design §3.7 / §1.5 / §2.2 / §6.4.
|
|
//!
|
|
//! Owns the SQL pattern documented in `tasks/p2/p2-2-lexical-retriever.md`
|
|
//! and constructs `kebab_core::SearchHit` values directly from the joined
|
|
//! `chunks_fts` / `chunks` / `documents` rows. Reads only — never mutates
|
|
//! the underlying SQLite file.
|
|
|
|
use std::sync::Arc;
|
|
|
|
use anyhow::{Context, Result};
|
|
use globset::GlobMatcher;
|
|
use kebab_core::{
|
|
ChunkId, ChunkerVersion, DocumentId, IndexVersion, RetrievalDetail, Retriever,
|
|
ScoreKind, SearchFilters, SearchHit, SearchMode, SearchQuery, SourceSpan, TrustLevel,
|
|
WorkspacePath,
|
|
};
|
|
use kebab_store_sqlite::SqliteStore;
|
|
use rusqlite::{params_from_iter, Connection, Row, ToSql};
|
|
|
|
use crate::citation_helper::citation_from_first_span;
|
|
|
|
// ── Tunables ─────────────────────────────────────────────────────────────
|
|
|
|
/// FTS5 hard limit on the `snippet()` `nToken` argument.
|
|
/// See SQLite's FTS5 docs: snippet() rejects nToken > 64.
|
|
const FTS5_SNIPPET_MAX_WORDS: usize = 64;
|
|
|
|
/// Floor for the snippet word budget. `snippet_chars / 4` may yield 0 for
|
|
/// pathologically small configs; we always ask FTS5 for at least one word
|
|
/// so it can still return something matchable for the test harness.
|
|
const FTS5_SNIPPET_MIN_WORDS: usize = 1;
|
|
|
|
/// Default `k` when `SearchQuery::k == 0`. Mirrors §6.4 default_k=10.
|
|
const DEFAULT_K: usize = 10;
|
|
|
|
/// When `path_glob` is set we have to over-fetch and post-filter in Rust,
|
|
/// because SQLite's GLOB operator treats `*` as "any chars including `/`",
|
|
/// which contradicts the design rule that `*` must NOT cross path
|
|
/// separators. Empirically `+128` is generous for any realistic workspace
|
|
/// and bounded enough to keep memory predictable.
|
|
const PATH_GLOB_OVERFETCH: usize = 128;
|
|
|
|
// ── Public surface ───────────────────────────────────────────────────────
|
|
|
|
/// Lexical retriever backed by SQLite FTS5 + bm25.
|
|
pub struct LexicalRetriever {
|
|
store: Arc<SqliteStore>,
|
|
index_version: IndexVersion,
|
|
/// Number of `snippet()` words derived from `kb-config::search.snippet_chars`,
|
|
/// clamped into `[FTS5_SNIPPET_MIN_WORDS, FTS5_SNIPPET_MAX_WORDS]`.
|
|
snippet_words: usize,
|
|
/// Hard cap on the returned snippet's character length per design §6.4.
|
|
snippet_chars: usize,
|
|
}
|
|
|
|
impl LexicalRetriever {
|
|
/// Construct with default settings derived from `kb-config`'s defaults.
|
|
/// Snippet width is computed from `Config::defaults().search.snippet_chars`.
|
|
pub fn new(store: Arc<SqliteStore>, index_version: IndexVersion) -> Self {
|
|
let cfg = kebab_config::Config::defaults();
|
|
Self::with_settings(store, index_version, cfg.search.snippet_chars)
|
|
}
|
|
|
|
/// Construct with explicit `snippet_chars`. Used by tests / callers
|
|
/// that have already loaded a `Config`.
|
|
pub fn with_settings(
|
|
store: Arc<SqliteStore>,
|
|
index_version: IndexVersion,
|
|
snippet_chars: usize,
|
|
) -> Self {
|
|
// Heuristic: 1 token ≈ 4 chars (English-leaning estimate; Korean
|
|
// tokens average shorter, so the cap-by-chars trim below is what
|
|
// actually enforces the contract). The `/4` keeps us well below
|
|
// FTS5's nToken=64 limit for typical snippet_chars=220 budgets.
|
|
let raw = snippet_chars / 4;
|
|
let snippet_words = raw.clamp(FTS5_SNIPPET_MIN_WORDS, FTS5_SNIPPET_MAX_WORDS);
|
|
Self {
|
|
store,
|
|
index_version,
|
|
snippet_words,
|
|
snippet_chars,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Retriever for LexicalRetriever {
|
|
fn search(&self, query: &SearchQuery) -> Result<Vec<SearchHit>> {
|
|
let match_opt = build_match_string(&query.text);
|
|
let k = if query.k == 0 { DEFAULT_K } else { query.k };
|
|
let filters = &query.filters;
|
|
// One-line summary at request entry. Filter shape only — no
|
|
// tag/lang/path values, which could be PII-sensitive.
|
|
tracing::debug!(
|
|
match_str = match_opt.as_deref().unwrap_or("<empty>"),
|
|
tags_any = filters.tags_any.len(),
|
|
has_lang = filters.lang.is_some(),
|
|
has_trust_min = filters.trust_min.is_some(),
|
|
has_path_glob = filters.path_glob.is_some(),
|
|
k,
|
|
"kb-search lexical: search start"
|
|
);
|
|
|
|
// Empty / whitespace-only query → nothing to do. Per spec we
|
|
// succeed with an empty hit list rather than erroring.
|
|
let match_str = match match_opt {
|
|
Some(s) => s,
|
|
None => return Ok(Vec::new()),
|
|
};
|
|
|
|
// Pre-compile the path_glob once. The `Glob` produced rejects
|
|
// syntactically invalid patterns at construction time so the
|
|
// caller gets a clear error rather than a silent empty result.
|
|
let path_matcher = match &filters.path_glob {
|
|
Some(g) => Some(compile_glob(g)?),
|
|
None => None,
|
|
};
|
|
|
|
// Fetch budget: when post-filtering by glob we need to over-fetch
|
|
// so that the final `take(k)` still has enough rows after culling.
|
|
let fetch_limit = if path_matcher.is_some() {
|
|
k.saturating_add(PATH_GLOB_OVERFETCH)
|
|
} else {
|
|
k
|
|
};
|
|
|
|
let conn = self.store.read_conn();
|
|
let raw_rows = run_query(
|
|
&conn,
|
|
&match_str,
|
|
self.snippet_words,
|
|
filters,
|
|
fetch_limit,
|
|
)?;
|
|
|
|
let mut hits: Vec<SearchHit> = Vec::with_capacity(raw_rows.len().min(k));
|
|
let mut rank: u32 = 0;
|
|
for row in raw_rows {
|
|
// Path glob is the only filter we evaluate in Rust because the
|
|
// semantics differ from SQLite's GLOB (no `/` crossing).
|
|
if let Some(m) = &path_matcher {
|
|
if !m.is_match(&row.workspace_path) {
|
|
continue;
|
|
}
|
|
}
|
|
rank = rank.saturating_add(1);
|
|
let hit = build_hit(row, rank, &self.index_version, self.snippet_chars)?;
|
|
hits.push(hit);
|
|
if hits.len() >= k {
|
|
break;
|
|
}
|
|
}
|
|
tracing::debug!(rows = hits.len(), "kb-search lexical: search done");
|
|
Ok(hits)
|
|
}
|
|
|
|
fn index_version(&self) -> IndexVersion {
|
|
self.index_version.clone()
|
|
}
|
|
}
|
|
|
|
// ── Match-string construction ────────────────────────────────────────────
|
|
|
|
/// Translate a user-typed query into an FTS5 match string.
|
|
///
|
|
/// Rules (from the task spec):
|
|
///
|
|
/// - The query is wrapped in a single pair of `'...'` → strip the quotes
|
|
/// and pass the inner text through verbatim. The user has explicitly
|
|
/// opted into FTS5 syntax (e.g. `'rust AND cargo'`, `'foo*'`).
|
|
///
|
|
/// - Otherwise: split on whitespace, escape every token by wrapping it
|
|
/// in `"..."` (FTS5 string literal), with any inner `"` doubled. Join
|
|
/// with spaces — FTS5 default operator is implicit AND.
|
|
///
|
|
/// - An empty / whitespace-only token list → return `None` (caller
|
|
/// short-circuits to `Ok(vec![])`).
|
|
fn build_match_string(text: &str) -> Option<String> {
|
|
let trimmed = text.trim();
|
|
if trimmed.is_empty() {
|
|
return None;
|
|
}
|
|
if let Some(inner) = strip_single_quotes(trimmed) {
|
|
let inner_trim = inner.trim();
|
|
if inner_trim.is_empty() {
|
|
return None;
|
|
}
|
|
return Some(inner_trim.to_string());
|
|
}
|
|
let tokens: Vec<String> = trimmed
|
|
.split_whitespace()
|
|
.map(escape_fts5_token)
|
|
.collect();
|
|
if tokens.is_empty() {
|
|
None
|
|
} else {
|
|
Some(tokens.join(" "))
|
|
}
|
|
}
|
|
|
|
/// Return `Some(inner)` if `s` is wrapped in a matching pair of single
|
|
/// quotes (`'...'`), otherwise `None`. We require the closing quote to
|
|
/// be the last character so `'foo' bar` doesn't accidentally engage
|
|
/// raw-FTS5 mode.
|
|
fn strip_single_quotes(s: &str) -> Option<&str> {
|
|
let bytes = s.as_bytes();
|
|
if bytes.len() >= 2 && bytes[0] == b'\'' && bytes[bytes.len() - 1] == b'\'' {
|
|
Some(&s[1..s.len() - 1])
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// FTS5-escape one token by wrapping it in double quotes (FTS5 string
|
|
/// literal). Inner `"` are escaped by doubling per FTS5 grammar. This is
|
|
/// the simple-and-safe approach that defangs every special character —
|
|
/// `(`, `)`, `*`, `^`, `:`, `"`, etc. — without trying to parse FTS5
|
|
/// expressions.
|
|
fn escape_fts5_token(tok: &str) -> String {
|
|
let mut out = String::with_capacity(tok.len() + 2);
|
|
out.push('"');
|
|
for ch in tok.chars() {
|
|
if ch == '"' {
|
|
out.push('"');
|
|
out.push('"');
|
|
} else {
|
|
out.push(ch);
|
|
}
|
|
}
|
|
out.push('"');
|
|
out
|
|
}
|
|
|
|
// ── SQL execution ────────────────────────────────────────────────────────
|
|
|
|
/// Raw row shape mirroring the columns selected by [`run_query`]. Kept
|
|
/// internal — every public path constructs `SearchHit` from this.
|
|
struct RawRow {
|
|
chunk_id: String,
|
|
doc_id: String,
|
|
bm25_raw: f64,
|
|
snippet: String,
|
|
heading_path_json: String,
|
|
section_label: Option<String>,
|
|
source_spans_json: String,
|
|
chunker_version: String,
|
|
workspace_path: String,
|
|
/// p9-fb-32: documents.updated_at (RFC3339).
|
|
updated_at: String,
|
|
}
|
|
|
|
/// Build + execute the FTS5 query. The SQL pattern is the one documented
|
|
/// in `tasks/p2/p2-2-lexical-retriever.md` (§Behavior contract).
|
|
fn run_query(
|
|
conn: &Connection,
|
|
match_str: &str,
|
|
snippet_words: usize,
|
|
filters: &SearchFilters,
|
|
fetch_limit: usize,
|
|
) -> Result<Vec<RawRow>> {
|
|
// Build the dynamic SQL + positional parameter vector. Positional `?`
|
|
// is used (not named bindings) because the dynamic IN-list for
|
|
// `tags_any` is most natural with `params_from_iter`.
|
|
let mut sql = String::from(
|
|
"SELECT \
|
|
f.chunk_id, f.doc_id, \
|
|
bm25(chunks_fts) AS score, \
|
|
snippet(chunks_fts, 3, '', '', '…', ?) AS snippet, \
|
|
c.heading_path_json, c.section_label, c.source_spans_json, \
|
|
c.chunker_version, \
|
|
d.workspace_path, \
|
|
d.updated_at \
|
|
FROM chunks_fts f \
|
|
JOIN chunks c ON c.chunk_id = f.chunk_id \
|
|
JOIN documents d ON d.doc_id = f.doc_id",
|
|
);
|
|
|
|
let mut params: Vec<Box<dyn ToSql>> = Vec::new();
|
|
// 1) snippet word count.
|
|
params.push(Box::new(snippet_words as i64));
|
|
// 2) MATCH expression.
|
|
sql.push_str(" WHERE chunks_fts MATCH ?");
|
|
params.push(Box::new(match_str.to_owned()));
|
|
|
|
// tags_any: doc must own at least one of the requested tags.
|
|
if !filters.tags_any.is_empty() {
|
|
sql.push_str(
|
|
" AND f.doc_id IN (SELECT doc_id FROM document_tags WHERE tag IN (",
|
|
);
|
|
for (i, tag) in filters.tags_any.iter().enumerate() {
|
|
if i > 0 {
|
|
sql.push(',');
|
|
}
|
|
sql.push('?');
|
|
params.push(Box::new(tag.clone()));
|
|
}
|
|
sql.push_str("))");
|
|
}
|
|
if let Some(lang) = &filters.lang {
|
|
sql.push_str(" AND d.lang = ?");
|
|
params.push(Box::new(lang.0.clone()));
|
|
}
|
|
if let Some(trust_min) = &filters.trust_min {
|
|
// Mirror `kebab_store_sqlite::documents::list_documents` ranking:
|
|
// Generated < Secondary < Primary. Doing the rank in SQL
|
|
// (rather than post-filtering) keeps the row stream short
|
|
// when the workspace contains many low-trust docs.
|
|
sql.push_str(
|
|
" AND CASE d.trust_level \
|
|
WHEN 'primary' THEN 3 \
|
|
WHEN 'secondary' THEN 2 \
|
|
WHEN 'generated' THEN 1 \
|
|
ELSE 0 \
|
|
END >= ?",
|
|
);
|
|
let rank: i64 = match trust_min {
|
|
TrustLevel::Primary => 3,
|
|
TrustLevel::Secondary => 2,
|
|
TrustLevel::Generated => 1,
|
|
};
|
|
params.push(Box::new(rank));
|
|
}
|
|
// p9-fb-36: media_type filter (IN-list).
|
|
// `assets.media_type` JSON has two shapes:
|
|
// - unit variant (Markdown / Pdf): JSON text, e.g. `"markdown"`
|
|
// - tuple variant (Image(Png) / Audio(Mp3) / Other(s)): JSON object,
|
|
// e.g. `{"image": "png"}`
|
|
// Extract a unified "kind" string for both shapes via:
|
|
// CASE WHEN json_type = 'text' THEN json_extract($)
|
|
// ELSE (first object key)
|
|
// END IN (?, ...)
|
|
if !filters.media.is_empty() {
|
|
let placeholders: Vec<&str> =
|
|
std::iter::repeat_n("?", filters.media.len()).collect();
|
|
let placeholders = placeholders.join(",");
|
|
sql.push_str(&format!(
|
|
" AND f.doc_id IN (\
|
|
SELECT d2.doc_id FROM documents d2 \
|
|
JOIN assets a ON a.asset_id = d2.asset_id \
|
|
WHERE CASE \
|
|
WHEN json_type(a.media_type) = 'text' THEN json_extract(a.media_type, '$') \
|
|
ELSE (SELECT key FROM json_each(a.media_type) LIMIT 1) \
|
|
END IN ({placeholders}))"
|
|
));
|
|
for kind in &filters.media {
|
|
params.push(Box::new(kind.clone()));
|
|
}
|
|
}
|
|
|
|
// p10-1A-1 fix (dogfood-discovered 2026-05-20): code_lang filter
|
|
// (IN-list on metadata_json.$.code_lang). Empty Vec = no filter.
|
|
if !filters.code_lang.is_empty() {
|
|
let placeholders = std::iter::repeat_n("?", filters.code_lang.len())
|
|
.collect::<Vec<_>>()
|
|
.join(",");
|
|
sql.push_str(&format!(
|
|
" AND json_extract(d.metadata_json, '$.code_lang') IN ({placeholders})"
|
|
));
|
|
for lang in &filters.code_lang {
|
|
params.push(Box::new(lang.clone()));
|
|
}
|
|
}
|
|
|
|
// p10-1A-1 fix (dogfood-discovered 2026-05-20): repo filter
|
|
// (IN-list on metadata_json.$.repo). Empty Vec = no filter.
|
|
if !filters.repo.is_empty() {
|
|
let placeholders = std::iter::repeat_n("?", filters.repo.len())
|
|
.collect::<Vec<_>>()
|
|
.join(",");
|
|
sql.push_str(&format!(
|
|
" AND json_extract(d.metadata_json, '$.repo') IN ({placeholders})"
|
|
));
|
|
for repo in &filters.repo {
|
|
params.push(Box::new(repo.clone()));
|
|
}
|
|
}
|
|
|
|
// p9-fb-36: ingested_after filter.
|
|
// `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
|
|
// fb-32 ingest path), so lexicographic >= compare is correct — but only
|
|
// when the filter instant is also formatted as UTC `Z`. A non-UTC offset
|
|
// (e.g. `+09:00`) would compare as ASCII after `Z` (0x2B < 0x5A) and
|
|
// produce wrong results. Convert to UTC before formatting.
|
|
if let Some(after) = &filters.ingested_after {
|
|
let formatted = after
|
|
.to_offset(time::UtcOffset::UTC)
|
|
.format(&time::format_description::well_known::Rfc3339)
|
|
.expect("OffsetDateTime (UTC) formats to RFC3339");
|
|
sql.push_str(" AND d.updated_at >= ?");
|
|
params.push(Box::new(formatted));
|
|
}
|
|
|
|
// p9-fb-36: doc_id filter — single-doc scoping.
|
|
if let Some(id) = &filters.doc_id {
|
|
sql.push_str(" AND d.doc_id = ?");
|
|
params.push(Box::new(id.0.clone()));
|
|
}
|
|
|
|
// path_glob is intentionally NOT applied here — see module comment
|
|
// on PATH_GLOB_OVERFETCH and the post-filter in `LexicalRetriever::search`.
|
|
|
|
// Determinism: tie-break on chunk_id so equal bm25 scores produce a
|
|
// stable order across runs. `f.chunk_id` is the FTS row's UNINDEXED
|
|
// copy of the same value as `c.chunk_id`; either side works.
|
|
sql.push_str(" ORDER BY score, f.chunk_id LIMIT ?");
|
|
params.push(Box::new(i64::try_from(fetch_limit).unwrap_or(i64::MAX)));
|
|
|
|
let mut stmt = conn
|
|
.prepare(&sql)
|
|
.context("kb-search lexical: prepare FTS5 statement")?;
|
|
let rows = stmt
|
|
.query_map(params_from_iter(params.iter().map(|b| b.as_ref())), row_from_sql)
|
|
.context("kb-search lexical: execute FTS5 query")?;
|
|
let mut out: Vec<RawRow> = Vec::new();
|
|
for r in rows {
|
|
out.push(r.context("kb-search lexical: read row")?);
|
|
}
|
|
Ok(out)
|
|
}
|
|
|
|
fn row_from_sql(row: &Row<'_>) -> rusqlite::Result<RawRow> {
|
|
Ok(RawRow {
|
|
chunk_id: row.get(0)?,
|
|
doc_id: row.get(1)?,
|
|
bm25_raw: row.get(2)?,
|
|
snippet: row.get(3)?,
|
|
heading_path_json: row.get(4)?,
|
|
section_label: row.get(5)?,
|
|
source_spans_json: row.get(6)?,
|
|
chunker_version: row.get(7)?,
|
|
workspace_path: row.get(8)?,
|
|
updated_at: row.get(9)?,
|
|
})
|
|
}
|
|
|
|
// ── Hit construction ─────────────────────────────────────────────────────
|
|
|
|
fn build_hit(
|
|
raw: RawRow,
|
|
rank: u32,
|
|
index_version: &IndexVersion,
|
|
snippet_chars: usize,
|
|
) -> Result<SearchHit> {
|
|
let normalized = normalize_bm25(raw.bm25_raw);
|
|
let heading_path: Vec<String> = serde_json::from_str(&raw.heading_path_json)
|
|
.context("kb-search lexical: deserialize heading_path_json")?;
|
|
let source_spans: Vec<SourceSpan> = serde_json::from_str(&raw.source_spans_json)
|
|
.context("kb-search lexical: deserialize source_spans_json")?;
|
|
|
|
let workspace_path = WorkspacePath::new(raw.workspace_path)
|
|
.context("kb-search lexical: documents.workspace_path violates WorkspacePath invariant")?;
|
|
|
|
let citation = citation_from_first_span(
|
|
&raw.chunk_id,
|
|
workspace_path.clone(),
|
|
raw.section_label.clone(),
|
|
source_spans.first(),
|
|
);
|
|
|
|
// FTS5's snippet() respects the word budget but produces a
|
|
// character-length we can't predict precisely (token boundaries vary
|
|
// with the tokenizer). The contract caps at `snippet_chars`; trim
|
|
// defensively if SQLite ever returns a longer string.
|
|
let snippet = trim_snippet(&raw.snippet, snippet_chars);
|
|
|
|
// p9-fb-32: documents.updated_at is stored as RFC3339 TEXT (V001
|
|
// migration; written by put_document via OffsetDateTime::now_utc).
|
|
// fb-23 incremental ingest's skip path does not call put_document,
|
|
// so this naturally reflects the last actual re-process.
|
|
let indexed_at = time::OffsetDateTime::parse(
|
|
&raw.updated_at,
|
|
&time::format_description::well_known::Rfc3339,
|
|
)
|
|
.context("kb-search lexical: parse documents.updated_at as RFC3339")?;
|
|
|
|
Ok(SearchHit {
|
|
rank,
|
|
chunk_id: ChunkId(raw.chunk_id),
|
|
doc_id: DocumentId(raw.doc_id),
|
|
doc_path: workspace_path,
|
|
heading_path,
|
|
section_label: raw.section_label,
|
|
snippet,
|
|
citation,
|
|
retrieval: RetrievalDetail {
|
|
method: SearchMode::Lexical,
|
|
fusion_score: normalized,
|
|
lexical_score: Some(normalized),
|
|
vector_score: None,
|
|
lexical_rank: Some(rank),
|
|
vector_rank: None,
|
|
},
|
|
index_version: index_version.clone(),
|
|
embedding_model: None,
|
|
chunker_version: ChunkerVersion(raw.chunker_version),
|
|
indexed_at,
|
|
// Placeholder — overwritten by `kebab_app::staleness::mark_stale_in_place`
|
|
// (called from `App::search` / `App::search_uncached`) and the equivalent
|
|
// in `RagPipeline::ask` against the configured threshold.
|
|
stale: false,
|
|
score_kind: ScoreKind::Bm25,
|
|
repo: None,
|
|
code_lang: None,
|
|
})
|
|
}
|
|
|
|
/// Map the raw bm25 score (FTS5 returns a *negative* number; lower is
|
|
/// better) into a positive score in `(0, 1]`. The formula
|
|
/// `score = -bm25 / (1 + |bm25|)` is monotonic, smooth, and bounded —
|
|
/// suitable both for human display and for use as an RRF input.
|
|
fn normalize_bm25(bm25_raw: f64) -> f32 {
|
|
let abs = bm25_raw.abs();
|
|
let normalized = -bm25_raw / (1.0_f64 + abs);
|
|
normalized as f32
|
|
}
|
|
|
|
/// Cap the snippet at `max_chars` characters (Unicode scalar values, not
|
|
/// bytes — matches the §6.4 setting's "characters" semantics). Returns
|
|
/// the input unchanged when already short enough.
|
|
fn trim_snippet(s: &str, max_chars: usize) -> String {
|
|
// We slice on Unicode scalar values per §6.4's "characters" semantics; this
|
|
// can orphan a combining mark in extreme cases (Hebrew niqqud, Devanagari)
|
|
// but matches the spec's char-budget definition.
|
|
if s.chars().count() <= max_chars {
|
|
return s.to_string();
|
|
}
|
|
s.chars().take(max_chars).collect()
|
|
}
|
|
|
|
// ── path_glob ────────────────────────────────────────────────────────────
|
|
|
|
/// Compile a `path_glob` pattern. We enable `literal_separator` so `*`
|
|
/// does NOT cross `/` — design requires `*` to match within a single
|
|
/// path segment, not across them. (`globset`'s default is to let `*`
|
|
/// span separators.)
|
|
fn compile_glob(pattern: &str) -> Result<GlobMatcher> {
|
|
let g = globset::GlobBuilder::new(pattern)
|
|
.literal_separator(true)
|
|
.build()
|
|
.with_context(|| format!("kb-search lexical: invalid path_glob {pattern:?}"))?;
|
|
Ok(g.compile_matcher())
|
|
}
|
|
|
|
// ── Unit tests for pure helpers ──────────────────────────────────────────
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn build_match_string_empty_returns_none() {
|
|
assert!(build_match_string("").is_none());
|
|
assert!(build_match_string(" ").is_none());
|
|
assert!(build_match_string("''").is_none());
|
|
assert!(build_match_string("' '").is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn build_match_string_default_is_quoted_and_anded() {
|
|
let s = build_match_string("rust cargo").unwrap();
|
|
// Two tokens, each quoted, joined by a space (implicit AND).
|
|
assert_eq!(s, r#""rust" "cargo""#);
|
|
}
|
|
|
|
#[test]
|
|
fn build_match_string_escapes_special_chars() {
|
|
// `*`, `(`, `)`, `:`, `^`, `"` should all be wrapped inside
|
|
// FTS5 string-literal quotes so they're treated as literal
|
|
// text rather than FTS5 operators.
|
|
let s = build_match_string(r#"foo* (bar) baz:qux ^head he"llo"#).unwrap();
|
|
assert_eq!(
|
|
s,
|
|
r#""foo*" "(bar)" "baz:qux" "^head" "he""llo""#
|
|
);
|
|
// The doubled `""` is FTS5's way of embedding a literal quote
|
|
// inside a string literal.
|
|
assert!(s.contains(r#"he""llo"#));
|
|
// Sanity: every special character lives between matching `"`
|
|
// delimiters — there is no bare-token (unquoted) span anywhere.
|
|
// We check this by confirming the string starts and ends with `"`
|
|
// and the count of unescaped `"` is even (each token is wrapped).
|
|
assert!(s.starts_with('"') && s.ends_with('"'));
|
|
}
|
|
|
|
#[test]
|
|
fn build_match_string_passthrough_when_single_quoted() {
|
|
// The FTS5 expression is preserved verbatim.
|
|
let s = build_match_string("'foo OR bar*'").unwrap();
|
|
assert_eq!(s, "foo OR bar*");
|
|
}
|
|
|
|
#[test]
|
|
fn normalize_bm25_top_score_in_unit_interval() {
|
|
// A "perfect" hit is bm25 = -1.0 → normalized 0.5.
|
|
// A high-relevance hit (bm25 = -10.0) → 10/11 ≈ 0.909.
|
|
let high = normalize_bm25(-10.0);
|
|
assert!(high > 0.0 && high <= 1.0, "got {high}");
|
|
let medium = normalize_bm25(-1.0);
|
|
assert!((medium - 0.5).abs() < 1e-6);
|
|
}
|
|
|
|
#[test]
|
|
fn normalize_bm25_monotonic() {
|
|
// Lower (more-negative) bm25 must map to a higher normalized score.
|
|
let a = normalize_bm25(-2.0);
|
|
let b = normalize_bm25(-1.0);
|
|
assert!(a > b, "{a} should exceed {b}");
|
|
}
|
|
|
|
#[test]
|
|
fn trim_snippet_caps_at_char_count() {
|
|
let s = "a".repeat(300);
|
|
let trimmed = trim_snippet(&s, 220);
|
|
assert_eq!(trimmed.chars().count(), 220);
|
|
}
|
|
|
|
#[test]
|
|
fn trim_snippet_passthrough_when_short() {
|
|
let s = "short";
|
|
assert_eq!(trim_snippet(s, 220), "short");
|
|
}
|
|
|
|
#[test]
|
|
fn build_citation_line_round_trip() {
|
|
use kebab_core::Citation;
|
|
let p = WorkspacePath::new("a/b.md".to_string()).unwrap();
|
|
let span = SourceSpan::Line { start: 7, end: 12 };
|
|
let c = citation_from_first_span("c1", p.clone(), Some("S1".to_string()), Some(&span));
|
|
match c {
|
|
Citation::Line {
|
|
start,
|
|
end,
|
|
ref section,
|
|
path: ref pp,
|
|
} => {
|
|
assert_eq!(start, 7);
|
|
assert_eq!(end, 12);
|
|
assert_eq!(section.as_deref(), Some("S1"));
|
|
assert_eq!(pp, &p);
|
|
}
|
|
other => panic!("expected Citation::Line, got {other:?}"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn build_citation_page_forwards_section() {
|
|
use kebab_core::Citation;
|
|
let p = WorkspacePath::new("doc.pdf".to_string()).unwrap();
|
|
let span = SourceSpan::Page {
|
|
page: 4,
|
|
char_start: None,
|
|
char_end: None,
|
|
};
|
|
let c = citation_from_first_span("c1", p, Some("Intro".to_string()), Some(&span));
|
|
match c {
|
|
Citation::Page {
|
|
page,
|
|
ref section,
|
|
..
|
|
} => {
|
|
assert_eq!(page, 4);
|
|
assert_eq!(section.as_deref(), Some("Intro"));
|
|
}
|
|
other => panic!("expected Citation::Page, got {other:?}"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn build_citation_none_falls_back_to_line_one() {
|
|
use kebab_core::Citation;
|
|
let p = WorkspacePath::new("x.md".to_string()).unwrap();
|
|
let c = citation_from_first_span("c1", p, None, None);
|
|
match c {
|
|
Citation::Line { start, end, .. } => {
|
|
assert_eq!((start, end), (1, 1));
|
|
}
|
|
other => panic!("expected fallback Citation::Line, got {other:?}"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn compile_glob_rejects_invalid_pattern() {
|
|
// `[` is a character-class opener; an unclosed class is invalid.
|
|
let r = compile_glob("notes/[abc");
|
|
assert!(r.is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn compile_glob_star_does_not_cross_slash() {
|
|
// This is the design invariant: `*` must NOT match `/`.
|
|
let m = compile_glob("notes/*.md").unwrap();
|
|
assert!(m.is_match("notes/foo.md"));
|
|
assert!(!m.is_match("notes/sub/foo.md"));
|
|
}
|
|
}
|