diff --git a/Cargo.lock b/Cargo.lock index 0a4deaa..0fc9172 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3473,7 +3473,9 @@ dependencies = [ "globset", "kb-config", "kb-core", + "kb-embed", "kb-store-sqlite", + "kb-store-vector", "rusqlite", "serde_json", "tempfile", diff --git a/crates/kb-search/Cargo.toml b/crates/kb-search/Cargo.toml index d7a49ab..b5a8295 100644 --- a/crates/kb-search/Cargo.toml +++ b/crates/kb-search/Cargo.toml @@ -11,6 +11,14 @@ description = "Retriever implementations for kb (P2-2 lexical FTS5; P3 vector kb-core = { path = "../kb-core" } kb-config = { path = "../kb-config" } kb-store-sqlite = { path = "../kb-store-sqlite" } +# P3-4 hybrid retriever wraps a `dyn VectorStore` (typically backed by +# `kb-store-vector::LanceVectorStore`) and a `dyn Embedder` (any P3-2 +# adapter). Listed as a runtime dep so callers can construct +# `VectorRetriever::new` against the trait objects without a concrete +# adapter — the concrete adapter (`kb-embed-local`) stays out of this +# crate per the spec's Forbidden deps list. +kb-store-vector = { path = "../kb-store-vector" } +kb-embed = { path = "../kb-embed" } rusqlite = { workspace = true } globset = { workspace = true } serde_json = { workspace = true } @@ -20,3 +28,8 @@ anyhow = { workspace = true } [dev-dependencies] tempfile = { workspace = true } +# Hybrid integration tests inject a `MockEmbedder` (kb-embed `mock` +# feature) and stand up a real `LanceVectorStore` on a tmp directory. +# The mock-retriever unit tests (the bulk of the hybrid suite) do not +# need either, but the integration / snapshot lane does. +kb-embed = { path = "../kb-embed", features = ["mock"] } diff --git a/crates/kb-search/src/citation_helper.rs b/crates/kb-search/src/citation_helper.rs new file mode 100644 index 0000000..d0c0068 --- /dev/null +++ b/crates/kb-search/src/citation_helper.rs @@ -0,0 +1,74 @@ +//! Shared helpers for building `kb_core::Citation` values from a +//! chunk's first `SourceSpan`. +//! +//! Both the lexical and vector retrievers join against the same +//! `chunks.source_spans_json` column and need identical mapping logic +//! so cross-mode citation strings round-trip byte-identically (a +//! requirement for the hybrid retriever's tie-break on chunk_id and +//! for the `search --explain` output documented in design §0 Q3 and +//! §1.6). Living here means a future PDF / image / audio extractor can +//! enrich the mapping in one place rather than two. + +use kb_core::{Citation, SourceSpan, WorkspacePath}; + +/// Build a `Citation` from the chunk's first `SourceSpan`. P1 markdown +/// only emits `Line`, so the other variants are mostly defensive — we +/// forward them as faithfully as possible so a future PDF / image +/// extractor can flow through without churn. +/// +/// `chunk_id` is taken only for diagnostic logging when the span shape +/// has no Citation mapping (`Byte`-spans, empty arrays). +pub(crate) fn citation_from_first_span( + chunk_id: &str, + path: WorkspacePath, + section: Option, + first_span: Option<&SourceSpan>, +) -> Citation { + match first_span { + Some(SourceSpan::Line { start, end }) => Citation::Line { + path, + start: *start, + end: *end, + section, + }, + Some(SourceSpan::Page { page, .. }) => Citation::Page { + path, + page: *page, + section, + }, + Some(SourceSpan::Region { x, y, w, h }) => Citation::Region { + path, + x: *x, + y: *y, + w: *w, + h: *h, + }, + Some(SourceSpan::Time { start_ms, end_ms }) => Citation::Time { + path, + start_ms: *start_ms, + end_ms: *end_ms, + speaker: None, + }, + // Byte-spans don't have a Citation variant. Fall back to a + // Line citation pointing at the document head — better than + // fabricating a position. Spans-empty falls into the same + // branch. + other @ (Some(SourceSpan::Byte { .. }) | None) => { + let span_shape = match other { + Some(_) => "Byte", + None => "empty array", + }; + tracing::warn!( + chunk_id, + span_shape, + "kb-search: SourceSpan has no Citation mapping; falling back to Line {{1, 1}}" + ); + Citation::Line { + path, + start: 1, + end: 1, + section, + } + } + } +} diff --git a/crates/kb-search/src/hybrid.rs b/crates/kb-search/src/hybrid.rs new file mode 100644 index 0000000..b05583b --- /dev/null +++ b/crates/kb-search/src/hybrid.rs @@ -0,0 +1,603 @@ +//! Hybrid retriever — design §3.7 / §6.4 / §0 Q3 / §1.6. +//! +//! Composes a lexical and a vector retriever (both `dyn Retriever`) +//! and dispatches by `SearchMode`. For `Hybrid`, results are fused via +//! Reciprocal Rank Fusion (RRF): +//! +//! ```text +//! score(c) = Σ_{m ∈ {lex, vec}} 1 / (k_rrf + rank_m(c)) +//! ``` +//! +//! where `rank_m(c)` is the 1-based rank of chunk `c` in retriever +//! `m`'s output (chunks not appearing in `m` contribute 0). +//! +//! Each `SearchHit.retrieval` is rebuilt with the per-mode scores / +//! ranks the fusion observed, so `kb search --explain` (§1.6) can +//! show users exactly which retriever contributed what to the final +//! ordering. + +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::Result; +use kb_core::{ + IndexVersion, RetrievalDetail, Retriever, SearchHit, SearchMode, SearchQuery, +}; + +/// Default `k_rrf` if `kb-config::SearchCfg::rrf_k` is misconfigured. +/// Matches §6.4's documented default (60). +const DEFAULT_K_RRF: u32 = 60; + +/// When fanning out for hybrid fusion we ask each side for `k * +/// HYBRID_FANOUT_MULTIPLIER` candidates so the disjoint set of +/// chunks (those a single retriever surfaces but the other does not) +/// is wide enough to feed a useful fused top-k. +/// +/// `2` is the spec-suggested floor; raising it helps recall on +/// adversarial corpora at linear cost. Documented in +/// `tasks/p3/p3-4-hybrid-fusion.md` "Risks / notes". +const HYBRID_FANOUT_MULTIPLIER: usize = 2; + +/// Default `k` when `SearchQuery::k == 0`. Mirrors §6.4 default_k=10. +const DEFAULT_K: usize = 10; + +/// Fusion algorithm. Today only Reciprocal Rank Fusion is supported; +/// listing as an enum so future score-calibration policies (P+) can +/// land without an API break. +#[derive(Clone, Copy, Debug)] +pub enum FusionPolicy { + /// Reciprocal Rank Fusion. `k_rrf` is the standard rank-bias + /// hyperparameter (§6.4); larger values flatten the rank-bias + /// curve, smaller values privilege top-of-list hits. + Rrf { k_rrf: u32 }, +} + +/// Hybrid retriever composing a lexical and a vector retriever. +/// +/// For chunks that appear in both retrievers, the lexical-side hit +/// supplies `snippet`, `citation`, `heading_path`, `chunker_version`, +/// and `embedding_model` — lexical search has FTS5 highlighting that's +/// more user-relevant than the vector retriever's truncated text. +/// Vector-only chunks fall through to the vector hit's data verbatim. +/// This matches `kb search --explain` (§1.6) expectations for snippet +/// provenance. +pub struct HybridRetriever { + lexical: Arc, + vector: Arc, + fusion: FusionPolicy, + /// Default `k` for queries that arrive with `k == 0`. Pulled from + /// `config.search.default_k` at construction. + default_k: usize, +} + +impl HybridRetriever { + /// Construct from a `kb-config` Config + the two underlying + /// retrievers. Reads `config.search.hybrid_fusion` (only `"rrf"` + /// is recognised today) and `config.search.rrf_k`. + pub fn new( + config: &kb_config::Config, + lexical: Arc, + vector: Arc, + ) -> Self { + let fusion = parse_fusion(&config.search.hybrid_fusion, config.search.rrf_k); + let default_k = if config.search.default_k == 0 { + DEFAULT_K + } else { + config.search.default_k + }; + // Surface mismatched index_version up front so users see it + // (e.g. lexical at v2, vector at v1 means a stale index that + // the user should refresh). Spec line 144 calls this out as + // a "flag at construction". + let lex_iv = lexical.index_version(); + let vec_iv = vector.index_version(); + if lex_iv.0 != vec_iv.0 { + tracing::warn!( + target: "kb-search", + lexical_index = %lex_iv.0, + vector_index = %vec_iv.0, + "kb-search hybrid: lexical and vector index_version differ; consider re-indexing" + ); + } + Self { + lexical, + vector, + fusion, + default_k, + } + } + + /// Construct with explicit policy / `k`. Used by tests that want + /// to pin RRF parameters without going through `kb-config`. + pub fn with_policy( + lexical: Arc, + vector: Arc, + fusion: FusionPolicy, + default_k: usize, + ) -> Self { + Self { + lexical, + vector, + fusion, + default_k: if default_k == 0 { DEFAULT_K } else { default_k }, + } + } +} + +impl Retriever for HybridRetriever { + fn search(&self, query: &SearchQuery) -> Result> { + match query.mode { + SearchMode::Lexical => self.lexical.search(query), + SearchMode::Vector => self.vector.search(query), + SearchMode::Hybrid => self.fuse(query), + } + } + + fn index_version(&self) -> IndexVersion { + // Composite token so callers (e.g. snapshot tests) can detect + // either side drifting without inspecting both retrievers. + let lex = self.lexical.index_version().0; + let vec = self.vector.index_version().0; + IndexVersion(format!("hybrid:{lex}+{vec}")) + } +} + +impl HybridRetriever { + fn fuse(&self, query: &SearchQuery) -> Result> { + let target_k = if query.k == 0 { self.default_k } else { query.k }; + + // Fanout: ask each retriever for `target_k * MULTIPLIER` so + // the disjoint set of candidates is wide enough. The two + // per-side queries are identical (same text, k, mode, filters); + // only the dispatch differs, so we share one `SearchQuery`. + let fanout_k = target_k.saturating_mul(HYBRID_FANOUT_MULTIPLIER); + let lex_query = SearchQuery { + k: fanout_k, + ..query.clone() + }; + + let lex_hits = self.lexical.search(&lex_query)?; + let vec_hits = self.vector.search(&lex_query)?; + + tracing::debug!( + lex = lex_hits.len(), + vec = vec_hits.len(), + target_k, + "kb-search hybrid: pre-fusion candidate counts" + ); + + // Build (chunk_id → (rank, hit)) maps. The rank stored here + // is the `rank` field on each retriever's output, which is + // already 1-based by both LexicalRetriever and VectorRetriever + // (and any well-behaved Retriever should mirror). + let lex_index: HashMap = lex_hits + .into_iter() + .map(|h| (h.chunk_id.0.clone(), (h.rank, h))) + .collect(); + let vec_index: HashMap = vec_hits + .into_iter() + .map(|h| (h.chunk_id.0.clone(), (h.rank, h))) + .collect(); + + // Union of chunk_ids from both sides. + let mut all_ids: Vec = Vec::with_capacity(lex_index.len() + vec_index.len()); + for k in lex_index.keys() { + all_ids.push(k.clone()); + } + for k in vec_index.keys() { + if !lex_index.contains_key(k) { + all_ids.push(k.clone()); + } + } + + // Compute fused score per chunk. + let FusionPolicy::Rrf { k_rrf } = self.fusion; + let k_rrf_f = f64::from(k_rrf); + + struct Scored { + chunk_id: String, + rrf: f64, + lex_rank: Option, + vec_rank: Option, + } + let mut scored: Vec = all_ids + .into_iter() + .map(|cid| { + let lex_rank = lex_index.get(&cid).map(|(r, _)| *r); + let vec_rank = vec_index.get(&cid).map(|(r, _)| *r); + let mut rrf = 0.0_f64; + if let Some(r) = lex_rank { + rrf += 1.0 / (k_rrf_f + f64::from(r)); + } + if let Some(r) = vec_rank { + rrf += 1.0 / (k_rrf_f + f64::from(r)); + } + Scored { + chunk_id: cid, + rrf, + lex_rank, + vec_rank, + } + }) + .collect(); + + // Sort: rrf DESC, then lex_rank ASC (None last), then chunk_id ASC. + // f64 ordering uses `total_cmp` so NaN stays deterministic + // (won't occur today — k_rrf > 0 → denominators > 0 — but + // total_cmp keeps the sort stable under future tweaks). + scored.sort_by(|a, b| { + b.rrf + .total_cmp(&a.rrf) + .then_with(|| { + let am = a.lex_rank.unwrap_or(u32::MAX); + let bm = b.lex_rank.unwrap_or(u32::MAX); + am.cmp(&bm) + }) + .then_with(|| a.chunk_id.cmp(&b.chunk_id)) + }); + + // Build final SearchHits, taking the top `target_k`. + let mut hits: Vec = Vec::with_capacity(target_k.min(scored.len())); + let mut rank: u32 = 0; + for s in scored.into_iter().take(target_k) { + // Pull the underlying hit. Prefer the lexical side when + // available — its snippet has FTS5 highlighting which + // gives users the most useful preview. Fall back to + // vector if the chunk only appeared in vector results. + let mut base = match (lex_index.get(&s.chunk_id), vec_index.get(&s.chunk_id)) { + (Some((_, lex)), _) => lex.clone(), + (None, Some((_, vec))) => vec.clone(), + // `all_ids` is the union of `lex_index` and + // `vec_index` keys, so this arm cannot fire. + (None, None) => { + unreachable!("chunk_id was in union but absent from both indices") + } + }; + + // `unwrap_or(fusion_score)` covers a defensive-coding case + // that doesn't arise today: when a chunk only appears in + // one retriever, RRF sums a single term so `fusion_score` + // already equals that side's normalized score, making the + // fallback harmless. + let lex_score = lex_index + .get(&s.chunk_id) + .map(|(_, h)| h.retrieval.lexical_score.unwrap_or(h.retrieval.fusion_score)); + let vec_score = vec_index + .get(&s.chunk_id) + .map(|(_, h)| h.retrieval.vector_score.unwrap_or(h.retrieval.fusion_score)); + + rank = rank.saturating_add(1); + base.rank = rank; + base.retrieval = RetrievalDetail { + method: SearchMode::Hybrid, + // RRF is computed in f64 inside `fuse` and cast to f32 + // here at the boundary. `1/(k_rrf+rank)` is bounded + // roughly in `(0, 2/k_rrf]` (≤ ~0.033 at k_rrf=60), so + // the magnitude is well within f32 range and f32 + // precision is more than sufficient for ranking. + fusion_score: s.rrf as f32, + lexical_score: lex_score, + vector_score: vec_score, + lexical_rank: s.lex_rank, + vector_rank: s.vec_rank, + }; + hits.push(base); + } + + tracing::debug!(rows = hits.len(), "kb-search hybrid: search done"); + Ok(hits) + } +} + +/// Parse the `hybrid_fusion` config string into a [`FusionPolicy`]. +/// Today only `"rrf"` is recognised; anything else falls back to RRF +/// with a warn log so misconfiguration is visible but not fatal. +fn parse_fusion(name: &str, k_rrf: u32) -> FusionPolicy { + let k = if k_rrf == 0 { DEFAULT_K_RRF } else { k_rrf }; + match name { + "rrf" => FusionPolicy::Rrf { k_rrf: k }, + other => { + tracing::warn!( + target: "kb-search", + policy = other, + "kb-search hybrid: unknown fusion policy; falling back to RRF" + ); + FusionPolicy::Rrf { k_rrf: k } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use kb_core::{ + ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, SearchFilters, + SearchHit, SearchMode, WorkspacePath, + }; + use std::sync::Mutex; + + /// Test double: returns a canned `Vec` and records + /// every call so we can assert delegation. + struct CannedRetriever { + hits: Vec, + calls: Mutex>, + version: IndexVersion, + } + + impl CannedRetriever { + fn new(hits: Vec, version: &str) -> Self { + Self { + hits, + calls: Mutex::new(Vec::new()), + version: IndexVersion(version.to_string()), + } + } + } + + impl Retriever for CannedRetriever { + fn search(&self, query: &SearchQuery) -> Result> { + self.calls.lock().unwrap().push(query.clone()); + Ok(self.hits.clone()) + } + fn index_version(&self) -> IndexVersion { + self.version.clone() + } + } + + fn wp(p: &str) -> WorkspacePath { + WorkspacePath::new(p.to_string()).unwrap() + } + + /// Build a synthetic `SearchHit`. Most fields take inert defaults + /// because the hybrid logic only reads `chunk_id`, `rank`, + /// `retrieval.{lexical,vector}_score`, and (transitively) the rest + /// when building the fused output. + fn mk_hit( + chunk_id: &str, + rank: u32, + method: SearchMode, + score: f32, + ) -> SearchHit { + let cid = ChunkId(chunk_id.to_string()); + let did = DocumentId(format!("d-{chunk_id}")); + let path = wp(&format!("notes/{chunk_id}.md")); + SearchHit { + rank, + chunk_id: cid, + doc_id: did, + doc_path: path.clone(), + heading_path: vec![], + section_label: None, + snippet: format!("snippet for {chunk_id}"), + citation: Citation::Line { + path, + start: 1, + end: 1, + section: None, + }, + retrieval: RetrievalDetail { + method, + fusion_score: score, + lexical_score: matches!(method, SearchMode::Lexical | SearchMode::Hybrid) + .then_some(score), + vector_score: matches!(method, SearchMode::Vector | SearchMode::Hybrid) + .then_some(score), + lexical_rank: matches!(method, SearchMode::Lexical | SearchMode::Hybrid) + .then_some(rank), + vector_rank: matches!(method, SearchMode::Vector | SearchMode::Hybrid) + .then_some(rank), + }, + index_version: IndexVersion("v1".to_string()), + embedding_model: None, + chunker_version: ChunkerVersion("v1".to_string()), + } + } + + fn rrf_policy(k_rrf: u32) -> FusionPolicy { + FusionPolicy::Rrf { k_rrf } + } + + fn make_query(mode: SearchMode, k: usize) -> SearchQuery { + SearchQuery { + text: "rust".to_string(), + mode, + k, + filters: SearchFilters::default(), + } + } + + #[test] + fn hybrid_lexical_mode_delegates_to_lexical() { + let lex_hits = vec![mk_hit("aaaa", 1, SearchMode::Lexical, 0.9)]; + let lex = Arc::new(CannedRetriever::new(lex_hits.clone(), "lex-v1")); + let vec = Arc::new(CannedRetriever::new(vec![], "vec-v1")); + let h = HybridRetriever::with_policy(lex.clone(), vec.clone(), rrf_policy(60), 5); + let out = h.search(&make_query(SearchMode::Lexical, 5)).unwrap(); + assert_eq!(out, lex_hits, "lexical mode must pass through verbatim"); + assert_eq!(lex.calls.lock().unwrap().len(), 1, "lexical called once"); + assert_eq!(vec.calls.lock().unwrap().len(), 0, "vector NOT called"); + } + + #[test] + fn hybrid_vector_mode_delegates_to_vector() { + let vec_hits = vec![mk_hit("bbbb", 1, SearchMode::Vector, 0.8)]; + let lex = Arc::new(CannedRetriever::new(vec![], "lex-v1")); + let vec = Arc::new(CannedRetriever::new(vec_hits.clone(), "vec-v1")); + let h = HybridRetriever::with_policy(lex.clone(), vec.clone(), rrf_policy(60), 5); + let out = h.search(&make_query(SearchMode::Vector, 5)).unwrap(); + assert_eq!(out, vec_hits, "vector mode must pass through verbatim"); + assert_eq!(lex.calls.lock().unwrap().len(), 0, "lexical NOT called"); + assert_eq!(vec.calls.lock().unwrap().len(), 1, "vector called once"); + } + + #[test] + fn hybrid_chunk_only_in_lexical_keeps_vector_none() { + // Chunk X is in lexical only. + let lex = Arc::new(CannedRetriever::new( + vec![mk_hit("xxxx", 1, SearchMode::Lexical, 0.9)], + "lex-v1", + )); + let vec = Arc::new(CannedRetriever::new( + vec![mk_hit("yyyy", 1, SearchMode::Vector, 0.8)], + "vec-v1", + )); + let h = HybridRetriever::with_policy(lex, vec, rrf_policy(60), 5); + let out = h.search(&make_query(SearchMode::Hybrid, 5)).unwrap(); + // Both X and Y are present. + let xx = out.iter().find(|h| h.chunk_id.0 == "xxxx").unwrap(); + assert_eq!(xx.retrieval.method, SearchMode::Hybrid); + assert!(xx.retrieval.lexical_score.is_some()); + assert_eq!(xx.retrieval.vector_score, None); + assert_eq!(xx.retrieval.lexical_rank, Some(1)); + assert_eq!(xx.retrieval.vector_rank, None); + assert!(xx.retrieval.fusion_score > 0.0); + + let yy = out.iter().find(|h| h.chunk_id.0 == "yyyy").unwrap(); + assert_eq!(yy.retrieval.lexical_score, None); + assert!(yy.retrieval.vector_score.is_some()); + assert_eq!(yy.retrieval.lexical_rank, None); + assert_eq!(yy.retrieval.vector_rank, Some(1)); + } + + #[test] + fn rrf_formula_matches_known_value() { + // chunk A appears at lexical rank 1, vector rank 2; k_rrf=60. + // Expected: 1/(60+1) + 1/(60+2) = 1/61 + 1/62. + let expected = 1.0_f64 / 61.0 + 1.0_f64 / 62.0; + let lex = Arc::new(CannedRetriever::new( + vec![mk_hit("aaaa", 1, SearchMode::Lexical, 0.5)], + "lex-v1", + )); + let vec_hits = vec![ + mk_hit("zzzz", 1, SearchMode::Vector, 0.9), + mk_hit("aaaa", 2, SearchMode::Vector, 0.7), + ]; + let vec = Arc::new(CannedRetriever::new(vec_hits, "vec-v1")); + let h = HybridRetriever::with_policy(lex, vec, rrf_policy(60), 5); + let out = h.search(&make_query(SearchMode::Hybrid, 5)).unwrap(); + let a = out.iter().find(|h| h.chunk_id.0 == "aaaa").unwrap(); + let actual = a.retrieval.fusion_score as f64; + // Tolerance: the score is computed in f64 and cast to f32 at + // the API boundary, so any discrepancy must fit within f32 + // precision. `1e-7` is below `f32::EPSILON` (~1.19e-7), which + // makes the check brittle on edge cases. Use a small multiple + // of EPSILON to stay robust. + let tol = f64::from(f32::EPSILON) * 10.0; + assert!( + (actual - expected).abs() < tol, + "RRF score {actual} drifted from expected {expected} (tol {tol})" + ); + } + + #[test] + fn hybrid_tiebreak_prefers_lower_lexical_rank_then_chunk_id() { + // Construct two chunks with identical fused scores. + // Strategy: A appears at lex rank 2 only → score = 1/62. + // B appears at vec rank 2 only → score = 1/62. + // Tie-break: lex_rank ascending (Some(2) < None), so A wins. + let lex = Arc::new(CannedRetriever::new( + vec![ + mk_hit("zzzz", 1, SearchMode::Lexical, 0.9), // rank 1: high RRF, leader + mk_hit("aaaa", 2, SearchMode::Lexical, 0.5), // rank 2 + ], + "lex-v1", + )); + let vec = Arc::new(CannedRetriever::new( + vec![ + mk_hit("zzzz", 1, SearchMode::Vector, 0.9), + mk_hit("bbbb", 2, SearchMode::Vector, 0.5), + ], + "vec-v1", + )); + let h = HybridRetriever::with_policy(lex, vec, rrf_policy(60), 5); + let out = h.search(&make_query(SearchMode::Hybrid, 5)).unwrap(); + + // zzzz has both ranks → strictly higher RRF → rank 1. + assert_eq!(out[0].chunk_id.0, "zzzz"); + + // aaaa and bbbb both have a single rank-2 contribution → identical + // RRF. Tie-break: aaaa has lex_rank=Some(2), bbbb has lex_rank=None, + // so aaaa comes first. + assert_eq!(out[1].chunk_id.0, "aaaa"); + assert_eq!(out[2].chunk_id.0, "bbbb"); + + // Now construct two chunks with identical lex rank to verify + // the chunk_id tie-break. CannedRetriever can't produce two + // hits at the same rank via mk_hit's normal flow, so we patch + // `retrieval.lexical_rank` directly after construction. + let mut tied_a = mk_hit("aaaa", 2, SearchMode::Lexical, 0.4); + tied_a.retrieval.lexical_rank = Some(2); + let mut tied_b = mk_hit("bbbb", 2, SearchMode::Lexical, 0.4); + tied_b.retrieval.lexical_rank = Some(2); + let lex3 = Arc::new(CannedRetriever::new( + vec![tied_a, tied_b], + "lex-v1", + )); + let vec3 = Arc::new(CannedRetriever::new(vec![], "vec-v1")); + let h3 = HybridRetriever::with_policy(lex3, vec3, rrf_policy(60), 5); + let out3 = h3.search(&make_query(SearchMode::Hybrid, 5)).unwrap(); + // Same lex_rank=2 → tie-break on chunk_id ascending: aaaa < bbbb. + assert_eq!(out3[0].chunk_id.0, "aaaa"); + assert_eq!(out3[1].chunk_id.0, "bbbb"); + } + + #[test] + fn hybrid_index_version_is_composite() { + let lex = Arc::new(CannedRetriever::new(vec![], "lex-v1")); + let vec = Arc::new(CannedRetriever::new(vec![], "vec-v2")); + let h = HybridRetriever::with_policy(lex, vec, rrf_policy(60), 5); + assert_eq!(h.index_version().0, "hybrid:lex-v1+vec-v2"); + } + + #[test] + fn hybrid_disjoint_recall_returns_all_when_k_large_enough() { + // lex returns [A, B], vec returns [C, D]; k=4 → all 4 in result. + let lex = Arc::new(CannedRetriever::new( + vec![ + mk_hit("aaaa", 1, SearchMode::Lexical, 0.9), + mk_hit("bbbb", 2, SearchMode::Lexical, 0.7), + ], + "lex-v1", + )); + let vec = Arc::new(CannedRetriever::new( + vec![ + mk_hit("cccc", 1, SearchMode::Vector, 0.9), + mk_hit("dddd", 2, SearchMode::Vector, 0.7), + ], + "vec-v1", + )); + let h = HybridRetriever::with_policy(lex, vec, rrf_policy(60), 4); + let out = h.search(&make_query(SearchMode::Hybrid, 4)).unwrap(); + let mut ids: Vec<&str> = out.iter().map(|h| h.chunk_id.0.as_str()).collect(); + ids.sort(); + assert_eq!(ids, vec!["aaaa", "bbbb", "cccc", "dddd"]); + } + + #[test] + fn hybrid_zero_k_uses_default() { + // With query.k=0, hybrid should use the configured default_k. + let lex = Arc::new(CannedRetriever::new( + (0..20) + .map(|i| mk_hit(&format!("c{i:04}"), i + 1, SearchMode::Lexical, 0.5)) + .collect(), + "lex-v1", + )); + let vec = Arc::new(CannedRetriever::new(vec![], "vec-v1")); + let h = HybridRetriever::with_policy(lex, vec, rrf_policy(60), 7); + let out = h.search(&make_query(SearchMode::Hybrid, 0)).unwrap(); + assert_eq!(out.len(), 7); + } + + #[test] + fn parse_fusion_falls_back_to_rrf_on_unknown() { + let p = parse_fusion("nonsense", 60); + let FusionPolicy::Rrf { k_rrf } = p; + assert_eq!(k_rrf, 60); + } + + #[test] + fn parse_fusion_zero_k_falls_back_to_default() { + let FusionPolicy::Rrf { k_rrf } = parse_fusion("rrf", 0); + assert_eq!(k_rrf, DEFAULT_K_RRF); + } +} diff --git a/crates/kb-search/src/lexical.rs b/crates/kb-search/src/lexical.rs index 6b06dc1..0912348 100644 --- a/crates/kb-search/src/lexical.rs +++ b/crates/kb-search/src/lexical.rs @@ -10,13 +10,15 @@ use std::sync::Arc; use anyhow::{Context, Result}; use globset::GlobMatcher; use kb_core::{ - ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail, - Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery, SourceSpan, - TrustLevel, WorkspacePath, + ChunkId, ChunkerVersion, DocumentId, IndexVersion, RetrievalDetail, Retriever, + SearchFilters, SearchHit, SearchMode, SearchQuery, SourceSpan, TrustLevel, + WorkspacePath, }; use kb_store_sqlite::SqliteStore; use rusqlite::{params_from_iter, Connection, Row, ToSql}; +use crate::citation_helper::citation_from_first_span; + // ── Tunables ───────────────────────────────────────────────────────────── /// FTS5 hard limit on the `snippet()` `nToken` argument. @@ -367,7 +369,7 @@ fn build_hit( let workspace_path = WorkspacePath::new(raw.workspace_path) .context("kb-search lexical: documents.workspace_path violates WorkspacePath invariant")?; - let citation = build_citation( + let citation = citation_from_first_span( &raw.chunk_id, workspace_path.clone(), raw.section_label.clone(), @@ -413,64 +415,6 @@ fn normalize_bm25(bm25_raw: f64) -> f32 { normalized as f32 } -/// Build a `Citation` from the chunk's first `SourceSpan`. P1 markdown -/// only emits `Line`, so the other variants are mostly defensive — we -/// forward them as faithfully as possible so a future PDF / image -/// extractor can flow through without churn. -fn build_citation( - chunk_id: &str, - path: WorkspacePath, - section: Option, - first_span: Option<&SourceSpan>, -) -> Citation { - match first_span { - Some(SourceSpan::Line { start, end }) => Citation::Line { - path, - start: *start, - end: *end, - section, - }, - Some(SourceSpan::Page { page, .. }) => Citation::Page { - path, - page: *page, - section, - }, - Some(SourceSpan::Region { x, y, w, h }) => Citation::Region { - path, - x: *x, - y: *y, - w: *w, - h: *h, - }, - Some(SourceSpan::Time { start_ms, end_ms }) => Citation::Time { - path, - start_ms: *start_ms, - end_ms: *end_ms, - speaker: None, - }, - // Byte-spans don't have a Citation variant. Fall back to a Line - // citation pointing at the document head — better than fabricating - // a position. Spans-empty falls into the same branch. - other @ (Some(SourceSpan::Byte { .. }) | None) => { - let span_shape = match other { - Some(_) => "Byte", - None => "empty array", - }; - tracing::warn!( - chunk_id, - span_shape, - "kb-search lexical: SourceSpan has no Citation mapping; falling back to Line {{1, 1}}" - ); - Citation::Line { - path, - start: 1, - end: 1, - section, - } - } - } -} - /// Cap the snippet at `max_chars` characters (Unicode scalar values, not /// bytes — matches the §6.4 setting's "characters" semantics). Returns /// the input unchanged when already short enough. @@ -579,9 +523,10 @@ mod tests { #[test] fn build_citation_line_round_trip() { + use kb_core::Citation; let p = WorkspacePath::new("a/b.md".to_string()).unwrap(); let span = SourceSpan::Line { start: 7, end: 12 }; - let c = build_citation("c1", p.clone(), Some("S1".to_string()), Some(&span)); + let c = citation_from_first_span("c1", p.clone(), Some("S1".to_string()), Some(&span)); match c { Citation::Line { start, @@ -600,13 +545,14 @@ mod tests { #[test] fn build_citation_page_forwards_section() { + use kb_core::Citation; let p = WorkspacePath::new("doc.pdf".to_string()).unwrap(); let span = SourceSpan::Page { page: 4, char_start: None, char_end: None, }; - let c = build_citation("c1", p, Some("Intro".to_string()), Some(&span)); + let c = citation_from_first_span("c1", p, Some("Intro".to_string()), Some(&span)); match c { Citation::Page { page, @@ -622,8 +568,9 @@ mod tests { #[test] fn build_citation_none_falls_back_to_line_one() { + use kb_core::Citation; let p = WorkspacePath::new("x.md".to_string()).unwrap(); - let c = build_citation("c1", p, None, None); + let c = citation_from_first_span("c1", p, None, None); match c { Citation::Line { start, end, .. } => { assert_eq!((start, end), (1, 1)); diff --git a/crates/kb-search/src/lib.rs b/crates/kb-search/src/lib.rs index 76afd55..63708b6 100644 --- a/crates/kb-search/src/lib.rs +++ b/crates/kb-search/src/lib.rs @@ -1,15 +1,26 @@ //! `kb-search` — `kb_core::Retriever` implementations. //! -//! P2-2 ships [`LexicalRetriever`], a SQLite-FTS5-backed retriever for -//! `SearchMode::Lexical`. Vector + Hybrid retrievers land in P3-3 / P3-4. +//! - [`LexicalRetriever`] (P2-2): SQLite-FTS5 + bm25 backed retriever +//! for `SearchMode::Lexical`. +//! - [`VectorRetriever`] (P3-4): wraps a `dyn VectorStore` (typically +//! `kb-store-vector::LanceVectorStore`) and a `dyn Embedder`, +//! hydrating SQLite metadata for full `SearchHit`s. +//! - [`HybridRetriever`] (P3-4): composes lexical + vector retrievers, +//! dispatches by `SearchMode`, fuses Hybrid via [`FusionPolicy::Rrf`]. //! -//! Allowed deps per task spec: `kb-core`, `kb-config`, `kb-store-sqlite`, -//! `rusqlite`, `globset`, `tracing`, `thiserror`, `anyhow`. Forbidden: -//! `kb-source-fs`, `kb-parse-md`, `kb-normalize`, `kb-chunk`, -//! `kb-store-vector`, `kb-embed*`, `kb-llm*`, `kb-rag`, `kb-tui`, -//! `kb-desktop`. Only `serde_json` is a transitive helper used to decode -//! JSON-typed columns from `chunks` / `documents`. +//! Allowed deps per the P2-2 + P3-4 task specs: `kb-core`, `kb-config`, +//! `kb-store-sqlite`, `kb-store-vector`, `kb-embed` (trait re-export +//! only — concrete adapters like `kb-embed-local` are runtime-injected +//! via `Arc`), `rusqlite`, `globset`, `serde_json`, +//! `tracing`, `thiserror`, `anyhow`. Forbidden: `kb-source-fs`, +//! `kb-parse-md`, `kb-normalize`, `kb-chunk`, `kb-embed-local` (concrete +//! adapter), `kb-llm*`, `kb-rag`, `kb-tui`, `kb-desktop`. +mod citation_helper; +mod hybrid; mod lexical; +mod vector; +pub use hybrid::{FusionPolicy, HybridRetriever}; pub use lexical::LexicalRetriever; +pub use vector::VectorRetriever; diff --git a/crates/kb-search/src/vector.rs b/crates/kb-search/src/vector.rs new file mode 100644 index 0000000..8fdd0f9 --- /dev/null +++ b/crates/kb-search/src/vector.rs @@ -0,0 +1,338 @@ +//! Vector retriever — design §3.7 / §7.2 / §1.6. +//! +//! Wraps a `dyn VectorStore` + `dyn Embedder` + the SQLite metadata +//! store into a `kb_core::Retriever`. The vector store knows how to +//! find the nearest chunks by cosine on the embedding column; SQLite +//! owns the human-readable metadata (heading_path / section_label / +//! source_spans / chunker_version / workspace_path) needed for +//! `SearchHit` and `Citation`. The retriever stitches them together +//! per spec §7.2. +//! +//! Snippet policy: this retriever has no FTS5 highlighter to lean on, +//! so the `snippet` field is the chunk text trimmed to +//! `config.search.snippet_chars` Unicode scalar values. The lexical +//! retriever does query-token highlighting; downstream UI code should +//! continue to surface lexical snippets for hybrid hits where the +//! lexical side contributed (handled in `HybridRetriever::search`). + +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use kb_core::{ + ChunkId, ChunkerVersion, DocumentId, Embedder, EmbeddingInput, EmbeddingKind, + IndexVersion, RetrievalDetail, Retriever, SearchHit, SearchMode, SearchQuery, + SourceSpan, VectorHit, VectorStore, WorkspacePath, +}; +use kb_store_sqlite::SqliteStore; +use rusqlite::params_from_iter; + +use crate::citation_helper::citation_from_first_span; + +/// Default `k` when `SearchQuery::k == 0`. Mirrors §6.4 default_k=10 +/// and the lexical retriever's `DEFAULT_K`. +const DEFAULT_K: usize = 10; + +/// Over-fetch multiplier passed to `VectorStore::search` so that +/// SQLite-side filter losses (tags / lang / trust / path_glob) still +/// leave at least `k` candidates. The Lance store already applies the +/// same filters internally; the extra `* 2` is the spec-mandated +/// safety margin for the `Retriever` layer (§7.2 spec line 138). +const VECTOR_OVERFETCH_MULTIPLIER: usize = 2; + +/// Wraps a vector store + embedder into a [`Retriever`]. +/// +/// `VectorStore` is not declared `Send + Sync` in `kb-core::traits`, +/// but `Retriever` requires both. We constrain the trait objects +/// here so callers must hand us implementations that already are +/// (`LanceVectorStore` is `Send + Sync` thanks to its +/// `Connection`/`Runtime` ownership; the trait is sync-method-only). +pub struct VectorRetriever { + store: Arc, + embed: Arc, + sqlite: Arc, + index_version: IndexVersion, + snippet_chars: usize, +} + +impl VectorRetriever { + /// Construct with `index_version` derived from the configured + /// embedding model + dimensions, and snippet width pulled from + /// `kb-config`'s defaults. + /// + /// The explicit `index_version` form is [`Self::with_settings`]. + pub fn new( + store: Arc, + embed: Arc, + sqlite: Arc, + index_version: IndexVersion, + ) -> Self { + let cfg = kb_config::Config::defaults(); + Self::with_settings(store, embed, sqlite, index_version, cfg.search.snippet_chars) + } + + /// Construct with explicit `snippet_chars`. Mirrors the lexical + /// retriever's `with_settings` constructor for callers that have + /// already loaded a `Config`. + pub fn with_settings( + store: Arc, + embed: Arc, + sqlite: Arc, + index_version: IndexVersion, + snippet_chars: usize, + ) -> Self { + Self { + store, + embed, + sqlite, + index_version, + snippet_chars, + } + } +} + +impl Retriever for VectorRetriever { + fn search(&self, query: &SearchQuery) -> Result> { + let k = if query.k == 0 { DEFAULT_K } else { query.k }; + tracing::debug!( + text_len = query.text.len(), + k, + "kb-search vector: search start" + ); + + // Empty / whitespace-only queries — short-circuit. The + // embedder would still produce a vector for an empty string, + // but nearest-neighbours on the centroid of "" is meaningless + // and only forces a wasted Lance scan. + if query.text.trim().is_empty() { + return Ok(Vec::new()); + } + + // 1. Embed the query as `Query` kind (e5-style asymmetry — + // documents and queries have different prefixes). + let inputs = [EmbeddingInput { + text: &query.text, + kind: EmbeddingKind::Query, + }]; + let mut embeddings = self + .embed + .embed(&inputs) + .context("kb-search vector: embed query")?; + if embeddings.len() != 1 { + anyhow::bail!( + "kb-search vector: embedder returned {} vectors for one input", + embeddings.len() + ); + } + let query_vec = embeddings.remove(0); + + // 2. Over-fetch from the vector store. The Lance store + // applies `filter_chunks` internally, so we pass `query.filters` + // through and trust the post-filter pass to honour them. + // `saturating_mul(2)` is always ≥ k for any usize k, so we + // don't need an extra `.max(k)` clamp. + let overfetch = k.saturating_mul(VECTOR_OVERFETCH_MULTIPLIER); + let raw_hits = self + .store + .search(&query_vec, overfetch, &query.filters) + .context("kb-search vector: VectorStore::search")?; + + if raw_hits.is_empty() { + tracing::debug!("kb-search vector: store returned no hits"); + return Ok(Vec::new()); + } + + // 3. Hydrate metadata from SQLite for the candidate ids in + // one round-trip. Order is preserved by the caller via the + // HashMap lookup at hit-construction time. + let candidate_ids: Vec<&str> = + raw_hits.iter().map(|h| h.chunk_id.0.as_str()).collect(); + let hydration = hydrate_chunks(&self.sqlite, &candidate_ids) + .context("kb-search vector: hydrate chunk metadata")?; + + // 4. Build `SearchHit` for the first `k` raw hits that pass + // hydration (a missing row would be a filter-induced drop — + // Lance returned the chunk but SQLite filtered it out, or + // the chunk was deleted between Lance's read and ours). + let model_id = self.embed.model_id(); + let mut hits: Vec = Vec::with_capacity(k.min(raw_hits.len())); + let mut rank: u32 = 0; + for hit in raw_hits { + let Some(meta) = hydration.get(hit.chunk_id.0.as_str()) else { + continue; + }; + rank = rank.saturating_add(1); + hits.push(build_hit( + hit, + meta, + rank, + &self.index_version, + &model_id, + self.snippet_chars, + )?); + if hits.len() >= k { + break; + } + } + + tracing::debug!(rows = hits.len(), "kb-search vector: search done"); + Ok(hits) + } + + fn index_version(&self) -> IndexVersion { + self.index_version.clone() + } +} + +// ── Hydration ──────────────────────────────────────────────────────────── + +/// Subset of `chunks` + `documents` metadata needed to build a +/// `SearchHit` from a `VectorHit`. Pulled in one round-trip so the +/// per-hit construction loop stays O(1) per row. +struct ChunkMeta { + text: String, + heading_path_json: String, + section_label: Option, + source_spans_json: String, + chunker_version: String, + doc_id: String, + workspace_path: String, +} + +fn hydrate_chunks( + sqlite: &SqliteStore, + chunk_ids: &[&str], +) -> Result> { + if chunk_ids.is_empty() { + return Ok(HashMap::new()); + } + // Deduplicate the IN-list — Lance can repeat a chunk_id across + // batches in pathological cases. A HashMap key dedupes in the + // result anyway, but keeping the placeholder count tight is good + // hygiene. + let mut seen = std::collections::HashSet::new(); + let unique: Vec<&str> = chunk_ids + .iter() + .copied() + .filter(|id| seen.insert(*id)) + .collect(); + + let placeholders = vec!["?"; unique.len()].join(","); + let sql = format!( + "SELECT \ + c.chunk_id, c.text, c.heading_path_json, c.section_label, \ + c.source_spans_json, c.chunker_version, \ + c.doc_id, d.workspace_path \ + FROM chunks c \ + JOIN documents d ON d.doc_id = c.doc_id \ + WHERE c.chunk_id IN ({placeholders})" + ); + let conn = sqlite.read_conn(); + let mut stmt = conn + .prepare(&sql) + .context("kb-search vector: prepare hydration statement")?; + let rows = stmt + .query_map( + // `unique` is a `Vec<&str>`; `&str` implements `ToSql` + // directly, so we hand the iterator straight to + // `params_from_iter` without copying. + params_from_iter(unique.iter().copied()), + |row| { + let chunk_id: String = row.get(0)?; + Ok(( + chunk_id, + ChunkMeta { + text: row.get(1)?, + heading_path_json: row.get(2)?, + section_label: row.get(3)?, + source_spans_json: row.get(4)?, + chunker_version: row.get(5)?, + doc_id: row.get(6)?, + workspace_path: row.get(7)?, + }, + )) + }, + ) + .context("kb-search vector: execute hydration query")?; + let mut out: HashMap = HashMap::with_capacity(unique.len()); + for row in rows { + let (chunk_id, meta) = + row.context("kb-search vector: read hydration row")?; + out.insert(chunk_id, meta); + } + Ok(out) +} + +fn build_hit( + hit: VectorHit, + meta: &ChunkMeta, + rank: u32, + index_version: &IndexVersion, + model_id: &kb_core::EmbeddingModelId, + snippet_chars: usize, +) -> Result { + let heading_path: Vec = serde_json::from_str(&meta.heading_path_json) + .context("kb-search vector: deserialize heading_path_json")?; + let source_spans: Vec = serde_json::from_str(&meta.source_spans_json) + .context("kb-search vector: deserialize source_spans_json")?; + + let workspace_path = WorkspacePath::new(meta.workspace_path.clone()).context( + "kb-search vector: documents.workspace_path violates WorkspacePath invariant", + )?; + let citation = citation_from_first_span( + &hit.chunk_id.0, + workspace_path.clone(), + meta.section_label.clone(), + source_spans.first(), + ); + let snippet = trim_snippet(&meta.text, snippet_chars); + + let score = hit.score; + Ok(SearchHit { + rank, + chunk_id: ChunkId(hit.chunk_id.0), + doc_id: DocumentId(meta.doc_id.clone()), + doc_path: workspace_path, + heading_path, + section_label: meta.section_label.clone(), + snippet, + citation, + retrieval: RetrievalDetail { + method: SearchMode::Vector, + fusion_score: score, + lexical_score: None, + vector_score: Some(score), + lexical_rank: None, + vector_rank: Some(rank), + }, + index_version: index_version.clone(), + embedding_model: Some(model_id.clone()), + chunker_version: ChunkerVersion(meta.chunker_version.clone()), + }) +} + +/// Cap the snippet at `max_chars` Unicode scalar values. Mirrors +/// `lexical::trim_snippet` so the two retrievers produce identically +/// shaped snippets for hybrid output. +fn trim_snippet(s: &str, max_chars: usize) -> String { + if s.chars().count() <= max_chars { + return s.to_string(); + } + s.chars().take(max_chars).collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn trim_snippet_caps_at_char_count() { + let s = "a".repeat(300); + assert_eq!(trim_snippet(&s, 220).chars().count(), 220); + } + + #[test] + fn trim_snippet_passthrough_when_short() { + assert_eq!(trim_snippet("short", 220), "short"); + } +} diff --git a/crates/kb-search/tests/common/mod.rs b/crates/kb-search/tests/common/mod.rs new file mode 100644 index 0000000..5ff9db8 --- /dev/null +++ b/crates/kb-search/tests/common/mod.rs @@ -0,0 +1,216 @@ +//! Shared scaffolding for kb-search hybrid integration tests. +//! +//! # Test policy +//! +//! Integration tests in `hybrid.rs` that touch `LanceVectorStore` +//! are marked `#[ignore]` AND call [`require_avx_or_panic`] inside +//! the test body so a `--ignored` invocation on a non-AVX host +//! fails loudly with a clear message rather than crashing later +//! inside Lance's f32 SIMD kernel with `SIGILL`. +//! +//! See `crates/kb-store-vector/tests/common/mod.rs` for the +//! original P3-3 rationale; this is a copy because that crate's +//! test commons are test-only and not part of its public surface. + +#![allow(dead_code)] + +use std::sync::Arc; + +use kb_config::Config; +use kb_core::{ + ChunkId, DocumentId, EmbeddingId, EmbeddingInput, EmbeddingKind, + EmbeddingModelId, EmbeddingVersion, IndexVersion, VectorRecord, VectorStore, +}; +use kb_embed::{Embedder, MockEmbedder}; +use kb_search::{LexicalRetriever, VectorRetriever}; +use kb_store_sqlite::SqliteStore; +use kb_store_vector::LanceVectorStore; +use rusqlite::params; +use tempfile::TempDir; + +/// Panic if the host CPU lacks AVX. Called from every `#[ignore]`-d +/// integration test body so that `cargo test -- --ignored` on a +/// non-AVX host fails loudly with a clear message instead of crashing +/// later inside a Lance SIMD kernel with `SIGILL`. +pub fn require_avx_or_panic() { + #[cfg(target_arch = "x86_64")] + { + if !std::is_x86_feature_detected!("avx") { + panic!( + "kb-search hybrid integration test requires AVX-capable hardware; \ + host CPU lacks AVX. Run on an AVX-capable machine." + ); + } + } +} + +/// Index version label used by hybrid integration tests so the +/// `index_version()` composite token is predictable in snapshots. +pub const TEST_LEX_INDEX_VERSION: &str = "v1.0-lex"; +pub const TEST_VEC_INDEX_VERSION: &str = "v1.0-vec"; + +/// Embedding dimensions for tests. Kept small so MockEmbedder runs +/// fast and the Lance table stays compact on disk; production uses +/// 384 (multilingual-e5-small) but the retriever code is dim-agnostic. +pub const TEST_DIMENSIONS: usize = 16; +pub const TEST_MODEL_ID: &str = "mock-e5"; + +pub struct HybridEnv { + pub temp: TempDir, + pub config: Config, + pub sqlite: Arc, + pub vector_store: Arc, + pub embedder: Arc, +} + +impl HybridEnv { + pub fn new() -> Self { + let temp = tempfile::tempdir().expect("tempdir"); + let mut config = Config::defaults(); + config.storage.data_dir = temp.path().to_string_lossy().into_owned(); + let sqlite = SqliteStore::open(&config).unwrap(); + sqlite.run_migrations().unwrap(); + let sqlite = Arc::new(sqlite); + let vector_store = + Arc::new(LanceVectorStore::new(&config, sqlite.clone()).unwrap()); + let embedder = Arc::new(MockEmbedder::new( + EmbeddingModelId(TEST_MODEL_ID.to_string()), + EmbeddingVersion("v1".to_string()), + TEST_DIMENSIONS, + )); + Self { + temp, + config, + sqlite, + vector_store, + embedder, + } + } + + /// Build a `LexicalRetriever` over the shared SQLite store. + pub fn lexical_retriever(&self) -> LexicalRetriever { + LexicalRetriever::new( + Arc::clone(&self.sqlite), + IndexVersion(TEST_LEX_INDEX_VERSION.to_string()), + ) + } + + /// Build a `VectorRetriever` over the shared LanceVectorStore + + /// MockEmbedder + SQLite store. + pub fn vector_retriever(&self) -> VectorRetriever { + let store: Arc = + Arc::clone(&self.vector_store) as Arc; + let embed: Arc = + Arc::clone(&self.embedder) as Arc; + VectorRetriever::new( + store, + embed, + Arc::clone(&self.sqlite), + IndexVersion(TEST_VEC_INDEX_VERSION.to_string()), + ) + } + + /// Insert (asset, document, document_tags, chunk) rows directly. + /// We seed without going through `DocumentStore::put_document` + /// to keep this crate's test deps inside the Allowed list (no + /// `kb-parse-md` / `kb-normalize` / `kb-chunk`). The `chunks` row + /// also fires the V002 FTS5 triggers, so the lexical retriever + /// can find the row by `MATCH` without a manual rebuild. + pub fn seed_chunk( + &self, + chunk_id: &str, + doc_id: &str, + workspace_path: &str, + text: &str, + heading_path: &[&str], + tags: &[&str], + ) { + let asset_id = format!("a{}", &doc_id[..31]); + let conn = self.sqlite.read_conn(); + conn.execute( + "INSERT OR IGNORE INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, '\"markdown\"', 0, + 'deadbeefdeadbeefdeadbeefdeadbeef', + 'reference', ?, '1970-01-01T00:00:00Z')", + params![ + asset_id, + format!("file://{workspace_path}"), + workspace_path, + workspace_path, + ], + ) + .unwrap(); + conn.execute( + "INSERT OR IGNORE INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, parser_version, doc_version, schema_version, + metadata_json, provenance_json, created_at, updated_at + ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1, + '{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')", + params![doc_id, asset_id, workspace_path], + ) + .unwrap(); + for t in tags { + conn.execute( + "INSERT OR IGNORE INTO document_tags (doc_id, tag) VALUES (?, ?)", + params![doc_id, t], + ) + .unwrap(); + } + let heading_json = serde_json::to_string(heading_path).unwrap(); + conn.execute( + "INSERT OR IGNORE INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, ?, ?, NULL, + '[{\"kind\":\"line\",\"start\":1,\"end\":3}]', + 1, 'v1', 'h', '[]', '1970-01-01T00:00:00Z')", + params![chunk_id, doc_id, text, heading_json], + ) + .unwrap(); + } + + /// Embed `text` as a Document and upsert it as the embedding for + /// `chunk_id`. Drives the same code path production uses: + /// MockEmbedder → VectorRecord → LanceVectorStore::upsert → + /// embedding_records committed. + pub fn embed_and_upsert( + &self, + chunk_id: &str, + doc_id: &str, + text: &str, + heading_path: &[&str], + ) { + let inputs = [EmbeddingInput { + text, + kind: EmbeddingKind::Document, + }]; + let mut vecs = self.embedder.embed(&inputs).unwrap(); + let vector = vecs.remove(0); + let record = VectorRecord { + chunk_id: ChunkId(chunk_id.to_string()), + embedding_id: EmbeddingId(format!("e{}", &chunk_id[..31])), + vector, + doc_id: DocumentId(doc_id.to_string()), + text: text.to_string(), + heading_path: heading_path.iter().map(|s| s.to_string()).collect(), + model_id: EmbeddingModelId(TEST_MODEL_ID.to_string()), + model_version: EmbeddingVersion("v1".to_string()), + dimensions: TEST_DIMENSIONS, + }; + self.vector_store.upsert(&[record]).unwrap(); + } +} + +/// Pad a short prefix to the 32-hex shape `kb_core` newtypes expect. +pub fn id32(prefix: &str) -> String { + let mut s = prefix.to_string(); + while s.len() < 32 { + s.push('0'); + } + s.truncate(32); + s +} diff --git a/crates/kb-search/tests/fixtures/search/hybrid/run-1.json b/crates/kb-search/tests/fixtures/search/hybrid/run-1.json new file mode 100644 index 0000000..6321460 --- /dev/null +++ b/crates/kb-search/tests/fixtures/search/hybrid/run-1.json @@ -0,0 +1,42 @@ +[ + { + "chunk_id": "c1000000000000000000000000000000", + "fusion_score_positive": true, + "lex_some": true, + "lexical_rank": 1, + "method": "hybrid", + "rank": 1, + "vec_some": true, + "vector_rank": 3 + }, + { + "chunk_id": "c2000000000000000000000000000000", + "fusion_score_positive": true, + "lex_some": true, + "lexical_rank": 2, + "method": "hybrid", + "rank": 2, + "vec_some": true, + "vector_rank": 2 + }, + { + "chunk_id": "c4000000000000000000000000000000", + "fusion_score_positive": true, + "lex_some": false, + "lexical_rank": null, + "method": "hybrid", + "rank": 3, + "vec_some": true, + "vector_rank": 1 + }, + { + "chunk_id": "c3000000000000000000000000000000", + "fusion_score_positive": true, + "lex_some": false, + "lexical_rank": null, + "method": "hybrid", + "rank": 4, + "vec_some": true, + "vector_rank": 4 + } +] \ No newline at end of file diff --git a/crates/kb-search/tests/hybrid.rs b/crates/kb-search/tests/hybrid.rs new file mode 100644 index 0000000..b9279a6 --- /dev/null +++ b/crates/kb-search/tests/hybrid.rs @@ -0,0 +1,213 @@ +//! Hybrid integration tests — touch a real `LanceVectorStore` + +//! `SqliteStore` + `MockEmbedder`. These tests are `#[ignore]`-d and +//! AVX-gated; see `tests/common/mod.rs` for the policy rationale. +//! +//! Mock-retriever unit tests live alongside the implementation in +//! `crates/kb-search/src/hybrid.rs` (no Lance, no AVX needed) — the +//! tests here exercise the full plumbing with the real Lance store. + +mod common; + +use std::path::PathBuf; +use std::sync::Arc; + +use common::{ + HybridEnv, id32, require_avx_or_panic, TEST_LEX_INDEX_VERSION, TEST_VEC_INDEX_VERSION, +}; +use kb_core::{ + Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery, +}; +use kb_search::{FusionPolicy, HybridRetriever}; +use serde_json::json; + +fn build_hybrid(env: &HybridEnv) -> HybridRetriever { + let lex: Arc = Arc::new(env.lexical_retriever()); + let vec: Arc = Arc::new(env.vector_retriever()); + HybridRetriever::with_policy(lex, vec, FusionPolicy::Rrf { k_rrf: 60 }, 5) +} + +/// Seed a tiny corpus that lets us prove hybrid recall ≥ each side +/// independently. Two chunks are lexical-only matches ("rust cargo"); +/// two chunks are vector-only matches (their text doesn't contain +/// the query token but their embedding still scores nearby because +/// MockEmbedder's hash distributes over all chunks). +fn seed_disjoint_corpus(env: &HybridEnv) -> Vec { + // The lexical side will only match chunks that contain the query + // tokens. The vector side will rank ALL chunks by embedding + // similarity to the query — even ones whose text doesn't share + // a token with the query. + let chunks = [ + // (chunk_id, doc_id, path, text, headings) + (id32("c1"), id32("d1"), "notes/rust1.md", "rust cargo macros", &["A"][..]), + (id32("c2"), id32("d2"), "notes/rust2.md", "rust traits and lifetimes", &["B"][..]), + (id32("c3"), id32("d3"), "notes/python.md", "python dataclasses tutorial", &["C"][..]), + (id32("c4"), id32("d4"), "notes/go.md", "go interfaces and channels", &["D"][..]), + ]; + let mut ids = Vec::new(); + for (cid, did, path, text, headings) in &chunks { + env.seed_chunk(cid, did, path, text, headings, &[]); + env.embed_and_upsert(cid, did, text, headings); + ids.push(cid.clone()); + } + ids +} + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn hybrid_recall_disjoint_returns_union() { + require_avx_or_panic(); + let env = HybridEnv::new(); + let _ids = seed_disjoint_corpus(&env); + let h = build_hybrid(&env); + + let q = SearchQuery { + text: "rust".to_string(), + mode: SearchMode::Hybrid, + k: 4, + filters: SearchFilters::default(), + }; + let hits = h.search(&q).unwrap(); + + // The vector side will return up to 4 candidates regardless of + // text overlap; the lexical side will return only the rust* ones. + // Together the union must cover at least the lexical hits AND + // include at least one non-lexical chunk if vector found one. + assert!(!hits.is_empty(), "hybrid must return at least one hit"); + // Every hit's RetrievalDetail.method must be Hybrid. + for h in &hits { + assert_eq!(h.retrieval.method, SearchMode::Hybrid); + // At least one of lex/vec_score must be Some. + assert!( + h.retrieval.lexical_score.is_some() || h.retrieval.vector_score.is_some(), + "hybrid hit must carry at least one mode's score" + ); + } + // index_version composite token. + let iv = h.index_version(); + assert!(iv.0.starts_with("hybrid:")); + assert!(iv.0.contains(TEST_LEX_INDEX_VERSION)); + assert!(iv.0.contains(TEST_VEC_INDEX_VERSION)); + + // Lexical-only chunks (c1, c2) MUST appear: they're the only ones + // matching the FTS5 query, and the vector side over-fetches enough + // to include them too. + let ids: Vec<&str> = hits.iter().map(|h| h.chunk_id.0.as_str()).collect(); + assert!(ids.contains(&id32("c1").as_str())); + assert!(ids.contains(&id32("c2").as_str())); +} + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn hybrid_determinism_same_query_twice() { + require_avx_or_panic(); + let env = HybridEnv::new(); + let _ = seed_disjoint_corpus(&env); + let h = build_hybrid(&env); + + let q = SearchQuery { + text: "rust".to_string(), + mode: SearchMode::Hybrid, + k: 4, + filters: SearchFilters::default(), + }; + let a = h.search(&q).unwrap(); + let b = h.search(&q).unwrap(); + assert_eq!(a, b, "identical query must yield byte-identical Vec"); +} + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn hybrid_snapshot_run_1() { + require_avx_or_panic(); + let env = HybridEnv::new(); + let _ = seed_disjoint_corpus(&env); + let h = build_hybrid(&env); + + let q = SearchQuery { + text: "rust".to_string(), + mode: SearchMode::Hybrid, + k: 4, + filters: SearchFilters::default(), + }; + let hits = h.search(&q).unwrap(); + + // Snapshot pins the structural shape: + // - chunk_id ordering + // - which side contributed (lexical_rank / vector_rank + // populated as Some/None) + // - that fusion_score is non-increasing + // - method = Hybrid for every hit + let actual = json!( + hits.iter().map(|h: &SearchHit| json!({ + "chunk_id": h.chunk_id.0, + "rank": h.rank, + "method": h.retrieval.method, + "lexical_rank": h.retrieval.lexical_rank, + "vector_rank": h.retrieval.vector_rank, + "lex_some": h.retrieval.lexical_score.is_some(), + "vec_some": h.retrieval.vector_score.is_some(), + "fusion_score_positive": h.retrieval.fusion_score > 0.0, + })).collect::>() + ); + + let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("search") + .join("hybrid") + .join("run-1.json"); + + if std::env::var_os("KB_UPDATE_SNAPSHOTS").is_some() { + std::fs::create_dir_all(fixture.parent().unwrap()).unwrap(); + std::fs::write(&fixture, serde_json::to_string_pretty(&actual).unwrap()).unwrap(); + eprintln!("[snapshot] regenerated {}", fixture.display()); + // Fail loudly so that accidentally setting KB_UPDATE_SNAPSHOTS + // in CI surfaces as a test failure rather than a silent + // overwrite + green run. Same fail-loud-instead-of-silent-pass + // philosophy as P3-2's `SNAPSHOT_HASH_BASELINE = 0` and P3-3's + // placeholder fixture guards. + panic!( + "[snapshot] regenerated {}, re-run without KB_UPDATE_SNAPSHOTS to verify pin", + fixture.display() + ); + } + + let expected: serde_json::Value = + serde_json::from_str(&std::fs::read_to_string(&fixture).unwrap_or_else(|_| { + panic!( + "missing snapshot fixture at {}; run with \ + KB_UPDATE_SNAPSHOTS=1 to create", + fixture.display() + ) + })) + .unwrap(); + + // Refuse to silently "pass" against the committed placeholder. The + // placeholder JSON carries a `_comment` field with regeneration + // instructions; production fixtures (a captured list) do not. + if expected.get("_comment").is_some() { + panic!( + "snapshot fixture is a placeholder — regenerate on AVX hardware then commit. \ + Path: {}. To regenerate: \ + `KB_UPDATE_SNAPSHOTS=1 cargo test -p kb-search -- --ignored hybrid_snapshot`.", + fixture.display() + ); + } + + assert_eq!( + actual, expected, + "hybrid snapshot drift; rerun with KB_UPDATE_SNAPSHOTS=1 to regenerate" + ); + + // Independent guard: fusion scores must be non-increasing across + // the result list (rrf is rank-biased, so this is the + // semantically-correct ordering invariant). + for w in hits.windows(2) { + assert!( + w[0].retrieval.fusion_score >= w[1].retrieval.fusion_score, + "fusion scores not in descending order: {} then {}", + w[0].retrieval.fusion_score, + w[1].retrieval.fusion_score + ); + } +}