refactor(rename): kb crates → kebab — Cargo packages, folders, Rust modules
프로젝트 이름 `kb` → `kebab` rename 의 첫 단계. - workspace `Cargo.toml`: members `crates/kb-*` → `crates/kebab-*`, repository URL `altair823/kb` → `altair823/kebab`. - 18 crate 폴더 rename via `git mv` (history 보존). - 각 crate `Cargo.toml`: `name = "kb-*"` → `"kebab-*"`, path deps `../kb-*` → `../kebab-*`. - 모든 `.rs`: `kb_<id>` snake-case 모듈 path 18 개 (`kb_core`, `kb_config`, `kb_app`, `kb_cli`, `kb_eval`, `kb_search`, `kb_chunk`, `kb_normalize`, `kb_source_fs`, `kb_parse_md`, `kb_parse_types`, `kb_store_sqlite`, `kb_store_vector`, `kb_embed`, `kb_embed_local`, `kb_llm`, `kb_llm_local`, `kb_rag`) → `kebab_<id>` 일괄 sed (단어 경계 \\b 사용해 영어 문장 안의 "kb" 약어 미오염). CLI binary 이름 (`[[bin]] name = "kb"`), 환경변수 `KB_*`, XDG paths, tracing target, 그리고 docs sweep 은 다음 commit 에서. ## 검증 - `cargo check --workspace` clean — 모든 crate 빌드 통과 후 commit. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
216
crates/kebab-search/tests/common/mod.rs
Normal file
216
crates/kebab-search/tests/common/mod.rs
Normal file
@@ -0,0 +1,216 @@
|
||||
//! Shared scaffolding for kb-search hybrid integration tests.
|
||||
//!
|
||||
//! # Test policy
|
||||
//!
|
||||
//! Integration tests in `hybrid.rs` that touch `LanceVectorStore`
|
||||
//! are marked `#[ignore]` AND call [`require_avx_or_panic`] inside
|
||||
//! the test body so a `--ignored` invocation on a non-AVX host
|
||||
//! fails loudly with a clear message rather than crashing later
|
||||
//! inside Lance's f32 SIMD kernel with `SIGILL`.
|
||||
//!
|
||||
//! See `crates/kb-store-vector/tests/common/mod.rs` for the
|
||||
//! original P3-3 rationale; this is a copy because that crate's
|
||||
//! test commons are test-only and not part of its public surface.
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use kebab_config::Config;
|
||||
use kebab_core::{
|
||||
ChunkId, DocumentId, EmbeddingId, EmbeddingInput, EmbeddingKind,
|
||||
EmbeddingModelId, EmbeddingVersion, IndexVersion, VectorRecord, VectorStore,
|
||||
};
|
||||
use kebab_embed::{Embedder, MockEmbedder};
|
||||
use kebab_search::{LexicalRetriever, VectorRetriever};
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
use kebab_store_vector::LanceVectorStore;
|
||||
use rusqlite::params;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Panic if the host CPU lacks AVX. Called from every `#[ignore]`-d
|
||||
/// integration test body so that `cargo test -- --ignored` on a
|
||||
/// non-AVX host fails loudly with a clear message instead of crashing
|
||||
/// later inside a Lance SIMD kernel with `SIGILL`.
|
||||
pub fn require_avx_or_panic() {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if !std::is_x86_feature_detected!("avx") {
|
||||
panic!(
|
||||
"kb-search hybrid integration test requires AVX-capable hardware; \
|
||||
host CPU lacks AVX. Run on an AVX-capable machine."
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Index version label used by hybrid integration tests so the
|
||||
/// `index_version()` composite token is predictable in snapshots.
|
||||
pub const TEST_LEX_INDEX_VERSION: &str = "v1.0-lex";
|
||||
pub const TEST_VEC_INDEX_VERSION: &str = "v1.0-vec";
|
||||
|
||||
/// Embedding dimensions for tests. Kept small so MockEmbedder runs
|
||||
/// fast and the Lance table stays compact on disk; production uses
|
||||
/// 384 (multilingual-e5-small) but the retriever code is dim-agnostic.
|
||||
pub const TEST_DIMENSIONS: usize = 16;
|
||||
pub const TEST_MODEL_ID: &str = "mock-e5";
|
||||
|
||||
pub struct HybridEnv {
|
||||
pub temp: TempDir,
|
||||
pub config: Config,
|
||||
pub sqlite: Arc<SqliteStore>,
|
||||
pub vector_store: Arc<LanceVectorStore>,
|
||||
pub embedder: Arc<MockEmbedder>,
|
||||
}
|
||||
|
||||
impl HybridEnv {
|
||||
pub fn new() -> Self {
|
||||
let temp = tempfile::tempdir().expect("tempdir");
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = temp.path().to_string_lossy().into_owned();
|
||||
let sqlite = SqliteStore::open(&config).unwrap();
|
||||
sqlite.run_migrations().unwrap();
|
||||
let sqlite = Arc::new(sqlite);
|
||||
let vector_store =
|
||||
Arc::new(LanceVectorStore::new(&config, sqlite.clone()).unwrap());
|
||||
let embedder = Arc::new(MockEmbedder::new(
|
||||
EmbeddingModelId(TEST_MODEL_ID.to_string()),
|
||||
EmbeddingVersion("v1".to_string()),
|
||||
TEST_DIMENSIONS,
|
||||
));
|
||||
Self {
|
||||
temp,
|
||||
config,
|
||||
sqlite,
|
||||
vector_store,
|
||||
embedder,
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a `LexicalRetriever` over the shared SQLite store.
|
||||
pub fn lexical_retriever(&self) -> LexicalRetriever {
|
||||
LexicalRetriever::new(
|
||||
Arc::clone(&self.sqlite),
|
||||
IndexVersion(TEST_LEX_INDEX_VERSION.to_string()),
|
||||
)
|
||||
}
|
||||
|
||||
/// Build a `VectorRetriever` over the shared LanceVectorStore +
|
||||
/// MockEmbedder + SQLite store.
|
||||
pub fn vector_retriever(&self) -> VectorRetriever {
|
||||
let store: Arc<dyn VectorStore + Send + Sync> =
|
||||
Arc::clone(&self.vector_store) as Arc<dyn VectorStore + Send + Sync>;
|
||||
let embed: Arc<dyn Embedder> =
|
||||
Arc::clone(&self.embedder) as Arc<dyn Embedder>;
|
||||
VectorRetriever::new(
|
||||
store,
|
||||
embed,
|
||||
Arc::clone(&self.sqlite),
|
||||
IndexVersion(TEST_VEC_INDEX_VERSION.to_string()),
|
||||
)
|
||||
}
|
||||
|
||||
/// Insert (asset, document, document_tags, chunk) rows directly.
|
||||
/// We seed without going through `DocumentStore::put_document`
|
||||
/// to keep this crate's test deps inside the Allowed list (no
|
||||
/// `kb-parse-md` / `kb-normalize` / `kb-chunk`). The `chunks` row
|
||||
/// also fires the V002 FTS5 triggers, so the lexical retriever
|
||||
/// can find the row by `MATCH` without a manual rebuild.
|
||||
pub fn seed_chunk(
|
||||
&self,
|
||||
chunk_id: &str,
|
||||
doc_id: &str,
|
||||
workspace_path: &str,
|
||||
text: &str,
|
||||
heading_path: &[&str],
|
||||
tags: &[&str],
|
||||
) {
|
||||
let asset_id = format!("a{}", &doc_id[..31]);
|
||||
let conn = self.sqlite.read_conn();
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, ?, ?, '\"markdown\"', 0,
|
||||
'deadbeefdeadbeefdeadbeefdeadbeef',
|
||||
'reference', ?, '1970-01-01T00:00:00Z')",
|
||||
params![
|
||||
asset_id,
|
||||
format!("file://{workspace_path}"),
|
||||
workspace_path,
|
||||
workspace_path,
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang, source_type,
|
||||
trust_level, parser_version, doc_version, schema_version,
|
||||
metadata_json, provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
|
||||
'{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
|
||||
params![doc_id, asset_id, workspace_path],
|
||||
)
|
||||
.unwrap();
|
||||
for t in tags {
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO document_tags (doc_id, tag) VALUES (?, ?)",
|
||||
params![doc_id, t],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
let heading_json = serde_json::to_string(heading_path).unwrap();
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, ?, ?, NULL,
|
||||
'[{\"kind\":\"line\",\"start\":1,\"end\":3}]',
|
||||
1, 'v1', 'h', '[]', '1970-01-01T00:00:00Z')",
|
||||
params![chunk_id, doc_id, text, heading_json],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// Embed `text` as a Document and upsert it as the embedding for
|
||||
/// `chunk_id`. Drives the same code path production uses:
|
||||
/// MockEmbedder → VectorRecord → LanceVectorStore::upsert →
|
||||
/// embedding_records committed.
|
||||
pub fn embed_and_upsert(
|
||||
&self,
|
||||
chunk_id: &str,
|
||||
doc_id: &str,
|
||||
text: &str,
|
||||
heading_path: &[&str],
|
||||
) {
|
||||
let inputs = [EmbeddingInput {
|
||||
text,
|
||||
kind: EmbeddingKind::Document,
|
||||
}];
|
||||
let mut vecs = self.embedder.embed(&inputs).unwrap();
|
||||
let vector = vecs.remove(0);
|
||||
let record = VectorRecord {
|
||||
chunk_id: ChunkId(chunk_id.to_string()),
|
||||
embedding_id: EmbeddingId(format!("e{}", &chunk_id[..31])),
|
||||
vector,
|
||||
doc_id: DocumentId(doc_id.to_string()),
|
||||
text: text.to_string(),
|
||||
heading_path: heading_path.iter().map(|s| s.to_string()).collect(),
|
||||
model_id: EmbeddingModelId(TEST_MODEL_ID.to_string()),
|
||||
model_version: EmbeddingVersion("v1".to_string()),
|
||||
dimensions: TEST_DIMENSIONS,
|
||||
};
|
||||
self.vector_store.upsert(&[record]).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
/// Pad a short prefix to the 32-hex shape `kebab_core` newtypes expect.
|
||||
pub fn id32(prefix: &str) -> String {
|
||||
let mut s = prefix.to_string();
|
||||
while s.len() < 32 {
|
||||
s.push('0');
|
||||
}
|
||||
s.truncate(32);
|
||||
s
|
||||
}
|
||||
42
crates/kebab-search/tests/fixtures/search/hybrid/run-1.json
vendored
Normal file
42
crates/kebab-search/tests/fixtures/search/hybrid/run-1.json
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
[
|
||||
{
|
||||
"chunk_id": "c1000000000000000000000000000000",
|
||||
"fusion_score_positive": true,
|
||||
"lex_some": true,
|
||||
"lexical_rank": 1,
|
||||
"method": "hybrid",
|
||||
"rank": 1,
|
||||
"vec_some": true,
|
||||
"vector_rank": 3
|
||||
},
|
||||
{
|
||||
"chunk_id": "c2000000000000000000000000000000",
|
||||
"fusion_score_positive": true,
|
||||
"lex_some": true,
|
||||
"lexical_rank": 2,
|
||||
"method": "hybrid",
|
||||
"rank": 2,
|
||||
"vec_some": true,
|
||||
"vector_rank": 2
|
||||
},
|
||||
{
|
||||
"chunk_id": "c4000000000000000000000000000000",
|
||||
"fusion_score_positive": true,
|
||||
"lex_some": false,
|
||||
"lexical_rank": null,
|
||||
"method": "hybrid",
|
||||
"rank": 3,
|
||||
"vec_some": true,
|
||||
"vector_rank": 1
|
||||
},
|
||||
{
|
||||
"chunk_id": "c3000000000000000000000000000000",
|
||||
"fusion_score_positive": true,
|
||||
"lex_some": false,
|
||||
"lexical_rank": null,
|
||||
"method": "hybrid",
|
||||
"rank": 4,
|
||||
"vec_some": true,
|
||||
"vector_rank": 4
|
||||
}
|
||||
]
|
||||
60
crates/kebab-search/tests/fixtures/search/lexical/run-1.json
vendored
Normal file
60
crates/kebab-search/tests/fixtures/search/lexical/run-1.json
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
[
|
||||
{
|
||||
"chunk_id": "c3000000000000000000000000000000",
|
||||
"chunker_version": "v1",
|
||||
"citation": {
|
||||
"end": 8,
|
||||
"kind": "line",
|
||||
"path": "notes/snap.md",
|
||||
"section": "Snap",
|
||||
"start": 7
|
||||
},
|
||||
"doc_id": "d0000000000000000000000000000000",
|
||||
"doc_path": "notes/snap.md",
|
||||
"embedding_model": null,
|
||||
"heading_path": [
|
||||
"Snap"
|
||||
],
|
||||
"index_version": "v1.0",
|
||||
"rank": 1,
|
||||
"retrieval": {
|
||||
"fusion_score": 1.4490997273242101e-6,
|
||||
"lexical_rank": 1,
|
||||
"lexical_score": 1.4490997273242101e-6,
|
||||
"method": "lexical",
|
||||
"vector_rank": null,
|
||||
"vector_score": null
|
||||
},
|
||||
"section_label": "Snap",
|
||||
"snippet": "alpha alpha"
|
||||
},
|
||||
{
|
||||
"chunk_id": "c1000000000000000000000000000000",
|
||||
"chunker_version": "v1",
|
||||
"citation": {
|
||||
"end": 2,
|
||||
"kind": "line",
|
||||
"path": "notes/snap.md",
|
||||
"section": "Snap",
|
||||
"start": 1
|
||||
},
|
||||
"doc_id": "d0000000000000000000000000000000",
|
||||
"doc_path": "notes/snap.md",
|
||||
"embedding_model": null,
|
||||
"heading_path": [
|
||||
"Snap"
|
||||
],
|
||||
"index_version": "v1.0",
|
||||
"rank": 2,
|
||||
"retrieval": {
|
||||
"fusion_score": 9.641424867368187e-7,
|
||||
"lexical_rank": 2,
|
||||
"lexical_score": 9.641424867368187e-7,
|
||||
"method": "lexical",
|
||||
"vector_rank": null,
|
||||
"vector_score": null
|
||||
},
|
||||
"section_label": "Snap",
|
||||
"snippet": "alpha bravo charlie"
|
||||
}
|
||||
]
|
||||
213
crates/kebab-search/tests/hybrid.rs
Normal file
213
crates/kebab-search/tests/hybrid.rs
Normal file
@@ -0,0 +1,213 @@
|
||||
//! Hybrid integration tests — touch a real `LanceVectorStore` +
|
||||
//! `SqliteStore` + `MockEmbedder`. These tests are `#[ignore]`-d and
|
||||
//! AVX-gated; see `tests/common/mod.rs` for the policy rationale.
|
||||
//!
|
||||
//! Mock-retriever unit tests live alongside the implementation in
|
||||
//! `crates/kb-search/src/hybrid.rs` (no Lance, no AVX needed) — the
|
||||
//! tests here exercise the full plumbing with the real Lance store.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::{
|
||||
HybridEnv, id32, require_avx_or_panic, TEST_LEX_INDEX_VERSION, TEST_VEC_INDEX_VERSION,
|
||||
};
|
||||
use kebab_core::{
|
||||
Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery,
|
||||
};
|
||||
use kebab_search::{FusionPolicy, HybridRetriever};
|
||||
use serde_json::json;
|
||||
|
||||
fn build_hybrid(env: &HybridEnv) -> HybridRetriever {
|
||||
let lex: Arc<dyn Retriever> = Arc::new(env.lexical_retriever());
|
||||
let vec: Arc<dyn Retriever> = Arc::new(env.vector_retriever());
|
||||
HybridRetriever::with_policy(lex, vec, FusionPolicy::Rrf { k_rrf: 60 }, 5)
|
||||
}
|
||||
|
||||
/// Seed a tiny corpus that lets us prove hybrid recall ≥ each side
|
||||
/// independently. Two chunks are lexical-only matches ("rust cargo");
|
||||
/// two chunks are vector-only matches (their text doesn't contain
|
||||
/// the query token but their embedding still scores nearby because
|
||||
/// MockEmbedder's hash distributes over all chunks).
|
||||
fn seed_disjoint_corpus(env: &HybridEnv) -> Vec<String> {
|
||||
// The lexical side will only match chunks that contain the query
|
||||
// tokens. The vector side will rank ALL chunks by embedding
|
||||
// similarity to the query — even ones whose text doesn't share
|
||||
// a token with the query.
|
||||
let chunks = [
|
||||
// (chunk_id, doc_id, path, text, headings)
|
||||
(id32("c1"), id32("d1"), "notes/rust1.md", "rust cargo macros", &["A"][..]),
|
||||
(id32("c2"), id32("d2"), "notes/rust2.md", "rust traits and lifetimes", &["B"][..]),
|
||||
(id32("c3"), id32("d3"), "notes/python.md", "python dataclasses tutorial", &["C"][..]),
|
||||
(id32("c4"), id32("d4"), "notes/go.md", "go interfaces and channels", &["D"][..]),
|
||||
];
|
||||
let mut ids = Vec::new();
|
||||
for (cid, did, path, text, headings) in &chunks {
|
||||
env.seed_chunk(cid, did, path, text, headings, &[]);
|
||||
env.embed_and_upsert(cid, did, text, headings);
|
||||
ids.push(cid.clone());
|
||||
}
|
||||
ids
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore = "requires AVX-capable hardware (LanceDB)"]
|
||||
fn hybrid_recall_disjoint_returns_union() {
|
||||
require_avx_or_panic();
|
||||
let env = HybridEnv::new();
|
||||
let _ids = seed_disjoint_corpus(&env);
|
||||
let h = build_hybrid(&env);
|
||||
|
||||
let q = SearchQuery {
|
||||
text: "rust".to_string(),
|
||||
mode: SearchMode::Hybrid,
|
||||
k: 4,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let hits = h.search(&q).unwrap();
|
||||
|
||||
// The vector side will return up to 4 candidates regardless of
|
||||
// text overlap; the lexical side will return only the rust* ones.
|
||||
// Together the union must cover at least the lexical hits AND
|
||||
// include at least one non-lexical chunk if vector found one.
|
||||
assert!(!hits.is_empty(), "hybrid must return at least one hit");
|
||||
// Every hit's RetrievalDetail.method must be Hybrid.
|
||||
for h in &hits {
|
||||
assert_eq!(h.retrieval.method, SearchMode::Hybrid);
|
||||
// At least one of lex/vec_score must be Some.
|
||||
assert!(
|
||||
h.retrieval.lexical_score.is_some() || h.retrieval.vector_score.is_some(),
|
||||
"hybrid hit must carry at least one mode's score"
|
||||
);
|
||||
}
|
||||
// index_version composite token.
|
||||
let iv = h.index_version();
|
||||
assert!(iv.0.starts_with("hybrid:"));
|
||||
assert!(iv.0.contains(TEST_LEX_INDEX_VERSION));
|
||||
assert!(iv.0.contains(TEST_VEC_INDEX_VERSION));
|
||||
|
||||
// Lexical-only chunks (c1, c2) MUST appear: they're the only ones
|
||||
// matching the FTS5 query, and the vector side over-fetches enough
|
||||
// to include them too.
|
||||
let ids: Vec<&str> = hits.iter().map(|h| h.chunk_id.0.as_str()).collect();
|
||||
assert!(ids.contains(&id32("c1").as_str()));
|
||||
assert!(ids.contains(&id32("c2").as_str()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore = "requires AVX-capable hardware (LanceDB)"]
|
||||
fn hybrid_determinism_same_query_twice() {
|
||||
require_avx_or_panic();
|
||||
let env = HybridEnv::new();
|
||||
let _ = seed_disjoint_corpus(&env);
|
||||
let h = build_hybrid(&env);
|
||||
|
||||
let q = SearchQuery {
|
||||
text: "rust".to_string(),
|
||||
mode: SearchMode::Hybrid,
|
||||
k: 4,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let a = h.search(&q).unwrap();
|
||||
let b = h.search(&q).unwrap();
|
||||
assert_eq!(a, b, "identical query must yield byte-identical Vec<SearchHit>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore = "requires AVX-capable hardware (LanceDB)"]
|
||||
fn hybrid_snapshot_run_1() {
|
||||
require_avx_or_panic();
|
||||
let env = HybridEnv::new();
|
||||
let _ = seed_disjoint_corpus(&env);
|
||||
let h = build_hybrid(&env);
|
||||
|
||||
let q = SearchQuery {
|
||||
text: "rust".to_string(),
|
||||
mode: SearchMode::Hybrid,
|
||||
k: 4,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let hits = h.search(&q).unwrap();
|
||||
|
||||
// Snapshot pins the structural shape:
|
||||
// - chunk_id ordering
|
||||
// - which side contributed (lexical_rank / vector_rank
|
||||
// populated as Some/None)
|
||||
// - that fusion_score is non-increasing
|
||||
// - method = Hybrid for every hit
|
||||
let actual = json!(
|
||||
hits.iter().map(|h: &SearchHit| json!({
|
||||
"chunk_id": h.chunk_id.0,
|
||||
"rank": h.rank,
|
||||
"method": h.retrieval.method,
|
||||
"lexical_rank": h.retrieval.lexical_rank,
|
||||
"vector_rank": h.retrieval.vector_rank,
|
||||
"lex_some": h.retrieval.lexical_score.is_some(),
|
||||
"vec_some": h.retrieval.vector_score.is_some(),
|
||||
"fusion_score_positive": h.retrieval.fusion_score > 0.0,
|
||||
})).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
.join("search")
|
||||
.join("hybrid")
|
||||
.join("run-1.json");
|
||||
|
||||
if std::env::var_os("KB_UPDATE_SNAPSHOTS").is_some() {
|
||||
std::fs::create_dir_all(fixture.parent().unwrap()).unwrap();
|
||||
std::fs::write(&fixture, serde_json::to_string_pretty(&actual).unwrap()).unwrap();
|
||||
eprintln!("[snapshot] regenerated {}", fixture.display());
|
||||
// Fail loudly so that accidentally setting KB_UPDATE_SNAPSHOTS
|
||||
// in CI surfaces as a test failure rather than a silent
|
||||
// overwrite + green run. Same fail-loud-instead-of-silent-pass
|
||||
// philosophy as P3-2's `SNAPSHOT_HASH_BASELINE = 0` and P3-3's
|
||||
// placeholder fixture guards.
|
||||
panic!(
|
||||
"[snapshot] regenerated {}, re-run without KB_UPDATE_SNAPSHOTS to verify pin",
|
||||
fixture.display()
|
||||
);
|
||||
}
|
||||
|
||||
let expected: serde_json::Value =
|
||||
serde_json::from_str(&std::fs::read_to_string(&fixture).unwrap_or_else(|_| {
|
||||
panic!(
|
||||
"missing snapshot fixture at {}; run with \
|
||||
KB_UPDATE_SNAPSHOTS=1 to create",
|
||||
fixture.display()
|
||||
)
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
// Refuse to silently "pass" against the committed placeholder. The
|
||||
// placeholder JSON carries a `_comment` field with regeneration
|
||||
// instructions; production fixtures (a captured list) do not.
|
||||
if expected.get("_comment").is_some() {
|
||||
panic!(
|
||||
"snapshot fixture is a placeholder — regenerate on AVX hardware then commit. \
|
||||
Path: {}. To regenerate: \
|
||||
`KB_UPDATE_SNAPSHOTS=1 cargo test -p kb-search -- --ignored hybrid_snapshot`.",
|
||||
fixture.display()
|
||||
);
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
actual, expected,
|
||||
"hybrid snapshot drift; rerun with KB_UPDATE_SNAPSHOTS=1 to regenerate"
|
||||
);
|
||||
|
||||
// Independent guard: fusion scores must be non-increasing across
|
||||
// the result list (rrf is rank-biased, so this is the
|
||||
// semantically-correct ordering invariant).
|
||||
for w in hits.windows(2) {
|
||||
assert!(
|
||||
w[0].retrieval.fusion_score >= w[1].retrieval.fusion_score,
|
||||
"fusion scores not in descending order: {} then {}",
|
||||
w[0].retrieval.fusion_score,
|
||||
w[1].retrieval.fusion_score
|
||||
);
|
||||
}
|
||||
}
|
||||
666
crates/kebab-search/tests/lexical.rs
Normal file
666
crates/kebab-search/tests/lexical.rs
Normal file
@@ -0,0 +1,666 @@
|
||||
//! P2-2 integration tests for `LexicalRetriever`.
|
||||
//!
|
||||
//! Strategy: seed the SQLite store via raw inserts with `foreign_keys =
|
||||
//! OFF` (mirroring the P2-1 FTS tests). This avoids dragging
|
||||
//! `kb-parse-md` / `kb-normalize` / `kb-chunk` into kb-search's dev-deps,
|
||||
//! which would violate the task's "Allowed deps" list.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use kebab_config::Config;
|
||||
use kebab_core::{IndexVersion, Lang, Retriever, SearchFilters, SearchMode, SearchQuery, TrustLevel};
|
||||
use kebab_search::LexicalRetriever;
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
use rusqlite::Connection;
|
||||
use tempfile::TempDir;
|
||||
|
||||
// ── Test scaffolding ─────────────────────────────────────────────────────
|
||||
|
||||
struct Env {
|
||||
_temp: TempDir,
|
||||
store: Arc<SqliteStore>,
|
||||
db_path: std::path::PathBuf,
|
||||
}
|
||||
|
||||
impl Env {
|
||||
fn new() -> Self {
|
||||
let temp = tempfile::tempdir().expect("tempdir");
|
||||
let mut config = Config::defaults();
|
||||
config.storage.data_dir = temp.path().to_string_lossy().into_owned();
|
||||
let store = SqliteStore::open(&config).expect("open store");
|
||||
store.run_migrations().expect("run migrations");
|
||||
let db_path = temp.path().join("kb.sqlite");
|
||||
Self {
|
||||
_temp: temp,
|
||||
store: Arc::new(store),
|
||||
db_path,
|
||||
}
|
||||
}
|
||||
|
||||
/// Side-channel raw connection with FK enforcement off — same
|
||||
/// trick used by P2-1's FTS tests so we can seed `chunks` /
|
||||
/// `documents` directly without the full ingest graph.
|
||||
fn raw_conn(&self) -> Connection {
|
||||
let conn = Connection::open(&self.db_path).expect("open side conn");
|
||||
conn.pragma_update(None, "foreign_keys", "OFF").unwrap();
|
||||
conn
|
||||
}
|
||||
|
||||
fn retriever(&self) -> LexicalRetriever {
|
||||
LexicalRetriever::new(
|
||||
Arc::clone(&self.store),
|
||||
IndexVersion("v1.0".to_string()),
|
||||
)
|
||||
}
|
||||
|
||||
fn retriever_with_snippet_chars(&self, snippet_chars: usize) -> LexicalRetriever {
|
||||
LexicalRetriever::with_settings(
|
||||
Arc::clone(&self.store),
|
||||
IndexVersion("v1.0".to_string()),
|
||||
snippet_chars,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Minimal documents row. Many columns are NOT NULL and we don't care
|
||||
/// about their exact values for retrieval tests, so we wedge in
|
||||
/// reasonable defaults.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn insert_document(
|
||||
conn: &Connection,
|
||||
doc_id: &str,
|
||||
workspace_path: &str,
|
||||
title: &str,
|
||||
lang: &str,
|
||||
trust_level: &str,
|
||||
tags: &[&str],
|
||||
) {
|
||||
// assets row first — documents.asset_id has a FK with ON DELETE
|
||||
// RESTRICT but FKs are OFF on this connection. Still we insert a
|
||||
// matching row so JOINs pick it up.
|
||||
let asset_id = format!("{:0>32}", &doc_id[..1.min(doc_id.len())]); // 32-hex-ish
|
||||
let asset_id = format!("{:0>32}", asset_id.chars().take(32).collect::<String>());
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, 'file:///x', ?, '\"markdown\"', 0,
|
||||
'd0', 'reference', '/x', '2024-01-01T00:00:00Z')",
|
||||
rusqlite::params![asset_id, workspace_path],
|
||||
)
|
||||
.expect("insert asset");
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang,
|
||||
source_type, trust_level, parser_version,
|
||||
doc_version, schema_version, metadata_json,
|
||||
provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, 'markdown', ?, 'pv1', 1, 1,
|
||||
'{}', '{\"events\":[]}',
|
||||
'2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z')",
|
||||
rusqlite::params![doc_id, asset_id, workspace_path, title, lang, trust_level],
|
||||
)
|
||||
.expect("insert document");
|
||||
|
||||
for tag in tags {
|
||||
conn.execute(
|
||||
"INSERT INTO document_tags (doc_id, tag) VALUES (?, ?)",
|
||||
rusqlite::params![doc_id, tag],
|
||||
)
|
||||
.expect("insert tag");
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn insert_chunk(
|
||||
conn: &Connection,
|
||||
chunk_id: &str,
|
||||
doc_id: &str,
|
||||
text: &str,
|
||||
heading_path: &[&str],
|
||||
section_label: Option<&str>,
|
||||
source_spans_json: &str,
|
||||
chunker_version: &str,
|
||||
) {
|
||||
let heading_json = serde_json::to_string(heading_path).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, 0, ?, 'h', '[]', '2024-01-01T00:00:00Z')",
|
||||
rusqlite::params![
|
||||
chunk_id,
|
||||
doc_id,
|
||||
text,
|
||||
heading_json,
|
||||
section_label,
|
||||
source_spans_json,
|
||||
chunker_version,
|
||||
],
|
||||
)
|
||||
.expect("insert chunk");
|
||||
}
|
||||
|
||||
/// Pad a short ID to the 32-hex shape kebab_core newtypes expect.
|
||||
fn id32(prefix: &str) -> String {
|
||||
let mut s = prefix.to_string();
|
||||
while s.len() < 32 {
|
||||
s.push('0');
|
||||
}
|
||||
s.truncate(32);
|
||||
s
|
||||
}
|
||||
|
||||
// ── Tests ────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn lexical_empty_corpus_returns_empty_vec() {
|
||||
let env = Env::new();
|
||||
let r = env.retriever();
|
||||
let q = SearchQuery {
|
||||
text: "rust".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let hits = r.search(&q).expect("search");
|
||||
assert!(hits.is_empty(), "empty corpus must yield empty Vec");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_empty_query_returns_empty_vec_without_db_hit() {
|
||||
// Even with rows in the DB, a blank query must short-circuit to [].
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/a.md", "A", "en", "primary", &[]);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d"),
|
||||
"rust cargo macros",
|
||||
&["A"],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":3}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
for empty in ["", " ", "''"] {
|
||||
let q = SearchQuery {
|
||||
text: empty.to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 5,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let hits = r.search(&q).unwrap();
|
||||
assert!(hits.is_empty(), "query {empty:?} must yield empty Vec");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_single_doc_match_returns_one_hit_with_citation_round_trip() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/rust.md", "Rust Notes", "en", "primary", &[]);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d"),
|
||||
"Rust borrow checker enforces ownership.",
|
||||
&["Notes"],
|
||||
Some("Notes"),
|
||||
r#"[{"kind":"line","start":4,"end":4}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let q = SearchQuery {
|
||||
text: "borrow".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let hits = r.search(&q).expect("search");
|
||||
assert_eq!(hits.len(), 1);
|
||||
let h = &hits[0];
|
||||
assert_eq!(h.rank, 1);
|
||||
assert_eq!(h.doc_path.0, "notes/rust.md");
|
||||
assert_eq!(h.heading_path, vec!["Notes".to_string()]);
|
||||
assert_eq!(h.section_label.as_deref(), Some("Notes"));
|
||||
assert_eq!(h.retrieval.method, SearchMode::Lexical);
|
||||
assert_eq!(h.retrieval.lexical_rank, Some(1));
|
||||
assert!(h.retrieval.vector_score.is_none());
|
||||
|
||||
// Citation round-trips through `to_uri`/`parse` (line variant).
|
||||
let uri = h.citation.to_uri();
|
||||
let parsed = kebab_core::Citation::parse(&uri).expect("parse uri");
|
||||
// Reparsed citation has section=None (URI fragment doesn't carry it),
|
||||
// so compare by `to_uri` equivalence rather than struct equality.
|
||||
assert_eq!(parsed.to_uri(), uri);
|
||||
// Sanity: this is a Line citation matching the seeded source span.
|
||||
assert_eq!(uri, "notes/rust.md#L4");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_snippet_length_capped_at_snippet_chars() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(
|
||||
&conn,
|
||||
&id32("d"),
|
||||
"notes/long.md",
|
||||
"Long",
|
||||
"en",
|
||||
"primary",
|
||||
&[],
|
||||
);
|
||||
// A text long enough that FTS5 might return a snippet > 80 chars
|
||||
// when given a high word budget. We instead set a tight cap below
|
||||
// and rely on `trim_snippet` as the backstop.
|
||||
let mut text = String::new();
|
||||
for _ in 0..50 {
|
||||
text.push_str("alpha beta gamma delta epsilon ");
|
||||
}
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d"),
|
||||
&text,
|
||||
&["Long"],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
// Set snippet_chars to a known bound; the retriever clamps + trims
|
||||
// any snippet to fit.
|
||||
let r = env.retriever_with_snippet_chars(80);
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: SearchFilters::default(),
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(hits.len(), 1);
|
||||
assert!(
|
||||
hits[0].snippet.chars().count() <= 80,
|
||||
"snippet must be ≤ snippet_chars; got {} chars: {:?}",
|
||||
hits[0].snippet.chars().count(),
|
||||
hits[0].snippet
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_tags_any_excludes_untagged_docs() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &["rust"]);
|
||||
insert_document(&conn, &id32("d2"), "notes/b.md", "B", "en", "primary", &["python"]);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d1"),
|
||||
"ownership and borrow checker",
|
||||
&["A"],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c2"),
|
||||
&id32("d2"),
|
||||
"borrow semantics in python",
|
||||
&["B"],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let q = SearchQuery {
|
||||
text: "borrow".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters {
|
||||
tags_any: vec!["rust".to_string()],
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = r.search(&q).unwrap();
|
||||
assert_eq!(hits.len(), 1, "tags_any=[rust] must exclude python doc");
|
||||
assert_eq!(hits[0].doc_path.0, "notes/a.md");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_lang_and_trust_min_compose() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d1"), "ko/a.md", "A", "ko", "primary", &[]);
|
||||
insert_document(&conn, &id32("d2"), "en/b.md", "B", "en", "primary", &[]);
|
||||
insert_document(&conn, &id32("d3"), "en/c.md", "C", "en", "generated", &[]);
|
||||
for (cid, did, body) in [
|
||||
("c1", "d1", "검색 키워드 alpha"),
|
||||
("c2", "d2", "alpha bravo"),
|
||||
("c3", "d3", "alpha gamma"),
|
||||
] {
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32(cid),
|
||||
&id32(did),
|
||||
body,
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
}
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
// lang=en + trust_min=secondary → only d2 (primary ≥ secondary).
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters {
|
||||
lang: Some(Lang("en".to_string())),
|
||||
trust_min: Some(TrustLevel::Secondary),
|
||||
..Default::default()
|
||||
},
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(hits.len(), 1);
|
||||
assert_eq!(hits[0].doc_path.0, "en/b.md");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_path_glob_does_not_cross_slash() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &[]);
|
||||
insert_document(&conn, &id32("d2"), "notes/sub/b.md", "B", "en", "primary", &[]);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d1"),
|
||||
"shared keyword",
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c2"),
|
||||
&id32("d2"),
|
||||
"shared keyword",
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "keyword".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters {
|
||||
path_glob: Some("notes/*.md".to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
})
|
||||
.unwrap();
|
||||
let paths: Vec<&str> = hits.iter().map(|h| h.doc_path.0.as_str()).collect();
|
||||
assert_eq!(paths, vec!["notes/a.md"], "* must not match across `/`");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_citation_round_trip_against_first_source_span() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/m.md", "M", "en", "primary", &[]);
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&id32("d"),
|
||||
"echo bravo",
|
||||
&[],
|
||||
None,
|
||||
// Two spans; the citation uses the first.
|
||||
r#"[{"kind":"line","start":12,"end":34},{"kind":"line","start":60,"end":61}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "bravo".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: SearchFilters::default(),
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(hits.len(), 1);
|
||||
let uri = hits[0].citation.to_uri();
|
||||
assert_eq!(uri, "notes/m.md#L12-L34");
|
||||
let parsed = kebab_core::Citation::parse(&uri).unwrap();
|
||||
assert_eq!(parsed.to_uri(), uri);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_top_score_within_unit_interval_three_chunks() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/r.md", "R", "en", "primary", &[]);
|
||||
// Three chunks of varying relevance to the query 'alpha':
|
||||
// c1: alpha alpha alpha (best)
|
||||
// c2: alpha bravo
|
||||
// c3: bravo charlie alpha (one occurrence)
|
||||
for (cid, body) in [
|
||||
("c1", "alpha alpha alpha keyword"),
|
||||
("c2", "alpha bravo charlie"),
|
||||
("c3", "bravo charlie alpha"),
|
||||
] {
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32(cid),
|
||||
&id32("d"),
|
||||
body,
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
}
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
})
|
||||
.unwrap();
|
||||
assert!(!hits.is_empty(), "must surface at least one hit");
|
||||
let top = hits[0].retrieval.fusion_score;
|
||||
assert!(
|
||||
top > 0.0 && top <= 1.0,
|
||||
"top normalized score must be in (0, 1]; got {top}"
|
||||
);
|
||||
// All scores in [0, 1].
|
||||
for h in &hits {
|
||||
let s = h.retrieval.fusion_score;
|
||||
assert!((0.0..=1.0).contains(&s), "hit score {s} out of [0, 1]");
|
||||
// lexical_score and fusion_score equal in lexical-only mode.
|
||||
assert_eq!(h.retrieval.lexical_score, Some(s));
|
||||
}
|
||||
// bm25 should rank c1 (3 occurrences) above c2 / c3.
|
||||
assert!(hits[0].chunk_id.0.starts_with("c1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_determinism_same_query_twice() {
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/r.md", "R", "en", "primary", &[]);
|
||||
for (cid, body) in [
|
||||
("c1", "alpha alpha"),
|
||||
("c2", "alpha bravo"),
|
||||
("c3", "alpha charlie"),
|
||||
("c4", "alpha delta"),
|
||||
] {
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32(cid),
|
||||
&id32("d"),
|
||||
body,
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
}
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let q = SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let a = r.search(&q).unwrap();
|
||||
let b = r.search(&q).unwrap();
|
||||
assert_eq!(a, b, "same DB + same query must yield identical Vec<SearchHit>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_determinism_chunk_id_tiebreaker_on_equal_bm25() {
|
||||
// Two chunks with byte-identical text + length → identical bm25 scores
|
||||
// for any `MATCH` against them. The retriever must fall back to
|
||||
// `chunk_id` ordering so the result is stable across runs.
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/tie.md", "Tie", "en", "primary", &[]);
|
||||
let cid_a = id32("aaaa");
|
||||
let cid_b = id32("bbbb");
|
||||
assert!(cid_a < cid_b, "test premise: aaaa-id sorts before bbbb-id");
|
||||
for cid in [&cid_a, &cid_b] {
|
||||
insert_chunk(
|
||||
&conn,
|
||||
cid,
|
||||
&id32("d"),
|
||||
"alpha bravo charlie",
|
||||
&[],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
}
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let q = SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
};
|
||||
let a = r.search(&q).unwrap();
|
||||
let b = r.search(&q).unwrap();
|
||||
assert_eq!(a.len(), 2, "both chunks should match");
|
||||
// bm25 must be equal for byte-identical chunks; the secondary sort
|
||||
// by chunk_id pins the order.
|
||||
assert!(
|
||||
(a[0].retrieval.fusion_score - a[1].retrieval.fusion_score).abs() < 1e-9,
|
||||
"byte-identical chunks must score equally; got {} vs {}",
|
||||
a[0].retrieval.fusion_score,
|
||||
a[1].retrieval.fusion_score
|
||||
);
|
||||
assert!(
|
||||
a[0].chunk_id.0 < a[1].chunk_id.0,
|
||||
"tiebreaker must order by chunk_id ascending; got {} then {}",
|
||||
a[0].chunk_id.0,
|
||||
a[1].chunk_id.0
|
||||
);
|
||||
assert_eq!(a, b, "tiebreaker order must be stable across runs");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_index_version_is_returned_unchanged() {
|
||||
let env = Env::new();
|
||||
let r = LexicalRetriever::new(
|
||||
Arc::clone(&env.store),
|
||||
IndexVersion("custom-label-1".to_string()),
|
||||
);
|
||||
assert_eq!(r.index_version().0, "custom-label-1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_snapshot_run_1() {
|
||||
// Pinned snapshot. A small, deterministic corpus; the JSON shape of
|
||||
// `Vec<SearchHit>` for a fixed query is checked verbatim against
|
||||
// `tests/fixtures/search/lexical/run-1.json`. Update both sides in
|
||||
// the same commit when intentional changes ship.
|
||||
// Stable because rusqlite ships bundled SQLite — a tokenizer/bm25 algorithm change in a future SQLite bump will require regenerating run-1.json via `KB_UPDATE_SNAPSHOTS=1`.
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
insert_document(&conn, &id32("d"), "notes/snap.md", "Snap", "en", "primary", &[]);
|
||||
for (cid, body, span) in [
|
||||
(
|
||||
"c1",
|
||||
"alpha bravo charlie",
|
||||
r#"[{"kind":"line","start":1,"end":2}]"#,
|
||||
),
|
||||
(
|
||||
"c2",
|
||||
"bravo only here",
|
||||
r#"[{"kind":"line","start":4,"end":5}]"#,
|
||||
),
|
||||
(
|
||||
"c3",
|
||||
"alpha alpha",
|
||||
r#"[{"kind":"line","start":7,"end":8}]"#,
|
||||
),
|
||||
] {
|
||||
insert_chunk(&conn, &id32(cid), &id32("d"), body, &["Snap"], Some("Snap"), span, "v1");
|
||||
}
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "alpha".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: SearchFilters::default(),
|
||||
})
|
||||
.unwrap();
|
||||
let actual = serde_json::to_value(&hits).unwrap();
|
||||
|
||||
let baseline_path =
|
||||
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/search/lexical/run-1.json");
|
||||
if std::env::var_os("KB_UPDATE_SNAPSHOTS").is_some() {
|
||||
std::fs::write(&baseline_path, serde_json::to_string_pretty(&actual).unwrap()).unwrap();
|
||||
}
|
||||
let baseline_text = std::fs::read_to_string(&baseline_path)
|
||||
.expect("baseline snapshot must exist; run with KB_UPDATE_SNAPSHOTS=1 to seed");
|
||||
let expected: serde_json::Value = serde_json::from_str(&baseline_text).unwrap();
|
||||
assert_eq!(actual, expected, "lexical run-1 snapshot drift");
|
||||
}
|
||||
Reference in New Issue
Block a user