test(cli): integration tests for --trace + schema breakdowns (fb-37)

Also fixes App::search_with_opts trace branch to use NoopRetriever
for SearchMode::Lexical, removing the embeddings requirement when
the user only wants lexical-mode trace.
This commit is contained in:
th-kim0823
2026-05-10 13:21:33 +09:00
parent 72c227af23
commit f7e2072d66
3 changed files with 166 additions and 35 deletions

View File

@@ -347,26 +347,36 @@ impl App {
// p9-fb-37: when --trace is requested, bypass the LRU cache and
// run through `HybridRetriever::search_with_trace`, which
// dispatches by mode internally. This requires embeddings (same
// as `--mode hybrid`); `require_embeddings()` surfaces the
// existing "switch to --mode lexical" error otherwise.
// dispatches by mode internally. Vector / hybrid modes require
// embeddings (same as `--mode hybrid`); lexical mode skips
// embedder construction via `NoopRetriever` so lexical-only
// workspaces (provider = "none") can use `--trace` without
// surfacing the "switch to --mode lexical" error.
if opts.trace {
let lex = Arc::new(LexicalRetriever::with_settings(
self.sqlite.clone(),
lexical_index_version(&self.config),
self.config.search.snippet_chars,
)) as Arc<dyn Retriever>;
let (emb, vec_store) = self.require_embeddings()?;
let vec_iv = vector_index_version(emb.as_ref());
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
let emb_dyn: Arc<dyn Embedder> = emb;
let vec_retr = Arc::new(VectorRetriever::with_settings(
vec_dyn,
emb_dyn,
self.sqlite.clone(),
vec_iv,
self.config.search.snippet_chars,
)) as Arc<dyn Retriever>;
let vec_retr: Arc<dyn Retriever> = if matches!(query.mode, SearchMode::Lexical) {
// `HybridRetriever::search_with_trace` never invokes the
// vector retriever for `SearchMode::Lexical` (Task 4).
// A no-op stand-in lets us avoid the ~470 MB embedder
// load when the user only asked for lexical trace.
Arc::new(NoopRetriever)
} else {
let (emb, vec_store) = self.require_embeddings()?;
let vec_iv = vector_index_version(emb.as_ref());
let vec_dyn: Arc<dyn VectorStore + Send + Sync> = vec_store;
let emb_dyn: Arc<dyn Embedder> = emb;
Arc::new(VectorRetriever::with_settings(
vec_dyn,
emb_dyn,
self.sqlite.clone(),
vec_iv,
self.config.search.snippet_chars,
)) as Arc<dyn Retriever>
};
let hybrid = HybridRetriever::new(&self.config, lex, vec_retr);
let (mut traced_hits, trace) = hybrid.search_with_trace(&fetch_query)?;
@@ -800,6 +810,24 @@ fn lexical_index_version(config: &kebab_config::Config) -> IndexVersion {
IndexVersion(format!("lex:{}", config.chunking.chunker_version))
}
/// p9-fb-37: stand-in for the vector retriever in the trace path when
/// `query.mode == SearchMode::Lexical`. `HybridRetriever::search_with_trace`'s
/// Lexical branch never calls `vector.search()`, so returning an empty
/// hit list here is safe and lets lexical-only workspaces (embedding
/// `provider = "none"`) use `--trace` without paying the ~470 MB
/// embedder load.
struct NoopRetriever;
impl Retriever for NoopRetriever {
fn search(&self, _q: &kebab_core::SearchQuery) -> anyhow::Result<Vec<kebab_core::SearchHit>> {
Ok(Vec::new())
}
fn index_version(&self) -> kebab_core::IndexVersion {
kebab_core::IndexVersion("noop:trace".into())
}
}
/// Compose a stable `IndexVersion` for the vector retriever. Tracks
/// `(embedding_model, embedding_version, dimensions)` so a model swap
/// flags drift via the existing index_version mismatch warning in
@@ -944,12 +972,11 @@ mod tests_trace {
#[test]
fn search_response_trace_some_when_opts_trace_true_lexical_mode() {
// Lexical mode doesn't require embeddings — the trace path
// builds HybridRetriever which holds both retrievers, but
// for SearchMode::Lexical only the lexical side is invoked.
// require_embeddings will fail if no embedding provider is
// configured. Default Config has provider = "none" so this
// test will fail unless we tolerate that. Skip the assertion
// if the call returns the embedding-disabled error.
// builds HybridRetriever with a `NoopRetriever` stand-in for
// the vector side, since `HybridRetriever::search_with_trace`'s
// Lexical branch never invokes `vector.search()`. Default
// Config has embedding `provider = "none"`, and lexical-mode
// trace must succeed under that config (no embedder load).
let (_dir, app) = open_app_with_temp_dir();
let q = SearchQuery {
text: "x".into(),
@@ -961,20 +988,9 @@ mod tests_trace {
trace: true,
..Default::default()
};
match app.search_with_opts(q, opts) {
Ok(resp) => {
assert!(resp.trace.is_some(), "trace populated when opts.trace=true");
}
Err(e) => {
// Acceptable in test environment without embeddings —
// verify the error is the expected embedding-disabled
// shape, not an unrelated panic.
let msg = format!("{e:#}");
assert!(
msg.contains("embedding") || msg.contains("--mode lexical"),
"unexpected error: {msg}"
);
}
}
let resp = app
.search_with_opts(q, opts)
.expect("lexical-mode trace must succeed without embeddings");
assert!(resp.trace.is_some(), "trace populated when opts.trace=true");
}
}

View File

@@ -0,0 +1,57 @@
//! p9-fb-37: integration tests for `kebab schema --json` extended stats.
mod common;
use serde_json::Value;
use std::fs;
use std::process::Command;
fn run_schema(cfg: &std::path::Path) -> Value {
let bin = env!("CARGO_BIN_EXE_kebab");
let out = Command::new(bin)
.args(["--config", cfg.to_str().unwrap(), "schema", "--json"])
.output()
.expect("run kebab schema");
assert!(
out.status.success(),
"schema failed: stderr={}",
String::from_utf8_lossy(&out.stderr)
);
serde_json::from_slice(&out.stdout).expect("valid JSON")
}
#[test]
fn schema_stats_includes_breakdowns_on_fresh_corpus() {
let dir = tempfile::tempdir().unwrap();
let (cfg, workspace, _data) = common::write_config(dir.path(), 0);
// Run a no-op ingest to bring up migrations + create the SQLite file.
fs::write(workspace.join("placeholder.md"), "# placeholder\n").unwrap();
common::ingest(&cfg, &workspace);
let v = run_schema(&cfg);
let stats = &v["stats"];
let m = stats["media_breakdown"].as_object().unwrap();
assert_eq!(m.len(), 5, "5 media keys padded");
for k in &["markdown", "pdf", "image", "audio", "other"] {
assert!(m[*k].is_number(), "media[{k}] is integer");
}
assert!(stats["lang_breakdown"].is_object());
assert!(stats["index_bytes"]["sqlite"].is_number());
assert!(stats["index_bytes"]["lancedb"].is_number());
assert!(stats["stale_doc_count"].is_number());
}
#[test]
fn schema_stats_breakdowns_after_ingest() {
let dir = tempfile::tempdir().unwrap();
let (cfg, workspace, _data) = common::write_config(dir.path(), 0);
fs::write(workspace.join("a.md"), "---\nlang: en\n---\nhello\n").unwrap();
fs::write(workspace.join("b.md"), "---\nlang: ko\n---\n안녕\n").unwrap();
common::ingest(&cfg, &workspace);
let v = run_schema(&cfg);
let stats = &v["stats"];
assert_eq!(stats["media_breakdown"]["markdown"], 2);
assert!(stats["lang_breakdown"].is_object());
assert!(stats["index_bytes"]["sqlite"].as_u64().unwrap() > 0);
}

View File

@@ -0,0 +1,58 @@
//! p9-fb-37: integration tests for `kebab search --trace --json`.
mod common;
use serde_json::Value;
use std::fs;
#[test]
fn search_trace_json_includes_trace_block() {
let dir = tempfile::tempdir().unwrap();
let (cfg, workspace, _data) = common::write_config(dir.path(), 0);
fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap();
common::ingest(&cfg, &workspace);
let (stdout, _stderr) = common::run_search_with_args(
&cfg,
&["--mode", "lexical", "--trace", "--json", "rust"],
);
let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON");
assert_eq!(v["schema_version"], "search_response.v1");
assert!(v["trace"].is_object(), "trace block present");
assert!(v["trace"]["timing"].is_object());
assert!(v["trace"]["timing"]["total_ms"].is_number());
assert!(v["trace"]["lexical"].is_array());
assert!(v["trace"]["vector"].is_array());
assert!(v["trace"]["rrf_inputs"].is_array());
}
#[test]
fn search_without_trace_omits_trace_field() {
let dir = tempfile::tempdir().unwrap();
let (cfg, workspace, _data) = common::write_config(dir.path(), 0);
fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap();
common::ingest(&cfg, &workspace);
let (stdout, _stderr) = common::run_search_with_args(
&cfg,
&["--mode", "lexical", "--json", "rust"],
);
let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON");
assert!(v.get("trace").is_none(), "trace field absent without --trace");
}
#[test]
fn search_trace_lexical_mode_vector_list_empty() {
let dir = tempfile::tempdir().unwrap();
let (cfg, workspace, _data) = common::write_config(dir.path(), 0);
fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap();
common::ingest(&cfg, &workspace);
let (stdout, _stderr) = common::run_search_with_args(
&cfg,
&["--mode", "lexical", "--trace", "--json", "rust"],
);
let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON");
assert_eq!(v["trace"]["vector"].as_array().unwrap().len(), 0);
assert_eq!(v["trace"]["timing"]["vector_ms"], 0);
}