From f7e2072d6693151683e7e67f42a32bf445dccf0a Mon Sep 17 00:00:00 2001 From: th-kim0823 Date: Sun, 10 May 2026 13:21:33 +0900 Subject: [PATCH] test(cli): integration tests for --trace + schema breakdowns (fb-37) Also fixes App::search_with_opts trace branch to use NoopRetriever for SearchMode::Lexical, removing the embeddings requirement when the user only wants lexical-mode trace. --- crates/kebab-app/src/app.rs | 86 +++++++++++-------- .../kebab-cli/tests/wire_schema_breakdowns.rs | 57 ++++++++++++ crates/kebab-cli/tests/wire_search_trace.rs | 58 +++++++++++++ 3 files changed, 166 insertions(+), 35 deletions(-) create mode 100644 crates/kebab-cli/tests/wire_schema_breakdowns.rs create mode 100644 crates/kebab-cli/tests/wire_search_trace.rs diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs index 7895459..a3d2c07 100644 --- a/crates/kebab-app/src/app.rs +++ b/crates/kebab-app/src/app.rs @@ -347,26 +347,36 @@ impl App { // p9-fb-37: when --trace is requested, bypass the LRU cache and // run through `HybridRetriever::search_with_trace`, which - // dispatches by mode internally. This requires embeddings (same - // as `--mode hybrid`); `require_embeddings()` surfaces the - // existing "switch to --mode lexical" error otherwise. + // dispatches by mode internally. Vector / hybrid modes require + // embeddings (same as `--mode hybrid`); lexical mode skips + // embedder construction via `NoopRetriever` so lexical-only + // workspaces (provider = "none") can use `--trace` without + // surfacing the "switch to --mode lexical" error. if opts.trace { let lex = Arc::new(LexicalRetriever::with_settings( self.sqlite.clone(), lexical_index_version(&self.config), self.config.search.snippet_chars, )) as Arc; - let (emb, vec_store) = self.require_embeddings()?; - let vec_iv = vector_index_version(emb.as_ref()); - let vec_dyn: Arc = vec_store; - let emb_dyn: Arc = emb; - let vec_retr = Arc::new(VectorRetriever::with_settings( - vec_dyn, - emb_dyn, - self.sqlite.clone(), - vec_iv, - self.config.search.snippet_chars, - )) as Arc; + let vec_retr: Arc = if matches!(query.mode, SearchMode::Lexical) { + // `HybridRetriever::search_with_trace` never invokes the + // vector retriever for `SearchMode::Lexical` (Task 4). + // A no-op stand-in lets us avoid the ~470 MB embedder + // load when the user only asked for lexical trace. + Arc::new(NoopRetriever) + } else { + let (emb, vec_store) = self.require_embeddings()?; + let vec_iv = vector_index_version(emb.as_ref()); + let vec_dyn: Arc = vec_store; + let emb_dyn: Arc = emb; + Arc::new(VectorRetriever::with_settings( + vec_dyn, + emb_dyn, + self.sqlite.clone(), + vec_iv, + self.config.search.snippet_chars, + )) as Arc + }; let hybrid = HybridRetriever::new(&self.config, lex, vec_retr); let (mut traced_hits, trace) = hybrid.search_with_trace(&fetch_query)?; @@ -800,6 +810,24 @@ fn lexical_index_version(config: &kebab_config::Config) -> IndexVersion { IndexVersion(format!("lex:{}", config.chunking.chunker_version)) } +/// p9-fb-37: stand-in for the vector retriever in the trace path when +/// `query.mode == SearchMode::Lexical`. `HybridRetriever::search_with_trace`'s +/// Lexical branch never calls `vector.search()`, so returning an empty +/// hit list here is safe and lets lexical-only workspaces (embedding +/// `provider = "none"`) use `--trace` without paying the ~470 MB +/// embedder load. +struct NoopRetriever; + +impl Retriever for NoopRetriever { + fn search(&self, _q: &kebab_core::SearchQuery) -> anyhow::Result> { + Ok(Vec::new()) + } + + fn index_version(&self) -> kebab_core::IndexVersion { + kebab_core::IndexVersion("noop:trace".into()) + } +} + /// Compose a stable `IndexVersion` for the vector retriever. Tracks /// `(embedding_model, embedding_version, dimensions)` so a model swap /// flags drift via the existing index_version mismatch warning in @@ -944,12 +972,11 @@ mod tests_trace { #[test] fn search_response_trace_some_when_opts_trace_true_lexical_mode() { // Lexical mode doesn't require embeddings — the trace path - // builds HybridRetriever which holds both retrievers, but - // for SearchMode::Lexical only the lexical side is invoked. - // require_embeddings will fail if no embedding provider is - // configured. Default Config has provider = "none" so this - // test will fail unless we tolerate that. Skip the assertion - // if the call returns the embedding-disabled error. + // builds HybridRetriever with a `NoopRetriever` stand-in for + // the vector side, since `HybridRetriever::search_with_trace`'s + // Lexical branch never invokes `vector.search()`. Default + // Config has embedding `provider = "none"`, and lexical-mode + // trace must succeed under that config (no embedder load). let (_dir, app) = open_app_with_temp_dir(); let q = SearchQuery { text: "x".into(), @@ -961,20 +988,9 @@ mod tests_trace { trace: true, ..Default::default() }; - match app.search_with_opts(q, opts) { - Ok(resp) => { - assert!(resp.trace.is_some(), "trace populated when opts.trace=true"); - } - Err(e) => { - // Acceptable in test environment without embeddings — - // verify the error is the expected embedding-disabled - // shape, not an unrelated panic. - let msg = format!("{e:#}"); - assert!( - msg.contains("embedding") || msg.contains("--mode lexical"), - "unexpected error: {msg}" - ); - } - } + let resp = app + .search_with_opts(q, opts) + .expect("lexical-mode trace must succeed without embeddings"); + assert!(resp.trace.is_some(), "trace populated when opts.trace=true"); } } diff --git a/crates/kebab-cli/tests/wire_schema_breakdowns.rs b/crates/kebab-cli/tests/wire_schema_breakdowns.rs new file mode 100644 index 0000000..5696cd2 --- /dev/null +++ b/crates/kebab-cli/tests/wire_schema_breakdowns.rs @@ -0,0 +1,57 @@ +//! p9-fb-37: integration tests for `kebab schema --json` extended stats. + +mod common; + +use serde_json::Value; +use std::fs; +use std::process::Command; + +fn run_schema(cfg: &std::path::Path) -> Value { + let bin = env!("CARGO_BIN_EXE_kebab"); + let out = Command::new(bin) + .args(["--config", cfg.to_str().unwrap(), "schema", "--json"]) + .output() + .expect("run kebab schema"); + assert!( + out.status.success(), + "schema failed: stderr={}", + String::from_utf8_lossy(&out.stderr) + ); + serde_json::from_slice(&out.stdout).expect("valid JSON") +} + +#[test] +fn schema_stats_includes_breakdowns_on_fresh_corpus() { + let dir = tempfile::tempdir().unwrap(); + let (cfg, workspace, _data) = common::write_config(dir.path(), 0); + // Run a no-op ingest to bring up migrations + create the SQLite file. + fs::write(workspace.join("placeholder.md"), "# placeholder\n").unwrap(); + common::ingest(&cfg, &workspace); + + let v = run_schema(&cfg); + let stats = &v["stats"]; + let m = stats["media_breakdown"].as_object().unwrap(); + assert_eq!(m.len(), 5, "5 media keys padded"); + for k in &["markdown", "pdf", "image", "audio", "other"] { + assert!(m[*k].is_number(), "media[{k}] is integer"); + } + assert!(stats["lang_breakdown"].is_object()); + assert!(stats["index_bytes"]["sqlite"].is_number()); + assert!(stats["index_bytes"]["lancedb"].is_number()); + assert!(stats["stale_doc_count"].is_number()); +} + +#[test] +fn schema_stats_breakdowns_after_ingest() { + let dir = tempfile::tempdir().unwrap(); + let (cfg, workspace, _data) = common::write_config(dir.path(), 0); + fs::write(workspace.join("a.md"), "---\nlang: en\n---\nhello\n").unwrap(); + fs::write(workspace.join("b.md"), "---\nlang: ko\n---\nė•ˆë…•\n").unwrap(); + common::ingest(&cfg, &workspace); + + let v = run_schema(&cfg); + let stats = &v["stats"]; + assert_eq!(stats["media_breakdown"]["markdown"], 2); + assert!(stats["lang_breakdown"].is_object()); + assert!(stats["index_bytes"]["sqlite"].as_u64().unwrap() > 0); +} diff --git a/crates/kebab-cli/tests/wire_search_trace.rs b/crates/kebab-cli/tests/wire_search_trace.rs new file mode 100644 index 0000000..4b8daff --- /dev/null +++ b/crates/kebab-cli/tests/wire_search_trace.rs @@ -0,0 +1,58 @@ +//! p9-fb-37: integration tests for `kebab search --trace --json`. + +mod common; + +use serde_json::Value; +use std::fs; + +#[test] +fn search_trace_json_includes_trace_block() { + let dir = tempfile::tempdir().unwrap(); + let (cfg, workspace, _data) = common::write_config(dir.path(), 0); + fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap(); + common::ingest(&cfg, &workspace); + + let (stdout, _stderr) = common::run_search_with_args( + &cfg, + &["--mode", "lexical", "--trace", "--json", "rust"], + ); + let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["schema_version"], "search_response.v1"); + assert!(v["trace"].is_object(), "trace block present"); + assert!(v["trace"]["timing"].is_object()); + assert!(v["trace"]["timing"]["total_ms"].is_number()); + assert!(v["trace"]["lexical"].is_array()); + assert!(v["trace"]["vector"].is_array()); + assert!(v["trace"]["rrf_inputs"].is_array()); +} + +#[test] +fn search_without_trace_omits_trace_field() { + let dir = tempfile::tempdir().unwrap(); + let (cfg, workspace, _data) = common::write_config(dir.path(), 0); + fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap(); + common::ingest(&cfg, &workspace); + + let (stdout, _stderr) = common::run_search_with_args( + &cfg, + &["--mode", "lexical", "--json", "rust"], + ); + let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert!(v.get("trace").is_none(), "trace field absent without --trace"); +} + +#[test] +fn search_trace_lexical_mode_vector_list_empty() { + let dir = tempfile::tempdir().unwrap(); + let (cfg, workspace, _data) = common::write_config(dir.path(), 0); + fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap(); + common::ingest(&cfg, &workspace); + + let (stdout, _stderr) = common::run_search_with_args( + &cfg, + &["--mode", "lexical", "--trace", "--json", "rust"], + ); + let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["trace"]["vector"].as_array().unwrap().len(), 0); + assert_eq!(v["trace"]["timing"]["vector_ms"], 0); +}