From f7e2072d6693151683e7e67f42a32bf445dccf0a Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 13:21:33 +0900
Subject: [PATCH] test(cli): integration tests for --trace + schema breakdowns
(fb-37)
Also fixes App::search_with_opts trace branch to use NoopRetriever
for SearchMode::Lexical, removing the embeddings requirement when
the user only wants lexical-mode trace.
---
crates/kebab-app/src/app.rs | 86 +++++++++++--------
.../kebab-cli/tests/wire_schema_breakdowns.rs | 57 ++++++++++++
crates/kebab-cli/tests/wire_search_trace.rs | 58 +++++++++++++
3 files changed, 166 insertions(+), 35 deletions(-)
create mode 100644 crates/kebab-cli/tests/wire_schema_breakdowns.rs
create mode 100644 crates/kebab-cli/tests/wire_search_trace.rs
diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs
index 7895459..a3d2c07 100644
--- a/crates/kebab-app/src/app.rs
+++ b/crates/kebab-app/src/app.rs
@@ -347,26 +347,36 @@ impl App {
// p9-fb-37: when --trace is requested, bypass the LRU cache and
// run through `HybridRetriever::search_with_trace`, which
- // dispatches by mode internally. This requires embeddings (same
- // as `--mode hybrid`); `require_embeddings()` surfaces the
- // existing "switch to --mode lexical" error otherwise.
+ // dispatches by mode internally. Vector / hybrid modes require
+ // embeddings (same as `--mode hybrid`); lexical mode skips
+ // embedder construction via `NoopRetriever` so lexical-only
+ // workspaces (provider = "none") can use `--trace` without
+ // surfacing the "switch to --mode lexical" error.
if opts.trace {
let lex = Arc::new(LexicalRetriever::with_settings(
self.sqlite.clone(),
lexical_index_version(&self.config),
self.config.search.snippet_chars,
)) as Arc;
- let (emb, vec_store) = self.require_embeddings()?;
- let vec_iv = vector_index_version(emb.as_ref());
- let vec_dyn: Arc = vec_store;
- let emb_dyn: Arc = emb;
- let vec_retr = Arc::new(VectorRetriever::with_settings(
- vec_dyn,
- emb_dyn,
- self.sqlite.clone(),
- vec_iv,
- self.config.search.snippet_chars,
- )) as Arc;
+ let vec_retr: Arc = if matches!(query.mode, SearchMode::Lexical) {
+ // `HybridRetriever::search_with_trace` never invokes the
+ // vector retriever for `SearchMode::Lexical` (Task 4).
+ // A no-op stand-in lets us avoid the ~470 MB embedder
+ // load when the user only asked for lexical trace.
+ Arc::new(NoopRetriever)
+ } else {
+ let (emb, vec_store) = self.require_embeddings()?;
+ let vec_iv = vector_index_version(emb.as_ref());
+ let vec_dyn: Arc = vec_store;
+ let emb_dyn: Arc = emb;
+ Arc::new(VectorRetriever::with_settings(
+ vec_dyn,
+ emb_dyn,
+ self.sqlite.clone(),
+ vec_iv,
+ self.config.search.snippet_chars,
+ )) as Arc
+ };
let hybrid = HybridRetriever::new(&self.config, lex, vec_retr);
let (mut traced_hits, trace) = hybrid.search_with_trace(&fetch_query)?;
@@ -800,6 +810,24 @@ fn lexical_index_version(config: &kebab_config::Config) -> IndexVersion {
IndexVersion(format!("lex:{}", config.chunking.chunker_version))
}
+/// p9-fb-37: stand-in for the vector retriever in the trace path when
+/// `query.mode == SearchMode::Lexical`. `HybridRetriever::search_with_trace`'s
+/// Lexical branch never calls `vector.search()`, so returning an empty
+/// hit list here is safe and lets lexical-only workspaces (embedding
+/// `provider = "none"`) use `--trace` without paying the ~470 MB
+/// embedder load.
+struct NoopRetriever;
+
+impl Retriever for NoopRetriever {
+ fn search(&self, _q: &kebab_core::SearchQuery) -> anyhow::Result> {
+ Ok(Vec::new())
+ }
+
+ fn index_version(&self) -> kebab_core::IndexVersion {
+ kebab_core::IndexVersion("noop:trace".into())
+ }
+}
+
/// Compose a stable `IndexVersion` for the vector retriever. Tracks
/// `(embedding_model, embedding_version, dimensions)` so a model swap
/// flags drift via the existing index_version mismatch warning in
@@ -944,12 +972,11 @@ mod tests_trace {
#[test]
fn search_response_trace_some_when_opts_trace_true_lexical_mode() {
// Lexical mode doesn't require embeddings â the trace path
- // builds HybridRetriever which holds both retrievers, but
- // for SearchMode::Lexical only the lexical side is invoked.
- // require_embeddings will fail if no embedding provider is
- // configured. Default Config has provider = "none" so this
- // test will fail unless we tolerate that. Skip the assertion
- // if the call returns the embedding-disabled error.
+ // builds HybridRetriever with a `NoopRetriever` stand-in for
+ // the vector side, since `HybridRetriever::search_with_trace`'s
+ // Lexical branch never invokes `vector.search()`. Default
+ // Config has embedding `provider = "none"`, and lexical-mode
+ // trace must succeed under that config (no embedder load).
let (_dir, app) = open_app_with_temp_dir();
let q = SearchQuery {
text: "x".into(),
@@ -961,20 +988,9 @@ mod tests_trace {
trace: true,
..Default::default()
};
- match app.search_with_opts(q, opts) {
- Ok(resp) => {
- assert!(resp.trace.is_some(), "trace populated when opts.trace=true");
- }
- Err(e) => {
- // Acceptable in test environment without embeddings â
- // verify the error is the expected embedding-disabled
- // shape, not an unrelated panic.
- let msg = format!("{e:#}");
- assert!(
- msg.contains("embedding") || msg.contains("--mode lexical"),
- "unexpected error: {msg}"
- );
- }
- }
+ let resp = app
+ .search_with_opts(q, opts)
+ .expect("lexical-mode trace must succeed without embeddings");
+ assert!(resp.trace.is_some(), "trace populated when opts.trace=true");
}
}
diff --git a/crates/kebab-cli/tests/wire_schema_breakdowns.rs b/crates/kebab-cli/tests/wire_schema_breakdowns.rs
new file mode 100644
index 0000000..5696cd2
--- /dev/null
+++ b/crates/kebab-cli/tests/wire_schema_breakdowns.rs
@@ -0,0 +1,57 @@
+//! p9-fb-37: integration tests for `kebab schema --json` extended stats.
+
+mod common;
+
+use serde_json::Value;
+use std::fs;
+use std::process::Command;
+
+fn run_schema(cfg: &std::path::Path) -> Value {
+ let bin = env!("CARGO_BIN_EXE_kebab");
+ let out = Command::new(bin)
+ .args(["--config", cfg.to_str().unwrap(), "schema", "--json"])
+ .output()
+ .expect("run kebab schema");
+ assert!(
+ out.status.success(),
+ "schema failed: stderr={}",
+ String::from_utf8_lossy(&out.stderr)
+ );
+ serde_json::from_slice(&out.stdout).expect("valid JSON")
+}
+
+#[test]
+fn schema_stats_includes_breakdowns_on_fresh_corpus() {
+ let dir = tempfile::tempdir().unwrap();
+ let (cfg, workspace, _data) = common::write_config(dir.path(), 0);
+ // Run a no-op ingest to bring up migrations + create the SQLite file.
+ fs::write(workspace.join("placeholder.md"), "# placeholder\n").unwrap();
+ common::ingest(&cfg, &workspace);
+
+ let v = run_schema(&cfg);
+ let stats = &v["stats"];
+ let m = stats["media_breakdown"].as_object().unwrap();
+ assert_eq!(m.len(), 5, "5 media keys padded");
+ for k in &["markdown", "pdf", "image", "audio", "other"] {
+ assert!(m[*k].is_number(), "media[{k}] is integer");
+ }
+ assert!(stats["lang_breakdown"].is_object());
+ assert!(stats["index_bytes"]["sqlite"].is_number());
+ assert!(stats["index_bytes"]["lancedb"].is_number());
+ assert!(stats["stale_doc_count"].is_number());
+}
+
+#[test]
+fn schema_stats_breakdowns_after_ingest() {
+ let dir = tempfile::tempdir().unwrap();
+ let (cfg, workspace, _data) = common::write_config(dir.path(), 0);
+ fs::write(workspace.join("a.md"), "---\nlang: en\n---\nhello\n").unwrap();
+ fs::write(workspace.join("b.md"), "---\nlang: ko\n---\nėë
\n").unwrap();
+ common::ingest(&cfg, &workspace);
+
+ let v = run_schema(&cfg);
+ let stats = &v["stats"];
+ assert_eq!(stats["media_breakdown"]["markdown"], 2);
+ assert!(stats["lang_breakdown"].is_object());
+ assert!(stats["index_bytes"]["sqlite"].as_u64().unwrap() > 0);
+}
diff --git a/crates/kebab-cli/tests/wire_search_trace.rs b/crates/kebab-cli/tests/wire_search_trace.rs
new file mode 100644
index 0000000..4b8daff
--- /dev/null
+++ b/crates/kebab-cli/tests/wire_search_trace.rs
@@ -0,0 +1,58 @@
+//! p9-fb-37: integration tests for `kebab search --trace --json`.
+
+mod common;
+
+use serde_json::Value;
+use std::fs;
+
+#[test]
+fn search_trace_json_includes_trace_block() {
+ let dir = tempfile::tempdir().unwrap();
+ let (cfg, workspace, _data) = common::write_config(dir.path(), 0);
+ fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap();
+ common::ingest(&cfg, &workspace);
+
+ let (stdout, _stderr) = common::run_search_with_args(
+ &cfg,
+ &["--mode", "lexical", "--trace", "--json", "rust"],
+ );
+ let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON");
+ assert_eq!(v["schema_version"], "search_response.v1");
+ assert!(v["trace"].is_object(), "trace block present");
+ assert!(v["trace"]["timing"].is_object());
+ assert!(v["trace"]["timing"]["total_ms"].is_number());
+ assert!(v["trace"]["lexical"].is_array());
+ assert!(v["trace"]["vector"].is_array());
+ assert!(v["trace"]["rrf_inputs"].is_array());
+}
+
+#[test]
+fn search_without_trace_omits_trace_field() {
+ let dir = tempfile::tempdir().unwrap();
+ let (cfg, workspace, _data) = common::write_config(dir.path(), 0);
+ fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap();
+ common::ingest(&cfg, &workspace);
+
+ let (stdout, _stderr) = common::run_search_with_args(
+ &cfg,
+ &["--mode", "lexical", "--json", "rust"],
+ );
+ let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON");
+ assert!(v.get("trace").is_none(), "trace field absent without --trace");
+}
+
+#[test]
+fn search_trace_lexical_mode_vector_list_empty() {
+ let dir = tempfile::tempdir().unwrap();
+ let (cfg, workspace, _data) = common::write_config(dir.path(), 0);
+ fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap();
+ common::ingest(&cfg, &workspace);
+
+ let (stdout, _stderr) = common::run_search_with_args(
+ &cfg,
+ &["--mode", "lexical", "--trace", "--json", "rust"],
+ );
+ let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON");
+ assert_eq!(v["trace"]["vector"].as_array().unwrap().len(), 0);
+ assert_eq!(v["trace"]["timing"]["vector_ms"], 0);
+}