feat(p10-1a-1): CLI filter flags + SchemaStats breakdowns + regression tests
Task 13: add wire regression tests proving markdown SearchHit omits repo/code_lang when None, and all 5 original Citation variants serialize byte-identically without spurious Code-variant keys. Task 15: add --repo (repeatable) and --code-lang (repeatable, comma-separated) flags to `kebab search`; propagate both into SearchFilters instead of the previous vec![] stub. Add #[allow(clippy::large_enum_variant)] — Cmd is short-lived, boxing buys nothing. Task 16: add code_lang_breakdown and repo_breakdown BTreeMap fields to Stats (schema.v1); derive Default on Stats; populate both as empty in collect_stats (1A-2 fills them when code chunks land). Add unit test asserting both keys are present in the serialized object. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -45,7 +45,7 @@ pub struct Models {
|
||||
pub corpus_revision: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct Stats {
|
||||
pub doc_count: u64,
|
||||
pub chunk_count: u64,
|
||||
@@ -63,6 +63,14 @@ pub struct Stats {
|
||||
/// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold.
|
||||
#[serde(default)]
|
||||
pub stale_doc_count: u64,
|
||||
/// p10-1A-1: code language breakdown (chunk counts by canonical lowercase
|
||||
/// language identifier). Empty until 1A-2 produces code chunks.
|
||||
#[serde(default)]
|
||||
pub code_lang_breakdown: std::collections::BTreeMap<String, u32>,
|
||||
/// p10-1A-1: repo breakdown (chunk counts by `metadata.repo` value).
|
||||
/// Empty until 1A-2 produces code chunks.
|
||||
#[serde(default)]
|
||||
pub repo_breakdown: std::collections::BTreeMap<String, u32>,
|
||||
}
|
||||
|
||||
const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
@@ -158,6 +166,9 @@ fn collect_stats(
|
||||
lang_breakdown: counts.lang_breakdown,
|
||||
index_bytes,
|
||||
stale_doc_count: counts.stale_doc_count,
|
||||
// p10-1A-1: populated by 1A-2 code ingest; empty until then.
|
||||
code_lang_breakdown: std::collections::BTreeMap::new(),
|
||||
repo_breakdown: std::collections::BTreeMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -182,6 +193,32 @@ fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Mode
|
||||
mod tests_stats_ext {
|
||||
use super::*;
|
||||
|
||||
/// p10-1A-1: Stats must serialize `code_lang_breakdown` and
|
||||
/// `repo_breakdown` so downstream consumers (MCP skill, Claude Code)
|
||||
/// can branch on their presence.
|
||||
#[test]
|
||||
fn stats_includes_code_lang_and_repo_breakdown_fields() {
|
||||
let stats = Stats::default();
|
||||
let v = serde_json::to_value(&stats).unwrap();
|
||||
assert!(
|
||||
v.get("code_lang_breakdown").is_some(),
|
||||
"Stats JSON must include code_lang_breakdown: {v}"
|
||||
);
|
||||
assert!(
|
||||
v.get("repo_breakdown").is_some(),
|
||||
"Stats JSON must include repo_breakdown: {v}"
|
||||
);
|
||||
// Empty BTreeMap serializes as `{}` — confirm it's an object, not null.
|
||||
assert!(
|
||||
v["code_lang_breakdown"].is_object(),
|
||||
"code_lang_breakdown must be an object: {v}"
|
||||
);
|
||||
assert!(
|
||||
v["repo_breakdown"].is_object(),
|
||||
"repo_breakdown must be an object: {v}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stats_includes_breakdowns_and_bytes_on_fresh_corpus() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
@@ -46,6 +46,11 @@ struct Cli {
|
||||
command: Cmd,
|
||||
}
|
||||
|
||||
// p10-1A-1: adding `repo` and `code_lang` Vec<String> fields pushed `Cmd`
|
||||
// over clippy's large_enum_variant threshold. The enum is short-lived
|
||||
// (parsed once at startup, never cloned in a hot path) — boxing would add
|
||||
// noise with no real benefit.
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Cmd {
|
||||
/// Initialise XDG dirs + workspace + `config.toml`.
|
||||
@@ -165,6 +170,18 @@ enum Cmd {
|
||||
#[arg(long)]
|
||||
doc_id: Option<String>,
|
||||
|
||||
/// p10-1A-1: filter by repo name (`metadata.repo`). Repeatable;
|
||||
/// multi-value = OR. Empty = no filter (all repos returned).
|
||||
#[arg(long = "repo", value_name = "NAME", num_args = 1)]
|
||||
repo: Vec<String>,
|
||||
|
||||
/// p10-1A-1: filter by code language identifier (lowercase
|
||||
/// canonical). Repeatable or comma-separated.
|
||||
/// Examples: `rust`, `python`, `typescript`.
|
||||
/// Unknown values produce empty hits.
|
||||
#[arg(long = "code-lang", value_name = "LANG", num_args = 1, value_delimiter = ',')]
|
||||
code_lang: Vec<String>,
|
||||
|
||||
/// p9-fb-37: emit pre-fusion lexical / vector / RRF candidate
|
||||
/// lists + per-stage timing in the response. Bypasses cache
|
||||
/// (debug intent — fresh run guaranteed). Requires embeddings
|
||||
@@ -688,6 +705,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
media,
|
||||
ingested_after,
|
||||
doc_id,
|
||||
repo,
|
||||
code_lang,
|
||||
trace,
|
||||
bulk,
|
||||
} => {
|
||||
@@ -819,7 +838,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
None => None,
|
||||
};
|
||||
|
||||
// p9-fb-36: build SearchFilters from the 7 new flags.
|
||||
// p9-fb-36 + p10-1A-1: build SearchFilters from CLI flags.
|
||||
let filters = kebab_core::SearchFilters {
|
||||
tags_any: tag.clone(),
|
||||
lang: lang.as_ref().map(|s| kebab_core::Lang(s.clone())),
|
||||
@@ -828,8 +847,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
media: media_norm,
|
||||
ingested_after: ingested_after_parsed,
|
||||
doc_id: doc_id.as_ref().map(|s| kebab_core::DocumentId(s.clone())),
|
||||
repo: vec![],
|
||||
code_lang: vec![],
|
||||
repo: repo.clone(),
|
||||
code_lang: code_lang.clone(),
|
||||
};
|
||||
|
||||
let q = kebab_core::SearchQuery {
|
||||
|
||||
100
crates/kebab-cli/tests/wire_citation_5_variants_unchanged.rs
Normal file
100
crates/kebab-cli/tests/wire_citation_5_variants_unchanged.rs
Normal file
@@ -0,0 +1,100 @@
|
||||
//! p10-1A-1 Task 13: regression — the 5 original Citation variants
|
||||
//! (Line, Page, Region, Caption, Time) serialize byte-identically to
|
||||
//! pre-Task-1 form. No spurious `code`, `line_start`, or `symbol` keys
|
||||
//! must leak into these variants.
|
||||
|
||||
use kebab_core::{Citation, WorkspacePath};
|
||||
|
||||
#[test]
|
||||
fn line_variant_serialization_unchanged() {
|
||||
let c = Citation::Line {
|
||||
path: WorkspacePath::new("a.md".into()).unwrap(),
|
||||
start: 1,
|
||||
end: 2,
|
||||
section: Some("§14".into()),
|
||||
};
|
||||
let v = serde_json::to_value(&c).unwrap();
|
||||
assert_eq!(v["kind"], "line");
|
||||
assert_eq!(v["start"], 1);
|
||||
assert_eq!(v["end"], 2);
|
||||
assert_eq!(v["section"], "§14");
|
||||
// Must not bleed Code-variant keys.
|
||||
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
|
||||
assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
|
||||
assert!(v.get("code").is_none(), "code must be absent: {v}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn line_variant_null_section_omitted() {
|
||||
let c = Citation::Line {
|
||||
path: WorkspacePath::new("b.md".into()).unwrap(),
|
||||
start: 5,
|
||||
end: 10,
|
||||
section: None,
|
||||
};
|
||||
let v = serde_json::to_value(&c).unwrap();
|
||||
assert_eq!(v["kind"], "line");
|
||||
// `section` with None should be omitted (skip_serializing_if = is_none).
|
||||
assert!(v.get("section").is_none() || v["section"].is_null());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn page_variant_serialization_unchanged() {
|
||||
let c = Citation::Page {
|
||||
path: WorkspacePath::new("a.pdf".into()).unwrap(),
|
||||
page: 13,
|
||||
section: None,
|
||||
};
|
||||
let v = serde_json::to_value(&c).unwrap();
|
||||
assert_eq!(v["kind"], "page");
|
||||
assert_eq!(v["page"], 13);
|
||||
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
|
||||
assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn region_variant_serialization_unchanged() {
|
||||
let c = Citation::Region {
|
||||
path: WorkspacePath::new("img.png".into()).unwrap(),
|
||||
x: 10,
|
||||
y: 20,
|
||||
w: 100,
|
||||
h: 200,
|
||||
};
|
||||
let v = serde_json::to_value(&c).unwrap();
|
||||
assert_eq!(v["kind"], "region");
|
||||
assert_eq!(v["x"], 10);
|
||||
assert_eq!(v["y"], 20);
|
||||
assert_eq!(v["w"], 100);
|
||||
assert_eq!(v["h"], 200);
|
||||
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn caption_variant_serialization_unchanged() {
|
||||
let c = Citation::Caption {
|
||||
path: WorkspacePath::new("a.png".into()).unwrap(),
|
||||
model: "qwen2.5-vl:7b".into(),
|
||||
};
|
||||
let v = serde_json::to_value(&c).unwrap();
|
||||
assert_eq!(v["kind"], "caption");
|
||||
assert_eq!(v["model"], "qwen2.5-vl:7b");
|
||||
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn time_variant_serialization_unchanged() {
|
||||
let c = Citation::Time {
|
||||
path: WorkspacePath::new("audio.mp3".into()).unwrap(),
|
||||
start_ms: 1000,
|
||||
end_ms: 5000,
|
||||
speaker: Some("Alice".into()),
|
||||
};
|
||||
let v = serde_json::to_value(&c).unwrap();
|
||||
assert_eq!(v["kind"], "time");
|
||||
assert_eq!(v["start_ms"], 1000);
|
||||
assert_eq!(v["end_ms"], 5000);
|
||||
assert_eq!(v["speaker"], "Alice");
|
||||
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
|
||||
assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
|
||||
}
|
||||
72
crates/kebab-cli/tests/wire_search_filters_code.rs
Normal file
72
crates/kebab-cli/tests/wire_search_filters_code.rs
Normal file
@@ -0,0 +1,72 @@
|
||||
//! p10-1A-1 Task 15: CLI accepts --repo and --code-lang flags.
|
||||
//!
|
||||
//! These tests verify that clap parses the new flags without error.
|
||||
//! They drive `kebab search --help` (which exercises flag parsing
|
||||
//! via clap's help generation path, exiting 0) or use a minimal
|
||||
//! config + `--json` round-trip to verify the flags reach the wire.
|
||||
|
||||
use std::process::Command;
|
||||
|
||||
fn kebab() -> Command {
|
||||
Command::new(env!("CARGO_BIN_EXE_kebab"))
|
||||
}
|
||||
|
||||
/// `kebab search --help` must exit 0 and mention `--repo`.
|
||||
#[test]
|
||||
fn cli_search_help_mentions_repo_flag() {
|
||||
let out = kebab()
|
||||
.args(["search", "--help"])
|
||||
.output()
|
||||
.expect("failed to run kebab");
|
||||
// clap help exits 0.
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"kebab search --help exited non-zero: {:?}",
|
||||
out.status
|
||||
);
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
assert!(
|
||||
stdout.contains("--repo"),
|
||||
"--repo flag must appear in search help output:\n{stdout}"
|
||||
);
|
||||
}
|
||||
|
||||
/// `kebab search --help` must exit 0 and mention `--code-lang`.
|
||||
#[test]
|
||||
fn cli_search_help_mentions_code_lang_flag() {
|
||||
let out = kebab()
|
||||
.args(["search", "--help"])
|
||||
.output()
|
||||
.expect("failed to run kebab");
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"kebab search --help exited non-zero: {:?}",
|
||||
out.status
|
||||
);
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
assert!(
|
||||
stdout.contains("--code-lang"),
|
||||
"--code-lang flag must appear in search help output:\n{stdout}"
|
||||
);
|
||||
}
|
||||
|
||||
/// `kebab search --help` must exit 0 and mention `--media`.
|
||||
/// Confirms `--media code` value pathway is available (media is
|
||||
/// a free-form Vec<String> that already accepted arbitrary values).
|
||||
#[test]
|
||||
fn cli_search_help_mentions_media_flag() {
|
||||
let out = kebab()
|
||||
.args(["search", "--help"])
|
||||
.output()
|
||||
.expect("failed to run kebab");
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"kebab search --help exited non-zero: {:?}",
|
||||
out.status
|
||||
);
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
assert!(
|
||||
stdout.contains("--media"),
|
||||
"--media flag must appear in search help output:\n{stdout}"
|
||||
);
|
||||
}
|
||||
47
crates/kebab-cli/tests/wire_search_hit_no_code_fields.rs
Normal file
47
crates/kebab-cli/tests/wire_search_hit_no_code_fields.rs
Normal file
@@ -0,0 +1,47 @@
|
||||
//! p10-1A-1 Task 13: regression — markdown SearchHit omits `repo` and
|
||||
//! `code_lang` from JSON when both are `None`.
|
||||
//!
|
||||
//! Proves that adding optional fields to SearchHit does not silently
|
||||
//! inject spurious keys into the existing markdown corpus wire shape.
|
||||
|
||||
use kebab_core::{
|
||||
Citation, ChunkId, ChunkerVersion, DocumentId, IndexVersion, RetrievalDetail, ScoreKind,
|
||||
SearchHit, WorkspacePath,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn markdown_hit_omits_repo_and_code_lang() {
|
||||
let hit = SearchHit {
|
||||
rank: 1,
|
||||
chunk_id: ChunkId("c1".into()),
|
||||
doc_id: DocumentId("d1".into()),
|
||||
doc_path: WorkspacePath::new("notes/foo.md".into()).unwrap(),
|
||||
heading_path: vec!["A".into(), "B".into()],
|
||||
section_label: Some("B".into()),
|
||||
snippet: "hi".into(),
|
||||
citation: Citation::Line {
|
||||
path: WorkspacePath::new("notes/foo.md".into()).unwrap(),
|
||||
start: 1,
|
||||
end: 2,
|
||||
section: None,
|
||||
},
|
||||
retrieval: RetrievalDetail::default(),
|
||||
index_version: IndexVersion("v1".into()),
|
||||
embedding_model: None,
|
||||
chunker_version: ChunkerVersion("md-heading-v1".into()),
|
||||
indexed_at: time::OffsetDateTime::UNIX_EPOCH,
|
||||
stale: false,
|
||||
score_kind: ScoreKind::Rrf,
|
||||
repo: None,
|
||||
code_lang: None,
|
||||
};
|
||||
let s = serde_json::to_string(&hit).unwrap();
|
||||
assert!(
|
||||
!s.contains("\"repo\""),
|
||||
"repo should be absent from markdown hit JSON: {s}"
|
||||
);
|
||||
assert!(
|
||||
!s.contains("\"code_lang\""),
|
||||
"code_lang should be absent from markdown hit JSON: {s}"
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user