diff --git a/crates/kebab-app/src/schema.rs b/crates/kebab-app/src/schema.rs index 793f4d5..866c714 100644 --- a/crates/kebab-app/src/schema.rs +++ b/crates/kebab-app/src/schema.rs @@ -45,7 +45,7 @@ pub struct Models { pub corpus_revision: u64, } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct Stats { pub doc_count: u64, pub chunk_count: u64, @@ -63,6 +63,14 @@ pub struct Stats { /// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold. #[serde(default)] pub stale_doc_count: u64, + /// p10-1A-1: code language breakdown (chunk counts by canonical lowercase + /// language identifier). Empty until 1A-2 produces code chunks. + #[serde(default)] + pub code_lang_breakdown: std::collections::BTreeMap, + /// p10-1A-1: repo breakdown (chunk counts by `metadata.repo` value). + /// Empty until 1A-2 produces code chunks. + #[serde(default)] + pub repo_breakdown: std::collections::BTreeMap, } const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION"); @@ -158,6 +166,9 @@ fn collect_stats( lang_breakdown: counts.lang_breakdown, index_bytes, stale_doc_count: counts.stale_doc_count, + // p10-1A-1: populated by 1A-2 code ingest; empty until then. + code_lang_breakdown: std::collections::BTreeMap::new(), + repo_breakdown: std::collections::BTreeMap::new(), }) } @@ -182,6 +193,32 @@ fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Mode mod tests_stats_ext { use super::*; + /// p10-1A-1: Stats must serialize `code_lang_breakdown` and + /// `repo_breakdown` so downstream consumers (MCP skill, Claude Code) + /// can branch on their presence. + #[test] + fn stats_includes_code_lang_and_repo_breakdown_fields() { + let stats = Stats::default(); + let v = serde_json::to_value(&stats).unwrap(); + assert!( + v.get("code_lang_breakdown").is_some(), + "Stats JSON must include code_lang_breakdown: {v}" + ); + assert!( + v.get("repo_breakdown").is_some(), + "Stats JSON must include repo_breakdown: {v}" + ); + // Empty BTreeMap serializes as `{}` — confirm it's an object, not null. + assert!( + v["code_lang_breakdown"].is_object(), + "code_lang_breakdown must be an object: {v}" + ); + assert!( + v["repo_breakdown"].is_object(), + "repo_breakdown must be an object: {v}" + ); + } + #[test] fn stats_includes_breakdowns_and_bytes_on_fresh_corpus() { let dir = tempfile::tempdir().unwrap(); diff --git a/crates/kebab-cli/src/main.rs b/crates/kebab-cli/src/main.rs index 3ca6e63..ad06916 100644 --- a/crates/kebab-cli/src/main.rs +++ b/crates/kebab-cli/src/main.rs @@ -46,6 +46,11 @@ struct Cli { command: Cmd, } +// p10-1A-1: adding `repo` and `code_lang` Vec fields pushed `Cmd` +// over clippy's large_enum_variant threshold. The enum is short-lived +// (parsed once at startup, never cloned in a hot path) — boxing would add +// noise with no real benefit. +#[allow(clippy::large_enum_variant)] #[derive(Subcommand, Debug)] enum Cmd { /// Initialise XDG dirs + workspace + `config.toml`. @@ -165,6 +170,18 @@ enum Cmd { #[arg(long)] doc_id: Option, + /// p10-1A-1: filter by repo name (`metadata.repo`). Repeatable; + /// multi-value = OR. Empty = no filter (all repos returned). + #[arg(long = "repo", value_name = "NAME", num_args = 1)] + repo: Vec, + + /// p10-1A-1: filter by code language identifier (lowercase + /// canonical). Repeatable or comma-separated. + /// Examples: `rust`, `python`, `typescript`. + /// Unknown values produce empty hits. + #[arg(long = "code-lang", value_name = "LANG", num_args = 1, value_delimiter = ',')] + code_lang: Vec, + /// p9-fb-37: emit pre-fusion lexical / vector / RRF candidate /// lists + per-stage timing in the response. Bypasses cache /// (debug intent — fresh run guaranteed). Requires embeddings @@ -688,6 +705,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> { media, ingested_after, doc_id, + repo, + code_lang, trace, bulk, } => { @@ -819,7 +838,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> { None => None, }; - // p9-fb-36: build SearchFilters from the 7 new flags. + // p9-fb-36 + p10-1A-1: build SearchFilters from CLI flags. let filters = kebab_core::SearchFilters { tags_any: tag.clone(), lang: lang.as_ref().map(|s| kebab_core::Lang(s.clone())), @@ -828,8 +847,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> { media: media_norm, ingested_after: ingested_after_parsed, doc_id: doc_id.as_ref().map(|s| kebab_core::DocumentId(s.clone())), - repo: vec![], - code_lang: vec![], + repo: repo.clone(), + code_lang: code_lang.clone(), }; let q = kebab_core::SearchQuery { diff --git a/crates/kebab-cli/tests/wire_citation_5_variants_unchanged.rs b/crates/kebab-cli/tests/wire_citation_5_variants_unchanged.rs new file mode 100644 index 0000000..6242024 --- /dev/null +++ b/crates/kebab-cli/tests/wire_citation_5_variants_unchanged.rs @@ -0,0 +1,100 @@ +//! p10-1A-1 Task 13: regression — the 5 original Citation variants +//! (Line, Page, Region, Caption, Time) serialize byte-identically to +//! pre-Task-1 form. No spurious `code`, `line_start`, or `symbol` keys +//! must leak into these variants. + +use kebab_core::{Citation, WorkspacePath}; + +#[test] +fn line_variant_serialization_unchanged() { + let c = Citation::Line { + path: WorkspacePath::new("a.md".into()).unwrap(), + start: 1, + end: 2, + section: Some("§14".into()), + }; + let v = serde_json::to_value(&c).unwrap(); + assert_eq!(v["kind"], "line"); + assert_eq!(v["start"], 1); + assert_eq!(v["end"], 2); + assert_eq!(v["section"], "§14"); + // Must not bleed Code-variant keys. + assert!(v.get("line_start").is_none(), "line_start must be absent: {v}"); + assert!(v.get("symbol").is_none(), "symbol must be absent: {v}"); + assert!(v.get("code").is_none(), "code must be absent: {v}"); +} + +#[test] +fn line_variant_null_section_omitted() { + let c = Citation::Line { + path: WorkspacePath::new("b.md".into()).unwrap(), + start: 5, + end: 10, + section: None, + }; + let v = serde_json::to_value(&c).unwrap(); + assert_eq!(v["kind"], "line"); + // `section` with None should be omitted (skip_serializing_if = is_none). + assert!(v.get("section").is_none() || v["section"].is_null()); +} + +#[test] +fn page_variant_serialization_unchanged() { + let c = Citation::Page { + path: WorkspacePath::new("a.pdf".into()).unwrap(), + page: 13, + section: None, + }; + let v = serde_json::to_value(&c).unwrap(); + assert_eq!(v["kind"], "page"); + assert_eq!(v["page"], 13); + assert!(v.get("line_start").is_none(), "line_start must be absent: {v}"); + assert!(v.get("symbol").is_none(), "symbol must be absent: {v}"); +} + +#[test] +fn region_variant_serialization_unchanged() { + let c = Citation::Region { + path: WorkspacePath::new("img.png".into()).unwrap(), + x: 10, + y: 20, + w: 100, + h: 200, + }; + let v = serde_json::to_value(&c).unwrap(); + assert_eq!(v["kind"], "region"); + assert_eq!(v["x"], 10); + assert_eq!(v["y"], 20); + assert_eq!(v["w"], 100); + assert_eq!(v["h"], 200); + assert!(v.get("line_start").is_none(), "line_start must be absent: {v}"); +} + +#[test] +fn caption_variant_serialization_unchanged() { + let c = Citation::Caption { + path: WorkspacePath::new("a.png".into()).unwrap(), + model: "qwen2.5-vl:7b".into(), + }; + let v = serde_json::to_value(&c).unwrap(); + assert_eq!(v["kind"], "caption"); + assert_eq!(v["model"], "qwen2.5-vl:7b"); + assert!(v.get("line_start").is_none(), "line_start must be absent: {v}"); +} + +#[test] +fn time_variant_serialization_unchanged() { + let c = Citation::Time { + path: WorkspacePath::new("audio.mp3".into()).unwrap(), + start_ms: 1000, + end_ms: 5000, + speaker: Some("Alice".into()), + }; + let v = serde_json::to_value(&c).unwrap(); + assert_eq!(v["kind"], "time"); + assert_eq!(v["start_ms"], 1000); + assert_eq!(v["end_ms"], 5000); + assert_eq!(v["speaker"], "Alice"); + assert!(v.get("line_start").is_none(), "line_start must be absent: {v}"); + assert!(v.get("symbol").is_none(), "symbol must be absent: {v}"); +} diff --git a/crates/kebab-cli/tests/wire_search_filters_code.rs b/crates/kebab-cli/tests/wire_search_filters_code.rs new file mode 100644 index 0000000..3480210 --- /dev/null +++ b/crates/kebab-cli/tests/wire_search_filters_code.rs @@ -0,0 +1,72 @@ +//! p10-1A-1 Task 15: CLI accepts --repo and --code-lang flags. +//! +//! These tests verify that clap parses the new flags without error. +//! They drive `kebab search --help` (which exercises flag parsing +//! via clap's help generation path, exiting 0) or use a minimal +//! config + `--json` round-trip to verify the flags reach the wire. + +use std::process::Command; + +fn kebab() -> Command { + Command::new(env!("CARGO_BIN_EXE_kebab")) +} + +/// `kebab search --help` must exit 0 and mention `--repo`. +#[test] +fn cli_search_help_mentions_repo_flag() { + let out = kebab() + .args(["search", "--help"]) + .output() + .expect("failed to run kebab"); + // clap help exits 0. + assert!( + out.status.success(), + "kebab search --help exited non-zero: {:?}", + out.status + ); + let stdout = String::from_utf8_lossy(&out.stdout); + assert!( + stdout.contains("--repo"), + "--repo flag must appear in search help output:\n{stdout}" + ); +} + +/// `kebab search --help` must exit 0 and mention `--code-lang`. +#[test] +fn cli_search_help_mentions_code_lang_flag() { + let out = kebab() + .args(["search", "--help"]) + .output() + .expect("failed to run kebab"); + assert!( + out.status.success(), + "kebab search --help exited non-zero: {:?}", + out.status + ); + let stdout = String::from_utf8_lossy(&out.stdout); + assert!( + stdout.contains("--code-lang"), + "--code-lang flag must appear in search help output:\n{stdout}" + ); +} + +/// `kebab search --help` must exit 0 and mention `--media`. +/// Confirms `--media code` value pathway is available (media is +/// a free-form Vec that already accepted arbitrary values). +#[test] +fn cli_search_help_mentions_media_flag() { + let out = kebab() + .args(["search", "--help"]) + .output() + .expect("failed to run kebab"); + assert!( + out.status.success(), + "kebab search --help exited non-zero: {:?}", + out.status + ); + let stdout = String::from_utf8_lossy(&out.stdout); + assert!( + stdout.contains("--media"), + "--media flag must appear in search help output:\n{stdout}" + ); +} diff --git a/crates/kebab-cli/tests/wire_search_hit_no_code_fields.rs b/crates/kebab-cli/tests/wire_search_hit_no_code_fields.rs new file mode 100644 index 0000000..c3d7d24 --- /dev/null +++ b/crates/kebab-cli/tests/wire_search_hit_no_code_fields.rs @@ -0,0 +1,47 @@ +//! p10-1A-1 Task 13: regression — markdown SearchHit omits `repo` and +//! `code_lang` from JSON when both are `None`. +//! +//! Proves that adding optional fields to SearchHit does not silently +//! inject spurious keys into the existing markdown corpus wire shape. + +use kebab_core::{ + Citation, ChunkId, ChunkerVersion, DocumentId, IndexVersion, RetrievalDetail, ScoreKind, + SearchHit, WorkspacePath, +}; + +#[test] +fn markdown_hit_omits_repo_and_code_lang() { + let hit = SearchHit { + rank: 1, + chunk_id: ChunkId("c1".into()), + doc_id: DocumentId("d1".into()), + doc_path: WorkspacePath::new("notes/foo.md".into()).unwrap(), + heading_path: vec!["A".into(), "B".into()], + section_label: Some("B".into()), + snippet: "hi".into(), + citation: Citation::Line { + path: WorkspacePath::new("notes/foo.md".into()).unwrap(), + start: 1, + end: 2, + section: None, + }, + retrieval: RetrievalDetail::default(), + index_version: IndexVersion("v1".into()), + embedding_model: None, + chunker_version: ChunkerVersion("md-heading-v1".into()), + indexed_at: time::OffsetDateTime::UNIX_EPOCH, + stale: false, + score_kind: ScoreKind::Rrf, + repo: None, + code_lang: None, + }; + let s = serde_json::to_string(&hit).unwrap(); + assert!( + !s.contains("\"repo\""), + "repo should be absent from markdown hit JSON: {s}" + ); + assert!( + !s.contains("\"code_lang\""), + "code_lang should be absent from markdown hit JSON: {s}" + ); +}