//! `kebab schema` — introspection report. See spec //! `docs/superpowers/specs/2026-05-07-p9-fb-27-introspection-and-error-wire-design.md`. use serde::{Deserialize, Serialize}; use kebab_config::Config; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SchemaV1 { pub schema_version: String, pub kebab_version: String, pub wire: WireBlock, pub capabilities: Capabilities, pub models: Models, pub stats: Stats, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WireBlock { pub schemas: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Capabilities { pub json_mode: bool, pub ingest_progress: bool, pub ingest_cancellation: bool, pub rag_multi_turn: bool, pub search_cache: bool, pub incremental_ingest: bool, pub streaming_ask: bool, pub http_daemon: bool, pub mcp_server: bool, pub single_file_ingest: bool, pub bulk_search: bool, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Models { pub parser_version: String, pub chunker_version: String, /// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체. /// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존. #[serde(default)] pub active_parsers: Vec, /// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체. /// 빈 corpus → empty Vec. #[serde(default)] pub active_chunkers: Vec, pub embedding_version: String, pub prompt_template_version: String, pub index_version: String, pub corpus_revision: u64, } #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct Stats { pub doc_count: u64, pub chunk_count: u64, pub asset_count: u64, pub last_ingest_at: Option, /// p9-fb-37: per-media-kind doc count (5 keys, zero-padded). #[serde(default)] pub media_breakdown: std::collections::BTreeMap, /// p9-fb-37: per-language doc count, NULL keyed as `"null"`. #[serde(default)] pub lang_breakdown: std::collections::BTreeMap, /// p9-fb-37: on-disk byte sums. #[serde(default)] pub index_bytes: kebab_core::IndexBytes, /// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold. #[serde(default)] pub stale_doc_count: u64, /// p10-1A-1: code language breakdown (**doc** counts by canonical /// lowercase language identifier). Empty until 1A-2 produces code /// docs. v0.17.0 PR-C: doc-count semantics corrected here (the /// previous "chunk counts" wording was a longstanding mis-label — /// implementation has always been `COUNT(*) FROM documents /// GROUP BY code_lang`). Use `code_lang_chunk_breakdown` for the /// chunk-level companion. #[serde(default)] pub code_lang_breakdown: std::collections::BTreeMap, /// p10-1A-1: repo breakdown (**doc** counts by `metadata.repo` /// value). Empty until 1A-2 produces code docs. v0.17.0 PR-C: /// doc-count wording corrected (mirror of code_lang_breakdown). #[serde(default)] pub repo_breakdown: std::collections::BTreeMap, /// v0.17.0 PR-C: sister of [`Self::code_lang_breakdown`] returning /// chunk counts instead of doc counts. Indexing-pressure metric — /// one PDF spec → 200 chunks vs one Rust file → 5 chunks shows up /// here in a way `code_lang_breakdown` (doc count) hides. #[serde(default)] pub code_lang_chunk_breakdown: std::collections::BTreeMap, } const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION"); /// Wire schema id for [`SchemaV1`]. Single source of truth — `kebab-cli` /// re-uses this via `kebab_app::schema::SCHEMA_V1_ID` when wrapping. pub const SCHEMA_V1_ID: &str = "schema.v1"; // Authoritative list of wire schemas this binary emits. Keep in sync with // `docs/wire-schema/v1/*.schema.json` and `kebab-cli::wire::wire_*` helpers. const WIRE_SCHEMAS: &[&str] = &[ "answer.v1", "search_hit.v1", "search_response.v1", "doc_summary.v1", "chunk_inspection.v1", "doctor.v1", "config_migration.v1", "ingest_report.v1", "ingest_progress.v1", "reset_report.v1", "citation.v1", "schema.v1", "error.v1", "bulk_search_item.v1", "bulk_search_response.v1", // v0.20.x r2 Enhancement 3: OCR statistics + failures introspection. "ocr_stats.v1", "ocr_failures.v1", ]; /// Build a [`SchemaV1`] introspection report for the given config. /// /// Opens the SQLite store read-only via [`kebab_store_sqlite::SqliteStore::open_existing`] /// so the caller (kebab-cli) does not need write access to the data dir. /// Returns a [`kebab_store_sqlite::NotIndexed`] error (wrapped in `anyhow`) /// if the database file does not exist — the CLI translates that to an /// `error.v1` / `"not_indexed"` wire record. #[doc(hidden)] pub fn schema_with_config(cfg: &Config) -> anyhow::Result { let store = open_store_for_stats(cfg)?; let stats = collect_stats(cfg, &store)?; let models = collect_models(cfg, &store); Ok(SchemaV1 { schema_version: SCHEMA_V1_ID.to_string(), kebab_version: KEBAB_VERSION.to_string(), wire: WireBlock { schemas: WIRE_SCHEMAS.iter().map(|s| (*s).to_string()).collect(), }, capabilities: capabilities_snapshot(), models, stats, }) } fn capabilities_snapshot() -> Capabilities { Capabilities { json_mode: true, ingest_progress: true, ingest_cancellation: true, rag_multi_turn: true, search_cache: true, incremental_ingest: true, streaming_ask: true, http_daemon: false, mcp_server: true, single_file_ingest: true, bulk_search: true, } } fn open_store_for_stats(cfg: &Config) -> anyhow::Result { // Mirror the data_dir resolution used in SqliteStore::open: // kebab_config::expand_path(&cfg.storage.data_dir, "") resolves tilde // and env vars. The SQLITE_FILE name ("kebab.sqlite") is the canonical // file name defined in kebab-store-sqlite. let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, ""); let db_path = data_dir.join("kebab.sqlite"); kebab_store_sqlite::SqliteStore::open_existing(&db_path) } fn collect_stats(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result { let counts = store.count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?; let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, ""); let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir) .map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?; Ok(Stats { doc_count: counts.doc_count, chunk_count: counts.chunk_count, asset_count: counts.asset_count, last_ingest_at: counts.last_ingest_at, media_breakdown: counts.media_breakdown, lang_breakdown: counts.lang_breakdown, index_bytes, stale_doc_count: counts.stale_doc_count, // p10-1A-2: populated by the store query added in this task. code_lang_breakdown: store.code_lang_breakdown()?, // p10-1A-2 follow-up: dogfooding (2026-05-20) revealed this was a // placeholder — mirror of code_lang_breakdown for the repo field. repo_breakdown: store.repo_breakdown()?, // v0.17.0 PR-C: chunk-level companion (closes HOTFIXES // 2026-05-22 "code_lang_breakdown chunk granularity" LOW). code_lang_chunk_breakdown: store.code_lang_chunk_breakdown()?, }) } fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models { let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default(); let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default(); Models { // markdown parser only — pdf-page-v1 (P7) / image extractors (P6) // maintain their own versions; surface those when SchemaV1.models // becomes a multi-medium map (P+). parser_version: kebab_parse_md::PARSER_VERSION.to_string(), chunker_version: cfg.ingest.chunking.chunker_version.clone(), active_parsers, active_chunkers, // EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan. embedding_version: cfg.models.embedding.model.clone(), prompt_template_version: cfg.rag.prompt_template_version.clone(), index_version: kebab_store_vector::INDEX_VERSION_STR.to_string(), // corpus_revision returns u64 directly (no Result) — matches // existing impl; treat 0 as the default for a fresh/unrevised store. corpus_revision: store.corpus_revision(), } } #[cfg(test)] mod tests_stats_ext { use super::*; /// p10-1A-1: Stats must serialize `code_lang_breakdown` and /// `repo_breakdown` so downstream consumers (MCP skill, Claude Code) /// can branch on their presence. #[test] fn stats_includes_code_lang_and_repo_breakdown_fields() { let stats = Stats::default(); let v = serde_json::to_value(&stats).unwrap(); assert!( v.get("code_lang_breakdown").is_some(), "Stats JSON must include code_lang_breakdown: {v}" ); assert!( v.get("repo_breakdown").is_some(), "Stats JSON must include repo_breakdown: {v}" ); // v0.17.0 PR-C: chunk-level companion field. assert!( v.get("code_lang_chunk_breakdown").is_some(), "Stats JSON must include code_lang_chunk_breakdown (v0.17.0 PR-C): {v}" ); // Empty BTreeMap serializes as `{}` — confirm it's an object, not null. assert!( v["code_lang_breakdown"].is_object(), "code_lang_breakdown must be an object: {v}" ); assert!( v["repo_breakdown"].is_object(), "repo_breakdown must be an object: {v}" ); assert!( v["code_lang_chunk_breakdown"].is_object(), "code_lang_chunk_breakdown must be an object: {v}" ); } #[test] fn stats_includes_breakdowns_and_bytes_on_fresh_corpus() { let dir = tempfile::tempdir().unwrap(); let mut cfg = kebab_config::Config::defaults(); cfg.storage.data_dir = dir.path().to_string_lossy().into_owned(); // Bring up migrations so the sqlite file is created. let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap(); store.run_migrations().unwrap(); drop(store); let s = schema_with_config(&cfg).unwrap(); // 5 keys padded. assert_eq!(s.stats.media_breakdown.len(), 5); assert_eq!(s.stats.media_breakdown.get("markdown"), Some(&0)); assert_eq!(s.stats.media_breakdown.get("pdf"), Some(&0)); // lang map empty on empty corpus. assert!(s.stats.lang_breakdown.is_empty()); // sqlite bytes positive after migrations, lancedb 0. assert!(s.stats.index_bytes.sqlite > 0); assert_eq!(s.stats.index_bytes.lancedb, 0); assert_eq!(s.stats.stale_doc_count, 0); } } #[cfg(test)] mod tests_capabilities { use super::*; #[test] fn capabilities_streaming_ask_matches_cli_surface() { // Bug #9: kebab ask --stream 가 answer_event.v1 ndjson 191 event 정상 emit → // capabilities.streaming_ask 가 true 여야 함. let caps = capabilities_snapshot(); assert!(caps.streaming_ask, "streaming_ask must be true (Bug #9)"); } #[test] fn capabilities_single_file_ingest_matches_cli_surface() { // Bug #9: kebab ingest-file + kebab ingest-stdin --title 양쪽 모두 // ingest_report.v1 정상 emit → capabilities.single_file_ingest 가 true 여야 함. let caps = capabilities_snapshot(); assert!( caps.single_file_ingest, "single_file_ingest must be true (Bug #9)" ); } }