부모 경로에 .ingest 삽입(leaf 구조체 불변). src + 테스트 call-site 전부. kebab-cli 테스트의 v2 TOML fixture 는 from_file 자동변환(T6) 경로 검증용으로 유지. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
307 lines
12 KiB
Rust
307 lines
12 KiB
Rust
//! `kebab schema` — introspection report. See spec
|
|
//! `docs/superpowers/specs/2026-05-07-p9-fb-27-introspection-and-error-wire-design.md`.
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use kebab_config::Config;
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct SchemaV1 {
|
|
pub schema_version: String,
|
|
pub kebab_version: String,
|
|
pub wire: WireBlock,
|
|
pub capabilities: Capabilities,
|
|
pub models: Models,
|
|
pub stats: Stats,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct WireBlock {
|
|
pub schemas: Vec<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct Capabilities {
|
|
pub json_mode: bool,
|
|
pub ingest_progress: bool,
|
|
pub ingest_cancellation: bool,
|
|
pub rag_multi_turn: bool,
|
|
pub search_cache: bool,
|
|
pub incremental_ingest: bool,
|
|
pub streaming_ask: bool,
|
|
pub http_daemon: bool,
|
|
pub mcp_server: bool,
|
|
pub single_file_ingest: bool,
|
|
pub bulk_search: bool,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct Models {
|
|
pub parser_version: String,
|
|
pub chunker_version: String,
|
|
/// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체.
|
|
/// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존.
|
|
#[serde(default)]
|
|
pub active_parsers: Vec<String>,
|
|
/// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체.
|
|
/// 빈 corpus → empty Vec.
|
|
#[serde(default)]
|
|
pub active_chunkers: Vec<String>,
|
|
pub embedding_version: String,
|
|
pub prompt_template_version: String,
|
|
pub index_version: String,
|
|
pub corpus_revision: u64,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
|
pub struct Stats {
|
|
pub doc_count: u64,
|
|
pub chunk_count: u64,
|
|
pub asset_count: u64,
|
|
pub last_ingest_at: Option<String>,
|
|
/// p9-fb-37: per-media-kind doc count (5 keys, zero-padded).
|
|
#[serde(default)]
|
|
pub media_breakdown: std::collections::BTreeMap<String, u64>,
|
|
/// p9-fb-37: per-language doc count, NULL keyed as `"null"`.
|
|
#[serde(default)]
|
|
pub lang_breakdown: std::collections::BTreeMap<String, u64>,
|
|
/// p9-fb-37: on-disk byte sums.
|
|
#[serde(default)]
|
|
pub index_bytes: kebab_core::IndexBytes,
|
|
/// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold.
|
|
#[serde(default)]
|
|
pub stale_doc_count: u64,
|
|
/// p10-1A-1: code language breakdown (**doc** counts by canonical
|
|
/// lowercase language identifier). Empty until 1A-2 produces code
|
|
/// docs. v0.17.0 PR-C: doc-count semantics corrected here (the
|
|
/// previous "chunk counts" wording was a longstanding mis-label —
|
|
/// implementation has always been `COUNT(*) FROM documents
|
|
/// GROUP BY code_lang`). Use `code_lang_chunk_breakdown` for the
|
|
/// chunk-level companion.
|
|
#[serde(default)]
|
|
pub code_lang_breakdown: std::collections::BTreeMap<String, u32>,
|
|
/// p10-1A-1: repo breakdown (**doc** counts by `metadata.repo`
|
|
/// value). Empty until 1A-2 produces code docs. v0.17.0 PR-C:
|
|
/// doc-count wording corrected (mirror of code_lang_breakdown).
|
|
#[serde(default)]
|
|
pub repo_breakdown: std::collections::BTreeMap<String, u32>,
|
|
/// v0.17.0 PR-C: sister of [`Self::code_lang_breakdown`] returning
|
|
/// chunk counts instead of doc counts. Indexing-pressure metric —
|
|
/// one PDF spec → 200 chunks vs one Rust file → 5 chunks shows up
|
|
/// here in a way `code_lang_breakdown` (doc count) hides.
|
|
#[serde(default)]
|
|
pub code_lang_chunk_breakdown: std::collections::BTreeMap<String, u32>,
|
|
}
|
|
|
|
const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
|
|
|
|
/// Wire schema id for [`SchemaV1`]. Single source of truth — `kebab-cli`
|
|
/// re-uses this via `kebab_app::schema::SCHEMA_V1_ID` when wrapping.
|
|
pub const SCHEMA_V1_ID: &str = "schema.v1";
|
|
|
|
// Authoritative list of wire schemas this binary emits. Keep in sync with
|
|
// `docs/wire-schema/v1/*.schema.json` and `kebab-cli::wire::wire_*` helpers.
|
|
const WIRE_SCHEMAS: &[&str] = &[
|
|
"answer.v1",
|
|
"search_hit.v1",
|
|
"search_response.v1",
|
|
"doc_summary.v1",
|
|
"chunk_inspection.v1",
|
|
"doctor.v1",
|
|
"config_migration.v1",
|
|
"ingest_report.v1",
|
|
"ingest_progress.v1",
|
|
"reset_report.v1",
|
|
"citation.v1",
|
|
"schema.v1",
|
|
"error.v1",
|
|
"bulk_search_item.v1",
|
|
"bulk_search_response.v1",
|
|
// v0.20.x r2 Enhancement 3: OCR statistics + failures introspection.
|
|
"ocr_stats.v1",
|
|
"ocr_failures.v1",
|
|
];
|
|
|
|
/// Build a [`SchemaV1`] introspection report for the given config.
|
|
///
|
|
/// Opens the SQLite store read-only via [`kebab_store_sqlite::SqliteStore::open_existing`]
|
|
/// so the caller (kebab-cli) does not need write access to the data dir.
|
|
/// Returns a [`kebab_store_sqlite::NotIndexed`] error (wrapped in `anyhow`)
|
|
/// if the database file does not exist — the CLI translates that to an
|
|
/// `error.v1` / `"not_indexed"` wire record.
|
|
#[doc(hidden)]
|
|
pub fn schema_with_config(cfg: &Config) -> anyhow::Result<SchemaV1> {
|
|
let store = open_store_for_stats(cfg)?;
|
|
let stats = collect_stats(cfg, &store)?;
|
|
let models = collect_models(cfg, &store);
|
|
Ok(SchemaV1 {
|
|
schema_version: SCHEMA_V1_ID.to_string(),
|
|
kebab_version: KEBAB_VERSION.to_string(),
|
|
wire: WireBlock {
|
|
schemas: WIRE_SCHEMAS.iter().map(|s| (*s).to_string()).collect(),
|
|
},
|
|
capabilities: capabilities_snapshot(),
|
|
models,
|
|
stats,
|
|
})
|
|
}
|
|
|
|
fn capabilities_snapshot() -> Capabilities {
|
|
Capabilities {
|
|
json_mode: true,
|
|
ingest_progress: true,
|
|
ingest_cancellation: true,
|
|
rag_multi_turn: true,
|
|
search_cache: true,
|
|
incremental_ingest: true,
|
|
streaming_ask: true,
|
|
http_daemon: false,
|
|
mcp_server: true,
|
|
single_file_ingest: true,
|
|
bulk_search: true,
|
|
}
|
|
}
|
|
|
|
fn open_store_for_stats(cfg: &Config) -> anyhow::Result<kebab_store_sqlite::SqliteStore> {
|
|
// Mirror the data_dir resolution used in SqliteStore::open:
|
|
// kebab_config::expand_path(&cfg.storage.data_dir, "") resolves tilde
|
|
// and env vars. The SQLITE_FILE name ("kebab.sqlite") is the canonical
|
|
// file name defined in kebab-store-sqlite.
|
|
let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
|
|
let db_path = data_dir.join("kebab.sqlite");
|
|
kebab_store_sqlite::SqliteStore::open_existing(&db_path)
|
|
}
|
|
|
|
fn collect_stats(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
|
|
let counts = store.count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?;
|
|
let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
|
|
let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir)
|
|
.map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?;
|
|
Ok(Stats {
|
|
doc_count: counts.doc_count,
|
|
chunk_count: counts.chunk_count,
|
|
asset_count: counts.asset_count,
|
|
last_ingest_at: counts.last_ingest_at,
|
|
media_breakdown: counts.media_breakdown,
|
|
lang_breakdown: counts.lang_breakdown,
|
|
index_bytes,
|
|
stale_doc_count: counts.stale_doc_count,
|
|
// p10-1A-2: populated by the store query added in this task.
|
|
code_lang_breakdown: store.code_lang_breakdown()?,
|
|
// p10-1A-2 follow-up: dogfooding (2026-05-20) revealed this was a
|
|
// placeholder — mirror of code_lang_breakdown for the repo field.
|
|
repo_breakdown: store.repo_breakdown()?,
|
|
// v0.17.0 PR-C: chunk-level companion (closes HOTFIXES
|
|
// 2026-05-22 "code_lang_breakdown chunk granularity" LOW).
|
|
code_lang_chunk_breakdown: store.code_lang_chunk_breakdown()?,
|
|
})
|
|
}
|
|
|
|
fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
|
|
let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default();
|
|
let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default();
|
|
Models {
|
|
// markdown parser only — pdf-page-v1 (P7) / image extractors (P6)
|
|
// maintain their own versions; surface those when SchemaV1.models
|
|
// becomes a multi-medium map (P+).
|
|
parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
|
|
chunker_version: cfg.ingest.chunking.chunker_version.clone(),
|
|
active_parsers,
|
|
active_chunkers,
|
|
// EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
|
|
embedding_version: cfg.models.embedding.model.clone(),
|
|
prompt_template_version: cfg.rag.prompt_template_version.clone(),
|
|
index_version: kebab_store_vector::INDEX_VERSION_STR.to_string(),
|
|
// corpus_revision returns u64 directly (no Result) — matches
|
|
// existing impl; treat 0 as the default for a fresh/unrevised store.
|
|
corpus_revision: store.corpus_revision(),
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests_stats_ext {
|
|
use super::*;
|
|
|
|
/// p10-1A-1: Stats must serialize `code_lang_breakdown` and
|
|
/// `repo_breakdown` so downstream consumers (MCP skill, Claude Code)
|
|
/// can branch on their presence.
|
|
#[test]
|
|
fn stats_includes_code_lang_and_repo_breakdown_fields() {
|
|
let stats = Stats::default();
|
|
let v = serde_json::to_value(&stats).unwrap();
|
|
assert!(
|
|
v.get("code_lang_breakdown").is_some(),
|
|
"Stats JSON must include code_lang_breakdown: {v}"
|
|
);
|
|
assert!(
|
|
v.get("repo_breakdown").is_some(),
|
|
"Stats JSON must include repo_breakdown: {v}"
|
|
);
|
|
// v0.17.0 PR-C: chunk-level companion field.
|
|
assert!(
|
|
v.get("code_lang_chunk_breakdown").is_some(),
|
|
"Stats JSON must include code_lang_chunk_breakdown (v0.17.0 PR-C): {v}"
|
|
);
|
|
// Empty BTreeMap serializes as `{}` — confirm it's an object, not null.
|
|
assert!(
|
|
v["code_lang_breakdown"].is_object(),
|
|
"code_lang_breakdown must be an object: {v}"
|
|
);
|
|
assert!(
|
|
v["repo_breakdown"].is_object(),
|
|
"repo_breakdown must be an object: {v}"
|
|
);
|
|
assert!(
|
|
v["code_lang_chunk_breakdown"].is_object(),
|
|
"code_lang_chunk_breakdown must be an object: {v}"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn stats_includes_breakdowns_and_bytes_on_fresh_corpus() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let mut cfg = kebab_config::Config::defaults();
|
|
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
|
|
// Bring up migrations so the sqlite file is created.
|
|
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
|
|
store.run_migrations().unwrap();
|
|
drop(store);
|
|
|
|
let s = schema_with_config(&cfg).unwrap();
|
|
// 5 keys padded.
|
|
assert_eq!(s.stats.media_breakdown.len(), 5);
|
|
assert_eq!(s.stats.media_breakdown.get("markdown"), Some(&0));
|
|
assert_eq!(s.stats.media_breakdown.get("pdf"), Some(&0));
|
|
// lang map empty on empty corpus.
|
|
assert!(s.stats.lang_breakdown.is_empty());
|
|
// sqlite bytes positive after migrations, lancedb 0.
|
|
assert!(s.stats.index_bytes.sqlite > 0);
|
|
assert_eq!(s.stats.index_bytes.lancedb, 0);
|
|
assert_eq!(s.stats.stale_doc_count, 0);
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests_capabilities {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn capabilities_streaming_ask_matches_cli_surface() {
|
|
// Bug #9: kebab ask --stream 가 answer_event.v1 ndjson 191 event 정상 emit →
|
|
// capabilities.streaming_ask 가 true 여야 함.
|
|
let caps = capabilities_snapshot();
|
|
assert!(caps.streaming_ask, "streaming_ask must be true (Bug #9)");
|
|
}
|
|
|
|
#[test]
|
|
fn capabilities_single_file_ingest_matches_cli_surface() {
|
|
// Bug #9: kebab ingest-file <path> + kebab ingest-stdin --title <T> 양쪽 모두
|
|
// ingest_report.v1 정상 emit → capabilities.single_file_ingest 가 true 여야 함.
|
|
let caps = capabilities_snapshot();
|
|
assert!(
|
|
caps.single_file_ingest,
|
|
"single_file_ingest must be true (Bug #9)"
|
|
);
|
|
}
|
|
}
|