feat(kebab-app): schema_with_config facade (fb-27)

New `SchemaV1` struct + `schema_with_config(&Config)` builder. Surfaces
wire schemas list, capabilities (current + future placeholders), model
versions (parser/chunker/embedding/prompt_template/index/corpus_revision),
and stats (doc/chunk/asset counts + last ingest). kebab-store-sqlite
gains `count_summary()` to back the stats block.

Deviations from plan:
- `cfg.models.embedding.id` → `cfg.models.embedding.model` (actual field name)
- No `Config::expand_path` method → free fn `kebab_config::expand_path(&cfg.storage.data_dir, "")`
- `PARSER_VERSION` added to `kebab-parse-md/src/lib.rs` (was absent; synced with `KEBAB_PARSE_MD_VERSION` literal in kebab-app)
- `INDEX_VERSION_STR` added to `kebab-store-vector/src/store.rs` + re-exported from `lib.rs` (was a private `const`)
- `corpus_revision()` returns `u64` directly (not `Result<u64>`) — no `?` in collect_models
- `SchemaV1` carries `schema_version: "schema.v1"` field (wire schema convention)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
th-kim0823
2026-05-07 11:46:37 +09:00
parent 1c4d554bf4
commit 39b4433549
7 changed files with 238 additions and 2 deletions

View File

@@ -60,10 +60,12 @@ pub mod error_signal;
pub mod ingest_progress;
pub mod logging;
pub mod reset;
pub mod schema;
pub use app::App;
pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
pub use reset::{ResetReport, ResetScope};
pub use schema::{Capabilities, Models, SchemaV1, Stats, WireBlock, schema_with_config};
/// p9-fb-25: sentinel for files without an extension in
/// `IngestReport.skipped_by_extension` keys + `IngestItem.warnings`

View File

@@ -0,0 +1,142 @@
//! `kebab schema` — introspection report. See spec
//! `docs/superpowers/specs/2026-05-07-p9-fb-27-introspection-and-error-wire-design.md`.
use serde::{Deserialize, Serialize};
use kebab_config::Config;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SchemaV1 {
pub schema_version: String,
pub kebab_version: String,
pub wire: WireBlock,
pub capabilities: Capabilities,
pub models: Models,
pub stats: Stats,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WireBlock {
pub schemas: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Capabilities {
pub json_mode: bool,
pub ingest_progress: bool,
pub ingest_cancellation: bool,
pub rag_multi_turn: bool,
pub search_cache: bool,
pub incremental_ingest: bool,
pub streaming_ask: bool,
pub http_daemon: bool,
pub mcp_server: bool,
pub single_file_ingest: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Models {
pub parser_version: String,
pub chunker_version: String,
pub embedding_version: String,
pub prompt_template_version: String,
pub index_version: String,
pub corpus_revision: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Stats {
pub doc_count: u64,
pub chunk_count: u64,
pub asset_count: u64,
pub last_ingest_at: Option<String>,
}
const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
const WIRE_SCHEMAS: &[&str] = &[
"answer.v1",
"search_hit.v1",
"doc_summary.v1",
"chunk_inspection.v1",
"doctor.v1",
"ingest_report.v1",
"ingest_progress.v1",
"reset_report.v1",
"citation.v1",
"schema.v1",
"error.v1",
];
/// Build a [`SchemaV1`] introspection report for the given config.
///
/// Opens the SQLite store read-only via [`kebab_store_sqlite::SqliteStore::open_existing`]
/// so the caller (kebab-cli) does not need write access to the data dir.
/// Returns a [`kebab_store_sqlite::NotIndexed`] error (wrapped in `anyhow`)
/// if the database file does not exist — the CLI translates that to an
/// `error.v1` / `"not_indexed"` wire record.
#[doc(hidden)]
pub fn schema_with_config(cfg: &Config) -> anyhow::Result<SchemaV1> {
let store = open_store_for_stats(cfg)?;
let stats = collect_stats(&store)?;
let models = collect_models(cfg, &store);
Ok(SchemaV1 {
schema_version: "schema.v1".to_string(),
kebab_version: KEBAB_VERSION.to_string(),
wire: WireBlock {
schemas: WIRE_SCHEMAS.iter().map(|s| (*s).to_string()).collect(),
},
capabilities: capabilities_snapshot(),
models,
stats,
})
}
fn capabilities_snapshot() -> Capabilities {
Capabilities {
json_mode: true,
ingest_progress: true,
ingest_cancellation: true,
rag_multi_turn: true,
search_cache: true,
incremental_ingest: true,
streaming_ask: false,
http_daemon: false,
mcp_server: false,
single_file_ingest: false,
}
}
fn open_store_for_stats(cfg: &Config) -> anyhow::Result<kebab_store_sqlite::SqliteStore> {
// Mirror the data_dir resolution used in SqliteStore::open:
// kebab_config::expand_path(&cfg.storage.data_dir, "") resolves tilde
// and env vars. The SQLITE_FILE name ("kebab.sqlite") is the canonical
// file name defined in kebab-store-sqlite.
let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
let db_path = data_dir.join("kebab.sqlite");
kebab_store_sqlite::SqliteStore::open_existing(&db_path)
}
fn collect_stats(store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
let counts = store.count_summary()?;
Ok(Stats {
doc_count: counts.doc_count,
chunk_count: counts.chunk_count,
asset_count: counts.asset_count,
last_ingest_at: counts.last_ingest_at,
})
}
fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
Models {
parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
chunker_version: cfg.chunking.chunker_version.clone(),
// EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
embedding_version: cfg.models.embedding.model.clone(),
prompt_template_version: cfg.rag.prompt_template_version.clone(),
index_version: kebab_store_vector::INDEX_VERSION_STR.to_string(),
// corpus_revision returns u64 directly (no Result) — matches
// existing impl; treat 0 as the default for a fresh/unrevised store.
corpus_revision: store.corpus_revision(),
}
}

View File

@@ -19,3 +19,10 @@ pub mod frontmatter;
pub use blocks::parse_blocks;
pub use frontmatter::{BodyHints, FrontmatterSpan, parse_frontmatter};
/// Parser-version label for Markdown files ingested through this crate.
/// Re-exported so `kebab-app::schema_with_config` can embed it in
/// `SchemaV1.models.parser_version` without duplicating the literal.
///
/// Kept in sync with `KEBAB_PARSE_MD_VERSION` in `kebab-app/src/lib.rs`.
pub const PARSER_VERSION: &str = "md-frontmatter-v2";

View File

@@ -34,4 +34,4 @@ pub use error::StoreError;
pub use eval::{EvalQueryResultRecord, EvalRunRecord, EvalRunRow};
pub use fts::rebuild_chunks_fts;
pub use jobs::IngestRunRow;
pub use store::{NotIndexed, SqliteStore};
pub use store::{CountSummary, NotIndexed, SqliteStore};

View File

@@ -591,6 +591,61 @@ pub(crate) fn upsert_asset_row(
Ok(())
}
/// p9-fb-27: aggregate counts for `SchemaV1.stats` block.
///
/// Returned by [`SqliteStore::count_summary`] and consumed by
/// `kebab-app::schema_with_config` to populate the `stats` sub-object of the
/// `schema.v1` wire record.
#[derive(Debug, Clone)]
pub struct CountSummary {
pub doc_count: u64,
pub chunk_count: u64,
pub asset_count: u64,
/// ISO-8601 timestamp of the most-recently updated document row, or
/// `None` when the store is empty.
pub last_ingest_at: Option<String>,
}
impl SqliteStore {
/// Return aggregate counts from the three primary tables plus the
/// most-recent `documents.updated_at` timestamp.
///
/// Uses `read_conn()` (no mutations) — mirrors the pattern used by
/// [`Self::corpus_revision`].
pub fn count_summary(&self) -> anyhow::Result<CountSummary> {
let conn = self.read_conn();
let doc_count: u64 = conn
.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0))
.context("count documents")?;
let chunk_count: u64 = conn
.query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0))
.context("count chunks")?;
let asset_count: u64 = conn
.query_row("SELECT COUNT(*) FROM assets", [], |r| r.get(0))
.context("count assets")?;
let last_ingest_at: Option<String> = conn
.query_row(
"SELECT MAX(updated_at) FROM documents",
[],
|r| r.get(0),
)
.optional()
.context("max updated_at")?
.flatten();
Ok(CountSummary {
doc_count,
chunk_count,
asset_count,
last_ingest_at,
})
}
}
/// Apply the design §5 / task-spec pragmas. Called once per connection.
/// Note: WAL is persistent (the journal-mode setting is sticky in the DB
/// header) but `foreign_keys`, `synchronous`, and `temp_store` are
@@ -605,3 +660,27 @@ fn apply_pragmas(conn: &Connection) -> Result<()> {
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn open_fresh_store() -> (tempfile::TempDir, SqliteStore) {
let dir = tempfile::tempdir().unwrap();
let mut cfg = kebab_config::Config::defaults();
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
let store = SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
(dir, store)
}
#[test]
fn count_summary_zero_on_fresh_store() {
let (_dir, store) = open_fresh_store();
let s = store.count_summary().unwrap();
assert_eq!(s.doc_count, 0);
assert_eq!(s.chunk_count, 0);
assert_eq!(s.asset_count, 0);
assert!(s.last_ingest_at.is_none());
}
}

View File

@@ -28,4 +28,4 @@ mod arrow_batch;
mod paths;
mod store;
pub use store::LanceVectorStore;
pub use store::{INDEX_VERSION_STR, LanceVectorStore};

View File

@@ -44,6 +44,12 @@ const INDEX_KIND: &str = "flat";
/// `v1` so re-runs produce stable IDs.
const INDEX_VERSION: &str = "v1";
/// Public view of [`INDEX_VERSION`] for `kebab-app::schema_with_config`.
/// The value is the same string — exposed as `pub const` so the schema
/// facade can embed it in `SchemaV1.models.index_version` without
/// reaching into a private constant.
pub const INDEX_VERSION_STR: &str = INDEX_VERSION;
/// Lance VectorStore.
///
/// Holds a single `lancedb::Connection` opened against