feat(kebab-app): schema_with_config facade (fb-27)
New `SchemaV1` struct + `schema_with_config(&Config)` builder. Surfaces wire schemas list, capabilities (current + future placeholders), model versions (parser/chunker/embedding/prompt_template/index/corpus_revision), and stats (doc/chunk/asset counts + last ingest). kebab-store-sqlite gains `count_summary()` to back the stats block. Deviations from plan: - `cfg.models.embedding.id` → `cfg.models.embedding.model` (actual field name) - No `Config::expand_path` method → free fn `kebab_config::expand_path(&cfg.storage.data_dir, "")` - `PARSER_VERSION` added to `kebab-parse-md/src/lib.rs` (was absent; synced with `KEBAB_PARSE_MD_VERSION` literal in kebab-app) - `INDEX_VERSION_STR` added to `kebab-store-vector/src/store.rs` + re-exported from `lib.rs` (was a private `const`) - `corpus_revision()` returns `u64` directly (not `Result<u64>`) — no `?` in collect_models - `SchemaV1` carries `schema_version: "schema.v1"` field (wire schema convention) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -60,10 +60,12 @@ pub mod error_signal;
|
||||
pub mod ingest_progress;
|
||||
pub mod logging;
|
||||
pub mod reset;
|
||||
pub mod schema;
|
||||
|
||||
pub use app::App;
|
||||
pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
|
||||
pub use reset::{ResetReport, ResetScope};
|
||||
pub use schema::{Capabilities, Models, SchemaV1, Stats, WireBlock, schema_with_config};
|
||||
|
||||
/// p9-fb-25: sentinel for files without an extension in
|
||||
/// `IngestReport.skipped_by_extension` keys + `IngestItem.warnings`
|
||||
|
||||
142
crates/kebab-app/src/schema.rs
Normal file
142
crates/kebab-app/src/schema.rs
Normal file
@@ -0,0 +1,142 @@
|
||||
//! `kebab schema` — introspection report. See spec
|
||||
//! `docs/superpowers/specs/2026-05-07-p9-fb-27-introspection-and-error-wire-design.md`.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use kebab_config::Config;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SchemaV1 {
|
||||
pub schema_version: String,
|
||||
pub kebab_version: String,
|
||||
pub wire: WireBlock,
|
||||
pub capabilities: Capabilities,
|
||||
pub models: Models,
|
||||
pub stats: Stats,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WireBlock {
|
||||
pub schemas: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Capabilities {
|
||||
pub json_mode: bool,
|
||||
pub ingest_progress: bool,
|
||||
pub ingest_cancellation: bool,
|
||||
pub rag_multi_turn: bool,
|
||||
pub search_cache: bool,
|
||||
pub incremental_ingest: bool,
|
||||
pub streaming_ask: bool,
|
||||
pub http_daemon: bool,
|
||||
pub mcp_server: bool,
|
||||
pub single_file_ingest: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Models {
|
||||
pub parser_version: String,
|
||||
pub chunker_version: String,
|
||||
pub embedding_version: String,
|
||||
pub prompt_template_version: String,
|
||||
pub index_version: String,
|
||||
pub corpus_revision: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Stats {
|
||||
pub doc_count: u64,
|
||||
pub chunk_count: u64,
|
||||
pub asset_count: u64,
|
||||
pub last_ingest_at: Option<String>,
|
||||
}
|
||||
|
||||
const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
|
||||
const WIRE_SCHEMAS: &[&str] = &[
|
||||
"answer.v1",
|
||||
"search_hit.v1",
|
||||
"doc_summary.v1",
|
||||
"chunk_inspection.v1",
|
||||
"doctor.v1",
|
||||
"ingest_report.v1",
|
||||
"ingest_progress.v1",
|
||||
"reset_report.v1",
|
||||
"citation.v1",
|
||||
"schema.v1",
|
||||
"error.v1",
|
||||
];
|
||||
|
||||
/// Build a [`SchemaV1`] introspection report for the given config.
|
||||
///
|
||||
/// Opens the SQLite store read-only via [`kebab_store_sqlite::SqliteStore::open_existing`]
|
||||
/// so the caller (kebab-cli) does not need write access to the data dir.
|
||||
/// Returns a [`kebab_store_sqlite::NotIndexed`] error (wrapped in `anyhow`)
|
||||
/// if the database file does not exist — the CLI translates that to an
|
||||
/// `error.v1` / `"not_indexed"` wire record.
|
||||
#[doc(hidden)]
|
||||
pub fn schema_with_config(cfg: &Config) -> anyhow::Result<SchemaV1> {
|
||||
let store = open_store_for_stats(cfg)?;
|
||||
let stats = collect_stats(&store)?;
|
||||
let models = collect_models(cfg, &store);
|
||||
Ok(SchemaV1 {
|
||||
schema_version: "schema.v1".to_string(),
|
||||
kebab_version: KEBAB_VERSION.to_string(),
|
||||
wire: WireBlock {
|
||||
schemas: WIRE_SCHEMAS.iter().map(|s| (*s).to_string()).collect(),
|
||||
},
|
||||
capabilities: capabilities_snapshot(),
|
||||
models,
|
||||
stats,
|
||||
})
|
||||
}
|
||||
|
||||
fn capabilities_snapshot() -> Capabilities {
|
||||
Capabilities {
|
||||
json_mode: true,
|
||||
ingest_progress: true,
|
||||
ingest_cancellation: true,
|
||||
rag_multi_turn: true,
|
||||
search_cache: true,
|
||||
incremental_ingest: true,
|
||||
streaming_ask: false,
|
||||
http_daemon: false,
|
||||
mcp_server: false,
|
||||
single_file_ingest: false,
|
||||
}
|
||||
}
|
||||
|
||||
fn open_store_for_stats(cfg: &Config) -> anyhow::Result<kebab_store_sqlite::SqliteStore> {
|
||||
// Mirror the data_dir resolution used in SqliteStore::open:
|
||||
// kebab_config::expand_path(&cfg.storage.data_dir, "") resolves tilde
|
||||
// and env vars. The SQLITE_FILE name ("kebab.sqlite") is the canonical
|
||||
// file name defined in kebab-store-sqlite.
|
||||
let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
|
||||
let db_path = data_dir.join("kebab.sqlite");
|
||||
kebab_store_sqlite::SqliteStore::open_existing(&db_path)
|
||||
}
|
||||
|
||||
fn collect_stats(store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
|
||||
let counts = store.count_summary()?;
|
||||
Ok(Stats {
|
||||
doc_count: counts.doc_count,
|
||||
chunk_count: counts.chunk_count,
|
||||
asset_count: counts.asset_count,
|
||||
last_ingest_at: counts.last_ingest_at,
|
||||
})
|
||||
}
|
||||
|
||||
fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
|
||||
Models {
|
||||
parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
|
||||
chunker_version: cfg.chunking.chunker_version.clone(),
|
||||
// EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
|
||||
embedding_version: cfg.models.embedding.model.clone(),
|
||||
prompt_template_version: cfg.rag.prompt_template_version.clone(),
|
||||
index_version: kebab_store_vector::INDEX_VERSION_STR.to_string(),
|
||||
// corpus_revision returns u64 directly (no Result) — matches
|
||||
// existing impl; treat 0 as the default for a fresh/unrevised store.
|
||||
corpus_revision: store.corpus_revision(),
|
||||
}
|
||||
}
|
||||
@@ -19,3 +19,10 @@ pub mod frontmatter;
|
||||
|
||||
pub use blocks::parse_blocks;
|
||||
pub use frontmatter::{BodyHints, FrontmatterSpan, parse_frontmatter};
|
||||
|
||||
/// Parser-version label for Markdown files ingested through this crate.
|
||||
/// Re-exported so `kebab-app::schema_with_config` can embed it in
|
||||
/// `SchemaV1.models.parser_version` without duplicating the literal.
|
||||
///
|
||||
/// Kept in sync with `KEBAB_PARSE_MD_VERSION` in `kebab-app/src/lib.rs`.
|
||||
pub const PARSER_VERSION: &str = "md-frontmatter-v2";
|
||||
|
||||
@@ -34,4 +34,4 @@ pub use error::StoreError;
|
||||
pub use eval::{EvalQueryResultRecord, EvalRunRecord, EvalRunRow};
|
||||
pub use fts::rebuild_chunks_fts;
|
||||
pub use jobs::IngestRunRow;
|
||||
pub use store::{NotIndexed, SqliteStore};
|
||||
pub use store::{CountSummary, NotIndexed, SqliteStore};
|
||||
|
||||
@@ -591,6 +591,61 @@ pub(crate) fn upsert_asset_row(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// p9-fb-27: aggregate counts for `SchemaV1.stats` block.
|
||||
///
|
||||
/// Returned by [`SqliteStore::count_summary`] and consumed by
|
||||
/// `kebab-app::schema_with_config` to populate the `stats` sub-object of the
|
||||
/// `schema.v1` wire record.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CountSummary {
|
||||
pub doc_count: u64,
|
||||
pub chunk_count: u64,
|
||||
pub asset_count: u64,
|
||||
/// ISO-8601 timestamp of the most-recently updated document row, or
|
||||
/// `None` when the store is empty.
|
||||
pub last_ingest_at: Option<String>,
|
||||
}
|
||||
|
||||
impl SqliteStore {
|
||||
/// Return aggregate counts from the three primary tables plus the
|
||||
/// most-recent `documents.updated_at` timestamp.
|
||||
///
|
||||
/// Uses `read_conn()` (no mutations) — mirrors the pattern used by
|
||||
/// [`Self::corpus_revision`].
|
||||
pub fn count_summary(&self) -> anyhow::Result<CountSummary> {
|
||||
let conn = self.read_conn();
|
||||
|
||||
let doc_count: u64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0))
|
||||
.context("count documents")?;
|
||||
|
||||
let chunk_count: u64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0))
|
||||
.context("count chunks")?;
|
||||
|
||||
let asset_count: u64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM assets", [], |r| r.get(0))
|
||||
.context("count assets")?;
|
||||
|
||||
let last_ingest_at: Option<String> = conn
|
||||
.query_row(
|
||||
"SELECT MAX(updated_at) FROM documents",
|
||||
[],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.optional()
|
||||
.context("max updated_at")?
|
||||
.flatten();
|
||||
|
||||
Ok(CountSummary {
|
||||
doc_count,
|
||||
chunk_count,
|
||||
asset_count,
|
||||
last_ingest_at,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply the design §5 / task-spec pragmas. Called once per connection.
|
||||
/// Note: WAL is persistent (the journal-mode setting is sticky in the DB
|
||||
/// header) but `foreign_keys`, `synchronous`, and `temp_store` are
|
||||
@@ -605,3 +660,27 @@ fn apply_pragmas(conn: &Connection) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn open_fresh_store() -> (tempfile::TempDir, SqliteStore) {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let mut cfg = kebab_config::Config::defaults();
|
||||
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
|
||||
let store = SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
(dir, store)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_summary_zero_on_fresh_store() {
|
||||
let (_dir, store) = open_fresh_store();
|
||||
let s = store.count_summary().unwrap();
|
||||
assert_eq!(s.doc_count, 0);
|
||||
assert_eq!(s.chunk_count, 0);
|
||||
assert_eq!(s.asset_count, 0);
|
||||
assert!(s.last_ingest_at.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -28,4 +28,4 @@ mod arrow_batch;
|
||||
mod paths;
|
||||
mod store;
|
||||
|
||||
pub use store::LanceVectorStore;
|
||||
pub use store::{INDEX_VERSION_STR, LanceVectorStore};
|
||||
|
||||
@@ -44,6 +44,12 @@ const INDEX_KIND: &str = "flat";
|
||||
/// `v1` so re-runs produce stable IDs.
|
||||
const INDEX_VERSION: &str = "v1";
|
||||
|
||||
/// Public view of [`INDEX_VERSION`] for `kebab-app::schema_with_config`.
|
||||
/// The value is the same string — exposed as `pub const` so the schema
|
||||
/// facade can embed it in `SchemaV1.models.index_version` without
|
||||
/// reaching into a private constant.
|
||||
pub const INDEX_VERSION_STR: &str = INDEX_VERSION;
|
||||
|
||||
/// Lance VectorStore.
|
||||
///
|
||||
/// Holds a single `lancedb::Connection` opened against
|
||||
|
||||
Reference in New Issue
Block a user