From 231d80e82d32aae2254539e7bc7ec557e5a675de Mon Sep 17 00:00:00 2001 From: th-kim0823 Date: Sun, 10 May 2026 12:30:19 +0900 Subject: [PATCH] feat(stats): media/lang/bytes/stale fields on schema.v1.stats (fb-37) Extends CountSummary with media_breakdown, lang_breakdown, stale_doc_count fields populated via stats_ext::breakdowns(). Adds count_summary_with_threshold for callers that need real stale counts. Mirrors all new fields onto the wire-bound Stats struct in kebab-app::schema with #[serde(default)] for backwards-compat. Also fixes search_budget_integration.rs for the trace field added to SearchOpts in Task 1. Co-Authored-By: Claude Sonnet 4.6 --- crates/kebab-app/src/schema.rs | 57 ++++++++++++++++++- .../tests/search_budget_integration.rs | 4 ++ crates/kebab-store-sqlite/src/store.rs | 48 ++++++++++++---- 3 files changed, 96 insertions(+), 13 deletions(-) diff --git a/crates/kebab-app/src/schema.rs b/crates/kebab-app/src/schema.rs index 603b212..46841fb 100644 --- a/crates/kebab-app/src/schema.rs +++ b/crates/kebab-app/src/schema.rs @@ -50,6 +50,18 @@ pub struct Stats { pub chunk_count: u64, pub asset_count: u64, pub last_ingest_at: Option, + /// p9-fb-37: per-media-kind doc count (5 keys, zero-padded). + #[serde(default)] + pub media_breakdown: std::collections::BTreeMap, + /// p9-fb-37: per-language doc count, NULL keyed as `"null"`. + #[serde(default)] + pub lang_breakdown: std::collections::BTreeMap, + /// p9-fb-37: on-disk byte sums. + #[serde(default)] + pub index_bytes: kebab_core::IndexBytes, + /// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold. + #[serde(default)] + pub stale_doc_count: u64, } const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION"); @@ -85,7 +97,7 @@ const WIRE_SCHEMAS: &[&str] = &[ #[doc(hidden)] pub fn schema_with_config(cfg: &Config) -> anyhow::Result { let store = open_store_for_stats(cfg)?; - let stats = collect_stats(&store)?; + let stats = collect_stats(cfg, &store)?; let models = collect_models(cfg, &store); Ok(SchemaV1 { schema_version: SCHEMA_V1_ID.to_string(), @@ -124,13 +136,24 @@ fn open_store_for_stats(cfg: &Config) -> anyhow::Result anyhow::Result { - let counts = store.count_summary()?; +fn collect_stats( + cfg: &Config, + store: &kebab_store_sqlite::SqliteStore, +) -> anyhow::Result { + let counts = store + .count_summary_with_threshold(cfg.search.stale_threshold_days as u64)?; + let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, ""); + let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir) + .map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?; Ok(Stats { doc_count: counts.doc_count, chunk_count: counts.chunk_count, asset_count: counts.asset_count, last_ingest_at: counts.last_ingest_at, + media_breakdown: counts.media_breakdown, + lang_breakdown: counts.lang_breakdown, + index_bytes, + stale_doc_count: counts.stale_doc_count, }) } @@ -150,3 +173,31 @@ fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Mode corpus_revision: store.corpus_revision(), } } + +#[cfg(test)] +mod tests_stats_ext { + use super::*; + + #[test] + fn stats_includes_breakdowns_and_bytes_on_fresh_corpus() { + let dir = tempfile::tempdir().unwrap(); + let mut cfg = kebab_config::Config::defaults(); + cfg.storage.data_dir = dir.path().to_string_lossy().into_owned(); + // Bring up migrations so the sqlite file is created. + let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + drop(store); + + let s = schema_with_config(&cfg).unwrap(); + // 5 keys padded. + assert_eq!(s.stats.media_breakdown.len(), 5); + assert_eq!(s.stats.media_breakdown.get("markdown"), Some(&0)); + assert_eq!(s.stats.media_breakdown.get("pdf"), Some(&0)); + // lang map empty on empty corpus. + assert!(s.stats.lang_breakdown.is_empty()); + // sqlite bytes positive after migrations, lancedb 0. + assert!(s.stats.index_bytes.sqlite > 0); + assert_eq!(s.stats.index_bytes.lancedb, 0); + assert_eq!(s.stats.stale_doc_count, 0); + } +} diff --git a/crates/kebab-app/tests/search_budget_integration.rs b/crates/kebab-app/tests/search_budget_integration.rs index 42ad346..c309b69 100644 --- a/crates/kebab-app/tests/search_budget_integration.rs +++ b/crates/kebab-app/tests/search_budget_integration.rs @@ -47,6 +47,7 @@ fn budget_truncates_snippets_when_below_threshold() { max_tokens: Some(50), snippet_chars: None, cursor: None, + trace: false, }, ) .unwrap(); @@ -78,6 +79,7 @@ fn cursor_paginates_to_next_page() { max_tokens: None, snippet_chars: None, cursor: Some(cursor), + trace: false, }, ) .unwrap(); @@ -114,6 +116,7 @@ fn cursor_rejected_after_corpus_revision_bump() { max_tokens: None, snippet_chars: None, cursor: Some(c), + trace: false, }, ); let err = result.unwrap_err(); @@ -147,6 +150,7 @@ fn max_tokens_zero_returns_one_hit_truncated() { max_tokens: Some(0), snippet_chars: None, cursor: None, + trace: false, }, ) .unwrap(); diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index 13691b3..57e16da 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -604,6 +604,12 @@ pub struct CountSummary { /// ISO-8601 timestamp of the most-recently updated document row, or /// `None` when the store is empty. pub last_ingest_at: Option, + /// p9-fb-37: per-media-kind doc count (5 keys, zero-padded). + pub media_breakdown: std::collections::BTreeMap, + /// p9-fb-37: per-language doc count, NULL keyed as `"null"`. + pub lang_breakdown: std::collections::BTreeMap, + /// p9-fb-37: docs whose `updated_at < now - threshold_days`. 0 when threshold=0. + pub stale_doc_count: u64, } impl SqliteStore { @@ -611,39 +617,58 @@ impl SqliteStore { /// most-recent `documents.updated_at` timestamp. /// /// Uses `read_conn()` (no mutations) — mirrors the pattern used by - /// [`Self::corpus_revision`]. - pub fn count_summary(&self) -> anyhow::Result { + /// Shared helper: counts and breakdowns in a single pass with given threshold. + fn count_summary_inner(&self, threshold_days: u64) -> anyhow::Result { + use anyhow::Context; + use rusqlite::OptionalExtension; + let conn = self.read_conn(); let doc_count: u64 = conn .query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0)) .context("count documents")?; - let chunk_count: u64 = conn .query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0)) .context("count chunks")?; - let asset_count: u64 = conn .query_row("SELECT COUNT(*) FROM assets", [], |r| r.get(0)) .context("count assets")?; - let last_ingest_at: Option = conn - .query_row( - "SELECT MAX(updated_at) FROM documents", - [], - |r| r.get(0), - ) + .query_row("SELECT MAX(updated_at) FROM documents", [], |r| r.get(0)) .optional() .context("max updated_at")? .flatten(); + let bd = crate::stats_ext::breakdowns(&conn, threshold_days).context("breakdowns")?; + Ok(CountSummary { doc_count, chunk_count, asset_count, last_ingest_at, + media_breakdown: bd.media, + lang_breakdown: bd.lang, + stale_doc_count: bd.stale_doc_count, }) } + + /// [`Self::corpus_revision`]. + pub fn count_summary(&self) -> anyhow::Result { + // p9-fb-37: default uses threshold_days=0 (matches fb-32 disable + // semantics). Callers that need real stale_doc_count call + // count_summary_with_threshold. + self.count_summary_inner(0) + } + + /// p9-fb-37: variant that honors `config.search.stale_threshold_days`. + /// Callers who need a meaningful `stale_doc_count` (e.g. `kebab schema`) + /// pass the configured threshold; the older `count_summary` returns 0. + pub fn count_summary_with_threshold( + &self, + threshold_days: u64, + ) -> anyhow::Result { + self.count_summary_inner(threshold_days) + } } /// Apply the design §5 / task-spec pragmas. Called once per connection. @@ -681,6 +706,9 @@ mod tests { assert_eq!(s.chunk_count, 0); assert_eq!(s.asset_count, 0); assert!(s.last_ingest_at.is_none()); + assert_eq!(s.media_breakdown.len(), 5); + assert!(s.lang_breakdown.is_empty()); + assert_eq!(s.stale_doc_count, 0); } }