From 231d80e82d32aae2254539e7bc7ec557e5a675de Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 12:30:19 +0900
Subject: [PATCH] feat(stats): media/lang/bytes/stale fields on schema.v1.stats
(fb-37)
Extends CountSummary with media_breakdown, lang_breakdown, stale_doc_count
fields populated via stats_ext::breakdowns(). Adds count_summary_with_threshold
for callers that need real stale counts. Mirrors all new fields onto the
wire-bound Stats struct in kebab-app::schema with #[serde(default)] for
backwards-compat. Also fixes search_budget_integration.rs for the trace field
added to SearchOpts in Task 1.
Co-Authored-By: Claude Sonnet 4.6
---
crates/kebab-app/src/schema.rs | 57 ++++++++++++++++++-
.../tests/search_budget_integration.rs | 4 ++
crates/kebab-store-sqlite/src/store.rs | 48 ++++++++++++----
3 files changed, 96 insertions(+), 13 deletions(-)
diff --git a/crates/kebab-app/src/schema.rs b/crates/kebab-app/src/schema.rs
index 603b212..46841fb 100644
--- a/crates/kebab-app/src/schema.rs
+++ b/crates/kebab-app/src/schema.rs
@@ -50,6 +50,18 @@ pub struct Stats {
pub chunk_count: u64,
pub asset_count: u64,
pub last_ingest_at: Option,
+ /// p9-fb-37: per-media-kind doc count (5 keys, zero-padded).
+ #[serde(default)]
+ pub media_breakdown: std::collections::BTreeMap,
+ /// p9-fb-37: per-language doc count, NULL keyed as `"null"`.
+ #[serde(default)]
+ pub lang_breakdown: std::collections::BTreeMap,
+ /// p9-fb-37: on-disk byte sums.
+ #[serde(default)]
+ pub index_bytes: kebab_core::IndexBytes,
+ /// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold.
+ #[serde(default)]
+ pub stale_doc_count: u64,
}
const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
@@ -85,7 +97,7 @@ const WIRE_SCHEMAS: &[&str] = &[
#[doc(hidden)]
pub fn schema_with_config(cfg: &Config) -> anyhow::Result {
let store = open_store_for_stats(cfg)?;
- let stats = collect_stats(&store)?;
+ let stats = collect_stats(cfg, &store)?;
let models = collect_models(cfg, &store);
Ok(SchemaV1 {
schema_version: SCHEMA_V1_ID.to_string(),
@@ -124,13 +136,24 @@ fn open_store_for_stats(cfg: &Config) -> anyhow::Result anyhow::Result {
- let counts = store.count_summary()?;
+fn collect_stats(
+ cfg: &Config,
+ store: &kebab_store_sqlite::SqliteStore,
+) -> anyhow::Result {
+ let counts = store
+ .count_summary_with_threshold(cfg.search.stale_threshold_days as u64)?;
+ let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
+ let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir)
+ .map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?;
Ok(Stats {
doc_count: counts.doc_count,
chunk_count: counts.chunk_count,
asset_count: counts.asset_count,
last_ingest_at: counts.last_ingest_at,
+ media_breakdown: counts.media_breakdown,
+ lang_breakdown: counts.lang_breakdown,
+ index_bytes,
+ stale_doc_count: counts.stale_doc_count,
})
}
@@ -150,3 +173,31 @@ fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Mode
corpus_revision: store.corpus_revision(),
}
}
+
+#[cfg(test)]
+mod tests_stats_ext {
+ use super::*;
+
+ #[test]
+ fn stats_includes_breakdowns_and_bytes_on_fresh_corpus() {
+ let dir = tempfile::tempdir().unwrap();
+ let mut cfg = kebab_config::Config::defaults();
+ cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
+ // Bring up migrations so the sqlite file is created.
+ let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
+ store.run_migrations().unwrap();
+ drop(store);
+
+ let s = schema_with_config(&cfg).unwrap();
+ // 5 keys padded.
+ assert_eq!(s.stats.media_breakdown.len(), 5);
+ assert_eq!(s.stats.media_breakdown.get("markdown"), Some(&0));
+ assert_eq!(s.stats.media_breakdown.get("pdf"), Some(&0));
+ // lang map empty on empty corpus.
+ assert!(s.stats.lang_breakdown.is_empty());
+ // sqlite bytes positive after migrations, lancedb 0.
+ assert!(s.stats.index_bytes.sqlite > 0);
+ assert_eq!(s.stats.index_bytes.lancedb, 0);
+ assert_eq!(s.stats.stale_doc_count, 0);
+ }
+}
diff --git a/crates/kebab-app/tests/search_budget_integration.rs b/crates/kebab-app/tests/search_budget_integration.rs
index 42ad346..c309b69 100644
--- a/crates/kebab-app/tests/search_budget_integration.rs
+++ b/crates/kebab-app/tests/search_budget_integration.rs
@@ -47,6 +47,7 @@ fn budget_truncates_snippets_when_below_threshold() {
max_tokens: Some(50),
snippet_chars: None,
cursor: None,
+ trace: false,
},
)
.unwrap();
@@ -78,6 +79,7 @@ fn cursor_paginates_to_next_page() {
max_tokens: None,
snippet_chars: None,
cursor: Some(cursor),
+ trace: false,
},
)
.unwrap();
@@ -114,6 +116,7 @@ fn cursor_rejected_after_corpus_revision_bump() {
max_tokens: None,
snippet_chars: None,
cursor: Some(c),
+ trace: false,
},
);
let err = result.unwrap_err();
@@ -147,6 +150,7 @@ fn max_tokens_zero_returns_one_hit_truncated() {
max_tokens: Some(0),
snippet_chars: None,
cursor: None,
+ trace: false,
},
)
.unwrap();
diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs
index 13691b3..57e16da 100644
--- a/crates/kebab-store-sqlite/src/store.rs
+++ b/crates/kebab-store-sqlite/src/store.rs
@@ -604,6 +604,12 @@ pub struct CountSummary {
/// ISO-8601 timestamp of the most-recently updated document row, or
/// `None` when the store is empty.
pub last_ingest_at: Option,
+ /// p9-fb-37: per-media-kind doc count (5 keys, zero-padded).
+ pub media_breakdown: std::collections::BTreeMap,
+ /// p9-fb-37: per-language doc count, NULL keyed as `"null"`.
+ pub lang_breakdown: std::collections::BTreeMap,
+ /// p9-fb-37: docs whose `updated_at < now - threshold_days`. 0 when threshold=0.
+ pub stale_doc_count: u64,
}
impl SqliteStore {
@@ -611,39 +617,58 @@ impl SqliteStore {
/// most-recent `documents.updated_at` timestamp.
///
/// Uses `read_conn()` (no mutations) — mirrors the pattern used by
- /// [`Self::corpus_revision`].
- pub fn count_summary(&self) -> anyhow::Result {
+ /// Shared helper: counts and breakdowns in a single pass with given threshold.
+ fn count_summary_inner(&self, threshold_days: u64) -> anyhow::Result {
+ use anyhow::Context;
+ use rusqlite::OptionalExtension;
+
let conn = self.read_conn();
let doc_count: u64 = conn
.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0))
.context("count documents")?;
-
let chunk_count: u64 = conn
.query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0))
.context("count chunks")?;
-
let asset_count: u64 = conn
.query_row("SELECT COUNT(*) FROM assets", [], |r| r.get(0))
.context("count assets")?;
-
let last_ingest_at: Option = conn
- .query_row(
- "SELECT MAX(updated_at) FROM documents",
- [],
- |r| r.get(0),
- )
+ .query_row("SELECT MAX(updated_at) FROM documents", [], |r| r.get(0))
.optional()
.context("max updated_at")?
.flatten();
+ let bd = crate::stats_ext::breakdowns(&conn, threshold_days).context("breakdowns")?;
+
Ok(CountSummary {
doc_count,
chunk_count,
asset_count,
last_ingest_at,
+ media_breakdown: bd.media,
+ lang_breakdown: bd.lang,
+ stale_doc_count: bd.stale_doc_count,
})
}
+
+ /// [`Self::corpus_revision`].
+ pub fn count_summary(&self) -> anyhow::Result {
+ // p9-fb-37: default uses threshold_days=0 (matches fb-32 disable
+ // semantics). Callers that need real stale_doc_count call
+ // count_summary_with_threshold.
+ self.count_summary_inner(0)
+ }
+
+ /// p9-fb-37: variant that honors `config.search.stale_threshold_days`.
+ /// Callers who need a meaningful `stale_doc_count` (e.g. `kebab schema`)
+ /// pass the configured threshold; the older `count_summary` returns 0.
+ pub fn count_summary_with_threshold(
+ &self,
+ threshold_days: u64,
+ ) -> anyhow::Result {
+ self.count_summary_inner(threshold_days)
+ }
}
/// Apply the design §5 / task-spec pragmas. Called once per connection.
@@ -681,6 +706,9 @@ mod tests {
assert_eq!(s.chunk_count, 0);
assert_eq!(s.asset_count, 0);
assert!(s.last_ingest_at.is_none());
+ assert_eq!(s.media_breakdown.len(), 5);
+ assert!(s.lang_breakdown.is_empty());
+ assert_eq!(s.stale_doc_count, 0);
}
}