feat(stats): media/lang/bytes/stale fields on schema.v1.stats (fb-37)

Extends CountSummary with media_breakdown, lang_breakdown, stale_doc_count
fields populated via stats_ext::breakdowns(). Adds count_summary_with_threshold
for callers that need real stale counts. Mirrors all new fields onto the
wire-bound Stats struct in kebab-app::schema with #[serde(default)] for
backwards-compat. Also fixes search_budget_integration.rs for the trace field
added to SearchOpts in Task 1.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
th-kim0823
2026-05-10 12:30:19 +09:00
parent 69c6e23432
commit 231d80e82d
3 changed files with 96 additions and 13 deletions

View File

@@ -50,6 +50,18 @@ pub struct Stats {
pub chunk_count: u64,
pub asset_count: u64,
pub last_ingest_at: Option<String>,
/// p9-fb-37: per-media-kind doc count (5 keys, zero-padded).
#[serde(default)]
pub media_breakdown: std::collections::BTreeMap<String, u64>,
/// p9-fb-37: per-language doc count, NULL keyed as `"null"`.
#[serde(default)]
pub lang_breakdown: std::collections::BTreeMap<String, u64>,
/// p9-fb-37: on-disk byte sums.
#[serde(default)]
pub index_bytes: kebab_core::IndexBytes,
/// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold.
#[serde(default)]
pub stale_doc_count: u64,
}
const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
@@ -85,7 +97,7 @@ const WIRE_SCHEMAS: &[&str] = &[
#[doc(hidden)]
pub fn schema_with_config(cfg: &Config) -> anyhow::Result<SchemaV1> {
let store = open_store_for_stats(cfg)?;
let stats = collect_stats(&store)?;
let stats = collect_stats(cfg, &store)?;
let models = collect_models(cfg, &store);
Ok(SchemaV1 {
schema_version: SCHEMA_V1_ID.to_string(),
@@ -124,13 +136,24 @@ fn open_store_for_stats(cfg: &Config) -> anyhow::Result<kebab_store_sqlite::Sqli
kebab_store_sqlite::SqliteStore::open_existing(&db_path)
}
fn collect_stats(store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
let counts = store.count_summary()?;
fn collect_stats(
cfg: &Config,
store: &kebab_store_sqlite::SqliteStore,
) -> anyhow::Result<Stats> {
let counts = store
.count_summary_with_threshold(cfg.search.stale_threshold_days as u64)?;
let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir)
.map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?;
Ok(Stats {
doc_count: counts.doc_count,
chunk_count: counts.chunk_count,
asset_count: counts.asset_count,
last_ingest_at: counts.last_ingest_at,
media_breakdown: counts.media_breakdown,
lang_breakdown: counts.lang_breakdown,
index_bytes,
stale_doc_count: counts.stale_doc_count,
})
}
@@ -150,3 +173,31 @@ fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Mode
corpus_revision: store.corpus_revision(),
}
}
#[cfg(test)]
mod tests_stats_ext {
use super::*;
#[test]
fn stats_includes_breakdowns_and_bytes_on_fresh_corpus() {
let dir = tempfile::tempdir().unwrap();
let mut cfg = kebab_config::Config::defaults();
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
// Bring up migrations so the sqlite file is created.
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
drop(store);
let s = schema_with_config(&cfg).unwrap();
// 5 keys padded.
assert_eq!(s.stats.media_breakdown.len(), 5);
assert_eq!(s.stats.media_breakdown.get("markdown"), Some(&0));
assert_eq!(s.stats.media_breakdown.get("pdf"), Some(&0));
// lang map empty on empty corpus.
assert!(s.stats.lang_breakdown.is_empty());
// sqlite bytes positive after migrations, lancedb 0.
assert!(s.stats.index_bytes.sqlite > 0);
assert_eq!(s.stats.index_bytes.lancedb, 0);
assert_eq!(s.stats.stale_doc_count, 0);
}
}

View File

@@ -47,6 +47,7 @@ fn budget_truncates_snippets_when_below_threshold() {
max_tokens: Some(50),
snippet_chars: None,
cursor: None,
trace: false,
},
)
.unwrap();
@@ -78,6 +79,7 @@ fn cursor_paginates_to_next_page() {
max_tokens: None,
snippet_chars: None,
cursor: Some(cursor),
trace: false,
},
)
.unwrap();
@@ -114,6 +116,7 @@ fn cursor_rejected_after_corpus_revision_bump() {
max_tokens: None,
snippet_chars: None,
cursor: Some(c),
trace: false,
},
);
let err = result.unwrap_err();
@@ -147,6 +150,7 @@ fn max_tokens_zero_returns_one_hit_truncated() {
max_tokens: Some(0),
snippet_chars: None,
cursor: None,
trace: false,
},
)
.unwrap();

View File

@@ -604,6 +604,12 @@ pub struct CountSummary {
/// ISO-8601 timestamp of the most-recently updated document row, or
/// `None` when the store is empty.
pub last_ingest_at: Option<String>,
/// p9-fb-37: per-media-kind doc count (5 keys, zero-padded).
pub media_breakdown: std::collections::BTreeMap<String, u64>,
/// p9-fb-37: per-language doc count, NULL keyed as `"null"`.
pub lang_breakdown: std::collections::BTreeMap<String, u64>,
/// p9-fb-37: docs whose `updated_at < now - threshold_days`. 0 when threshold=0.
pub stale_doc_count: u64,
}
impl SqliteStore {
@@ -611,39 +617,58 @@ impl SqliteStore {
/// most-recent `documents.updated_at` timestamp.
///
/// Uses `read_conn()` (no mutations) — mirrors the pattern used by
/// [`Self::corpus_revision`].
pub fn count_summary(&self) -> anyhow::Result<CountSummary> {
/// Shared helper: counts and breakdowns in a single pass with given threshold.
fn count_summary_inner(&self, threshold_days: u64) -> anyhow::Result<CountSummary> {
use anyhow::Context;
use rusqlite::OptionalExtension;
let conn = self.read_conn();
let doc_count: u64 = conn
.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0))
.context("count documents")?;
let chunk_count: u64 = conn
.query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0))
.context("count chunks")?;
let asset_count: u64 = conn
.query_row("SELECT COUNT(*) FROM assets", [], |r| r.get(0))
.context("count assets")?;
let last_ingest_at: Option<String> = conn
.query_row(
"SELECT MAX(updated_at) FROM documents",
[],
|r| r.get(0),
)
.query_row("SELECT MAX(updated_at) FROM documents", [], |r| r.get(0))
.optional()
.context("max updated_at")?
.flatten();
let bd = crate::stats_ext::breakdowns(&conn, threshold_days).context("breakdowns")?;
Ok(CountSummary {
doc_count,
chunk_count,
asset_count,
last_ingest_at,
media_breakdown: bd.media,
lang_breakdown: bd.lang,
stale_doc_count: bd.stale_doc_count,
})
}
/// [`Self::corpus_revision`].
pub fn count_summary(&self) -> anyhow::Result<CountSummary> {
// p9-fb-37: default uses threshold_days=0 (matches fb-32 disable
// semantics). Callers that need real stale_doc_count call
// count_summary_with_threshold.
self.count_summary_inner(0)
}
/// p9-fb-37: variant that honors `config.search.stale_threshold_days`.
/// Callers who need a meaningful `stale_doc_count` (e.g. `kebab schema`)
/// pass the configured threshold; the older `count_summary` returns 0.
pub fn count_summary_with_threshold(
&self,
threshold_days: u64,
) -> anyhow::Result<CountSummary> {
self.count_summary_inner(threshold_days)
}
}
/// Apply the design §5 / task-spec pragmas. Called once per connection.
@@ -681,6 +706,9 @@ mod tests {
assert_eq!(s.chunk_count, 0);
assert_eq!(s.asset_count, 0);
assert!(s.last_ingest_at.is_none());
assert_eq!(s.media_breakdown.len(), 5);
assert!(s.lang_breakdown.is_empty());
assert_eq!(s.stale_doc_count, 0);
}
}