feat(stats): media/lang/bytes/stale fields on schema.v1.stats (fb-37)
Extends CountSummary with media_breakdown, lang_breakdown, stale_doc_count fields populated via stats_ext::breakdowns(). Adds count_summary_with_threshold for callers that need real stale counts. Mirrors all new fields onto the wire-bound Stats struct in kebab-app::schema with #[serde(default)] for backwards-compat. Also fixes search_budget_integration.rs for the trace field added to SearchOpts in Task 1. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -50,6 +50,18 @@ pub struct Stats {
|
||||
pub chunk_count: u64,
|
||||
pub asset_count: u64,
|
||||
pub last_ingest_at: Option<String>,
|
||||
/// p9-fb-37: per-media-kind doc count (5 keys, zero-padded).
|
||||
#[serde(default)]
|
||||
pub media_breakdown: std::collections::BTreeMap<String, u64>,
|
||||
/// p9-fb-37: per-language doc count, NULL keyed as `"null"`.
|
||||
#[serde(default)]
|
||||
pub lang_breakdown: std::collections::BTreeMap<String, u64>,
|
||||
/// p9-fb-37: on-disk byte sums.
|
||||
#[serde(default)]
|
||||
pub index_bytes: kebab_core::IndexBytes,
|
||||
/// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold.
|
||||
#[serde(default)]
|
||||
pub stale_doc_count: u64,
|
||||
}
|
||||
|
||||
const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
@@ -85,7 +97,7 @@ const WIRE_SCHEMAS: &[&str] = &[
|
||||
#[doc(hidden)]
|
||||
pub fn schema_with_config(cfg: &Config) -> anyhow::Result<SchemaV1> {
|
||||
let store = open_store_for_stats(cfg)?;
|
||||
let stats = collect_stats(&store)?;
|
||||
let stats = collect_stats(cfg, &store)?;
|
||||
let models = collect_models(cfg, &store);
|
||||
Ok(SchemaV1 {
|
||||
schema_version: SCHEMA_V1_ID.to_string(),
|
||||
@@ -124,13 +136,24 @@ fn open_store_for_stats(cfg: &Config) -> anyhow::Result<kebab_store_sqlite::Sqli
|
||||
kebab_store_sqlite::SqliteStore::open_existing(&db_path)
|
||||
}
|
||||
|
||||
fn collect_stats(store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
|
||||
let counts = store.count_summary()?;
|
||||
fn collect_stats(
|
||||
cfg: &Config,
|
||||
store: &kebab_store_sqlite::SqliteStore,
|
||||
) -> anyhow::Result<Stats> {
|
||||
let counts = store
|
||||
.count_summary_with_threshold(cfg.search.stale_threshold_days as u64)?;
|
||||
let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
|
||||
let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir)
|
||||
.map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?;
|
||||
Ok(Stats {
|
||||
doc_count: counts.doc_count,
|
||||
chunk_count: counts.chunk_count,
|
||||
asset_count: counts.asset_count,
|
||||
last_ingest_at: counts.last_ingest_at,
|
||||
media_breakdown: counts.media_breakdown,
|
||||
lang_breakdown: counts.lang_breakdown,
|
||||
index_bytes,
|
||||
stale_doc_count: counts.stale_doc_count,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -150,3 +173,31 @@ fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Mode
|
||||
corpus_revision: store.corpus_revision(),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_stats_ext {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn stats_includes_breakdowns_and_bytes_on_fresh_corpus() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let mut cfg = kebab_config::Config::defaults();
|
||||
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
|
||||
// Bring up migrations so the sqlite file is created.
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
drop(store);
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
// 5 keys padded.
|
||||
assert_eq!(s.stats.media_breakdown.len(), 5);
|
||||
assert_eq!(s.stats.media_breakdown.get("markdown"), Some(&0));
|
||||
assert_eq!(s.stats.media_breakdown.get("pdf"), Some(&0));
|
||||
// lang map empty on empty corpus.
|
||||
assert!(s.stats.lang_breakdown.is_empty());
|
||||
// sqlite bytes positive after migrations, lancedb 0.
|
||||
assert!(s.stats.index_bytes.sqlite > 0);
|
||||
assert_eq!(s.stats.index_bytes.lancedb, 0);
|
||||
assert_eq!(s.stats.stale_doc_count, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,6 +47,7 @@ fn budget_truncates_snippets_when_below_threshold() {
|
||||
max_tokens: Some(50),
|
||||
snippet_chars: None,
|
||||
cursor: None,
|
||||
trace: false,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
@@ -78,6 +79,7 @@ fn cursor_paginates_to_next_page() {
|
||||
max_tokens: None,
|
||||
snippet_chars: None,
|
||||
cursor: Some(cursor),
|
||||
trace: false,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
@@ -114,6 +116,7 @@ fn cursor_rejected_after_corpus_revision_bump() {
|
||||
max_tokens: None,
|
||||
snippet_chars: None,
|
||||
cursor: Some(c),
|
||||
trace: false,
|
||||
},
|
||||
);
|
||||
let err = result.unwrap_err();
|
||||
@@ -147,6 +150,7 @@ fn max_tokens_zero_returns_one_hit_truncated() {
|
||||
max_tokens: Some(0),
|
||||
snippet_chars: None,
|
||||
cursor: None,
|
||||
trace: false,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -604,6 +604,12 @@ pub struct CountSummary {
|
||||
/// ISO-8601 timestamp of the most-recently updated document row, or
|
||||
/// `None` when the store is empty.
|
||||
pub last_ingest_at: Option<String>,
|
||||
/// p9-fb-37: per-media-kind doc count (5 keys, zero-padded).
|
||||
pub media_breakdown: std::collections::BTreeMap<String, u64>,
|
||||
/// p9-fb-37: per-language doc count, NULL keyed as `"null"`.
|
||||
pub lang_breakdown: std::collections::BTreeMap<String, u64>,
|
||||
/// p9-fb-37: docs whose `updated_at < now - threshold_days`. 0 when threshold=0.
|
||||
pub stale_doc_count: u64,
|
||||
}
|
||||
|
||||
impl SqliteStore {
|
||||
@@ -611,39 +617,58 @@ impl SqliteStore {
|
||||
/// most-recent `documents.updated_at` timestamp.
|
||||
///
|
||||
/// Uses `read_conn()` (no mutations) — mirrors the pattern used by
|
||||
/// [`Self::corpus_revision`].
|
||||
pub fn count_summary(&self) -> anyhow::Result<CountSummary> {
|
||||
/// Shared helper: counts and breakdowns in a single pass with given threshold.
|
||||
fn count_summary_inner(&self, threshold_days: u64) -> anyhow::Result<CountSummary> {
|
||||
use anyhow::Context;
|
||||
use rusqlite::OptionalExtension;
|
||||
|
||||
let conn = self.read_conn();
|
||||
|
||||
let doc_count: u64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM documents", [], |r| r.get(0))
|
||||
.context("count documents")?;
|
||||
|
||||
let chunk_count: u64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0))
|
||||
.context("count chunks")?;
|
||||
|
||||
let asset_count: u64 = conn
|
||||
.query_row("SELECT COUNT(*) FROM assets", [], |r| r.get(0))
|
||||
.context("count assets")?;
|
||||
|
||||
let last_ingest_at: Option<String> = conn
|
||||
.query_row(
|
||||
"SELECT MAX(updated_at) FROM documents",
|
||||
[],
|
||||
|r| r.get(0),
|
||||
)
|
||||
.query_row("SELECT MAX(updated_at) FROM documents", [], |r| r.get(0))
|
||||
.optional()
|
||||
.context("max updated_at")?
|
||||
.flatten();
|
||||
|
||||
let bd = crate::stats_ext::breakdowns(&conn, threshold_days).context("breakdowns")?;
|
||||
|
||||
Ok(CountSummary {
|
||||
doc_count,
|
||||
chunk_count,
|
||||
asset_count,
|
||||
last_ingest_at,
|
||||
media_breakdown: bd.media,
|
||||
lang_breakdown: bd.lang,
|
||||
stale_doc_count: bd.stale_doc_count,
|
||||
})
|
||||
}
|
||||
|
||||
/// [`Self::corpus_revision`].
|
||||
pub fn count_summary(&self) -> anyhow::Result<CountSummary> {
|
||||
// p9-fb-37: default uses threshold_days=0 (matches fb-32 disable
|
||||
// semantics). Callers that need real stale_doc_count call
|
||||
// count_summary_with_threshold.
|
||||
self.count_summary_inner(0)
|
||||
}
|
||||
|
||||
/// p9-fb-37: variant that honors `config.search.stale_threshold_days`.
|
||||
/// Callers who need a meaningful `stale_doc_count` (e.g. `kebab schema`)
|
||||
/// pass the configured threshold; the older `count_summary` returns 0.
|
||||
pub fn count_summary_with_threshold(
|
||||
&self,
|
||||
threshold_days: u64,
|
||||
) -> anyhow::Result<CountSummary> {
|
||||
self.count_summary_inner(threshold_days)
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply the design §5 / task-spec pragmas. Called once per connection.
|
||||
@@ -681,6 +706,9 @@ mod tests {
|
||||
assert_eq!(s.chunk_count, 0);
|
||||
assert_eq!(s.asset_count, 0);
|
||||
assert!(s.last_ingest_at.is_none());
|
||||
assert_eq!(s.media_breakdown.len(), 5);
|
||||
assert!(s.lang_breakdown.is_empty());
|
||||
assert_eq!(s.stale_doc_count, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user