From dfef65f196f4eb7170741dd7b9c9469f0d7d6a9e Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sat, 9 May 2026 01:30:10 +0900
Subject: [PATCH] feat(app): staleness module + post-process search hits
(fb-32)
compute_stale: strict > boundary, threshold=0 disables, future
timestamps treated as fresh (clock skew safety). App::search
re-stamps on cache hit so config threshold changes take effect
without flushing the cache.
Also unblocks the workspace build by plugging placeholder
indexed_at/stale into the two AnswerCitation construction
sites in kebab-rag/pipeline.rs (the score-gate refusal path
forwards from SearchHit; the LLM-citation path uses
UNIX_EPOCH/false until Task 7 wires the real values through
pack_context).
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-app/src/app.rs | 35 ++++++--
crates/kebab-app/src/lib.rs | 2 +
crates/kebab-app/src/staleness.rs | 74 ++++++++++++++++
crates/kebab-app/tests/common/mod.rs | 23 +++++
.../tests/search_stale_integration.rs | 87 +++++++++++++++++++
crates/kebab-rag/src/pipeline.rs | 9 ++
6 files changed, 224 insertions(+), 6 deletions(-)
create mode 100644 crates/kebab-app/src/staleness.rs
create mode 100644 crates/kebab-app/tests/search_stale_integration.rs
diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs
index e538efe..34ae3f0 100644
--- a/crates/kebab-app/src/app.rs
+++ b/crates/kebab-app/src/app.rs
@@ -190,7 +190,21 @@ impl App {
corpus_revision = key.corpus_revision,
"search served from LRU cache"
);
- return Ok(hits.clone());
+ // p9-fb-32: re-stamp staleness on every cache hit. The cache
+ // entry was stamped at insert time against an older `now`
+ // and an older threshold; if either has shifted (config
+ // reload, time passing) the cached `stale: false` may now
+ // be wrong. Re-stamping is cheap (per-hit comparison) and
+ // avoids invalidating the cache on threshold changes.
+ let mut hits = hits.clone();
+ drop(guard);
+ let now = time::OffsetDateTime::now_utc();
+ crate::staleness::mark_stale_in_place(
+ &mut hits,
+ now,
+ self.config.search.stale_threshold_days,
+ );
+ return Ok(hits);
}
// Drop the lock before the (potentially slow) retriever call
// so other in-flight searches can use the cache concurrently.
@@ -205,14 +219,14 @@ impl App {
/// Used by `--no-cache` CLI invocations and by `search` itself
/// on cache miss. Identical behavior to the pre-fb-19 `search`.
pub fn search_uncached(&self, query: SearchQuery) -> Result> {
- match query.mode {
+ let mut hits = match query.mode {
SearchMode::Lexical => {
let lex = LexicalRetriever::with_settings(
self.sqlite.clone(),
lexical_index_version(&self.config),
self.config.search.snippet_chars,
);
- lex.search(&query)
+ lex.search(&query)?
}
SearchMode::Vector => {
let (emb, vec_store) = self.require_embeddings()?;
@@ -226,7 +240,7 @@ impl App {
vec_iv,
self.config.search.snippet_chars,
);
- retr.search(&query)
+ retr.search(&query)?
}
SearchMode::Hybrid => {
let lex = Arc::new(LexicalRetriever::with_settings(
@@ -246,9 +260,18 @@ impl App {
self.config.search.snippet_chars,
)) as Arc;
let hybrid = HybridRetriever::new(&self.config, lex, vec_retr);
- hybrid.search(&query)
+ hybrid.search(&query)?
}
- }
+ };
+ // p9-fb-32: stamp staleness against the freshest possible `now`
+ // and the current threshold. Cheap (per-hit comparison).
+ let now = time::OffsetDateTime::now_utc();
+ crate::staleness::mark_stale_in_place(
+ &mut hits,
+ now,
+ self.config.search.stale_threshold_days,
+ );
+ Ok(hits)
}
/// Run a RAG `ask` against the configured retriever + LLM. Reuses
diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs
index fa5b242..602fdaf 100644
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -63,12 +63,14 @@ pub mod ingest_progress;
pub mod logging;
pub mod reset;
pub mod schema;
+mod staleness;
pub use app::App;
pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
pub use reset::{ResetReport, ResetScope};
pub use error_wire::{ERROR_V1_ID, ErrorV1, classify};
pub use schema::{Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config};
+pub use staleness::{compute_stale, mark_stale_in_place};
/// p9-fb-25: sentinel for files without an extension in
/// `IngestReport.skipped_by_extension` keys + `IngestItem.warnings`
diff --git a/crates/kebab-app/src/staleness.rs b/crates/kebab-app/src/staleness.rs
new file mode 100644
index 0000000..b60fa74
--- /dev/null
+++ b/crates/kebab-app/src/staleness.rs
@@ -0,0 +1,74 @@
+//! p9-fb-32 staleness helpers.
+
+use time::{Duration, OffsetDateTime};
+
+use kebab_core::SearchHit;
+
+/// Returns `true` iff `now - indexed_at > threshold_days * 24h`.
+/// `threshold_days = 0` always returns `false` (feature disabled).
+/// Strict `>` so that exactly `threshold_days` old returns `false`.
+pub fn compute_stale(
+ indexed_at: OffsetDateTime,
+ now: OffsetDateTime,
+ threshold_days: u32,
+) -> bool {
+ if threshold_days == 0 {
+ return false;
+ }
+ let threshold = Duration::days(i64::from(threshold_days));
+ (now - indexed_at) > threshold
+}
+
+/// Sets `stale` on each hit in place using `compute_stale`.
+pub fn mark_stale_in_place(
+ hits: &mut [SearchHit],
+ now: OffsetDateTime,
+ threshold_days: u32,
+) {
+ for h in hits {
+ h.stale = compute_stale(h.indexed_at, now, threshold_days);
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use time::macros::datetime;
+
+ fn now() -> OffsetDateTime {
+ datetime!(2026-05-09 12:00:00 UTC)
+ }
+
+ #[test]
+ fn threshold_zero_always_fresh() {
+ let very_old = datetime!(2020-01-01 00:00:00 UTC);
+ assert!(!compute_stale(very_old, now(), 0));
+ }
+
+ #[test]
+ fn just_under_threshold_is_fresh() {
+ // 29 days, 23h, 59m old — under 30d.
+ let indexed = now() - Duration::days(29) - Duration::hours(23) - Duration::minutes(59);
+ assert!(!compute_stale(indexed, now(), 30));
+ }
+
+ #[test]
+ fn exactly_threshold_is_fresh() {
+ // strict `>` boundary: exactly 30d old is still fresh.
+ let indexed = now() - Duration::days(30);
+ assert!(!compute_stale(indexed, now(), 30));
+ }
+
+ #[test]
+ fn one_minute_past_threshold_is_stale() {
+ let indexed = now() - Duration::days(30) - Duration::minutes(1);
+ assert!(compute_stale(indexed, now(), 30));
+ }
+
+ #[test]
+ fn future_indexed_at_is_fresh() {
+ // clock skew safety: future timestamps must not be stale.
+ let future = now() + Duration::hours(1);
+ assert!(!compute_stale(future, now(), 30));
+ }
+}
diff --git a/crates/kebab-app/tests/common/mod.rs b/crates/kebab-app/tests/common/mod.rs
index 9195457..ce2a28f 100644
--- a/crates/kebab-app/tests/common/mod.rs
+++ b/crates/kebab-app/tests/common/mod.rs
@@ -94,6 +94,29 @@ pub fn lexical_query(text: &str) -> kebab_core::SearchQuery {
}
}
+/// p9-fb-32: rewrite `documents.updated_at` for one workspace path
+/// to `now - days_ago` (RFC3339 UTC). Used by staleness integration
+/// tests to simulate aged-out docs without faking system time. Caller
+/// is responsible for ingesting the doc *before* calling this — the
+/// row must already exist.
+pub fn backdate_document_updated_at(env: &TestEnv, workspace_path: &str, days_ago: i64) {
+ let backdated = (time::OffsetDateTime::now_utc() - time::Duration::days(days_ago))
+ .format(&time::format_description::well_known::Rfc3339)
+ .expect("format backdated updated_at");
+ let db_path = PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
+ let conn = rusqlite::Connection::open(&db_path).expect("open kebab.sqlite");
+ let updated = conn
+ .execute(
+ "UPDATE documents SET updated_at = ?1 WHERE workspace_path = ?2",
+ rusqlite::params![backdated, workspace_path],
+ )
+ .expect("UPDATE documents.updated_at");
+ assert_eq!(
+ updated, 1,
+ "backdate_document_updated_at: expected to update exactly 1 row for {workspace_path}, got {updated}"
+ );
+}
+
fn copy_fixture_workspace(dest: &Path) {
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("tests")
diff --git a/crates/kebab-app/tests/search_stale_integration.rs b/crates/kebab-app/tests/search_stale_integration.rs
new file mode 100644
index 0000000..dff1639
--- /dev/null
+++ b/crates/kebab-app/tests/search_stale_integration.rs
@@ -0,0 +1,87 @@
+//! p9-fb-32: `App::search` end-to-end staleness wiring.
+//!
+//! `compute_stale` itself is unit-tested in `kebab_app::staleness`; this
+//! file proves the post-process actually fires through the full
+//! retriever stack and that the cache-hit re-stamp respects the
+//! configured threshold.
+//!
+//! All three tests run lexical-only (no AVX, no fastembed download).
+
+mod common;
+
+use common::TestEnv;
+
+fn lexical_query_owner() -> kebab_core::SearchQuery {
+ common::lexical_query("ownership")
+}
+
+/// Fresh ingest at default 30-day threshold → no hit can be stale.
+/// `documents.updated_at` is stamped at ingest time (now), so the
+/// distance to `now_utc()` is sub-second.
+#[test]
+fn fresh_doc_is_not_stale_with_default_threshold() {
+ let env = TestEnv::lexical_only();
+ kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
+
+ let app = kebab_app::App::open_with_config(env.config.clone()).unwrap();
+ let hits = app.search(lexical_query_owner()).unwrap();
+ assert!(!hits.is_empty(), "expected ≥1 hit for 'ownership'");
+ assert!(
+ hits.iter().all(|h| !h.stale),
+ "freshly-ingested doc must not be stale at default 30d threshold: {:?}",
+ hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::>()
+ );
+}
+
+/// `stale_threshold_days = 0` disables the feature even for very old
+/// `documents.updated_at`. Backdate the row to a year ago, expect
+/// `stale: false` on every hit.
+#[test]
+fn threshold_zero_disables_staleness() {
+ let mut env = TestEnv::lexical_only();
+ env.config.search.stale_threshold_days = 0;
+
+ kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
+ common::backdate_document_updated_at(&env, "intro.md", 365);
+
+ let app = kebab_app::App::open_with_config(env.config.clone()).unwrap();
+ let hits = app.search(lexical_query_owner()).unwrap();
+ assert!(!hits.is_empty(), "expected ≥1 hit");
+ assert!(
+ hits.iter().all(|h| !h.stale),
+ "threshold=0 disables staleness even for year-old docs: {:?}",
+ hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::>()
+ );
+}
+
+/// At a 30-day threshold, a 60-day-old `documents.updated_at` must
+/// surface as stale on the matching hit. (Other hits — fresh fixtures
+/// not backdated — stay fresh, so we use `any` not `all`.)
+#[test]
+fn old_doc_marked_stale() {
+ let mut env = TestEnv::lexical_only();
+ env.config.search.stale_threshold_days = 30;
+
+ kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
+ common::backdate_document_updated_at(&env, "intro.md", 60);
+
+ let app = kebab_app::App::open_with_config(env.config.clone()).unwrap();
+ let hits = app.search(lexical_query_owner()).unwrap();
+ assert!(!hits.is_empty(), "expected ≥1 hit");
+ let intro_hits: Vec<&kebab_core::SearchHit> = hits
+ .iter()
+ .filter(|h| h.doc_path.0.ends_with("intro.md"))
+ .collect();
+ assert!(
+ !intro_hits.is_empty(),
+ "expected ≥1 hit on intro.md (the backdated doc)"
+ );
+ assert!(
+ intro_hits.iter().all(|h| h.stale),
+ "60-day-old intro.md must be stale at 30d threshold: {:?}",
+ intro_hits
+ .iter()
+ .map(|h| (h.doc_path.0.clone(), h.stale))
+ .collect::>()
+ );
+}
diff --git a/crates/kebab-rag/src/pipeline.rs b/crates/kebab-rag/src/pipeline.rs
index 8c39c44..5f93ec3 100644
--- a/crates/kebab-rag/src/pipeline.rs
+++ b/crates/kebab-rag/src/pipeline.rs
@@ -343,6 +343,10 @@ impl RagPipeline {
// `AnswerCitation.marker` strips the `#`.
marker: Some(format!("[{n}]")),
citation: c.clone(),
+ // p9-fb-32: placeholder — Task 7 owns wiring real
+ // indexed_at / stale here from the underlying SearchHit.
+ indexed_at: time::OffsetDateTime::UNIX_EPOCH,
+ stale: false,
})
.collect();
@@ -560,6 +564,11 @@ impl RagPipeline {
.map(|h| AnswerCitation {
marker: None,
citation: h.citation.clone(),
+ // p9-fb-32: forward staleness from the underlying
+ // `SearchHit` directly — this is the score-gate refusal
+ // path which doesn't go through `pack_context`.
+ indexed_at: h.indexed_at,
+ stale: h.stale,
})
.collect();
let chunks_returned = u32::try_from(hits.len()).unwrap_or(u32::MAX);