From dfef65f196f4eb7170741dd7b9c9469f0d7d6a9e Mon Sep 17 00:00:00 2001 From: th-kim0823 Date: Sat, 9 May 2026 01:30:10 +0900 Subject: [PATCH] feat(app): staleness module + post-process search hits (fb-32) compute_stale: strict > boundary, threshold=0 disables, future timestamps treated as fresh (clock skew safety). App::search re-stamps on cache hit so config threshold changes take effect without flushing the cache. Also unblocks the workspace build by plugging placeholder indexed_at/stale into the two AnswerCitation construction sites in kebab-rag/pipeline.rs (the score-gate refusal path forwards from SearchHit; the LLM-citation path uses UNIX_EPOCH/false until Task 7 wires the real values through pack_context). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/app.rs | 35 ++++++-- crates/kebab-app/src/lib.rs | 2 + crates/kebab-app/src/staleness.rs | 74 ++++++++++++++++ crates/kebab-app/tests/common/mod.rs | 23 +++++ .../tests/search_stale_integration.rs | 87 +++++++++++++++++++ crates/kebab-rag/src/pipeline.rs | 9 ++ 6 files changed, 224 insertions(+), 6 deletions(-) create mode 100644 crates/kebab-app/src/staleness.rs create mode 100644 crates/kebab-app/tests/search_stale_integration.rs diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs index e538efe..34ae3f0 100644 --- a/crates/kebab-app/src/app.rs +++ b/crates/kebab-app/src/app.rs @@ -190,7 +190,21 @@ impl App { corpus_revision = key.corpus_revision, "search served from LRU cache" ); - return Ok(hits.clone()); + // p9-fb-32: re-stamp staleness on every cache hit. The cache + // entry was stamped at insert time against an older `now` + // and an older threshold; if either has shifted (config + // reload, time passing) the cached `stale: false` may now + // be wrong. Re-stamping is cheap (per-hit comparison) and + // avoids invalidating the cache on threshold changes. + let mut hits = hits.clone(); + drop(guard); + let now = time::OffsetDateTime::now_utc(); + crate::staleness::mark_stale_in_place( + &mut hits, + now, + self.config.search.stale_threshold_days, + ); + return Ok(hits); } // Drop the lock before the (potentially slow) retriever call // so other in-flight searches can use the cache concurrently. @@ -205,14 +219,14 @@ impl App { /// Used by `--no-cache` CLI invocations and by `search` itself /// on cache miss. Identical behavior to the pre-fb-19 `search`. pub fn search_uncached(&self, query: SearchQuery) -> Result> { - match query.mode { + let mut hits = match query.mode { SearchMode::Lexical => { let lex = LexicalRetriever::with_settings( self.sqlite.clone(), lexical_index_version(&self.config), self.config.search.snippet_chars, ); - lex.search(&query) + lex.search(&query)? } SearchMode::Vector => { let (emb, vec_store) = self.require_embeddings()?; @@ -226,7 +240,7 @@ impl App { vec_iv, self.config.search.snippet_chars, ); - retr.search(&query) + retr.search(&query)? } SearchMode::Hybrid => { let lex = Arc::new(LexicalRetriever::with_settings( @@ -246,9 +260,18 @@ impl App { self.config.search.snippet_chars, )) as Arc; let hybrid = HybridRetriever::new(&self.config, lex, vec_retr); - hybrid.search(&query) + hybrid.search(&query)? } - } + }; + // p9-fb-32: stamp staleness against the freshest possible `now` + // and the current threshold. Cheap (per-hit comparison). + let now = time::OffsetDateTime::now_utc(); + crate::staleness::mark_stale_in_place( + &mut hits, + now, + self.config.search.stale_threshold_days, + ); + Ok(hits) } /// Run a RAG `ask` against the configured retriever + LLM. Reuses diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index fa5b242..602fdaf 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -63,12 +63,14 @@ pub mod ingest_progress; pub mod logging; pub mod reset; pub mod schema; +mod staleness; pub use app::App; pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown}; pub use reset::{ResetReport, ResetScope}; pub use error_wire::{ERROR_V1_ID, ErrorV1, classify}; pub use schema::{Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config}; +pub use staleness::{compute_stale, mark_stale_in_place}; /// p9-fb-25: sentinel for files without an extension in /// `IngestReport.skipped_by_extension` keys + `IngestItem.warnings` diff --git a/crates/kebab-app/src/staleness.rs b/crates/kebab-app/src/staleness.rs new file mode 100644 index 0000000..b60fa74 --- /dev/null +++ b/crates/kebab-app/src/staleness.rs @@ -0,0 +1,74 @@ +//! p9-fb-32 staleness helpers. + +use time::{Duration, OffsetDateTime}; + +use kebab_core::SearchHit; + +/// Returns `true` iff `now - indexed_at > threshold_days * 24h`. +/// `threshold_days = 0` always returns `false` (feature disabled). +/// Strict `>` so that exactly `threshold_days` old returns `false`. +pub fn compute_stale( + indexed_at: OffsetDateTime, + now: OffsetDateTime, + threshold_days: u32, +) -> bool { + if threshold_days == 0 { + return false; + } + let threshold = Duration::days(i64::from(threshold_days)); + (now - indexed_at) > threshold +} + +/// Sets `stale` on each hit in place using `compute_stale`. +pub fn mark_stale_in_place( + hits: &mut [SearchHit], + now: OffsetDateTime, + threshold_days: u32, +) { + for h in hits { + h.stale = compute_stale(h.indexed_at, now, threshold_days); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use time::macros::datetime; + + fn now() -> OffsetDateTime { + datetime!(2026-05-09 12:00:00 UTC) + } + + #[test] + fn threshold_zero_always_fresh() { + let very_old = datetime!(2020-01-01 00:00:00 UTC); + assert!(!compute_stale(very_old, now(), 0)); + } + + #[test] + fn just_under_threshold_is_fresh() { + // 29 days, 23h, 59m old — under 30d. + let indexed = now() - Duration::days(29) - Duration::hours(23) - Duration::minutes(59); + assert!(!compute_stale(indexed, now(), 30)); + } + + #[test] + fn exactly_threshold_is_fresh() { + // strict `>` boundary: exactly 30d old is still fresh. + let indexed = now() - Duration::days(30); + assert!(!compute_stale(indexed, now(), 30)); + } + + #[test] + fn one_minute_past_threshold_is_stale() { + let indexed = now() - Duration::days(30) - Duration::minutes(1); + assert!(compute_stale(indexed, now(), 30)); + } + + #[test] + fn future_indexed_at_is_fresh() { + // clock skew safety: future timestamps must not be stale. + let future = now() + Duration::hours(1); + assert!(!compute_stale(future, now(), 30)); + } +} diff --git a/crates/kebab-app/tests/common/mod.rs b/crates/kebab-app/tests/common/mod.rs index 9195457..ce2a28f 100644 --- a/crates/kebab-app/tests/common/mod.rs +++ b/crates/kebab-app/tests/common/mod.rs @@ -94,6 +94,29 @@ pub fn lexical_query(text: &str) -> kebab_core::SearchQuery { } } +/// p9-fb-32: rewrite `documents.updated_at` for one workspace path +/// to `now - days_ago` (RFC3339 UTC). Used by staleness integration +/// tests to simulate aged-out docs without faking system time. Caller +/// is responsible for ingesting the doc *before* calling this — the +/// row must already exist. +pub fn backdate_document_updated_at(env: &TestEnv, workspace_path: &str, days_ago: i64) { + let backdated = (time::OffsetDateTime::now_utc() - time::Duration::days(days_ago)) + .format(&time::format_description::well_known::Rfc3339) + .expect("format backdated updated_at"); + let db_path = PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite"); + let conn = rusqlite::Connection::open(&db_path).expect("open kebab.sqlite"); + let updated = conn + .execute( + "UPDATE documents SET updated_at = ?1 WHERE workspace_path = ?2", + rusqlite::params![backdated, workspace_path], + ) + .expect("UPDATE documents.updated_at"); + assert_eq!( + updated, 1, + "backdate_document_updated_at: expected to update exactly 1 row for {workspace_path}, got {updated}" + ); +} + fn copy_fixture_workspace(dest: &Path) { let src = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("tests") diff --git a/crates/kebab-app/tests/search_stale_integration.rs b/crates/kebab-app/tests/search_stale_integration.rs new file mode 100644 index 0000000..dff1639 --- /dev/null +++ b/crates/kebab-app/tests/search_stale_integration.rs @@ -0,0 +1,87 @@ +//! p9-fb-32: `App::search` end-to-end staleness wiring. +//! +//! `compute_stale` itself is unit-tested in `kebab_app::staleness`; this +//! file proves the post-process actually fires through the full +//! retriever stack and that the cache-hit re-stamp respects the +//! configured threshold. +//! +//! All three tests run lexical-only (no AVX, no fastembed download). + +mod common; + +use common::TestEnv; + +fn lexical_query_owner() -> kebab_core::SearchQuery { + common::lexical_query("ownership") +} + +/// Fresh ingest at default 30-day threshold → no hit can be stale. +/// `documents.updated_at` is stamped at ingest time (now), so the +/// distance to `now_utc()` is sub-second. +#[test] +fn fresh_doc_is_not_stale_with_default_threshold() { + let env = TestEnv::lexical_only(); + kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); + + let app = kebab_app::App::open_with_config(env.config.clone()).unwrap(); + let hits = app.search(lexical_query_owner()).unwrap(); + assert!(!hits.is_empty(), "expected ≥1 hit for 'ownership'"); + assert!( + hits.iter().all(|h| !h.stale), + "freshly-ingested doc must not be stale at default 30d threshold: {:?}", + hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::>() + ); +} + +/// `stale_threshold_days = 0` disables the feature even for very old +/// `documents.updated_at`. Backdate the row to a year ago, expect +/// `stale: false` on every hit. +#[test] +fn threshold_zero_disables_staleness() { + let mut env = TestEnv::lexical_only(); + env.config.search.stale_threshold_days = 0; + + kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); + common::backdate_document_updated_at(&env, "intro.md", 365); + + let app = kebab_app::App::open_with_config(env.config.clone()).unwrap(); + let hits = app.search(lexical_query_owner()).unwrap(); + assert!(!hits.is_empty(), "expected ≥1 hit"); + assert!( + hits.iter().all(|h| !h.stale), + "threshold=0 disables staleness even for year-old docs: {:?}", + hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::>() + ); +} + +/// At a 30-day threshold, a 60-day-old `documents.updated_at` must +/// surface as stale on the matching hit. (Other hits — fresh fixtures +/// not backdated — stay fresh, so we use `any` not `all`.) +#[test] +fn old_doc_marked_stale() { + let mut env = TestEnv::lexical_only(); + env.config.search.stale_threshold_days = 30; + + kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); + common::backdate_document_updated_at(&env, "intro.md", 60); + + let app = kebab_app::App::open_with_config(env.config.clone()).unwrap(); + let hits = app.search(lexical_query_owner()).unwrap(); + assert!(!hits.is_empty(), "expected ≥1 hit"); + let intro_hits: Vec<&kebab_core::SearchHit> = hits + .iter() + .filter(|h| h.doc_path.0.ends_with("intro.md")) + .collect(); + assert!( + !intro_hits.is_empty(), + "expected ≥1 hit on intro.md (the backdated doc)" + ); + assert!( + intro_hits.iter().all(|h| h.stale), + "60-day-old intro.md must be stale at 30d threshold: {:?}", + intro_hits + .iter() + .map(|h| (h.doc_path.0.clone(), h.stale)) + .collect::>() + ); +} diff --git a/crates/kebab-rag/src/pipeline.rs b/crates/kebab-rag/src/pipeline.rs index 8c39c44..5f93ec3 100644 --- a/crates/kebab-rag/src/pipeline.rs +++ b/crates/kebab-rag/src/pipeline.rs @@ -343,6 +343,10 @@ impl RagPipeline { // `AnswerCitation.marker` strips the `#`. marker: Some(format!("[{n}]")), citation: c.clone(), + // p9-fb-32: placeholder — Task 7 owns wiring real + // indexed_at / stale here from the underlying SearchHit. + indexed_at: time::OffsetDateTime::UNIX_EPOCH, + stale: false, }) .collect(); @@ -560,6 +564,11 @@ impl RagPipeline { .map(|h| AnswerCitation { marker: None, citation: h.citation.clone(), + // p9-fb-32: forward staleness from the underlying + // `SearchHit` directly — this is the score-gate refusal + // path which doesn't go through `pack_context`. + indexed_at: h.indexed_at, + stale: h.stale, }) .collect(); let chunks_returned = u32::try_from(hits.len()).unwrap_or(u32::MAX);