diff --git a/crates/kebab-search/src/vector.rs b/crates/kebab-search/src/vector.rs index f4d5f8b..bcd6d04 100644 --- a/crates/kebab-search/src/vector.rs +++ b/crates/kebab-search/src/vector.rs @@ -197,6 +197,8 @@ struct ChunkMeta { chunker_version: String, doc_id: String, workspace_path: String, + /// p9-fb-32: documents.updated_at (RFC3339). + updated_at: String, } fn hydrate_chunks( @@ -222,7 +224,7 @@ fn hydrate_chunks( "SELECT \ c.chunk_id, c.text, c.heading_path_json, c.section_label, \ c.source_spans_json, c.chunker_version, \ - c.doc_id, d.workspace_path \ + c.doc_id, d.workspace_path, d.updated_at \ FROM chunks c \ JOIN documents d ON d.doc_id = c.doc_id \ WHERE c.chunk_id IN ({placeholders})" @@ -249,6 +251,7 @@ fn hydrate_chunks( chunker_version: row.get(5)?, doc_id: row.get(6)?, workspace_path: row.get(7)?, + updated_at: row.get(8)?, }, )) }, @@ -287,6 +290,16 @@ fn build_hit( ); let snippet = trim_snippet(&meta.text, snippet_chars); + // p9-fb-32: documents.updated_at is stored as RFC3339 TEXT (V001 + // migration; written by put_document via OffsetDateTime::now_utc). + // Mirrors the lexical retriever; see lexical::build_hit for the + // shared rationale on incremental-ingest skip semantics. + let indexed_at = time::OffsetDateTime::parse( + &meta.updated_at, + &time::format_description::well_known::Rfc3339, + ) + .context("kb-search vector: parse documents.updated_at as RFC3339")?; + let score = hit.score; Ok(SearchHit { rank, @@ -308,9 +321,8 @@ fn build_hit( index_version: index_version.clone(), embedding_model: Some(model_id.clone()), chunker_version: ChunkerVersion(meta.chunker_version.clone()), - // p9-fb-32: Task 5 will hydrate from documents.updated_at; this - // stub keeps the lib compiling after Task 1 added the field. - indexed_at: time::OffsetDateTime::UNIX_EPOCH, + indexed_at, + // Placeholder — App layer overwrites against config threshold (Task 6). stale: false, }) } diff --git a/crates/kebab-search/tests/hybrid.rs b/crates/kebab-search/tests/hybrid.rs index fcda0c5..13f945d 100644 --- a/crates/kebab-search/tests/hybrid.rs +++ b/crates/kebab-search/tests/hybrid.rs @@ -18,6 +18,7 @@ use kebab_core::{ Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery, }; use kebab_search::{FusionPolicy, HybridRetriever}; +use rusqlite::params; use serde_json::json; fn build_hybrid(env: &HybridEnv) -> HybridRetriever { @@ -211,3 +212,47 @@ fn hybrid_snapshot_run_1() { ); } } + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn vector_hit_carries_indexed_at() { + // p9-fb-32: VectorRetriever must populate SearchHit.indexed_at from + // documents.updated_at via the JOIN added to hydrate_chunks (mirrors + // the lexical retriever's behavior — Task 5). + use time::OffsetDateTime; + use time::format_description::well_known::Rfc3339; + + require_avx_or_panic(); + let env = HybridEnv::new(); + let _ids = seed_disjoint_corpus(&env); + + // `seed_chunk` hardcodes updated_at='1970-01-01T00:00:00Z'; bump + // every document's updated_at to wall-clock now so the assertion + // against `now` is meaningful. + let now = OffsetDateTime::now_utc(); + let now_rfc = now.format(&Rfc3339).expect("format now as rfc3339"); + { + let conn = env.sqlite.read_conn(); + conn.execute( + "UPDATE documents SET updated_at = ?", + params![now_rfc], + ) + .expect("bump documents.updated_at"); + } + + let r = env.vector_retriever(); + let hits = r + .search(&SearchQuery { + text: "rust".to_string(), + mode: SearchMode::Vector, + k: 5, + filters: SearchFilters::default(), + }) + .expect("vector search"); + let hit = hits.first().expect("at least one vector hit"); + let now2 = OffsetDateTime::now_utc(); + let delta = (now2 - hit.indexed_at).whole_seconds().abs(); + assert!(delta < 60, "indexed_at within ±60s of now, got {delta}s"); + // stale is a placeholder set by the retriever; the App layer overwrites. + assert!(!hit.stale, "vector retriever must default stale=false"); +}