diff --git a/Cargo.lock b/Cargo.lock index f50ef84..5955be5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3832,6 +3832,7 @@ dependencies = [ "serde_json", "tempfile", "thiserror 2.0.18", + "time", "tracing", ] diff --git a/crates/kebab-search/Cargo.toml b/crates/kebab-search/Cargo.toml index f195b81..f13fd7f 100644 --- a/crates/kebab-search/Cargo.toml +++ b/crates/kebab-search/Cargo.toml @@ -25,6 +25,9 @@ serde_json = { workspace = true } tracing = { workspace = true } thiserror = { workspace = true } anyhow = { workspace = true } +# p9-fb-32: parse documents.updated_at (RFC3339) into OffsetDateTime +# for SearchHit.indexed_at. +time = { workspace = true } [dev-dependencies] tempfile = { workspace = true } diff --git a/crates/kebab-search/src/hybrid.rs b/crates/kebab-search/src/hybrid.rs index 9ebd7de..37cd629 100644 --- a/crates/kebab-search/src/hybrid.rs +++ b/crates/kebab-search/src/hybrid.rs @@ -415,6 +415,10 @@ mod tests { index_version: IndexVersion("v1".to_string()), embedding_model: None, chunker_version: ChunkerVersion("v1".to_string()), + // p9-fb-32: hybrid unit tests don't exercise staleness; pin + // a fixed UNIX_EPOCH so synthetic hits remain deterministic. + indexed_at: time::OffsetDateTime::UNIX_EPOCH, + stale: false, } } diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs index 71d32c6..2b09c4f 100644 --- a/crates/kebab-search/src/lexical.rs +++ b/crates/kebab-search/src/lexical.rs @@ -244,6 +244,8 @@ struct RawRow { source_spans_json: String, chunker_version: String, workspace_path: String, + /// p9-fb-32: documents.updated_at (RFC3339). + updated_at: String, } /// Build + execute the FTS5 query. The SQL pattern is the one documented @@ -265,7 +267,8 @@ fn run_query( snippet(chunks_fts, 3, '', '', '…', ?) AS snippet, \ c.heading_path_json, c.section_label, c.source_spans_json, \ c.chunker_version, \ - d.workspace_path \ + d.workspace_path, \ + d.updated_at \ FROM chunks_fts f \ JOIN chunks c ON c.chunk_id = f.chunk_id \ JOIN documents d ON d.doc_id = f.doc_id", @@ -349,6 +352,7 @@ fn row_from_sql(row: &Row<'_>) -> rusqlite::Result { source_spans_json: row.get(6)?, chunker_version: row.get(7)?, workspace_path: row.get(8)?, + updated_at: row.get(9)?, }) } @@ -382,6 +386,16 @@ fn build_hit( // defensively if SQLite ever returns a longer string. let snippet = trim_snippet(&raw.snippet, snippet_chars); + // p9-fb-32: documents.updated_at is stored as RFC3339 TEXT (V001 + // migration; written by put_document via OffsetDateTime::now_utc). + // fb-23 incremental ingest's skip path does not call put_document, + // so this naturally reflects the last actual re-process. + let indexed_at = time::OffsetDateTime::parse( + &raw.updated_at, + &time::format_description::well_known::Rfc3339, + ) + .context("kb-search lexical: parse documents.updated_at as RFC3339")?; + Ok(SearchHit { rank, chunk_id: ChunkId(raw.chunk_id), @@ -402,6 +416,9 @@ fn build_hit( index_version: index_version.clone(), embedding_model: None, chunker_version: ChunkerVersion(raw.chunker_version), + indexed_at, + // Placeholder — App layer overwrites against config threshold (Task 6). + stale: false, }) } diff --git a/crates/kebab-search/src/vector.rs b/crates/kebab-search/src/vector.rs index 77ff2a1..f4d5f8b 100644 --- a/crates/kebab-search/src/vector.rs +++ b/crates/kebab-search/src/vector.rs @@ -308,6 +308,10 @@ fn build_hit( index_version: index_version.clone(), embedding_model: Some(model_id.clone()), chunker_version: ChunkerVersion(meta.chunker_version.clone()), + // p9-fb-32: Task 5 will hydrate from documents.updated_at; this + // stub keeps the lib compiling after Task 1 added the field. + indexed_at: time::OffsetDateTime::UNIX_EPOCH, + stale: false, }) } diff --git a/crates/kebab-search/tests/fixtures/search/lexical/run-1.json b/crates/kebab-search/tests/fixtures/search/lexical/run-1.json index 701d2fb..2500cd4 100644 --- a/crates/kebab-search/tests/fixtures/search/lexical/run-1.json +++ b/crates/kebab-search/tests/fixtures/search/lexical/run-1.json @@ -16,6 +16,7 @@ "Snap" ], "index_version": "v1.0", + "indexed_at": "2024-01-01T00:00:00Z", "rank": 1, "retrieval": { "fusion_score": 1.4490997273242101e-6, @@ -26,7 +27,8 @@ "vector_score": null }, "section_label": "Snap", - "snippet": "alpha alpha" + "snippet": "alpha alpha", + "stale": false }, { "chunk_id": "c1000000000000000000000000000000", @@ -45,6 +47,7 @@ "Snap" ], "index_version": "v1.0", + "indexed_at": "2024-01-01T00:00:00Z", "rank": 2, "retrieval": { "fusion_score": 9.641424867368187e-7, @@ -55,6 +58,7 @@ "vector_score": null }, "section_label": "Snap", - "snippet": "alpha bravo charlie" + "snippet": "alpha bravo charlie", + "stale": false } ] \ No newline at end of file diff --git a/crates/kebab-search/tests/lexical.rs b/crates/kebab-search/tests/lexical.rs index 2939aef..ae01460 100644 --- a/crates/kebab-search/tests/lexical.rs +++ b/crates/kebab-search/tests/lexical.rs @@ -612,6 +612,73 @@ fn lexical_index_version_is_returned_unchanged() { assert_eq!(r.index_version().0, "custom-label-1"); } +#[test] +fn search_hit_carries_indexed_at_from_documents_updated_at() { + // p9-fb-32: SearchHit.indexed_at must be populated from + // documents.updated_at via the JOIN. We seed documents with + // updated_at=now (RFC3339) and assert the parsed OffsetDateTime + // round-trips within ±60s of wall-clock now. + use time::OffsetDateTime; + use time::format_description::well_known::Rfc3339; + + let env = Env::new(); + let conn = env.raw_conn(); + // The `insert_document` helper hard-codes updated_at='2024-01-01...'; + // override that here so the assertion against `now` is meaningful. + let now = OffsetDateTime::now_utc(); + let now_rfc = now.format(&Rfc3339).expect("format now as rfc3339"); + let doc_id = id32("d"); + let asset_id = format!("{:0>32}", "d"); + conn.execute( + "INSERT OR IGNORE INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, 'file:///x', 'a.md', '\"markdown\"', 0, + 'd0', 'reference', '/x', '2024-01-01T00:00:00Z')", + rusqlite::params![asset_id], + ) + .expect("insert asset"); + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, + source_type, trust_level, parser_version, + doc_version, schema_version, metadata_json, + provenance_json, created_at, updated_at + ) VALUES (?, ?, 'a.md', 'T', 'en', 'markdown', 'primary', 'pv1', 1, 1, + '{}', '{\"events\":[]}', + ?, ?)", + rusqlite::params![doc_id, asset_id, now_rfc, now_rfc], + ) + .expect("insert document"); + insert_chunk( + &conn, + &id32("c1"), + &doc_id, + "body about apples", + &["T"], + None, + r#"[{"kind":"line","start":1,"end":1}]"#, + "v1", + ); + drop(conn); + + let r = env.retriever(); + let hits = r + .search(&SearchQuery { + text: "apples".to_string(), + mode: SearchMode::Lexical, + k: 5, + filters: SearchFilters::default(), + }) + .expect("search"); + let hit = hits.first().expect("at least one hit"); + let now2 = OffsetDateTime::now_utc(); + let delta = (now2 - hit.indexed_at).whole_seconds().abs(); + assert!(delta < 60, "indexed_at within ±60s of now, got {delta}s"); + // stale is a placeholder set by the retriever; the App layer overwrites. + assert!(!hit.stale, "lexical retriever must default stale=false"); +} + #[test] fn lexical_snapshot_run_1() { // Pinned snapshot. A small, deterministic corpus; the JSON shape of