From f4ce6652b2dc5eec82f2084b0a02deebbbf483ee Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sat, 9 May 2026 01:10:20 +0900
Subject: [PATCH] feat(search/lexical): populate SearchHit.indexed_at (fb-32)
JOIN documents.updated_at. stale defaults to false; App facade
post-processes against config threshold.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
Cargo.lock | 1 +
crates/kebab-search/Cargo.toml | 3 +
crates/kebab-search/src/hybrid.rs | 4 ++
crates/kebab-search/src/lexical.rs | 19 +++++-
crates/kebab-search/src/vector.rs | 4 ++
.../tests/fixtures/search/lexical/run-1.json | 8 ++-
crates/kebab-search/tests/lexical.rs | 67 +++++++++++++++++++
7 files changed, 103 insertions(+), 3 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index f50ef84..5955be5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3832,6 +3832,7 @@ dependencies = [
"serde_json",
"tempfile",
"thiserror 2.0.18",
+ "time",
"tracing",
]
diff --git a/crates/kebab-search/Cargo.toml b/crates/kebab-search/Cargo.toml
index f195b81..f13fd7f 100644
--- a/crates/kebab-search/Cargo.toml
+++ b/crates/kebab-search/Cargo.toml
@@ -25,6 +25,9 @@ serde_json = { workspace = true }
tracing = { workspace = true }
thiserror = { workspace = true }
anyhow = { workspace = true }
+# p9-fb-32: parse documents.updated_at (RFC3339) into OffsetDateTime
+# for SearchHit.indexed_at.
+time = { workspace = true }
[dev-dependencies]
tempfile = { workspace = true }
diff --git a/crates/kebab-search/src/hybrid.rs b/crates/kebab-search/src/hybrid.rs
index 9ebd7de..37cd629 100644
--- a/crates/kebab-search/src/hybrid.rs
+++ b/crates/kebab-search/src/hybrid.rs
@@ -415,6 +415,10 @@ mod tests {
index_version: IndexVersion("v1".to_string()),
embedding_model: None,
chunker_version: ChunkerVersion("v1".to_string()),
+ // p9-fb-32: hybrid unit tests don't exercise staleness; pin
+ // a fixed UNIX_EPOCH so synthetic hits remain deterministic.
+ indexed_at: time::OffsetDateTime::UNIX_EPOCH,
+ stale: false,
}
}
diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs
index 71d32c6..2b09c4f 100644
--- a/crates/kebab-search/src/lexical.rs
+++ b/crates/kebab-search/src/lexical.rs
@@ -244,6 +244,8 @@ struct RawRow {
source_spans_json: String,
chunker_version: String,
workspace_path: String,
+ /// p9-fb-32: documents.updated_at (RFC3339).
+ updated_at: String,
}
/// Build + execute the FTS5 query. The SQL pattern is the one documented
@@ -265,7 +267,8 @@ fn run_query(
snippet(chunks_fts, 3, '', '', '…', ?) AS snippet, \
c.heading_path_json, c.section_label, c.source_spans_json, \
c.chunker_version, \
- d.workspace_path \
+ d.workspace_path, \
+ d.updated_at \
FROM chunks_fts f \
JOIN chunks c ON c.chunk_id = f.chunk_id \
JOIN documents d ON d.doc_id = f.doc_id",
@@ -349,6 +352,7 @@ fn row_from_sql(row: &Row<'_>) -> rusqlite::Result {
source_spans_json: row.get(6)?,
chunker_version: row.get(7)?,
workspace_path: row.get(8)?,
+ updated_at: row.get(9)?,
})
}
@@ -382,6 +386,16 @@ fn build_hit(
// defensively if SQLite ever returns a longer string.
let snippet = trim_snippet(&raw.snippet, snippet_chars);
+ // p9-fb-32: documents.updated_at is stored as RFC3339 TEXT (V001
+ // migration; written by put_document via OffsetDateTime::now_utc).
+ // fb-23 incremental ingest's skip path does not call put_document,
+ // so this naturally reflects the last actual re-process.
+ let indexed_at = time::OffsetDateTime::parse(
+ &raw.updated_at,
+ &time::format_description::well_known::Rfc3339,
+ )
+ .context("kb-search lexical: parse documents.updated_at as RFC3339")?;
+
Ok(SearchHit {
rank,
chunk_id: ChunkId(raw.chunk_id),
@@ -402,6 +416,9 @@ fn build_hit(
index_version: index_version.clone(),
embedding_model: None,
chunker_version: ChunkerVersion(raw.chunker_version),
+ indexed_at,
+ // Placeholder — App layer overwrites against config threshold (Task 6).
+ stale: false,
})
}
diff --git a/crates/kebab-search/src/vector.rs b/crates/kebab-search/src/vector.rs
index 77ff2a1..f4d5f8b 100644
--- a/crates/kebab-search/src/vector.rs
+++ b/crates/kebab-search/src/vector.rs
@@ -308,6 +308,10 @@ fn build_hit(
index_version: index_version.clone(),
embedding_model: Some(model_id.clone()),
chunker_version: ChunkerVersion(meta.chunker_version.clone()),
+ // p9-fb-32: Task 5 will hydrate from documents.updated_at; this
+ // stub keeps the lib compiling after Task 1 added the field.
+ indexed_at: time::OffsetDateTime::UNIX_EPOCH,
+ stale: false,
})
}
diff --git a/crates/kebab-search/tests/fixtures/search/lexical/run-1.json b/crates/kebab-search/tests/fixtures/search/lexical/run-1.json
index 701d2fb..2500cd4 100644
--- a/crates/kebab-search/tests/fixtures/search/lexical/run-1.json
+++ b/crates/kebab-search/tests/fixtures/search/lexical/run-1.json
@@ -16,6 +16,7 @@
"Snap"
],
"index_version": "v1.0",
+ "indexed_at": "2024-01-01T00:00:00Z",
"rank": 1,
"retrieval": {
"fusion_score": 1.4490997273242101e-6,
@@ -26,7 +27,8 @@
"vector_score": null
},
"section_label": "Snap",
- "snippet": "alpha alpha"
+ "snippet": "alpha alpha",
+ "stale": false
},
{
"chunk_id": "c1000000000000000000000000000000",
@@ -45,6 +47,7 @@
"Snap"
],
"index_version": "v1.0",
+ "indexed_at": "2024-01-01T00:00:00Z",
"rank": 2,
"retrieval": {
"fusion_score": 9.641424867368187e-7,
@@ -55,6 +58,7 @@
"vector_score": null
},
"section_label": "Snap",
- "snippet": "alpha bravo charlie"
+ "snippet": "alpha bravo charlie",
+ "stale": false
}
]
\ No newline at end of file
diff --git a/crates/kebab-search/tests/lexical.rs b/crates/kebab-search/tests/lexical.rs
index 2939aef..ae01460 100644
--- a/crates/kebab-search/tests/lexical.rs
+++ b/crates/kebab-search/tests/lexical.rs
@@ -612,6 +612,73 @@ fn lexical_index_version_is_returned_unchanged() {
assert_eq!(r.index_version().0, "custom-label-1");
}
+#[test]
+fn search_hit_carries_indexed_at_from_documents_updated_at() {
+ // p9-fb-32: SearchHit.indexed_at must be populated from
+ // documents.updated_at via the JOIN. We seed documents with
+ // updated_at=now (RFC3339) and assert the parsed OffsetDateTime
+ // round-trips within ±60s of wall-clock now.
+ use time::OffsetDateTime;
+ use time::format_description::well_known::Rfc3339;
+
+ let env = Env::new();
+ let conn = env.raw_conn();
+ // The `insert_document` helper hard-codes updated_at='2024-01-01...';
+ // override that here so the assertion against `now` is meaningful.
+ let now = OffsetDateTime::now_utc();
+ let now_rfc = now.format(&Rfc3339).expect("format now as rfc3339");
+ let doc_id = id32("d");
+ let asset_id = format!("{:0>32}", "d");
+ conn.execute(
+ "INSERT OR IGNORE INTO assets (
+ asset_id, source_uri, workspace_path, media_type, byte_len,
+ checksum, storage_kind, storage_path, discovered_at
+ ) VALUES (?, 'file:///x', 'a.md', '\"markdown\"', 0,
+ 'd0', 'reference', '/x', '2024-01-01T00:00:00Z')",
+ rusqlite::params![asset_id],
+ )
+ .expect("insert asset");
+ conn.execute(
+ "INSERT INTO documents (
+ doc_id, asset_id, workspace_path, title, lang,
+ source_type, trust_level, parser_version,
+ doc_version, schema_version, metadata_json,
+ provenance_json, created_at, updated_at
+ ) VALUES (?, ?, 'a.md', 'T', 'en', 'markdown', 'primary', 'pv1', 1, 1,
+ '{}', '{\"events\":[]}',
+ ?, ?)",
+ rusqlite::params![doc_id, asset_id, now_rfc, now_rfc],
+ )
+ .expect("insert document");
+ insert_chunk(
+ &conn,
+ &id32("c1"),
+ &doc_id,
+ "body about apples",
+ &["T"],
+ None,
+ r#"[{"kind":"line","start":1,"end":1}]"#,
+ "v1",
+ );
+ drop(conn);
+
+ let r = env.retriever();
+ let hits = r
+ .search(&SearchQuery {
+ text: "apples".to_string(),
+ mode: SearchMode::Lexical,
+ k: 5,
+ filters: SearchFilters::default(),
+ })
+ .expect("search");
+ let hit = hits.first().expect("at least one hit");
+ let now2 = OffsetDateTime::now_utc();
+ let delta = (now2 - hit.indexed_at).whole_seconds().abs();
+ assert!(delta < 60, "indexed_at within ±60s of now, got {delta}s");
+ // stale is a placeholder set by the retriever; the App layer overwrites.
+ assert!(!hit.stale, "lexical retriever must default stale=false");
+}
+
#[test]
fn lexical_snapshot_run_1() {
// Pinned snapshot. A small, deterministic corpus; the JSON shape of