feat(search/lexical): populate SearchHit.indexed_at (fb-32)
JOIN documents.updated_at. stale defaults to false; App facade post-processes against config threshold. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -3832,6 +3832,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror 2.0.18",
|
||||
"time",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
|
||||
@@ -25,6 +25,9 @@ serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
# p9-fb-32: parse documents.updated_at (RFC3339) into OffsetDateTime
|
||||
# for SearchHit.indexed_at.
|
||||
time = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
|
||||
@@ -415,6 +415,10 @@ mod tests {
|
||||
index_version: IndexVersion("v1".to_string()),
|
||||
embedding_model: None,
|
||||
chunker_version: ChunkerVersion("v1".to_string()),
|
||||
// p9-fb-32: hybrid unit tests don't exercise staleness; pin
|
||||
// a fixed UNIX_EPOCH so synthetic hits remain deterministic.
|
||||
indexed_at: time::OffsetDateTime::UNIX_EPOCH,
|
||||
stale: false,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -244,6 +244,8 @@ struct RawRow {
|
||||
source_spans_json: String,
|
||||
chunker_version: String,
|
||||
workspace_path: String,
|
||||
/// p9-fb-32: documents.updated_at (RFC3339).
|
||||
updated_at: String,
|
||||
}
|
||||
|
||||
/// Build + execute the FTS5 query. The SQL pattern is the one documented
|
||||
@@ -265,7 +267,8 @@ fn run_query(
|
||||
snippet(chunks_fts, 3, '', '', '…', ?) AS snippet, \
|
||||
c.heading_path_json, c.section_label, c.source_spans_json, \
|
||||
c.chunker_version, \
|
||||
d.workspace_path \
|
||||
d.workspace_path, \
|
||||
d.updated_at \
|
||||
FROM chunks_fts f \
|
||||
JOIN chunks c ON c.chunk_id = f.chunk_id \
|
||||
JOIN documents d ON d.doc_id = f.doc_id",
|
||||
@@ -349,6 +352,7 @@ fn row_from_sql(row: &Row<'_>) -> rusqlite::Result<RawRow> {
|
||||
source_spans_json: row.get(6)?,
|
||||
chunker_version: row.get(7)?,
|
||||
workspace_path: row.get(8)?,
|
||||
updated_at: row.get(9)?,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -382,6 +386,16 @@ fn build_hit(
|
||||
// defensively if SQLite ever returns a longer string.
|
||||
let snippet = trim_snippet(&raw.snippet, snippet_chars);
|
||||
|
||||
// p9-fb-32: documents.updated_at is stored as RFC3339 TEXT (V001
|
||||
// migration; written by put_document via OffsetDateTime::now_utc).
|
||||
// fb-23 incremental ingest's skip path does not call put_document,
|
||||
// so this naturally reflects the last actual re-process.
|
||||
let indexed_at = time::OffsetDateTime::parse(
|
||||
&raw.updated_at,
|
||||
&time::format_description::well_known::Rfc3339,
|
||||
)
|
||||
.context("kb-search lexical: parse documents.updated_at as RFC3339")?;
|
||||
|
||||
Ok(SearchHit {
|
||||
rank,
|
||||
chunk_id: ChunkId(raw.chunk_id),
|
||||
@@ -402,6 +416,9 @@ fn build_hit(
|
||||
index_version: index_version.clone(),
|
||||
embedding_model: None,
|
||||
chunker_version: ChunkerVersion(raw.chunker_version),
|
||||
indexed_at,
|
||||
// Placeholder — App layer overwrites against config threshold (Task 6).
|
||||
stale: false,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -308,6 +308,10 @@ fn build_hit(
|
||||
index_version: index_version.clone(),
|
||||
embedding_model: Some(model_id.clone()),
|
||||
chunker_version: ChunkerVersion(meta.chunker_version.clone()),
|
||||
// p9-fb-32: Task 5 will hydrate from documents.updated_at; this
|
||||
// stub keeps the lib compiling after Task 1 added the field.
|
||||
indexed_at: time::OffsetDateTime::UNIX_EPOCH,
|
||||
stale: false,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
"Snap"
|
||||
],
|
||||
"index_version": "v1.0",
|
||||
"indexed_at": "2024-01-01T00:00:00Z",
|
||||
"rank": 1,
|
||||
"retrieval": {
|
||||
"fusion_score": 1.4490997273242101e-6,
|
||||
@@ -26,7 +27,8 @@
|
||||
"vector_score": null
|
||||
},
|
||||
"section_label": "Snap",
|
||||
"snippet": "alpha alpha"
|
||||
"snippet": "alpha alpha",
|
||||
"stale": false
|
||||
},
|
||||
{
|
||||
"chunk_id": "c1000000000000000000000000000000",
|
||||
@@ -45,6 +47,7 @@
|
||||
"Snap"
|
||||
],
|
||||
"index_version": "v1.0",
|
||||
"indexed_at": "2024-01-01T00:00:00Z",
|
||||
"rank": 2,
|
||||
"retrieval": {
|
||||
"fusion_score": 9.641424867368187e-7,
|
||||
@@ -55,6 +58,7 @@
|
||||
"vector_score": null
|
||||
},
|
||||
"section_label": "Snap",
|
||||
"snippet": "alpha bravo charlie"
|
||||
"snippet": "alpha bravo charlie",
|
||||
"stale": false
|
||||
}
|
||||
]
|
||||
@@ -612,6 +612,73 @@ fn lexical_index_version_is_returned_unchanged() {
|
||||
assert_eq!(r.index_version().0, "custom-label-1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_hit_carries_indexed_at_from_documents_updated_at() {
|
||||
// p9-fb-32: SearchHit.indexed_at must be populated from
|
||||
// documents.updated_at via the JOIN. We seed documents with
|
||||
// updated_at=now (RFC3339) and assert the parsed OffsetDateTime
|
||||
// round-trips within ±60s of wall-clock now.
|
||||
use time::OffsetDateTime;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
|
||||
let env = Env::new();
|
||||
let conn = env.raw_conn();
|
||||
// The `insert_document` helper hard-codes updated_at='2024-01-01...';
|
||||
// override that here so the assertion against `now` is meaningful.
|
||||
let now = OffsetDateTime::now_utc();
|
||||
let now_rfc = now.format(&Rfc3339).expect("format now as rfc3339");
|
||||
let doc_id = id32("d");
|
||||
let asset_id = format!("{:0>32}", "d");
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, 'file:///x', 'a.md', '\"markdown\"', 0,
|
||||
'd0', 'reference', '/x', '2024-01-01T00:00:00Z')",
|
||||
rusqlite::params![asset_id],
|
||||
)
|
||||
.expect("insert asset");
|
||||
conn.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang,
|
||||
source_type, trust_level, parser_version,
|
||||
doc_version, schema_version, metadata_json,
|
||||
provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, 'a.md', 'T', 'en', 'markdown', 'primary', 'pv1', 1, 1,
|
||||
'{}', '{\"events\":[]}',
|
||||
?, ?)",
|
||||
rusqlite::params![doc_id, asset_id, now_rfc, now_rfc],
|
||||
)
|
||||
.expect("insert document");
|
||||
insert_chunk(
|
||||
&conn,
|
||||
&id32("c1"),
|
||||
&doc_id,
|
||||
"body about apples",
|
||||
&["T"],
|
||||
None,
|
||||
r#"[{"kind":"line","start":1,"end":1}]"#,
|
||||
"v1",
|
||||
);
|
||||
drop(conn);
|
||||
|
||||
let r = env.retriever();
|
||||
let hits = r
|
||||
.search(&SearchQuery {
|
||||
text: "apples".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 5,
|
||||
filters: SearchFilters::default(),
|
||||
})
|
||||
.expect("search");
|
||||
let hit = hits.first().expect("at least one hit");
|
||||
let now2 = OffsetDateTime::now_utc();
|
||||
let delta = (now2 - hit.indexed_at).whole_seconds().abs();
|
||||
assert!(delta < 60, "indexed_at within ±60s of now, got {delta}s");
|
||||
// stale is a placeholder set by the retriever; the App layer overwrites.
|
||||
assert!(!hit.stale, "lexical retriever must default stale=false");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_snapshot_run_1() {
|
||||
// Pinned snapshot. A small, deterministic corpus; the JSON shape of
|
||||
|
||||
Reference in New Issue
Block a user