feat(search/vector): populate SearchHit.indexed_at (fb-32)
hydrate_chunks now JOINs d.updated_at. Hybrid fusion path is unchanged (passes SearchHit through, fields preserved). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -197,6 +197,8 @@ struct ChunkMeta {
|
|||||||
chunker_version: String,
|
chunker_version: String,
|
||||||
doc_id: String,
|
doc_id: String,
|
||||||
workspace_path: String,
|
workspace_path: String,
|
||||||
|
/// p9-fb-32: documents.updated_at (RFC3339).
|
||||||
|
updated_at: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn hydrate_chunks(
|
fn hydrate_chunks(
|
||||||
@@ -222,7 +224,7 @@ fn hydrate_chunks(
|
|||||||
"SELECT \
|
"SELECT \
|
||||||
c.chunk_id, c.text, c.heading_path_json, c.section_label, \
|
c.chunk_id, c.text, c.heading_path_json, c.section_label, \
|
||||||
c.source_spans_json, c.chunker_version, \
|
c.source_spans_json, c.chunker_version, \
|
||||||
c.doc_id, d.workspace_path \
|
c.doc_id, d.workspace_path, d.updated_at \
|
||||||
FROM chunks c \
|
FROM chunks c \
|
||||||
JOIN documents d ON d.doc_id = c.doc_id \
|
JOIN documents d ON d.doc_id = c.doc_id \
|
||||||
WHERE c.chunk_id IN ({placeholders})"
|
WHERE c.chunk_id IN ({placeholders})"
|
||||||
@@ -249,6 +251,7 @@ fn hydrate_chunks(
|
|||||||
chunker_version: row.get(5)?,
|
chunker_version: row.get(5)?,
|
||||||
doc_id: row.get(6)?,
|
doc_id: row.get(6)?,
|
||||||
workspace_path: row.get(7)?,
|
workspace_path: row.get(7)?,
|
||||||
|
updated_at: row.get(8)?,
|
||||||
},
|
},
|
||||||
))
|
))
|
||||||
},
|
},
|
||||||
@@ -287,6 +290,16 @@ fn build_hit(
|
|||||||
);
|
);
|
||||||
let snippet = trim_snippet(&meta.text, snippet_chars);
|
let snippet = trim_snippet(&meta.text, snippet_chars);
|
||||||
|
|
||||||
|
// p9-fb-32: documents.updated_at is stored as RFC3339 TEXT (V001
|
||||||
|
// migration; written by put_document via OffsetDateTime::now_utc).
|
||||||
|
// Mirrors the lexical retriever; see lexical::build_hit for the
|
||||||
|
// shared rationale on incremental-ingest skip semantics.
|
||||||
|
let indexed_at = time::OffsetDateTime::parse(
|
||||||
|
&meta.updated_at,
|
||||||
|
&time::format_description::well_known::Rfc3339,
|
||||||
|
)
|
||||||
|
.context("kb-search vector: parse documents.updated_at as RFC3339")?;
|
||||||
|
|
||||||
let score = hit.score;
|
let score = hit.score;
|
||||||
Ok(SearchHit {
|
Ok(SearchHit {
|
||||||
rank,
|
rank,
|
||||||
@@ -308,9 +321,8 @@ fn build_hit(
|
|||||||
index_version: index_version.clone(),
|
index_version: index_version.clone(),
|
||||||
embedding_model: Some(model_id.clone()),
|
embedding_model: Some(model_id.clone()),
|
||||||
chunker_version: ChunkerVersion(meta.chunker_version.clone()),
|
chunker_version: ChunkerVersion(meta.chunker_version.clone()),
|
||||||
// p9-fb-32: Task 5 will hydrate from documents.updated_at; this
|
indexed_at,
|
||||||
// stub keeps the lib compiling after Task 1 added the field.
|
// Placeholder — App layer overwrites against config threshold (Task 6).
|
||||||
indexed_at: time::OffsetDateTime::UNIX_EPOCH,
|
|
||||||
stale: false,
|
stale: false,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ use kebab_core::{
|
|||||||
Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery,
|
Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery,
|
||||||
};
|
};
|
||||||
use kebab_search::{FusionPolicy, HybridRetriever};
|
use kebab_search::{FusionPolicy, HybridRetriever};
|
||||||
|
use rusqlite::params;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
fn build_hybrid(env: &HybridEnv) -> HybridRetriever {
|
fn build_hybrid(env: &HybridEnv) -> HybridRetriever {
|
||||||
@@ -211,3 +212,47 @@ fn hybrid_snapshot_run_1() {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[ignore = "requires AVX-capable hardware (LanceDB)"]
|
||||||
|
fn vector_hit_carries_indexed_at() {
|
||||||
|
// p9-fb-32: VectorRetriever must populate SearchHit.indexed_at from
|
||||||
|
// documents.updated_at via the JOIN added to hydrate_chunks (mirrors
|
||||||
|
// the lexical retriever's behavior — Task 5).
|
||||||
|
use time::OffsetDateTime;
|
||||||
|
use time::format_description::well_known::Rfc3339;
|
||||||
|
|
||||||
|
require_avx_or_panic();
|
||||||
|
let env = HybridEnv::new();
|
||||||
|
let _ids = seed_disjoint_corpus(&env);
|
||||||
|
|
||||||
|
// `seed_chunk` hardcodes updated_at='1970-01-01T00:00:00Z'; bump
|
||||||
|
// every document's updated_at to wall-clock now so the assertion
|
||||||
|
// against `now` is meaningful.
|
||||||
|
let now = OffsetDateTime::now_utc();
|
||||||
|
let now_rfc = now.format(&Rfc3339).expect("format now as rfc3339");
|
||||||
|
{
|
||||||
|
let conn = env.sqlite.read_conn();
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE documents SET updated_at = ?",
|
||||||
|
params![now_rfc],
|
||||||
|
)
|
||||||
|
.expect("bump documents.updated_at");
|
||||||
|
}
|
||||||
|
|
||||||
|
let r = env.vector_retriever();
|
||||||
|
let hits = r
|
||||||
|
.search(&SearchQuery {
|
||||||
|
text: "rust".to_string(),
|
||||||
|
mode: SearchMode::Vector,
|
||||||
|
k: 5,
|
||||||
|
filters: SearchFilters::default(),
|
||||||
|
})
|
||||||
|
.expect("vector search");
|
||||||
|
let hit = hits.first().expect("at least one vector hit");
|
||||||
|
let now2 = OffsetDateTime::now_utc();
|
||||||
|
let delta = (now2 - hit.indexed_at).whole_seconds().abs();
|
||||||
|
assert!(delta < 60, "indexed_at within ±60s of now, got {delta}s");
|
||||||
|
// stale is a placeholder set by the retriever; the App layer overwrites.
|
||||||
|
assert!(!hit.stale, "vector retriever must default stale=false");
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user