feat(search): VectorRetriever sentinel 별칭 strip + dedup

별칭 dense 벡터({orig}#alias) hit 을 원본 chunk_id 로 strip 해 hydrate,
body+alias 중복은 첫(높은 score) 하나만 유지. overfetch 2→3 (dedup 후 k
확보). wire/RetrievalDetail 무변경. vector/hybrid 회귀 0, clippy green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-30 11:09:32 +00:00
parent afa8af0f88
commit 6ba8cb2c88

View File

@@ -36,9 +36,13 @@ const DEFAULT_K: usize = 10;
/// Over-fetch multiplier passed to `VectorStore::search` so that
/// SQLite-side filter losses (tags / lang / trust / path_glob) still
/// leave at least `k` candidates. The Lance store already applies the
/// same filters internally; the extra `* 2` is the spec-mandated
/// safety margin for the `Retriever` layer (§7.2 spec line 138).
const VECTOR_OVERFETCH_MULTIPLIER: usize = 2;
/// same filters internally; the extra margin is the spec-mandated
/// safety for the `Retriever` layer (§7.2 spec line 138).
///
/// `3` (was `2`): dense 별칭 sentinel 벡터(`{orig}#alias`)가 같은 청크의
/// body 벡터와 함께 raw_hits 에 들어올 수 있어, strip+dedup 후에도 `k` 개를
///확보하도록 여유를 키운다(별칭 미사용 시에도 안전한 상한).
const VECTOR_OVERFETCH_MULTIPLIER: usize = 3;
/// Wraps a vector store + embedder into a [`Retriever`].
///
@@ -149,23 +153,34 @@ impl Retriever for VectorRetriever {
}
// 3. Hydrate metadata from SQLite for the candidate ids in
// one round-trip. Order is preserved by the caller via the
// HashMap lookup at hit-construction time.
let candidate_ids: Vec<&str> = raw_hits.iter().map(|h| h.chunk_id.0.as_str()).collect();
// one round-trip. dense 별칭 벡터는 sentinel chunk_id
// (`{orig}#alias`)로 색인되므로, 원본 chunk_id 로 strip 해
// hydrate 한다(별칭 벡터는 chunks 테이블에 없음).
let candidate_ids: Vec<&str> = raw_hits
.iter()
.map(|h| kebab_core::strip_alias_suffix(h.chunk_id.0.as_str()))
.collect();
let hydration = hydrate_chunks(&self.sqlite, &candidate_ids)
.context("kb-search vector: hydrate chunk metadata")?;
// 4. Build `SearchHit` for the first `k` raw hits that pass
// hydration (a missing row would be a filter-induced drop —
// Lance returned the chunk but SQLite filtered it out, or
// the chunk was deleted between Lance's read and ours).
// hydration. sentinel 별칭 hit 은 원본 chunk_id 로 strip 하고,
// 같은 원본이 body+alias 둘 다 hit 하면 첫(높은 score) 하나만
// 남긴다(dedup). raw_hits 는 score 순이라 첫 매칭이 최선.
let model_id = self.embed.model_id();
let mut hits: Vec<SearchHit> = Vec::with_capacity(k.min(raw_hits.len()));
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
let mut rank: u32 = 0;
for hit in raw_hits {
let Some(meta) = hydration.get(hit.chunk_id.0.as_str()) else {
for mut hit in raw_hits {
let orig = kebab_core::strip_alias_suffix(hit.chunk_id.0.as_str()).to_string();
if !seen.insert(orig.clone()) {
continue; // body+alias 중복 → 첫 hit 유지
}
let Some(meta) = hydration.get(orig.as_str()) else {
continue;
};
// build_hit 이 원본 chunk_id 를 쓰도록 sentinel 을 strip 본으로 교체.
hit.chunk_id = kebab_core::ChunkId(orig);
rank = rank.saturating_add(1);
hits.push(build_hit(
hit,