feat(search/vector): media / ingested_after / doc_id filters (fb-36)
filter_chunks helper in kebab-store-sqlite extended with the same 3 WHERE clauses as lexical. Vector still over-fetches k*2 then post-filters via SqliteStore::filter_chunks; small k can return < k hits when filters drop a lot — agent is expected to widen k or paginate. AND combinator with existing filters. - kebab-store-sqlite/src/filters.rs: media IN-list subquery, ingested_after lexicographic >= compare, doc_id equality; mirrors lexical SQL arms - 3 direct unit tests (filter_chunks_media_type/ingested_after/doc_id) that run without AVX/Lance - common/mod.rs: insert_doc / insert_doc_with_media / run_vector_search helpers on HybridEnv for integration-test use - hybrid.rs: 2 new #[ignore = "requires AVX..."] integration tests (vector_filter_by_media, vector_filter_by_doc_id) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -15,7 +15,7 @@ use common::{
|
||||
HybridEnv, id32, require_avx_or_panic, TEST_LEX_INDEX_VERSION, TEST_VEC_INDEX_VERSION,
|
||||
};
|
||||
use kebab_core::{
|
||||
Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery,
|
||||
MediaType, Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery,
|
||||
};
|
||||
use kebab_search::{FusionPolicy, HybridRetriever};
|
||||
use rusqlite::params;
|
||||
@@ -213,6 +213,57 @@ fn hybrid_snapshot_run_1() {
|
||||
}
|
||||
}
|
||||
|
||||
/// p9-fb-36: vector post-filter must pass `media` through `filter_chunks`.
|
||||
/// Seeding two docs (markdown + pdf) and filtering for pdf-only must
|
||||
/// return only the pdf chunk, proving `LanceVectorStore::search` →
|
||||
/// `SqliteStore::filter_chunks` correctly applies the media arm.
|
||||
#[test]
|
||||
#[ignore = "requires AVX-capable hardware (LanceDB)"]
|
||||
fn vector_filter_by_media() {
|
||||
require_avx_or_panic();
|
||||
let env = HybridEnv::new();
|
||||
env.insert_doc_with_media("md1.md", "rust ownership", MediaType::Markdown);
|
||||
env.insert_doc_with_media("doc.pdf", "rust pdf body", MediaType::Pdf);
|
||||
|
||||
let filters = SearchFilters {
|
||||
media: vec!["pdf".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let hits = env.run_vector_search("rust", &filters);
|
||||
assert_eq!(hits.len(), 1, "media filter must keep only pdf chunk");
|
||||
assert!(
|
||||
hits[0].doc_path.0.ends_with(".pdf"),
|
||||
"expected .pdf path, got: {}",
|
||||
hits[0].doc_path.0
|
||||
);
|
||||
}
|
||||
|
||||
/// p9-fb-36: vector post-filter must pass `doc_id` through `filter_chunks`.
|
||||
/// Seeding two docs with shared text, filtering by one doc_id must return
|
||||
/// only chunks from that doc.
|
||||
#[test]
|
||||
#[ignore = "requires AVX-capable hardware (LanceDB)"]
|
||||
fn vector_filter_by_doc_id() {
|
||||
require_avx_or_panic();
|
||||
let env = HybridEnv::new();
|
||||
let target = env.insert_doc("a.md", "shared knowledge");
|
||||
env.insert_doc("b.md", "shared knowledge");
|
||||
|
||||
let filters = SearchFilters {
|
||||
doc_id: Some(target.clone()),
|
||||
..Default::default()
|
||||
};
|
||||
let hits = env.run_vector_search("shared", &filters);
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"doc_id filter must return hits for the target doc"
|
||||
);
|
||||
assert!(
|
||||
hits.iter().all(|h| h.doc_id == target),
|
||||
"all hits must belong to the target doc_id"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore = "requires AVX-capable hardware (LanceDB)"]
|
||||
fn vector_hit_carries_indexed_at() {
|
||||
|
||||
Reference in New Issue
Block a user