feat(search/vector): media / ingested_after / doc_id filters (fb-36)
filter_chunks helper in kebab-store-sqlite extended with the same 3 WHERE clauses as lexical. Vector still over-fetches k*2 then post-filters via SqliteStore::filter_chunks; small k can return < k hits when filters drop a lot — agent is expected to widen k or paginate. AND combinator with existing filters. - kebab-store-sqlite/src/filters.rs: media IN-list subquery, ingested_after lexicographic >= compare, doc_id equality; mirrors lexical SQL arms - 3 direct unit tests (filter_chunks_media_type/ingested_after/doc_id) that run without AVX/Lance - common/mod.rs: insert_doc / insert_doc_with_media / run_vector_search helpers on HybridEnv for integration-test use - hybrid.rs: 2 new #[ignore = "requires AVX..."] integration tests (vector_filter_by_media, vector_filter_by_doc_id) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,7 +19,9 @@ use std::sync::Arc;
|
||||
use kebab_config::Config;
|
||||
use kebab_core::{
|
||||
ChunkId, DocumentId, EmbeddingId, EmbeddingInput, EmbeddingKind,
|
||||
EmbeddingModelId, EmbeddingVersion, IndexVersion, VectorRecord, VectorStore,
|
||||
EmbeddingModelId, EmbeddingVersion, IndexVersion, MediaType,
|
||||
Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery,
|
||||
VectorRecord, VectorStore,
|
||||
};
|
||||
use kebab_embed::{Embedder, MockEmbedder};
|
||||
use kebab_search::{LexicalRetriever, VectorRetriever};
|
||||
@@ -173,6 +175,93 @@ impl HybridEnv {
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// High-level helper: seed a doc with the default media type
|
||||
/// (Markdown) and embed its text. Returns the `DocumentId` so
|
||||
/// callers can use it in `doc_id` filter tests.
|
||||
pub fn insert_doc(&self, path: &str, text: &str) -> DocumentId {
|
||||
self.insert_doc_with_media(path, text, MediaType::Markdown)
|
||||
}
|
||||
|
||||
/// High-level helper: seed a doc with an explicit `MediaType`.
|
||||
/// The `media_type` is serialized to JSON (mirrors how
|
||||
/// `DocumentStore::put_document` writes it) and stored in `assets`.
|
||||
pub fn insert_doc_with_media(
|
||||
&self,
|
||||
path: &str,
|
||||
text: &str,
|
||||
media: MediaType,
|
||||
) -> DocumentId {
|
||||
// Derive deterministic IDs from the path so repeated calls with
|
||||
// the same path are idempotent (INSERT OR IGNORE).
|
||||
let path_hash: String = {
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
let mut h = DefaultHasher::new();
|
||||
path.hash(&mut h);
|
||||
format!("{:032x}", h.finish())
|
||||
};
|
||||
let doc_id = format!("d{}", &path_hash[..31]);
|
||||
let chunk_id = format!("c{}", &path_hash[..31]);
|
||||
let asset_id = format!("a{}", &path_hash[..31]);
|
||||
|
||||
let media_json = serde_json::to_string(&media).expect("serialize MediaType");
|
||||
let conn = self.sqlite.read_conn();
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, ?, ?, ?, 0,
|
||||
'deadbeefdeadbeefdeadbeefdeadbeef',
|
||||
'reference', ?, '1970-01-01T00:00:00Z')",
|
||||
params![
|
||||
asset_id,
|
||||
format!("file:///{path}"),
|
||||
path,
|
||||
media_json,
|
||||
path,
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang, source_type,
|
||||
trust_level, parser_version, doc_version, schema_version,
|
||||
metadata_json, provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
|
||||
'{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
|
||||
params![doc_id, asset_id, path],
|
||||
)
|
||||
.unwrap();
|
||||
let heading_json = "[]";
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, ?, ?, NULL,
|
||||
'[{\"kind\":\"line\",\"start\":1,\"end\":1}]',
|
||||
1, 'v1', 'h', '[]', '1970-01-01T00:00:00Z')",
|
||||
params![chunk_id, doc_id, text, heading_json],
|
||||
)
|
||||
.unwrap();
|
||||
drop(conn);
|
||||
self.embed_and_upsert(&chunk_id, &doc_id, text, &[]);
|
||||
DocumentId(doc_id)
|
||||
}
|
||||
|
||||
/// Run a `SearchMode::Vector` query against the seeded corpus and
|
||||
/// return the resulting `Vec<SearchHit>`.
|
||||
pub fn run_vector_search(&self, query: &str, filters: &SearchFilters) -> Vec<SearchHit> {
|
||||
let r = self.vector_retriever();
|
||||
let q = SearchQuery {
|
||||
text: query.to_string(),
|
||||
mode: SearchMode::Vector,
|
||||
k: 10,
|
||||
filters: filters.clone(),
|
||||
};
|
||||
r.search(&q).expect("vector search")
|
||||
}
|
||||
|
||||
/// Embed `text` as a Document and upsert it as the embedding for
|
||||
/// `chunk_id`. Drives the same code path production uses:
|
||||
/// MockEmbedder → VectorRecord → LanceVectorStore::upsert →
|
||||
|
||||
Reference in New Issue
Block a user