feat(search/lexical): media / ingested_after / doc_id filters (fb-36)
SQL WHERE clause extension. media uses CASE WHEN json_type='text'
to handle both unit (\`"markdown"\`) and tuple (\`{"image":"png"}\`)
MediaType serde shapes. ingested_after relies on RFC3339 lexicographic
ordering with UTC Z (per fb-32 ingest invariant). doc_id is a simple
equality. AND combinator with existing tags / lang / trust filters.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -319,6 +319,50 @@ fn run_query(
|
||||
};
|
||||
params.push(Box::new(rank));
|
||||
}
|
||||
// p9-fb-36: media_type filter (IN-list).
|
||||
// `assets.media_type` JSON has two shapes:
|
||||
// - unit variant (Markdown / Pdf): JSON text, e.g. `"markdown"`
|
||||
// - tuple variant (Image(Png) / Audio(Mp3) / Other(s)): JSON object,
|
||||
// e.g. `{"image": "png"}`
|
||||
// Extract a unified "kind" string for both shapes via:
|
||||
// CASE WHEN json_type = 'text' THEN json_extract($)
|
||||
// ELSE (first object key)
|
||||
// END IN (?, ...)
|
||||
if !filters.media.is_empty() {
|
||||
let placeholders: Vec<&str> =
|
||||
std::iter::repeat("?").take(filters.media.len()).collect();
|
||||
let placeholders = placeholders.join(",");
|
||||
sql.push_str(&format!(
|
||||
" AND f.doc_id IN (\
|
||||
SELECT d2.doc_id FROM documents d2 \
|
||||
JOIN assets a ON a.asset_id = d2.asset_id \
|
||||
WHERE CASE \
|
||||
WHEN json_type(a.media_type) = 'text' THEN json_extract(a.media_type, '$') \
|
||||
ELSE (SELECT key FROM json_each(a.media_type) LIMIT 1) \
|
||||
END IN ({placeholders}))"
|
||||
));
|
||||
for kind in &filters.media {
|
||||
params.push(Box::new(kind.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
// p9-fb-36: ingested_after filter.
|
||||
// `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
|
||||
// fb-32 ingest path), so lexicographic >= compare is correct.
|
||||
if let Some(after) = &filters.ingested_after {
|
||||
let formatted = after
|
||||
.format(&time::format_description::well_known::Rfc3339)
|
||||
.expect("OffsetDateTime formats to RFC3339");
|
||||
sql.push_str(" AND d.updated_at >= ?");
|
||||
params.push(Box::new(formatted));
|
||||
}
|
||||
|
||||
// p9-fb-36: doc_id filter — single-doc scoping.
|
||||
if let Some(id) = &filters.doc_id {
|
||||
sql.push_str(" AND d.doc_id = ?");
|
||||
params.push(Box::new(id.0.clone()));
|
||||
}
|
||||
|
||||
// path_glob is intentionally NOT applied here — see module comment
|
||||
// on PATH_GLOB_OVERFETCH and the post-filter in `LexicalRetriever::search`.
|
||||
|
||||
|
||||
@@ -8,11 +8,15 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use kebab_config::Config;
|
||||
use kebab_core::{IndexVersion, Lang, Retriever, SearchFilters, SearchMode, SearchQuery, TrustLevel};
|
||||
use kebab_core::{
|
||||
DocumentId, IndexVersion, Lang, MediaType, Retriever, SearchFilters, SearchHit, SearchMode,
|
||||
SearchQuery, TrustLevel,
|
||||
};
|
||||
use kebab_search::LexicalRetriever;
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
use rusqlite::Connection;
|
||||
use tempfile::TempDir;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
// ── Test scaffolding ─────────────────────────────────────────────────────
|
||||
|
||||
@@ -679,6 +683,210 @@ fn search_hit_carries_indexed_at_from_documents_updated_at() {
|
||||
assert!(!hit.stale, "lexical retriever must default stale=false");
|
||||
}
|
||||
|
||||
// ── TestEnv helper for fb-36 filter tests ───────────────────────────────
|
||||
|
||||
/// Convenience wrapper over `Env` that exposes higher-level fixture helpers
|
||||
/// for the fb-36 filter tests. Intentionally kept separate from `Env` so
|
||||
/// the original tests are untouched.
|
||||
struct TestEnv {
|
||||
inner: Env,
|
||||
counter: std::cell::Cell<u32>,
|
||||
}
|
||||
|
||||
impl TestEnv {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
inner: Env::new(),
|
||||
counter: std::cell::Cell::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Allocate a fresh monotone counter suffix so every inserted doc / chunk
|
||||
/// gets a unique 32-hex ID without the caller worrying about collisions.
|
||||
fn next_id(&self, prefix: &str) -> String {
|
||||
let n = self.counter.get();
|
||||
self.counter.set(n + 1);
|
||||
let suffix = format!("{prefix}{n:04}");
|
||||
id32(&suffix)
|
||||
}
|
||||
|
||||
/// Insert a markdown doc with the given `body` and return its `DocumentId`.
|
||||
fn insert_doc(&self, path: &str, body: &str) -> DocumentId {
|
||||
self.insert_doc_with_media(path, body, MediaType::Markdown)
|
||||
}
|
||||
|
||||
/// Insert a doc whose `assets.media_type` JSON is set to the serialized
|
||||
/// form of `media`. The `documents.updated_at` defaults to now.
|
||||
fn insert_doc_with_media(&self, path: &str, body: &str, media: MediaType) -> DocumentId {
|
||||
self.insert_doc_full(path, body, media, OffsetDateTime::now_utc())
|
||||
}
|
||||
|
||||
/// Insert a doc with an explicit `updated_at` timestamp (for
|
||||
/// `ingested_after` filter tests).
|
||||
fn insert_doc_with_updated_at(
|
||||
&self,
|
||||
path: &str,
|
||||
body: &str,
|
||||
updated_at: OffsetDateTime,
|
||||
) -> DocumentId {
|
||||
self.insert_doc_full(path, body, MediaType::Markdown, updated_at)
|
||||
}
|
||||
|
||||
fn insert_doc_full(
|
||||
&self,
|
||||
path: &str,
|
||||
body: &str,
|
||||
media: MediaType,
|
||||
updated_at: OffsetDateTime,
|
||||
) -> DocumentId {
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
let doc_id = self.next_id("doc");
|
||||
let chunk_id = self.next_id("chk");
|
||||
let asset_id = self.next_id("ast");
|
||||
let media_json = serde_json::to_string(&media).expect("serialize MediaType");
|
||||
let updated_at_str = updated_at.format(&Rfc3339).expect("format updated_at");
|
||||
|
||||
let conn = self.inner.raw_conn();
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO assets (
|
||||
asset_id, source_uri, workspace_path, media_type, byte_len,
|
||||
checksum, storage_kind, storage_path, discovered_at
|
||||
) VALUES (?, ?, ?, ?, 0,
|
||||
'd0', 'reference', ?, '2024-01-01T00:00:00Z')",
|
||||
rusqlite::params![asset_id, format!("file:///{path}"), path, media_json, path],
|
||||
)
|
||||
.expect("insert asset");
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO documents (
|
||||
doc_id, asset_id, workspace_path, title, lang,
|
||||
source_type, trust_level, parser_version,
|
||||
doc_version, schema_version, metadata_json,
|
||||
provenance_json, created_at, updated_at
|
||||
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'pv1', 1, 1,
|
||||
'{}', '{\"events\":[]}',
|
||||
'2024-01-01T00:00:00Z', ?)",
|
||||
rusqlite::params![doc_id, asset_id, path, updated_at_str],
|
||||
)
|
||||
.expect("insert document");
|
||||
|
||||
let empty_headings: Vec<&str> = vec![];
|
||||
let heading_json = serde_json::to_string(&empty_headings).unwrap();
|
||||
conn.execute(
|
||||
"INSERT INTO chunks (
|
||||
chunk_id, doc_id, text, heading_path_json, section_label,
|
||||
source_spans_json, token_estimate, chunker_version,
|
||||
policy_hash, block_ids_json, created_at
|
||||
) VALUES (?, ?, ?, ?, NULL,
|
||||
'[{\"kind\":\"line\",\"start\":1,\"end\":1}]',
|
||||
1, 'v1', 'h', '[]', '2024-01-01T00:00:00Z')",
|
||||
rusqlite::params![chunk_id, doc_id, body, heading_json],
|
||||
)
|
||||
.expect("insert chunk");
|
||||
|
||||
DocumentId(doc_id)
|
||||
}
|
||||
|
||||
fn run_search(&self, query: &str, filters: &SearchFilters) -> Vec<SearchHit> {
|
||||
let r = self.inner.retriever();
|
||||
let q = SearchQuery {
|
||||
text: query.to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: filters.clone(),
|
||||
};
|
||||
r.search(&q).expect("search")
|
||||
}
|
||||
}
|
||||
|
||||
// ── fb-36 filter tests ───────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_by_media() {
|
||||
let env = TestEnv::new();
|
||||
env.insert_doc_with_media("md1.md", "rust ownership", MediaType::Markdown);
|
||||
env.insert_doc_with_media("doc.pdf", "rust pdf body", MediaType::Pdf);
|
||||
let filters = SearchFilters {
|
||||
media: vec!["pdf".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let hits = env.run_search("rust", &filters);
|
||||
assert_eq!(hits.len(), 1, "only pdf doc should match");
|
||||
assert!(hits[0].doc_path.0.ends_with(".pdf"), "got: {}", hits[0].doc_path.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_by_ingested_after() {
|
||||
let env = TestEnv::new();
|
||||
env.insert_doc_with_updated_at(
|
||||
"old.md",
|
||||
"ingest test",
|
||||
time::macros::datetime!(2020-01-01 00:00:00 UTC),
|
||||
);
|
||||
env.insert_doc_with_updated_at(
|
||||
"new.md",
|
||||
"ingest test",
|
||||
time::macros::datetime!(2026-01-01 00:00:00 UTC),
|
||||
);
|
||||
let filters = SearchFilters {
|
||||
ingested_after: Some(time::macros::datetime!(2025-01-01 00:00:00 UTC)),
|
||||
..Default::default()
|
||||
};
|
||||
let hits = env.run_search("ingest", &filters);
|
||||
assert_eq!(hits.len(), 1, "only post-2025 doc matches");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_by_doc_id() {
|
||||
let env = TestEnv::new();
|
||||
let target = env.insert_doc("a.md", "shared term");
|
||||
env.insert_doc("b.md", "shared term");
|
||||
let filters = SearchFilters {
|
||||
doc_id: Some(target.clone()),
|
||||
..Default::default()
|
||||
};
|
||||
let hits = env.run_search("shared", &filters);
|
||||
assert!(!hits.is_empty(), "should get at least one hit for target doc");
|
||||
for h in &hits {
|
||||
assert_eq!(h.doc_id, target, "all hits must be from target doc");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_combinator_is_and() {
|
||||
let env = TestEnv::new();
|
||||
let target = env.insert_doc_with_media("a.md", "rust", MediaType::Markdown);
|
||||
env.insert_doc_with_media("b.pdf", "rust", MediaType::Pdf);
|
||||
let filters = SearchFilters {
|
||||
media: vec!["markdown".to_string()],
|
||||
doc_id: Some(target.clone()),
|
||||
..Default::default()
|
||||
};
|
||||
let hits = env.run_search("rust", &filters);
|
||||
assert!(!hits.is_empty(), "target doc should match combined filter");
|
||||
assert!(hits.iter().all(|h| h.doc_id == target));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_filter_unknown_media_returns_empty() {
|
||||
let env = TestEnv::new();
|
||||
env.insert_doc("a.md", "rust");
|
||||
let filters = SearchFilters {
|
||||
media: vec!["nonexistent_kind".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let hits = env.run_search("rust", &filters);
|
||||
assert!(hits.is_empty(), "unknown media → no hits, no error");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_empty_filters_match_default_behavior() {
|
||||
let env = TestEnv::new();
|
||||
env.insert_doc("a.md", "rust");
|
||||
let with_default = env.run_search("rust", &SearchFilters::default());
|
||||
assert!(!with_default.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lexical_snapshot_run_1() {
|
||||
// Pinned snapshot. A small, deterministic corpus; the JSON shape of
|
||||
|
||||
Reference in New Issue
Block a user