feat(search/lexical): media / ingested_after / doc_id filters (fb-36)

SQL WHERE clause extension. media uses CASE WHEN json_type='text'
to handle both unit (\`"markdown"\`) and tuple (\`{"image":"png"}\`)
MediaType serde shapes. ingested_after relies on RFC3339 lexicographic
ordering with UTC Z (per fb-32 ingest invariant). doc_id is a simple
equality. AND combinator with existing tags / lang / trust filters.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
th-kim0823
2026-05-10 03:41:02 +09:00
parent d3f38c76e9
commit 2c80e2ad91
2 changed files with 253 additions and 1 deletions

View File

@@ -319,6 +319,50 @@ fn run_query(
};
params.push(Box::new(rank));
}
// p9-fb-36: media_type filter (IN-list).
// `assets.media_type` JSON has two shapes:
// - unit variant (Markdown / Pdf): JSON text, e.g. `"markdown"`
// - tuple variant (Image(Png) / Audio(Mp3) / Other(s)): JSON object,
// e.g. `{"image": "png"}`
// Extract a unified "kind" string for both shapes via:
// CASE WHEN json_type = 'text' THEN json_extract($)
// ELSE (first object key)
// END IN (?, ...)
if !filters.media.is_empty() {
let placeholders: Vec<&str> =
std::iter::repeat("?").take(filters.media.len()).collect();
let placeholders = placeholders.join(",");
sql.push_str(&format!(
" AND f.doc_id IN (\
SELECT d2.doc_id FROM documents d2 \
JOIN assets a ON a.asset_id = d2.asset_id \
WHERE CASE \
WHEN json_type(a.media_type) = 'text' THEN json_extract(a.media_type, '$') \
ELSE (SELECT key FROM json_each(a.media_type) LIMIT 1) \
END IN ({placeholders}))"
));
for kind in &filters.media {
params.push(Box::new(kind.clone()));
}
}
// p9-fb-36: ingested_after filter.
// `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
// fb-32 ingest path), so lexicographic >= compare is correct.
if let Some(after) = &filters.ingested_after {
let formatted = after
.format(&time::format_description::well_known::Rfc3339)
.expect("OffsetDateTime formats to RFC3339");
sql.push_str(" AND d.updated_at >= ?");
params.push(Box::new(formatted));
}
// p9-fb-36: doc_id filter — single-doc scoping.
if let Some(id) = &filters.doc_id {
sql.push_str(" AND d.doc_id = ?");
params.push(Box::new(id.0.clone()));
}
// path_glob is intentionally NOT applied here — see module comment
// on PATH_GLOB_OVERFETCH and the post-filter in `LexicalRetriever::search`.

View File

@@ -8,11 +8,15 @@
use std::sync::Arc;
use kebab_config::Config;
use kebab_core::{IndexVersion, Lang, Retriever, SearchFilters, SearchMode, SearchQuery, TrustLevel};
use kebab_core::{
DocumentId, IndexVersion, Lang, MediaType, Retriever, SearchFilters, SearchHit, SearchMode,
SearchQuery, TrustLevel,
};
use kebab_search::LexicalRetriever;
use kebab_store_sqlite::SqliteStore;
use rusqlite::Connection;
use tempfile::TempDir;
use time::OffsetDateTime;
// ── Test scaffolding ─────────────────────────────────────────────────────
@@ -679,6 +683,210 @@ fn search_hit_carries_indexed_at_from_documents_updated_at() {
assert!(!hit.stale, "lexical retriever must default stale=false");
}
// ── TestEnv helper for fb-36 filter tests ───────────────────────────────
/// Convenience wrapper over `Env` that exposes higher-level fixture helpers
/// for the fb-36 filter tests. Intentionally kept separate from `Env` so
/// the original tests are untouched.
struct TestEnv {
inner: Env,
counter: std::cell::Cell<u32>,
}
impl TestEnv {
fn new() -> Self {
Self {
inner: Env::new(),
counter: std::cell::Cell::new(0),
}
}
/// Allocate a fresh monotone counter suffix so every inserted doc / chunk
/// gets a unique 32-hex ID without the caller worrying about collisions.
fn next_id(&self, prefix: &str) -> String {
let n = self.counter.get();
self.counter.set(n + 1);
let suffix = format!("{prefix}{n:04}");
id32(&suffix)
}
/// Insert a markdown doc with the given `body` and return its `DocumentId`.
fn insert_doc(&self, path: &str, body: &str) -> DocumentId {
self.insert_doc_with_media(path, body, MediaType::Markdown)
}
/// Insert a doc whose `assets.media_type` JSON is set to the serialized
/// form of `media`. The `documents.updated_at` defaults to now.
fn insert_doc_with_media(&self, path: &str, body: &str, media: MediaType) -> DocumentId {
self.insert_doc_full(path, body, media, OffsetDateTime::now_utc())
}
/// Insert a doc with an explicit `updated_at` timestamp (for
/// `ingested_after` filter tests).
fn insert_doc_with_updated_at(
&self,
path: &str,
body: &str,
updated_at: OffsetDateTime,
) -> DocumentId {
self.insert_doc_full(path, body, MediaType::Markdown, updated_at)
}
fn insert_doc_full(
&self,
path: &str,
body: &str,
media: MediaType,
updated_at: OffsetDateTime,
) -> DocumentId {
use time::format_description::well_known::Rfc3339;
let doc_id = self.next_id("doc");
let chunk_id = self.next_id("chk");
let asset_id = self.next_id("ast");
let media_json = serde_json::to_string(&media).expect("serialize MediaType");
let updated_at_str = updated_at.format(&Rfc3339).expect("format updated_at");
let conn = self.inner.raw_conn();
conn.execute(
"INSERT OR IGNORE INTO assets (
asset_id, source_uri, workspace_path, media_type, byte_len,
checksum, storage_kind, storage_path, discovered_at
) VALUES (?, ?, ?, ?, 0,
'd0', 'reference', ?, '2024-01-01T00:00:00Z')",
rusqlite::params![asset_id, format!("file:///{path}"), path, media_json, path],
)
.expect("insert asset");
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path, title, lang,
source_type, trust_level, parser_version,
doc_version, schema_version, metadata_json,
provenance_json, created_at, updated_at
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'pv1', 1, 1,
'{}', '{\"events\":[]}',
'2024-01-01T00:00:00Z', ?)",
rusqlite::params![doc_id, asset_id, path, updated_at_str],
)
.expect("insert document");
let empty_headings: Vec<&str> = vec![];
let heading_json = serde_json::to_string(&empty_headings).unwrap();
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES (?, ?, ?, ?, NULL,
'[{\"kind\":\"line\",\"start\":1,\"end\":1}]',
1, 'v1', 'h', '[]', '2024-01-01T00:00:00Z')",
rusqlite::params![chunk_id, doc_id, body, heading_json],
)
.expect("insert chunk");
DocumentId(doc_id)
}
fn run_search(&self, query: &str, filters: &SearchFilters) -> Vec<SearchHit> {
let r = self.inner.retriever();
let q = SearchQuery {
text: query.to_string(),
mode: SearchMode::Lexical,
k: 10,
filters: filters.clone(),
};
r.search(&q).expect("search")
}
}
// ── fb-36 filter tests ───────────────────────────────────────────────────
#[test]
fn lexical_filter_by_media() {
let env = TestEnv::new();
env.insert_doc_with_media("md1.md", "rust ownership", MediaType::Markdown);
env.insert_doc_with_media("doc.pdf", "rust pdf body", MediaType::Pdf);
let filters = SearchFilters {
media: vec!["pdf".to_string()],
..Default::default()
};
let hits = env.run_search("rust", &filters);
assert_eq!(hits.len(), 1, "only pdf doc should match");
assert!(hits[0].doc_path.0.ends_with(".pdf"), "got: {}", hits[0].doc_path.0);
}
#[test]
fn lexical_filter_by_ingested_after() {
let env = TestEnv::new();
env.insert_doc_with_updated_at(
"old.md",
"ingest test",
time::macros::datetime!(2020-01-01 00:00:00 UTC),
);
env.insert_doc_with_updated_at(
"new.md",
"ingest test",
time::macros::datetime!(2026-01-01 00:00:00 UTC),
);
let filters = SearchFilters {
ingested_after: Some(time::macros::datetime!(2025-01-01 00:00:00 UTC)),
..Default::default()
};
let hits = env.run_search("ingest", &filters);
assert_eq!(hits.len(), 1, "only post-2025 doc matches");
}
#[test]
fn lexical_filter_by_doc_id() {
let env = TestEnv::new();
let target = env.insert_doc("a.md", "shared term");
env.insert_doc("b.md", "shared term");
let filters = SearchFilters {
doc_id: Some(target.clone()),
..Default::default()
};
let hits = env.run_search("shared", &filters);
assert!(!hits.is_empty(), "should get at least one hit for target doc");
for h in &hits {
assert_eq!(h.doc_id, target, "all hits must be from target doc");
}
}
#[test]
fn lexical_filter_combinator_is_and() {
let env = TestEnv::new();
let target = env.insert_doc_with_media("a.md", "rust", MediaType::Markdown);
env.insert_doc_with_media("b.pdf", "rust", MediaType::Pdf);
let filters = SearchFilters {
media: vec!["markdown".to_string()],
doc_id: Some(target.clone()),
..Default::default()
};
let hits = env.run_search("rust", &filters);
assert!(!hits.is_empty(), "target doc should match combined filter");
assert!(hits.iter().all(|h| h.doc_id == target));
}
#[test]
fn lexical_filter_unknown_media_returns_empty() {
let env = TestEnv::new();
env.insert_doc("a.md", "rust");
let filters = SearchFilters {
media: vec!["nonexistent_kind".to_string()],
..Default::default()
};
let hits = env.run_search("rust", &filters);
assert!(hits.is_empty(), "unknown media → no hits, no error");
}
#[test]
fn lexical_empty_filters_match_default_behavior() {
let env = TestEnv::new();
env.insert_doc("a.md", "rust");
let with_default = env.run_search("rust", &SearchFilters::default());
assert!(!with_default.is_empty());
}
#[test]
fn lexical_snapshot_run_1() {
// Pinned snapshot. A small, deterministic corpus; the JSON shape of