From 2c80e2ad915e021dd73f4571ffa61aeb291341a7 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 03:41:02 +0900
Subject: [PATCH] feat(search/lexical): media / ingested_after / doc_id filters
(fb-36)
SQL WHERE clause extension. media uses CASE WHEN json_type='text'
to handle both unit (\`"markdown"\`) and tuple (\`{"image":"png"}\`)
MediaType serde shapes. ingested_after relies on RFC3339 lexicographic
ordering with UTC Z (per fb-32 ingest invariant). doc_id is a simple
equality. AND combinator with existing tags / lang / trust filters.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-search/src/lexical.rs | 44 ++++++
crates/kebab-search/tests/lexical.rs | 210 ++++++++++++++++++++++++++-
2 files changed, 253 insertions(+), 1 deletion(-)
diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs
index 513fb9e..954148c 100644
--- a/crates/kebab-search/src/lexical.rs
+++ b/crates/kebab-search/src/lexical.rs
@@ -319,6 +319,50 @@ fn run_query(
};
params.push(Box::new(rank));
}
+ // p9-fb-36: media_type filter (IN-list).
+ // `assets.media_type` JSON has two shapes:
+ // - unit variant (Markdown / Pdf): JSON text, e.g. `"markdown"`
+ // - tuple variant (Image(Png) / Audio(Mp3) / Other(s)): JSON object,
+ // e.g. `{"image": "png"}`
+ // Extract a unified "kind" string for both shapes via:
+ // CASE WHEN json_type = 'text' THEN json_extract($)
+ // ELSE (first object key)
+ // END IN (?, ...)
+ if !filters.media.is_empty() {
+ let placeholders: Vec<&str> =
+ std::iter::repeat("?").take(filters.media.len()).collect();
+ let placeholders = placeholders.join(",");
+ sql.push_str(&format!(
+ " AND f.doc_id IN (\
+ SELECT d2.doc_id FROM documents d2 \
+ JOIN assets a ON a.asset_id = d2.asset_id \
+ WHERE CASE \
+ WHEN json_type(a.media_type) = 'text' THEN json_extract(a.media_type, '$') \
+ ELSE (SELECT key FROM json_each(a.media_type) LIMIT 1) \
+ END IN ({placeholders}))"
+ ));
+ for kind in &filters.media {
+ params.push(Box::new(kind.clone()));
+ }
+ }
+
+ // p9-fb-36: ingested_after filter.
+ // `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
+ // fb-32 ingest path), so lexicographic >= compare is correct.
+ if let Some(after) = &filters.ingested_after {
+ let formatted = after
+ .format(&time::format_description::well_known::Rfc3339)
+ .expect("OffsetDateTime formats to RFC3339");
+ sql.push_str(" AND d.updated_at >= ?");
+ params.push(Box::new(formatted));
+ }
+
+ // p9-fb-36: doc_id filter — single-doc scoping.
+ if let Some(id) = &filters.doc_id {
+ sql.push_str(" AND d.doc_id = ?");
+ params.push(Box::new(id.0.clone()));
+ }
+
// path_glob is intentionally NOT applied here — see module comment
// on PATH_GLOB_OVERFETCH and the post-filter in `LexicalRetriever::search`.
diff --git a/crates/kebab-search/tests/lexical.rs b/crates/kebab-search/tests/lexical.rs
index ae01460..4265160 100644
--- a/crates/kebab-search/tests/lexical.rs
+++ b/crates/kebab-search/tests/lexical.rs
@@ -8,11 +8,15 @@
use std::sync::Arc;
use kebab_config::Config;
-use kebab_core::{IndexVersion, Lang, Retriever, SearchFilters, SearchMode, SearchQuery, TrustLevel};
+use kebab_core::{
+ DocumentId, IndexVersion, Lang, MediaType, Retriever, SearchFilters, SearchHit, SearchMode,
+ SearchQuery, TrustLevel,
+};
use kebab_search::LexicalRetriever;
use kebab_store_sqlite::SqliteStore;
use rusqlite::Connection;
use tempfile::TempDir;
+use time::OffsetDateTime;
// ── Test scaffolding ─────────────────────────────────────────────────────
@@ -679,6 +683,210 @@ fn search_hit_carries_indexed_at_from_documents_updated_at() {
assert!(!hit.stale, "lexical retriever must default stale=false");
}
+// ── TestEnv helper for fb-36 filter tests ───────────────────────────────
+
+/// Convenience wrapper over `Env` that exposes higher-level fixture helpers
+/// for the fb-36 filter tests. Intentionally kept separate from `Env` so
+/// the original tests are untouched.
+struct TestEnv {
+ inner: Env,
+ counter: std::cell::Cell,
+}
+
+impl TestEnv {
+ fn new() -> Self {
+ Self {
+ inner: Env::new(),
+ counter: std::cell::Cell::new(0),
+ }
+ }
+
+ /// Allocate a fresh monotone counter suffix so every inserted doc / chunk
+ /// gets a unique 32-hex ID without the caller worrying about collisions.
+ fn next_id(&self, prefix: &str) -> String {
+ let n = self.counter.get();
+ self.counter.set(n + 1);
+ let suffix = format!("{prefix}{n:04}");
+ id32(&suffix)
+ }
+
+ /// Insert a markdown doc with the given `body` and return its `DocumentId`.
+ fn insert_doc(&self, path: &str, body: &str) -> DocumentId {
+ self.insert_doc_with_media(path, body, MediaType::Markdown)
+ }
+
+ /// Insert a doc whose `assets.media_type` JSON is set to the serialized
+ /// form of `media`. The `documents.updated_at` defaults to now.
+ fn insert_doc_with_media(&self, path: &str, body: &str, media: MediaType) -> DocumentId {
+ self.insert_doc_full(path, body, media, OffsetDateTime::now_utc())
+ }
+
+ /// Insert a doc with an explicit `updated_at` timestamp (for
+ /// `ingested_after` filter tests).
+ fn insert_doc_with_updated_at(
+ &self,
+ path: &str,
+ body: &str,
+ updated_at: OffsetDateTime,
+ ) -> DocumentId {
+ self.insert_doc_full(path, body, MediaType::Markdown, updated_at)
+ }
+
+ fn insert_doc_full(
+ &self,
+ path: &str,
+ body: &str,
+ media: MediaType,
+ updated_at: OffsetDateTime,
+ ) -> DocumentId {
+ use time::format_description::well_known::Rfc3339;
+ let doc_id = self.next_id("doc");
+ let chunk_id = self.next_id("chk");
+ let asset_id = self.next_id("ast");
+ let media_json = serde_json::to_string(&media).expect("serialize MediaType");
+ let updated_at_str = updated_at.format(&Rfc3339).expect("format updated_at");
+
+ let conn = self.inner.raw_conn();
+ conn.execute(
+ "INSERT OR IGNORE INTO assets (
+ asset_id, source_uri, workspace_path, media_type, byte_len,
+ checksum, storage_kind, storage_path, discovered_at
+ ) VALUES (?, ?, ?, ?, 0,
+ 'd0', 'reference', ?, '2024-01-01T00:00:00Z')",
+ rusqlite::params![asset_id, format!("file:///{path}"), path, media_json, path],
+ )
+ .expect("insert asset");
+
+ conn.execute(
+ "INSERT INTO documents (
+ doc_id, asset_id, workspace_path, title, lang,
+ source_type, trust_level, parser_version,
+ doc_version, schema_version, metadata_json,
+ provenance_json, created_at, updated_at
+ ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'pv1', 1, 1,
+ '{}', '{\"events\":[]}',
+ '2024-01-01T00:00:00Z', ?)",
+ rusqlite::params![doc_id, asset_id, path, updated_at_str],
+ )
+ .expect("insert document");
+
+ let empty_headings: Vec<&str> = vec![];
+ let heading_json = serde_json::to_string(&empty_headings).unwrap();
+ conn.execute(
+ "INSERT INTO chunks (
+ chunk_id, doc_id, text, heading_path_json, section_label,
+ source_spans_json, token_estimate, chunker_version,
+ policy_hash, block_ids_json, created_at
+ ) VALUES (?, ?, ?, ?, NULL,
+ '[{\"kind\":\"line\",\"start\":1,\"end\":1}]',
+ 1, 'v1', 'h', '[]', '2024-01-01T00:00:00Z')",
+ rusqlite::params![chunk_id, doc_id, body, heading_json],
+ )
+ .expect("insert chunk");
+
+ DocumentId(doc_id)
+ }
+
+ fn run_search(&self, query: &str, filters: &SearchFilters) -> Vec {
+ let r = self.inner.retriever();
+ let q = SearchQuery {
+ text: query.to_string(),
+ mode: SearchMode::Lexical,
+ k: 10,
+ filters: filters.clone(),
+ };
+ r.search(&q).expect("search")
+ }
+}
+
+// ── fb-36 filter tests ───────────────────────────────────────────────────
+
+#[test]
+fn lexical_filter_by_media() {
+ let env = TestEnv::new();
+ env.insert_doc_with_media("md1.md", "rust ownership", MediaType::Markdown);
+ env.insert_doc_with_media("doc.pdf", "rust pdf body", MediaType::Pdf);
+ let filters = SearchFilters {
+ media: vec!["pdf".to_string()],
+ ..Default::default()
+ };
+ let hits = env.run_search("rust", &filters);
+ assert_eq!(hits.len(), 1, "only pdf doc should match");
+ assert!(hits[0].doc_path.0.ends_with(".pdf"), "got: {}", hits[0].doc_path.0);
+}
+
+#[test]
+fn lexical_filter_by_ingested_after() {
+ let env = TestEnv::new();
+ env.insert_doc_with_updated_at(
+ "old.md",
+ "ingest test",
+ time::macros::datetime!(2020-01-01 00:00:00 UTC),
+ );
+ env.insert_doc_with_updated_at(
+ "new.md",
+ "ingest test",
+ time::macros::datetime!(2026-01-01 00:00:00 UTC),
+ );
+ let filters = SearchFilters {
+ ingested_after: Some(time::macros::datetime!(2025-01-01 00:00:00 UTC)),
+ ..Default::default()
+ };
+ let hits = env.run_search("ingest", &filters);
+ assert_eq!(hits.len(), 1, "only post-2025 doc matches");
+}
+
+#[test]
+fn lexical_filter_by_doc_id() {
+ let env = TestEnv::new();
+ let target = env.insert_doc("a.md", "shared term");
+ env.insert_doc("b.md", "shared term");
+ let filters = SearchFilters {
+ doc_id: Some(target.clone()),
+ ..Default::default()
+ };
+ let hits = env.run_search("shared", &filters);
+ assert!(!hits.is_empty(), "should get at least one hit for target doc");
+ for h in &hits {
+ assert_eq!(h.doc_id, target, "all hits must be from target doc");
+ }
+}
+
+#[test]
+fn lexical_filter_combinator_is_and() {
+ let env = TestEnv::new();
+ let target = env.insert_doc_with_media("a.md", "rust", MediaType::Markdown);
+ env.insert_doc_with_media("b.pdf", "rust", MediaType::Pdf);
+ let filters = SearchFilters {
+ media: vec!["markdown".to_string()],
+ doc_id: Some(target.clone()),
+ ..Default::default()
+ };
+ let hits = env.run_search("rust", &filters);
+ assert!(!hits.is_empty(), "target doc should match combined filter");
+ assert!(hits.iter().all(|h| h.doc_id == target));
+}
+
+#[test]
+fn lexical_filter_unknown_media_returns_empty() {
+ let env = TestEnv::new();
+ env.insert_doc("a.md", "rust");
+ let filters = SearchFilters {
+ media: vec!["nonexistent_kind".to_string()],
+ ..Default::default()
+ };
+ let hits = env.run_search("rust", &filters);
+ assert!(hits.is_empty(), "unknown media → no hits, no error");
+}
+
+#[test]
+fn lexical_empty_filters_match_default_behavior() {
+ let env = TestEnv::new();
+ env.insert_doc("a.md", "rust");
+ let with_default = env.run_search("rust", &SearchFilters::default());
+ assert!(!with_default.is_empty());
+}
+
#[test]
fn lexical_snapshot_run_1() {
// Pinned snapshot. A small, deterministic corpus; the JSON shape of