From 2c80e2ad915e021dd73f4571ffa61aeb291341a7 Mon Sep 17 00:00:00 2001 From: th-kim0823 Date: Sun, 10 May 2026 03:41:02 +0900 Subject: [PATCH] feat(search/lexical): media / ingested_after / doc_id filters (fb-36) SQL WHERE clause extension. media uses CASE WHEN json_type='text' to handle both unit (\`"markdown"\`) and tuple (\`{"image":"png"}\`) MediaType serde shapes. ingested_after relies on RFC3339 lexicographic ordering with UTC Z (per fb-32 ingest invariant). doc_id is a simple equality. AND combinator with existing tags / lang / trust filters. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-search/src/lexical.rs | 44 ++++++ crates/kebab-search/tests/lexical.rs | 210 ++++++++++++++++++++++++++- 2 files changed, 253 insertions(+), 1 deletion(-) diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs index 513fb9e..954148c 100644 --- a/crates/kebab-search/src/lexical.rs +++ b/crates/kebab-search/src/lexical.rs @@ -319,6 +319,50 @@ fn run_query( }; params.push(Box::new(rank)); } + // p9-fb-36: media_type filter (IN-list). + // `assets.media_type` JSON has two shapes: + // - unit variant (Markdown / Pdf): JSON text, e.g. `"markdown"` + // - tuple variant (Image(Png) / Audio(Mp3) / Other(s)): JSON object, + // e.g. `{"image": "png"}` + // Extract a unified "kind" string for both shapes via: + // CASE WHEN json_type = 'text' THEN json_extract($) + // ELSE (first object key) + // END IN (?, ...) + if !filters.media.is_empty() { + let placeholders: Vec<&str> = + std::iter::repeat("?").take(filters.media.len()).collect(); + let placeholders = placeholders.join(","); + sql.push_str(&format!( + " AND f.doc_id IN (\ + SELECT d2.doc_id FROM documents d2 \ + JOIN assets a ON a.asset_id = d2.asset_id \ + WHERE CASE \ + WHEN json_type(a.media_type) = 'text' THEN json_extract(a.media_type, '$') \ + ELSE (SELECT key FROM json_each(a.media_type) LIMIT 1) \ + END IN ({placeholders}))" + )); + for kind in &filters.media { + params.push(Box::new(kind.clone())); + } + } + + // p9-fb-36: ingested_after filter. + // `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per + // fb-32 ingest path), so lexicographic >= compare is correct. + if let Some(after) = &filters.ingested_after { + let formatted = after + .format(&time::format_description::well_known::Rfc3339) + .expect("OffsetDateTime formats to RFC3339"); + sql.push_str(" AND d.updated_at >= ?"); + params.push(Box::new(formatted)); + } + + // p9-fb-36: doc_id filter — single-doc scoping. + if let Some(id) = &filters.doc_id { + sql.push_str(" AND d.doc_id = ?"); + params.push(Box::new(id.0.clone())); + } + // path_glob is intentionally NOT applied here — see module comment // on PATH_GLOB_OVERFETCH and the post-filter in `LexicalRetriever::search`. diff --git a/crates/kebab-search/tests/lexical.rs b/crates/kebab-search/tests/lexical.rs index ae01460..4265160 100644 --- a/crates/kebab-search/tests/lexical.rs +++ b/crates/kebab-search/tests/lexical.rs @@ -8,11 +8,15 @@ use std::sync::Arc; use kebab_config::Config; -use kebab_core::{IndexVersion, Lang, Retriever, SearchFilters, SearchMode, SearchQuery, TrustLevel}; +use kebab_core::{ + DocumentId, IndexVersion, Lang, MediaType, Retriever, SearchFilters, SearchHit, SearchMode, + SearchQuery, TrustLevel, +}; use kebab_search::LexicalRetriever; use kebab_store_sqlite::SqliteStore; use rusqlite::Connection; use tempfile::TempDir; +use time::OffsetDateTime; // ── Test scaffolding ───────────────────────────────────────────────────── @@ -679,6 +683,210 @@ fn search_hit_carries_indexed_at_from_documents_updated_at() { assert!(!hit.stale, "lexical retriever must default stale=false"); } +// ── TestEnv helper for fb-36 filter tests ─────────────────────────────── + +/// Convenience wrapper over `Env` that exposes higher-level fixture helpers +/// for the fb-36 filter tests. Intentionally kept separate from `Env` so +/// the original tests are untouched. +struct TestEnv { + inner: Env, + counter: std::cell::Cell, +} + +impl TestEnv { + fn new() -> Self { + Self { + inner: Env::new(), + counter: std::cell::Cell::new(0), + } + } + + /// Allocate a fresh monotone counter suffix so every inserted doc / chunk + /// gets a unique 32-hex ID without the caller worrying about collisions. + fn next_id(&self, prefix: &str) -> String { + let n = self.counter.get(); + self.counter.set(n + 1); + let suffix = format!("{prefix}{n:04}"); + id32(&suffix) + } + + /// Insert a markdown doc with the given `body` and return its `DocumentId`. + fn insert_doc(&self, path: &str, body: &str) -> DocumentId { + self.insert_doc_with_media(path, body, MediaType::Markdown) + } + + /// Insert a doc whose `assets.media_type` JSON is set to the serialized + /// form of `media`. The `documents.updated_at` defaults to now. + fn insert_doc_with_media(&self, path: &str, body: &str, media: MediaType) -> DocumentId { + self.insert_doc_full(path, body, media, OffsetDateTime::now_utc()) + } + + /// Insert a doc with an explicit `updated_at` timestamp (for + /// `ingested_after` filter tests). + fn insert_doc_with_updated_at( + &self, + path: &str, + body: &str, + updated_at: OffsetDateTime, + ) -> DocumentId { + self.insert_doc_full(path, body, MediaType::Markdown, updated_at) + } + + fn insert_doc_full( + &self, + path: &str, + body: &str, + media: MediaType, + updated_at: OffsetDateTime, + ) -> DocumentId { + use time::format_description::well_known::Rfc3339; + let doc_id = self.next_id("doc"); + let chunk_id = self.next_id("chk"); + let asset_id = self.next_id("ast"); + let media_json = serde_json::to_string(&media).expect("serialize MediaType"); + let updated_at_str = updated_at.format(&Rfc3339).expect("format updated_at"); + + let conn = self.inner.raw_conn(); + conn.execute( + "INSERT OR IGNORE INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, ?, 0, + 'd0', 'reference', ?, '2024-01-01T00:00:00Z')", + rusqlite::params![asset_id, format!("file:///{path}"), path, media_json, path], + ) + .expect("insert asset"); + + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, + source_type, trust_level, parser_version, + doc_version, schema_version, metadata_json, + provenance_json, created_at, updated_at + ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'pv1', 1, 1, + '{}', '{\"events\":[]}', + '2024-01-01T00:00:00Z', ?)", + rusqlite::params![doc_id, asset_id, path, updated_at_str], + ) + .expect("insert document"); + + let empty_headings: Vec<&str> = vec![]; + let heading_json = serde_json::to_string(&empty_headings).unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, ?, ?, NULL, + '[{\"kind\":\"line\",\"start\":1,\"end\":1}]', + 1, 'v1', 'h', '[]', '2024-01-01T00:00:00Z')", + rusqlite::params![chunk_id, doc_id, body, heading_json], + ) + .expect("insert chunk"); + + DocumentId(doc_id) + } + + fn run_search(&self, query: &str, filters: &SearchFilters) -> Vec { + let r = self.inner.retriever(); + let q = SearchQuery { + text: query.to_string(), + mode: SearchMode::Lexical, + k: 10, + filters: filters.clone(), + }; + r.search(&q).expect("search") + } +} + +// ── fb-36 filter tests ─────────────────────────────────────────────────── + +#[test] +fn lexical_filter_by_media() { + let env = TestEnv::new(); + env.insert_doc_with_media("md1.md", "rust ownership", MediaType::Markdown); + env.insert_doc_with_media("doc.pdf", "rust pdf body", MediaType::Pdf); + let filters = SearchFilters { + media: vec!["pdf".to_string()], + ..Default::default() + }; + let hits = env.run_search("rust", &filters); + assert_eq!(hits.len(), 1, "only pdf doc should match"); + assert!(hits[0].doc_path.0.ends_with(".pdf"), "got: {}", hits[0].doc_path.0); +} + +#[test] +fn lexical_filter_by_ingested_after() { + let env = TestEnv::new(); + env.insert_doc_with_updated_at( + "old.md", + "ingest test", + time::macros::datetime!(2020-01-01 00:00:00 UTC), + ); + env.insert_doc_with_updated_at( + "new.md", + "ingest test", + time::macros::datetime!(2026-01-01 00:00:00 UTC), + ); + let filters = SearchFilters { + ingested_after: Some(time::macros::datetime!(2025-01-01 00:00:00 UTC)), + ..Default::default() + }; + let hits = env.run_search("ingest", &filters); + assert_eq!(hits.len(), 1, "only post-2025 doc matches"); +} + +#[test] +fn lexical_filter_by_doc_id() { + let env = TestEnv::new(); + let target = env.insert_doc("a.md", "shared term"); + env.insert_doc("b.md", "shared term"); + let filters = SearchFilters { + doc_id: Some(target.clone()), + ..Default::default() + }; + let hits = env.run_search("shared", &filters); + assert!(!hits.is_empty(), "should get at least one hit for target doc"); + for h in &hits { + assert_eq!(h.doc_id, target, "all hits must be from target doc"); + } +} + +#[test] +fn lexical_filter_combinator_is_and() { + let env = TestEnv::new(); + let target = env.insert_doc_with_media("a.md", "rust", MediaType::Markdown); + env.insert_doc_with_media("b.pdf", "rust", MediaType::Pdf); + let filters = SearchFilters { + media: vec!["markdown".to_string()], + doc_id: Some(target.clone()), + ..Default::default() + }; + let hits = env.run_search("rust", &filters); + assert!(!hits.is_empty(), "target doc should match combined filter"); + assert!(hits.iter().all(|h| h.doc_id == target)); +} + +#[test] +fn lexical_filter_unknown_media_returns_empty() { + let env = TestEnv::new(); + env.insert_doc("a.md", "rust"); + let filters = SearchFilters { + media: vec!["nonexistent_kind".to_string()], + ..Default::default() + }; + let hits = env.run_search("rust", &filters); + assert!(hits.is_empty(), "unknown media → no hits, no error"); +} + +#[test] +fn lexical_empty_filters_match_default_behavior() { + let env = TestEnv::new(); + env.insert_doc("a.md", "rust"); + let with_default = env.run_search("rust", &SearchFilters::default()); + assert!(!with_default.is_empty()); +} + #[test] fn lexical_snapshot_run_1() { // Pinned snapshot. A small, deterministic corpus; the JSON shape of