feat(search/lexical): media / ingested_after / doc_id filters (fb-36)

SQL WHERE clause extension. media uses CASE WHEN json_type='text' to handle both unit (\`"markdown"\`) and tuple (\`{"image":"png"}\`) MediaType serde shapes. ingested_after relies on RFC3339 lexicographic ordering with UTC Z (per fb-32 ingest invariant). doc_id is a simple equality. AND combinator with existing tags / lang / trust filters. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 03:41:02 +09:00
parent d3f38c76e9
commit 2c80e2ad91
2 changed files with 253 additions and 1 deletions
--- a/crates/kebab-search/src/lexical.rs
+++ b/crates/kebab-search/src/lexical.rs
@@ -319,6 +319,50 @@ fn run_query(
        };
        params.push(Box::new(rank));
    }
+    // p9-fb-36: media_type filter (IN-list).
+    // `assets.media_type` JSON has two shapes:
+    //   - unit variant (Markdown / Pdf): JSON text, e.g. `"markdown"`
+    //   - tuple variant (Image(Png) / Audio(Mp3) / Other(s)): JSON object,
+    //     e.g. `{"image": "png"}`
+    // Extract a unified "kind" string for both shapes via:
+    //   CASE WHEN json_type = 'text' THEN json_extract($)
+    //        ELSE (first object key)
+    //   END IN (?, ...)
+    if !filters.media.is_empty() {
+        let placeholders: Vec<&str> =
+            std::iter::repeat("?").take(filters.media.len()).collect();
+        let placeholders = placeholders.join(",");
+        sql.push_str(&format!(
+            " AND f.doc_id IN (\
+               SELECT d2.doc_id FROM documents d2 \
+               JOIN assets a ON a.asset_id = d2.asset_id \
+               WHERE CASE \
+                 WHEN json_type(a.media_type) = 'text' THEN json_extract(a.media_type, '$') \
+                 ELSE (SELECT key FROM json_each(a.media_type) LIMIT 1) \
+               END IN ({placeholders}))"
+        ));
+        for kind in &filters.media {
+            params.push(Box::new(kind.clone()));
+        }
+    }
+
+    // p9-fb-36: ingested_after filter.
+    // `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
+    // fb-32 ingest path), so lexicographic >= compare is correct.
+    if let Some(after) = &filters.ingested_after {
+        let formatted = after
+            .format(&time::format_description::well_known::Rfc3339)
+            .expect("OffsetDateTime formats to RFC3339");
+        sql.push_str(" AND d.updated_at >= ?");
+        params.push(Box::new(formatted));
+    }
+
+    // p9-fb-36: doc_id filter — single-doc scoping.
+    if let Some(id) = &filters.doc_id {
+        sql.push_str(" AND d.doc_id = ?");
+        params.push(Box::new(id.0.clone()));
+    }
+
    // path_glob is intentionally NOT applied here — see module comment
    // on PATH_GLOB_OVERFETCH and the post-filter in `LexicalRetriever::search`.

--- a/crates/kebab-search/tests/lexical.rs
+++ b/crates/kebab-search/tests/lexical.rs
@@ -8,11 +8,15 @@
 use std::sync::Arc;

 use kebab_config::Config;
-use kebab_core::{IndexVersion, Lang, Retriever, SearchFilters, SearchMode, SearchQuery, TrustLevel};
+use kebab_core::{
+    DocumentId, IndexVersion, Lang, MediaType, Retriever, SearchFilters, SearchHit, SearchMode,
+    SearchQuery, TrustLevel,
+};
 use kebab_search::LexicalRetriever;
 use kebab_store_sqlite::SqliteStore;
 use rusqlite::Connection;
 use tempfile::TempDir;
+use time::OffsetDateTime;

 // ── Test scaffolding ─────────────────────────────────────────────────────

@@ -679,6 +683,210 @@ fn search_hit_carries_indexed_at_from_documents_updated_at() {
    assert!(!hit.stale, "lexical retriever must default stale=false");
 }

+// ── TestEnv helper for fb-36 filter tests ───────────────────────────────
+
+/// Convenience wrapper over `Env` that exposes higher-level fixture helpers
+/// for the fb-36 filter tests.  Intentionally kept separate from `Env` so
+/// the original tests are untouched.
+struct TestEnv {
+    inner: Env,
+    counter: std::cell::Cell<u32>,
+}
+
+impl TestEnv {
+    fn new() -> Self {
+        Self {
+            inner: Env::new(),
+            counter: std::cell::Cell::new(0),
+        }
+    }
+
+    /// Allocate a fresh monotone counter suffix so every inserted doc / chunk
+    /// gets a unique 32-hex ID without the caller worrying about collisions.
+    fn next_id(&self, prefix: &str) -> String {
+        let n = self.counter.get();
+        self.counter.set(n + 1);
+        let suffix = format!("{prefix}{n:04}");
+        id32(&suffix)
+    }
+
+    /// Insert a markdown doc with the given `body` and return its `DocumentId`.
+    fn insert_doc(&self, path: &str, body: &str) -> DocumentId {
+        self.insert_doc_with_media(path, body, MediaType::Markdown)
+    }
+
+    /// Insert a doc whose `assets.media_type` JSON is set to the serialized
+    /// form of `media`.  The `documents.updated_at` defaults to now.
+    fn insert_doc_with_media(&self, path: &str, body: &str, media: MediaType) -> DocumentId {
+        self.insert_doc_full(path, body, media, OffsetDateTime::now_utc())
+    }
+
+    /// Insert a doc with an explicit `updated_at` timestamp (for
+    /// `ingested_after` filter tests).
+    fn insert_doc_with_updated_at(
+        &self,
+        path: &str,
+        body: &str,
+        updated_at: OffsetDateTime,
+    ) -> DocumentId {
+        self.insert_doc_full(path, body, MediaType::Markdown, updated_at)
+    }
+
+    fn insert_doc_full(
+        &self,
+        path: &str,
+        body: &str,
+        media: MediaType,
+        updated_at: OffsetDateTime,
+    ) -> DocumentId {
+        use time::format_description::well_known::Rfc3339;
+        let doc_id = self.next_id("doc");
+        let chunk_id = self.next_id("chk");
+        let asset_id = self.next_id("ast");
+        let media_json = serde_json::to_string(&media).expect("serialize MediaType");
+        let updated_at_str = updated_at.format(&Rfc3339).expect("format updated_at");
+
+        let conn = self.inner.raw_conn();
+        conn.execute(
+            "INSERT OR IGNORE INTO assets (
+                asset_id, source_uri, workspace_path, media_type, byte_len,
+                checksum, storage_kind, storage_path, discovered_at
+            ) VALUES (?, ?, ?, ?, 0,
+                      'd0', 'reference', ?, '2024-01-01T00:00:00Z')",
+            rusqlite::params![asset_id, format!("file:///{path}"), path, media_json, path],
+        )
+        .expect("insert asset");
+
+        conn.execute(
+            "INSERT INTO documents (
+                doc_id, asset_id, workspace_path, title, lang,
+                source_type, trust_level, parser_version,
+                doc_version, schema_version, metadata_json,
+                provenance_json, created_at, updated_at
+            ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'pv1', 1, 1,
+                      '{}', '{\"events\":[]}',
+                      '2024-01-01T00:00:00Z', ?)",
+            rusqlite::params![doc_id, asset_id, path, updated_at_str],
+        )
+        .expect("insert document");
+
+        let empty_headings: Vec<&str> = vec![];
+        let heading_json = serde_json::to_string(&empty_headings).unwrap();
+        conn.execute(
+            "INSERT INTO chunks (
+                chunk_id, doc_id, text, heading_path_json, section_label,
+                source_spans_json, token_estimate, chunker_version,
+                policy_hash, block_ids_json, created_at
+            ) VALUES (?, ?, ?, ?, NULL,
+                      '[{\"kind\":\"line\",\"start\":1,\"end\":1}]',
+                      1, 'v1', 'h', '[]', '2024-01-01T00:00:00Z')",
+            rusqlite::params![chunk_id, doc_id, body, heading_json],
+        )
+        .expect("insert chunk");
+
+        DocumentId(doc_id)
+    }
+
+    fn run_search(&self, query: &str, filters: &SearchFilters) -> Vec<SearchHit> {
+        let r = self.inner.retriever();
+        let q = SearchQuery {
+            text: query.to_string(),
+            mode: SearchMode::Lexical,
+            k: 10,
+            filters: filters.clone(),
+        };
+        r.search(&q).expect("search")
+    }
+}
+
+// ── fb-36 filter tests ───────────────────────────────────────────────────
+
+#[test]
+fn lexical_filter_by_media() {
+    let env = TestEnv::new();
+    env.insert_doc_with_media("md1.md", "rust ownership", MediaType::Markdown);
+    env.insert_doc_with_media("doc.pdf", "rust pdf body", MediaType::Pdf);
+    let filters = SearchFilters {
+        media: vec!["pdf".to_string()],
+        ..Default::default()
+    };
+    let hits = env.run_search("rust", &filters);
+    assert_eq!(hits.len(), 1, "only pdf doc should match");
+    assert!(hits[0].doc_path.0.ends_with(".pdf"), "got: {}", hits[0].doc_path.0);
+}
+
+#[test]
+fn lexical_filter_by_ingested_after() {
+    let env = TestEnv::new();
+    env.insert_doc_with_updated_at(
+        "old.md",
+        "ingest test",
+        time::macros::datetime!(2020-01-01 00:00:00 UTC),
+    );
+    env.insert_doc_with_updated_at(
+        "new.md",
+        "ingest test",
+        time::macros::datetime!(2026-01-01 00:00:00 UTC),
+    );
+    let filters = SearchFilters {
+        ingested_after: Some(time::macros::datetime!(2025-01-01 00:00:00 UTC)),
+        ..Default::default()
+    };
+    let hits = env.run_search("ingest", &filters);
+    assert_eq!(hits.len(), 1, "only post-2025 doc matches");
+}
+
+#[test]
+fn lexical_filter_by_doc_id() {
+    let env = TestEnv::new();
+    let target = env.insert_doc("a.md", "shared term");
+    env.insert_doc("b.md", "shared term");
+    let filters = SearchFilters {
+        doc_id: Some(target.clone()),
+        ..Default::default()
+    };
+    let hits = env.run_search("shared", &filters);
+    assert!(!hits.is_empty(), "should get at least one hit for target doc");
+    for h in &hits {
+        assert_eq!(h.doc_id, target, "all hits must be from target doc");
+    }
+}
+
+#[test]
+fn lexical_filter_combinator_is_and() {
+    let env = TestEnv::new();
+    let target = env.insert_doc_with_media("a.md", "rust", MediaType::Markdown);
+    env.insert_doc_with_media("b.pdf", "rust", MediaType::Pdf);
+    let filters = SearchFilters {
+        media: vec!["markdown".to_string()],
+        doc_id: Some(target.clone()),
+        ..Default::default()
+    };
+    let hits = env.run_search("rust", &filters);
+    assert!(!hits.is_empty(), "target doc should match combined filter");
+    assert!(hits.iter().all(|h| h.doc_id == target));
+}
+
+#[test]
+fn lexical_filter_unknown_media_returns_empty() {
+    let env = TestEnv::new();
+    env.insert_doc("a.md", "rust");
+    let filters = SearchFilters {
+        media: vec!["nonexistent_kind".to_string()],
+        ..Default::default()
+    };
+    let hits = env.run_search("rust", &filters);
+    assert!(hits.is_empty(), "unknown media → no hits, no error");
+}
+
+#[test]
+fn lexical_empty_filters_match_default_behavior() {
+    let env = TestEnv::new();
+    env.insert_doc("a.md", "rust");
+    let with_default = env.run_search("rust", &SearchFilters::default());
+    assert!(!with_default.is_empty());
+}
+
 #[test]
 fn lexical_snapshot_run_1() {
    // Pinned snapshot. A small, deterministic corpus; the JSON shape of