diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs index 43b4d26..67c21d5 100644 --- a/crates/kebab-search/src/lexical.rs +++ b/crates/kebab-search/src/lexical.rs @@ -346,6 +346,34 @@ fn run_query( } } + // p10-1A-1 fix (dogfood-discovered 2026-05-20): code_lang filter + // (IN-list on metadata_json.$.code_lang). Empty Vec = no filter. + if !filters.code_lang.is_empty() { + let placeholders = std::iter::repeat_n("?", filters.code_lang.len()) + .collect::>() + .join(","); + sql.push_str(&format!( + " AND json_extract(d.metadata_json, '$.code_lang') IN ({placeholders})" + )); + for lang in &filters.code_lang { + params.push(Box::new(lang.clone())); + } + } + + // p10-1A-1 fix (dogfood-discovered 2026-05-20): repo filter + // (IN-list on metadata_json.$.repo). Empty Vec = no filter. + if !filters.repo.is_empty() { + let placeholders = std::iter::repeat_n("?", filters.repo.len()) + .collect::>() + .join(","); + sql.push_str(&format!( + " AND json_extract(d.metadata_json, '$.repo') IN ({placeholders})" + )); + for repo in &filters.repo { + params.push(Box::new(repo.clone())); + } + } + // p9-fb-36: ingested_after filter. // `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per // fb-32 ingest path), so lexicographic >= compare is correct — but only diff --git a/crates/kebab-search/tests/lexical.rs b/crates/kebab-search/tests/lexical.rs index ba8f9b8..7fcf5a5 100644 --- a/crates/kebab-search/tests/lexical.rs +++ b/crates/kebab-search/tests/lexical.rs @@ -785,6 +785,19 @@ impl TestEnv { body: &str, media: MediaType, updated_at: OffsetDateTime, + ) -> DocumentId { + self.insert_doc_full_with_metadata(path, body, media, updated_at, "{}") + } + + /// Like `insert_doc_full` but accepts an explicit `metadata_json` string + /// so p10-1A-1 filter tests can set `metadata.code_lang` / `metadata.repo`. + fn insert_doc_full_with_metadata( + &self, + path: &str, + body: &str, + media: MediaType, + updated_at: OffsetDateTime, + metadata_json: &str, ) -> DocumentId { use time::format_description::well_known::Rfc3339; let doc_id = self.next_id("doc"); @@ -810,10 +823,10 @@ impl TestEnv { source_type, trust_level, parser_version, doc_version, schema_version, metadata_json, provenance_json, created_at, updated_at - ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'pv1', 1, 1, - '{}', '{\"events\":[]}', + ) VALUES (?, ?, ?, NULL, 'en', 'code', 'primary', 'pv1', 1, 1, + ?, '{\"events\":[]}', '2024-01-01T00:00:00Z', ?)", - rusqlite::params![doc_id, asset_id, path, updated_at_str], + rusqlite::params![doc_id, asset_id, path, metadata_json, updated_at_str], ) .expect("insert document"); @@ -834,6 +847,21 @@ impl TestEnv { DocumentId(doc_id) } + /// Insert a code doc with explicit `code_lang` and optional `repo` in metadata. + fn insert_code_doc(&self, path: &str, body: &str, code_lang: &str, repo: Option<&str>) -> DocumentId { + let metadata_json = match repo { + Some(r) => format!(r#"{{"code_lang":"{code_lang}","repo":"{r}"}}"#), + None => format!(r#"{{"code_lang":"{code_lang}"}}"#), + }; + self.insert_doc_full_with_metadata( + path, + body, + MediaType::Markdown, + OffsetDateTime::now_utc(), + &metadata_json, + ) + } + fn run_search(&self, query: &str, filters: &SearchFilters) -> Vec { let r = self.inner.retriever(); let q = SearchQuery { @@ -934,6 +962,52 @@ fn lexical_empty_filters_match_default_behavior() { assert!(!with_default.is_empty()); } +// ── p10-1A-1 filter tests ──────────────────────────────────────────────── + +#[test] +fn lexical_filter_by_code_lang() { + // Three docs: python code, rust code, markdown (no code_lang). + // Filter code_lang=["python"] → only the python doc should match. + let env = TestEnv::new(); + env.insert_code_doc("src/main.py", "AsyncClient session", "python", None); + env.insert_code_doc("src/lib.rs", "AsyncClient session", "rust", None); + env.insert_doc("docs/guide.md", "AsyncClient session"); + + let filters = SearchFilters { + code_lang: vec!["python".to_string()], + ..Default::default() + }; + let hits = env.run_search("AsyncClient", &filters); + assert_eq!(hits.len(), 1, "only python doc should match code_lang filter"); + assert!( + hits[0].doc_path.0.ends_with(".py"), + "expected python path, got: {}", + hits[0].doc_path.0 + ); +} + +#[test] +fn lexical_filter_by_repo() { + // Three docs: one in repo "httpx", one in repo "requests", one with no repo. + // Filter repo=["httpx"] → only the httpx doc should match. + let env = TestEnv::new(); + env.insert_code_doc("httpx/client.py", "session send request", "python", Some("httpx")); + env.insert_code_doc("requests/api.py", "session send request", "python", Some("requests")); + env.insert_code_doc("standalone.py", "session send request", "python", None); + + let filters = SearchFilters { + repo: vec!["httpx".to_string()], + ..Default::default() + }; + let hits = env.run_search("session", &filters); + assert_eq!(hits.len(), 1, "only httpx doc should match repo filter"); + assert!( + hits[0].doc_path.0.starts_with("httpx/"), + "expected httpx path, got: {}", + hits[0].doc_path.0 + ); +} + #[test] fn lexical_snapshot_run_1() { // Pinned snapshot. A small, deterministic corpus; the JSON shape of diff --git a/crates/kebab-store-sqlite/src/filters.rs b/crates/kebab-store-sqlite/src/filters.rs index 9519879..9c68829 100644 --- a/crates/kebab-store-sqlite/src/filters.rs +++ b/crates/kebab-store-sqlite/src/filters.rs @@ -153,6 +153,34 @@ impl SqliteStore { } } + // p10-1A-1 fix (dogfood-discovered 2026-05-20): code_lang filter + // (IN-list on metadata_json.$.code_lang). Empty Vec = no filter. + if !filters.code_lang.is_empty() { + let placeholders = std::iter::repeat_n("?", filters.code_lang.len()) + .collect::>() + .join(","); + sql.push_str(&format!( + " AND json_extract(d.metadata_json, '$.code_lang') IN ({placeholders})" + )); + for lang in &filters.code_lang { + bind.push(Box::new(lang.clone())); + } + } + + // p10-1A-1 fix (dogfood-discovered 2026-05-20): repo filter + // (IN-list on metadata_json.$.repo). Empty Vec = no filter. + if !filters.repo.is_empty() { + let placeholders = std::iter::repeat_n("?", filters.repo.len()) + .collect::>() + .join(","); + sql.push_str(&format!( + " AND json_extract(d.metadata_json, '$.repo') IN ({placeholders})" + )); + for repo in &filters.repo { + bind.push(Box::new(repo.clone())); + } + } + // p9-fb-36: ingested_after filter. // `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32); // lexicographic >= compare is correct — but only when the filter @@ -408,6 +436,78 @@ mod tests { .unwrap(); } + /// Variant of `seed_committed_full` that additionally accepts a + /// `metadata_json` string so p10-1A-1 filter tests can set + /// `metadata.code_lang` / `metadata.repo` without going through the + /// full ingest pipeline. + #[allow(clippy::too_many_arguments)] + fn seed_committed_with_metadata( + store: &SqliteStore, + chunk_id: &str, + doc_id: &str, + workspace_path: &str, + media_type_json: &str, + metadata_json: &str, + ) { + let asset_id = format!("a{}", &doc_id[..31]); + { + let conn = store.lock_conn(); + conn.execute( + "INSERT INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, ?, 0, 'deadbeefdeadbeefdeadbeefdeadbeef', + 'reference', ?, '1970-01-01T00:00:00Z')", + params![ + asset_id, + format!("file://{workspace_path}"), + workspace_path, + media_type_json, + workspace_path, + ], + ) + .unwrap(); + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, parser_version, doc_version, schema_version, + metadata_json, provenance_json, created_at, updated_at + ) VALUES (?, ?, ?, NULL, 'en', 'code', 'primary', 'v1', 1, 1, + ?, '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')", + params![doc_id, asset_id, workspace_path, metadata_json], + ) + .unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, 'code snippet', '[]', NULL, '[]', 1, 'v1', 'h', '[]', + '1970-01-01T00:00:00Z')", + params![chunk_id, doc_id], + ) + .unwrap(); + } + + let embed_row = EmbeddingRecordRow { + embedding_id: format!("e{}", &chunk_id[..31]), + chunk_id: chunk_id.to_string(), + model_id: "m".to_string(), + model_version: "v1".to_string(), + dimensions: 4, + lance_table: "t".to_string(), + created_at: OffsetDateTime::UNIX_EPOCH, + }; + store + .put_embedding_records_pending(std::slice::from_ref(&embed_row)) + .unwrap(); + store + .mark_embedding_records_committed(std::slice::from_ref( + &embed_row.embedding_id, + )) + .unwrap(); + } + fn cid(s: &str) -> ChunkId { ChunkId(s.to_string()) } @@ -671,6 +771,78 @@ mod tests { assert_eq!(out, vec![cid(c1)], "doc_id filter must scope to the target doc only"); } + // ── p10-1A-1 new filter arms ───────────────────────────────────────── + + #[test] + fn filter_chunks_code_lang_keeps_matching_lang() { + // c1 = python, c2 = rust, c3 = markdown (no code_lang). + // Filter code_lang=["python"] → only c1 survives. + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let c1 = "11111111111111111111111111111111"; + let c2 = "22222222222222222222222222222222"; + let c3 = "33333333333333333333333333333333"; + seed_committed_with_metadata( + &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", + "src/main.py", r#""code""#, + r#"{"code_lang":"python"}"#, + ); + seed_committed_with_metadata( + &store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", + "src/lib.rs", r#""code""#, + r#"{"code_lang":"rust"}"#, + ); + seed_committed_with_metadata( + &store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", + "README.md", r#""markdown""#, + r#"{}"#, + ); + + let f = SearchFilters { + code_lang: vec!["python".to_string()], + ..Default::default() + }; + let out = store + .filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f) + .unwrap(); + assert_eq!(out, vec![cid(c1)], "only python chunk should survive code_lang filter"); + } + + #[test] + fn filter_chunks_repo_keeps_matching_repo() { + // c1 = repo "httpx", c2 = repo "requests", c3 = no repo. + // Filter repo=["httpx"] → only c1 survives. + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let c1 = "11111111111111111111111111111111"; + let c2 = "22222222222222222222222222222222"; + let c3 = "33333333333333333333333333333333"; + seed_committed_with_metadata( + &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", + "httpx/client.py", r#""code""#, + r#"{"repo":"httpx","code_lang":"python"}"#, + ); + seed_committed_with_metadata( + &store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", + "requests/api.py", r#""code""#, + r#"{"repo":"requests","code_lang":"python"}"#, + ); + seed_committed_with_metadata( + &store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", + "standalone.py", r#""code""#, + r#"{"code_lang":"python"}"#, + ); + + let f = SearchFilters { + repo: vec!["httpx".to_string()], + ..Default::default() + }; + let out = store + .filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f) + .unwrap(); + assert_eq!(out, vec![cid(c1)], "only httpx chunk should survive repo filter"); + } + #[test] fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() { // Regression test for the non-UTC offset lex-compare bug.