From 7210386699c172912cfe65fc4a6d43f52084b367 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 03:26:40 +0900
Subject: [PATCH 01/11] =?UTF-8?q?spec(fb-36):=20search=20filter=20args=20?=
=?UTF-8?q?=E2=80=94=20design?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
`kebab search` 에 7 flag 노출 (기존 4 + 신규 3):
- --tag (반복) / --lang / --path-glob / --trust-min (기존 SearchFilters)
- --media (csv) / --ingested-after (RFC3339) / --doc-id (신규)
filter layer = SQLite WHERE (lexical) + over-fetch+post-filter
(vector). AND 결합. wire schema 무변경 (input only).
`SearchFilters` 3 필드 additive (#[serde(default)] 로 backwards-
compat). MCP SearchInput 7 optional 필드 추가. invalid RFC3339 →
error.v1.code = config_invalid.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
...26-05-10-p9-fb-36-search-filters-design.md | 213 ++++++++++++++++++
1 file changed, 213 insertions(+)
create mode 100644 docs/superpowers/specs/2026-05-10-p9-fb-36-search-filters-design.md
diff --git a/docs/superpowers/specs/2026-05-10-p9-fb-36-search-filters-design.md b/docs/superpowers/specs/2026-05-10-p9-fb-36-search-filters-design.md
new file mode 100644
index 0000000..cacf1d0
--- /dev/null
+++ b/docs/superpowers/specs/2026-05-10-p9-fb-36-search-filters-design.md
@@ -0,0 +1,213 @@
+---
+title: "p9-fb-36 — Search filter args design"
+phase: P9
+component: kebab-core + kebab-search + kebab-cli + kebab-mcp
+task_id: p9-fb-36
+status: design
+target_version: 0.5.0
+contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
+contract_sections: [§4 search]
+date: 2026-05-10
+---
+
+# p9-fb-36 — Search filter args
+
+## Goal
+
+agent / 사용자가 검색 범위를 좁힐 수 있도록 CLI / MCP 에 filter flag 추가. 기존 `SearchFilters` 도메인 type 의 4 필드 (tags_any / lang / path_glob / trust_min) 를 CLI 표면에 노출하고, 신규 3 필드 (media / ingested_after / doc_id) 추가. wire schema 변경 없음 (input-only). filter 적용 layer = SQLite WHERE (lexical) + over-fetch + post-filter (vector). AND 조합 의미 고정.
+
+## Behavior contract
+
+### CLI flags on `kebab search`
+
+7 flags 추가, 모두 optional. 비어있으면 미적용 (기존 동작 보존):
+
+| flag | 의미 | repeat? |
+|------|------|---------|
+| `--tag ` | doc 의 `metadata.tags` 안에 매칭 (OR-within) | yes (`--tag rust --tag async` = `tag IN (rust,async)`) |
+| `--lang ` | `documents.lang` 정확 매칭 | no |
+| `--path-glob ` | `documents.workspace_path` glob 매칭 | no |
+| `--trust-min ` | `documents.trust_level >= level` (enum 순서) | no |
+| `--media ` | `assets.media_type.kind` IN 리스트 (예: `--media md,pdf`) | csv |
+| `--ingested-after ` | `documents.updated_at >= timestamp` | no |
+| `--doc-id ` | `documents.doc_id = id` | no |
+
+다중 flag 조합 = AND 결합. 각 flag 안 다중 값 (--tag, --media) = OR.
+
+### Filter validation
+
+- `--ingested-after` RFC3339 파싱 실패 → CLI 진입 시 `error.v1.code = config_invalid`, exit 2.
+- `--media` 의 unknown value (예: `--media foo`) → 매칭 0건 (filter unmatch). 명시적 거절 안 함 (lenient).
+- `--trust-min` clap value_enum 검증 (enum 외 거절).
+- `--doc-id` 형식 검증 안 함 (DocumentId 는 단순 string wrapper). 존재하지 않으면 매칭 0건.
+
+### Filter layer
+
+**Lexical (lexical.rs)**:
+- 기존 SQL builder 의 WHERE 절 확장. `media` / `ingested_after` / `doc_id` 모두 SQL 구문 가능.
+- `media`: `JOIN assets a ON a.asset_id = d.asset_id` + `json_extract(a.media_type, '$.kind') IN (?, ?)` (다중 값).
+- `ingested_after`: `d.updated_at >= ?` (RFC3339 lexicographic compare; UTC `Z` 가정).
+- `doc_id`: `d.doc_id = ?`.
+- path_glob 은 기존 post-filter 그대로.
+
+**Vector (vector.rs)**:
+- 기존 over-fetch (k * 2) + `filter_chunks` 헬퍼에서 SQLite chunks JOIN documents JOIN assets.
+- 같은 WHERE 조건 적용. k 부족 시 truncated.
+
+### Wire shape
+
+기존 wire schema 변경 없음.
+
+- `search_response.v1` (output) — 그대로.
+- `search_hit.v1` (개별 hit) — 그대로.
+- 입력 측 (CLI args / MCP `SearchInput`) 만 확장.
+
+MCP `SearchInput` schema 는 `schemars` derive 로 자동 갱신. 수동 schema 파일 X.
+
+### MCP `SearchInput` 확장
+
+```rust
+pub struct SearchInput {
+ pub query: String,
+ pub mode: Option,
+ pub k: Option,
+ pub max_tokens: Option, // fb-34
+ pub snippet_chars: Option, // fb-34
+ pub cursor: Option, // fb-34
+ // p9-fb-36 신규 (모두 optional)
+ pub tags: Option>,
+ pub lang: Option,
+ pub path_glob: Option,
+ pub trust_min: Option, // "low" | "medium" | "high"
+ pub media: Option>,
+ pub ingested_after: Option, // RFC3339
+ pub doc_id: Option,
+}
+```
+
+input → `SearchFilters` 변환 시 위와 동일 검증 (RFC3339 파싱, trust_level enum). 실패 시 `invalid_input` ErrorV1.
+
+## Allowed / forbidden dependencies
+
+- `kebab-core`: 신규 dep 없음. 기존 type 확장만.
+- `kebab-search`: 변경 없음 (SQL builder 안 WHERE 추가만).
+- `kebab-cli`: clap flag 추가, dispatch 변환.
+- `kebab-mcp`: SearchInput 확장.
+- `kebab-tui`: 변경 없음.
+
+`kebab-core` 의 다른 `kebab-*` crate 의존 금지 룰 그대로.
+
+## Public surface delta
+
+### kebab-core
+
+```rust
+#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
+pub struct SearchFilters {
+ pub tags_any: Vec,
+ pub lang: Option,
+ pub path_glob: Option,
+ pub trust_min: Option,
+ /// p9-fb-36: media_type filter — IN-list of `MediaType.kind` strings
+ /// (e.g. `["markdown", "pdf"]`). Empty Vec = no filter.
+ #[serde(default)]
+ pub media: Vec,
+ /// p9-fb-36: hits whose source doc's `documents.updated_at` is at
+ /// or after this timestamp. None = no filter. RFC3339 / UTC.
+ #[serde(default, with = "time::serde::rfc3339::option")]
+ pub ingested_after: Option,
+ /// p9-fb-36: restrict hits to a single document. None = no filter.
+ #[serde(default)]
+ pub doc_id: Option,
+}
+```
+
+`#[serde(default)]` on each new field = backwards-compat (older JSON without these keys deserializes as defaults).
+
+### kebab-search (lexical + vector)
+
+내부 SQL builder 확장만. public API 변경 없음.
+
+### kebab-cli (`Cmd::Search`)
+
+```rust
+Cmd::Search {
+ // 기존
+ query, k, mode, explain, no_cache,
+ max_tokens, snippet_chars, cursor, // fb-34
+ // p9-fb-36 신규
+ #[arg(long)] tag: Vec,
+ #[arg(long)] lang: Option,
+ #[arg(long)] path_glob: Option,
+ #[arg(long, value_enum)] trust_min: Option,
+ #[arg(long, value_delimiter = ',')] media: Vec,
+ #[arg(long)] ingested_after: Option,
+ #[arg(long)] doc_id: Option,
+}
+```
+
+`TrustLevelFlag` 신규 clap value_enum (CLI-internal, kebab-core 의 `TrustLevel` 로 변환).
+
+### kebab-mcp::tools::search
+
+`SearchInput` 7 optional 필드 추가 (위 §MCP `SearchInput` 확장). dispatch 에서 `SearchFilters` 빌드 + 검증.
+
+## Test plan
+
+| kind | description |
+|------|-------------|
+| unit (kebab-core) | `SearchFilters::default()` — 7 필드 모두 비어있음 |
+| unit (kebab-search/lexical) | `media: ["pdf"]` — markdown doc 안 잡힘 |
+| unit (kebab-search/lexical) | `media: ["markdown", "pdf"]` — IN-list 동작 |
+| unit (kebab-search/lexical) | `ingested_after: <어제>` — 어제 이전 doc 안 잡힘 |
+| unit (kebab-search/lexical) | `doc_id: ` — 다른 doc 의 chunk 안 잡힘 |
+| unit (kebab-search/lexical) | 다중 filter AND — 모두 만족하는 hit 만 |
+| unit (kebab-search/lexical) | 빈 filter (default) — 기존 동작과 동일 |
+| unit (kebab-search/vector) | 동일 패턴 — `filter_chunks` post-filter |
+| unit (kebab-search) | 알 수 없는 media 값 (`["foo"]`) — empty result, no error |
+| 통합 (kebab-cli) | `kebab search Q --media md --json` wire shape (search_response.v1 그대로) |
+| 통합 (kebab-cli) | `kebab search Q --ingested-after 2020-01-01 --json` 모든 hit 통과 |
+| 통합 (kebab-cli) | `kebab search Q --ingested-after garbage --json` → `error.v1.code = config_invalid` exit 2 |
+| 통합 (kebab-cli) | `kebab search Q --doc-id --json` 단일 doc 만 |
+| 통합 (kebab-cli) | `kebab search Q --tag rust --tag async --json` IN-list 동작 |
+| 통합 (kebab-mcp) | `mcp__kebab__search` 7 optional 필드 모두 정상 응답 |
+| 통합 (kebab-mcp) | `mcp__kebab__search` invalid `ingested_after` → invalid_input |
+
+## Implementation steps (high-level)
+
+1. `kebab-core::SearchFilters` 3 필드 추가 + 단위 테스트.
+2. `kebab-search/lexical.rs` SQL builder 확장 + 단위 테스트.
+3. `kebab-search/vector.rs` `filter_chunks` 헬퍼 동일 확장 + 단위 테스트.
+4. `kebab-cli::Cmd::Search` 7 flag 추가 + dispatch + RFC3339 파싱.
+5. `kebab-cli` 통합 테스트 (lexical-only, no Ollama).
+6. `kebab-mcp::tools::search::SearchInput` 7 필드 + dispatch + invalid_input 검증.
+7. `kebab-mcp` 통합 테스트.
+8. README + SMOKE — filter 예시.
+9. tasks/INDEX.md / spec status flip.
+10. SKILL.md — `mcp__kebab__search` input shape 갱신.
+
+## Risks / notes
+
+- **`assets.media_type` JSON shape**: `MediaType` enum 의 serde 직렬화 형태가 `{"kind": "markdown"}` 인지, 다른 형태인지 SQLite 저장 형식 확인 필요. `Markdown` 같은 unit variant 는 `"markdown"` 문자열, `Image(...)` / `Audio(...)` 같은 tuple variant 는 `{"image": {...}}` 형태일 가능성. `json_extract` 경로를 그에 맞춰 조정 (e.g. `case when typeof(...) = 'text' then ... else json_extract($.kind) end`).
+- **RFC3339 lexicographic compare**: ingest 시 항상 UTC `Z` 로 저장 (fb-32 ingest path 확인됨). 외부 도구가 다른 offset 으로 강제 update 시 비교 부정확. spec 에 "UTC `Z` 가정" 명시.
+- **path_glob 과 다른 filter 의 ordering**: path_glob 은 post-filter (lexical), 신규 3 개는 SQL — fetch_limit 도달 후 path_glob 으로 추가 cut → final hit 수가 줄 수 있음. 기존 동작과 동일 (path_glob 패턴 유지).
+- **clap `Vec` 의 default**: clap 0.4 에서 미지정 = `Vec::new()`. 자동.
+- **trust_min enum 매핑**: clap value_enum 으로 안전. `TrustLevelFlag` → `TrustLevel` 변환 헬퍼.
+- **SearchFilters serde backwards-compat**: `#[serde(default)]` 로 옛 JSON 무영향. SQLite 안 SearchFilters 직렬 저장 안 함 (request-time only).
+
+## Out of scope
+
+- `--exclude-doc-id` / `--exclude-tag` (exclusion filter).
+- 다중 doc_id (`--doc-id a --doc-id b`) — 단일만.
+- TUI Search 패널 filter UI.
+- Lance metadata pre-filter.
+- tag 시스템 신규 도입 (이미 존재).
+- `--search.default-filter` config (default 값 지정) — agent 가 매번 명시.
+
+## Documentation updates (implementation PR 동시)
+
+- `README.md` — `kebab search` row 의 flag 표기에 7 flag 추가.
+- `docs/SMOKE.md` — filter walkthrough (`--media md --ingested-after 2026-04-01` 예시).
+- `tasks/p9/p9-fb-36-search-filters.md` — `status: open → completed`, design/plan 링크.
+- `tasks/INDEX.md` — fb-36 행 ✅.
+- `integrations/claude-code/kebab/SKILL.md` — `mcp__kebab__search` input shape 갱신 (7 필드 명시 + AND 의미 + lenient unknown media).
--
2.49.1
From 31c1e059519301674404afe8711bf88bccc94476 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 03:34:39 +0900
Subject: [PATCH 02/11] plan(fb-36): search filter args implementation plan
9 tasks: SearchFilters extension, lexical SQL WHERE, vector
filter_chunks mirror, CLI 7 flags, integration tests, MCP
SearchInput extension, workspace test/clippy, docs, smoke+PR.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../2026-05-10-p9-fb-36-search-filters.md | 1304 +++++++++++++++++
1 file changed, 1304 insertions(+)
create mode 100644 docs/superpowers/plans/2026-05-10-p9-fb-36-search-filters.md
diff --git a/docs/superpowers/plans/2026-05-10-p9-fb-36-search-filters.md b/docs/superpowers/plans/2026-05-10-p9-fb-36-search-filters.md
new file mode 100644
index 0000000..23bc018
--- /dev/null
+++ b/docs/superpowers/plans/2026-05-10-p9-fb-36-search-filters.md
@@ -0,0 +1,1304 @@
+# p9-fb-36 — Search Filter Args Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Expose 7 filter flags on `kebab search` (`--tag`, `--lang`, `--path-glob`, `--trust-min` for existing `SearchFilters` fields plus `--media`, `--ingested-after`, `--doc-id` as new fields). Filter layer = SQLite WHERE for lexical, over-fetch + post-filter for vector. AND combinator. Wire-shape input-only. MCP `kebab__search` SearchInput gains the 7 fields.
+
+**Architecture:** Domain `SearchFilters` gets 3 new optional fields. Lexical retriever's SQL builder extends WHERE clause; vector retriever's `filter_chunks` helper mirrors. CLI dispatch translates clap flags into `SearchFilters`, parsing `--ingested-after` as RFC3339 (config_invalid on failure). MCP `SearchInput` gains 7 optional fields with the same translation. `media_type` JSON column has two shapes (text for unit variants, object for tuple variants) — use `CASE WHEN json_type(media_type) = 'text' THEN json_extract(media_type, '$') ELSE (SELECT key FROM json_each(media_type) LIMIT 1) END` to extract a unified `kind` string.
+
+**Tech Stack:** Rust 2024, clap (value_enum, value_delimiter), serde, time crate (RFC3339), rusqlite (json_extract / json_each / json_type), no new deps.
+
+**Spec:** `docs/superpowers/specs/2026-05-10-p9-fb-36-search-filters-design.md`
+
+---
+
+## File Structure
+
+| File | Responsibility | Action |
+|------|----------------|--------|
+| `crates/kebab-core/src/search.rs` | `SearchFilters` 3 new fields + `MEDIA_KINDS` const | modify |
+| `crates/kebab-search/src/lexical.rs` | SQL builder WHERE clause extension (media JOIN assets, ingested_after, doc_id) | modify |
+| `crates/kebab-search/src/vector.rs` | `filter_chunks` helper extension to match | modify |
+| `crates/kebab-cli/src/main.rs` | `Cmd::Search` 7 new flags + dispatch + RFC3339 parsing + `TrustLevelFlag` enum | modify |
+| `crates/kebab-mcp/src/tools/search.rs` | `SearchInput` 7 optional fields + dispatch + invalid_input on bad RFC3339 | modify |
+| `crates/kebab-search/tests/lexical.rs` | filter unit tests (media / ingested_after / doc_id / AND combo) | modify |
+| `crates/kebab-search/tests/hybrid.rs` | vector filter mirror tests | modify |
+| `crates/kebab-cli/tests/wire_search_filters.rs` | NEW — CLI integration tests for 7 flags | create |
+| `crates/kebab-mcp/tests/tools_call_search.rs` | extend with filter input cases | modify |
+| `README.md` | `kebab search` row update | modify |
+| `docs/SMOKE.md` | filter walkthrough | modify |
+| `tasks/p9/p9-fb-36-search-filters.md` | status flip + design/plan links | modify |
+| `tasks/INDEX.md` | fb-36 row → ✅ | modify |
+| `integrations/claude-code/kebab/SKILL.md` | `mcp__kebab__search` input shape doc + filter examples | modify |
+
+---
+
+## Pre-flight
+
+- [ ] **Step 0.1: Branch off main**
+
+```bash
+git checkout main
+git pull
+git checkout -b feat/fb-36-search-filters
+```
+
+- [ ] **Step 0.2: Confirm spec branch reachable**
+
+```bash
+git log --oneline spec/fb-36-search-filters -1
+```
+
+Expected: `7210386 spec(fb-36): search filter args — design`. If spec PR not yet merged, `git merge spec/fb-36-search-filters`.
+
+---
+
+## Task 1: Domain — `SearchFilters` 3 new fields
+
+**Files:**
+- Modify: `crates/kebab-core/src/search.rs`
+
+- [ ] **Step 1.1: Failing test**
+
+Append to `crates/kebab-core/src/search.rs` `#[cfg(test)] mod tests`:
+
+```rust
+#[test]
+fn search_filters_default_includes_new_fb36_fields() {
+ let f = SearchFilters::default();
+ assert!(f.media.is_empty(), "media default empty");
+ assert!(f.ingested_after.is_none(), "ingested_after default None");
+ assert!(f.doc_id.is_none(), "doc_id default None");
+ // existing fields still default
+ assert!(f.tags_any.is_empty());
+ assert!(f.lang.is_none());
+ assert!(f.path_glob.is_none());
+ assert!(f.trust_min.is_none());
+}
+
+#[test]
+fn search_filters_serialize_with_serde_default_compat() {
+ // Old JSON without the new fields must still deserialize.
+ let old: SearchFilters = serde_json::from_str(r#"{"tags_any":[],"lang":null,"path_glob":null,"trust_min":null}"#).unwrap();
+ assert!(old.media.is_empty());
+ assert!(old.ingested_after.is_none());
+ assert!(old.doc_id.is_none());
+}
+```
+
+- [ ] **Step 1.2: Run test (verify failure)**
+
+```bash
+cargo test -p kebab-core search_filters_default_includes_new_fb36_fields
+```
+
+Expected: FAIL — fields don't exist.
+
+- [ ] **Step 1.3: Add the fields**
+
+Edit `SearchFilters` struct in `crates/kebab-core/src/search.rs`:
+
+```rust
+#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
+pub struct SearchFilters {
+ pub tags_any: Vec,
+ pub lang: Option,
+ pub path_glob: Option,
+ pub trust_min: Option,
+ /// p9-fb-36: media_type filter — IN-list of `MediaType.kind`
+ /// strings (`"markdown"`, `"pdf"`, `"image"`, `"audio"`, `"other"`).
+ /// Empty Vec = no filter. Match is on the variant tag only;
+ /// e.g. `["image"]` matches `Image(Png)` and `Image(Jpeg)`.
+ #[serde(default)]
+ pub media: Vec,
+ /// p9-fb-36: hits whose source doc's `documents.updated_at` is at
+ /// or after this timestamp. None = no filter. RFC3339 / UTC.
+ #[serde(default, with = "time::serde::rfc3339::option")]
+ pub ingested_after: Option,
+ /// p9-fb-36: restrict hits to a single document. None = no filter.
+ #[serde(default)]
+ pub doc_id: Option,
+}
+```
+
+`OffsetDateTime` is already imported (other fields use it). `DocumentId` is already in scope. If neither is, add:
+
+```rust
+use time::OffsetDateTime;
+use crate::ids::DocumentId;
+```
+
+Also expose a `MEDIA_KINDS` const that downstream code can use for validation / aliases:
+
+```rust
+/// p9-fb-36: canonical kind labels for `SearchFilters.media`. Mirrors
+/// `MediaType` variant tags; CLI / MCP normalize aliases (`md` → `markdown`)
+/// before populating this Vec.
+pub const MEDIA_KINDS: &[&str] = &["markdown", "pdf", "image", "audio", "other"];
+```
+
+- [ ] **Step 1.4: Run tests (verify pass)**
+
+```bash
+cargo test -p kebab-core
+```
+
+Expected: 33+ tests pass (2 new + existing).
+
+Other crates may break (lexical / vector retrievers reference `SearchFilters`). That's expected — Tasks 2/3 fix.
+
+- [ ] **Step 1.5: Commit**
+
+```bash
+git add crates/kebab-core/src/search.rs
+git commit -m "$(cat <<'EOF'
+feat(core): SearchFilters gains media / ingested_after / doc_id (fb-36)
+
+3 additive optional fields. #[serde(default)] preserves
+backwards compat for older JSON without the new keys.
+MEDIA_KINDS const exposes canonical "markdown"/"pdf"/"image"/
+"audio"/"other" labels for downstream alias normalization.
+
+Co-Authored-By: Claude Opus 4.7 (1M context)
+EOF
+)"
+```
+
+---
+
+## Task 2: Lexical retriever — SQL WHERE extension
+
+**Files:**
+- Modify: `crates/kebab-search/src/lexical.rs`
+- Modify: `crates/kebab-search/tests/lexical.rs`
+
+- [ ] **Step 2.1: Failing tests**
+
+Append to `crates/kebab-search/tests/lexical.rs`:
+
+```rust
+#[test]
+fn lexical_filter_by_media() {
+ let env = TestEnv::new();
+ env.insert_doc_with_media("md1.md", "rust ownership", kebab_core::MediaType::Markdown);
+ env.insert_doc_with_media("doc.pdf", "rust pdf body", kebab_core::MediaType::Pdf);
+ let filters = kebab_core::SearchFilters {
+ media: vec!["pdf".to_string()],
+ ..Default::default()
+ };
+ let hits = env.run_search("rust", &filters);
+ assert_eq!(hits.len(), 1, "only pdf doc should match");
+ assert!(hits[0].doc_path.0.ends_with(".pdf"), "got: {}", hits[0].doc_path.0);
+}
+
+#[test]
+fn lexical_filter_by_ingested_after() {
+ let env = TestEnv::new();
+ let old_doc = env.insert_doc_with_updated_at(
+ "old.md",
+ "ingest test",
+ time::macros::datetime!(2020-01-01 00:00:00 UTC),
+ );
+ let new_doc = env.insert_doc_with_updated_at(
+ "new.md",
+ "ingest test",
+ time::macros::datetime!(2026-01-01 00:00:00 UTC),
+ );
+ let filters = kebab_core::SearchFilters {
+ ingested_after: Some(time::macros::datetime!(2025-01-01 00:00:00 UTC)),
+ ..Default::default()
+ };
+ let hits = env.run_search("ingest", &filters);
+ let _ = (old_doc, new_doc);
+ assert_eq!(hits.len(), 1, "only post-2025 doc matches");
+}
+
+#[test]
+fn lexical_filter_by_doc_id() {
+ let env = TestEnv::new();
+ let target = env.insert_doc("a.md", "shared term");
+ env.insert_doc("b.md", "shared term");
+ let filters = kebab_core::SearchFilters {
+ doc_id: Some(target.clone()),
+ ..Default::default()
+ };
+ let hits = env.run_search("shared", &filters);
+ for h in &hits {
+ assert_eq!(h.doc_id, target, "all hits must be from target doc");
+ }
+}
+
+#[test]
+fn lexical_filter_combinator_is_and() {
+ let env = TestEnv::new();
+ let target = env.insert_doc_with_media("a.md", "rust", kebab_core::MediaType::Markdown);
+ env.insert_doc_with_media("b.pdf", "rust", kebab_core::MediaType::Pdf);
+ let filters = kebab_core::SearchFilters {
+ media: vec!["markdown".to_string()],
+ doc_id: Some(target.clone()),
+ ..Default::default()
+ };
+ let hits = env.run_search("rust", &filters);
+ assert!(hits.iter().all(|h| h.doc_id == target));
+}
+
+#[test]
+fn lexical_filter_unknown_media_returns_empty() {
+ let env = TestEnv::new();
+ env.insert_doc("a.md", "rust");
+ let filters = kebab_core::SearchFilters {
+ media: vec!["nonexistent_kind".to_string()],
+ ..Default::default()
+ };
+ let hits = env.run_search("rust", &filters);
+ assert!(hits.is_empty(), "unknown media → no hits, no error");
+}
+
+#[test]
+fn lexical_empty_filters_match_default_behavior() {
+ let env = TestEnv::new();
+ env.insert_doc("a.md", "rust");
+ let with_default = env.run_search("rust", &kebab_core::SearchFilters::default());
+ assert!(!with_default.is_empty());
+}
+```
+
+The `TestEnv` helper functions (`insert_doc`, `insert_doc_with_media`, `insert_doc_with_updated_at`, `run_search`) need to exist in the test scaffold. Check what's there:
+
+```bash
+grep -n "pub fn insert_doc\|pub fn run_search\|TestEnv" crates/kebab-search/tests/common/mod.rs 2>/dev/null
+ls crates/kebab-search/tests/
+```
+
+If missing, add minimal helpers to `crates/kebab-search/tests/common/mod.rs` (create the file if needed):
+
+```rust
+//! Lexical-test helpers shared across kebab-search integration tests.
+
+use std::sync::Arc;
+
+use kebab_core::{
+ DocumentId, MediaType, SearchFilters, SearchHit, SearchMode, SearchQuery,
+};
+use kebab_search::LexicalRetriever;
+use kebab_store_sqlite::SqliteStore;
+use time::OffsetDateTime;
+
+pub struct TestEnv {
+ pub store: Arc,
+ pub retriever: LexicalRetriever,
+ next: std::cell::Cell,
+}
+
+impl TestEnv {
+ pub fn new() -> Self {
+ // ... use whatever the existing tests do for store init.
+ // Mirror the pattern in crates/kebab-search/tests/lexical.rs that
+ // sets up an in-memory or tempdir SqliteStore + LexicalRetriever.
+ unimplemented!("copy the existing test scaffold's setup")
+ }
+
+ pub fn insert_doc(&self, path: &str, body: &str) -> DocumentId {
+ self.insert_doc_with_media(path, body, MediaType::Markdown)
+ }
+
+ pub fn insert_doc_with_media(
+ &self,
+ path: &str,
+ body: &str,
+ media: MediaType,
+ ) -> DocumentId {
+ self.insert_doc_with_updated_at(path, body, OffsetDateTime::now_utc())
+ // (set the media via a separate write or threading through
+ // whatever fixture helper the existing tests use)
+ }
+
+ pub fn insert_doc_with_updated_at(
+ &self,
+ path: &str,
+ body: &str,
+ updated_at: OffsetDateTime,
+ ) -> DocumentId {
+ // Insert a synthetic document + asset row + chunks + FTS row.
+ // Match the pattern used in the existing lexical / hybrid tests
+ // (which already use TestEnv-like helpers — adapt their signatures).
+ unimplemented!("see existing test scaffold")
+ }
+
+ pub fn run_search(&self, query: &str, filters: &SearchFilters) -> Vec {
+ let q = SearchQuery {
+ text: query.to_string(),
+ mode: SearchMode::Lexical,
+ k: 10,
+ filters: filters.clone(),
+ };
+ kebab_core::Retriever::search(&self.retriever, &q).expect("search")
+ }
+}
+```
+
+The "unimplemented" placeholders must be replaced with concrete code — see `crates/kebab-search/tests/lexical.rs`'s existing test setup for the right pattern (likely something like `init_store_with_doc_and_chunk(...)`). Take the time to study what's there and mirror it. The plan can't enumerate the full scaffold here because it depends on the codebase's existing fixtures.
+
+If the existing tests already have similar helpers under different names, REUSE them — don't add a new TestEnv. The new fixture-needing helpers (`insert_doc_with_media`, `insert_doc_with_updated_at`) are the only genuinely new pieces.
+
+- [ ] **Step 2.2: Run tests (verify failure)**
+
+```bash
+cargo test -p kebab-search --test lexical lexical_filter_by_media
+```
+
+Expected: FAIL — `lexical.rs` doesn't yet handle `media` filter; the test would either compile fail (helpers missing) or assertion fail.
+
+- [ ] **Step 2.3: Implement SQL WHERE extension**
+
+Edit `crates/kebab-search/src/lexical.rs::run_query`. Find the existing WHERE clause builder block (after `tags_any` / `lang` / `trust_min` arms — see line ~280-320). Add the 3 new arms BEFORE the `path_glob` post-filter (path_glob stays in Rust):
+
+```rust
+// p9-fb-36: media_type filter (IN-list).
+// `assets.media_type` JSON has two shapes:
+// - unit variant (Markdown / Pdf): JSON text, e.g. `"markdown"`
+// - tuple variant (Image(Png) / Audio(Mp3) / Other(s)): JSON object,
+// e.g. `{"image": "png"}`
+// Extract a unified "kind" string for both shapes via:
+// CASE WHEN json_type = 'text' THEN json_extract($)
+// ELSE (first object key)
+// END IN (?, ...)
+if !filters.media.is_empty() {
+ let placeholders: Vec<&str> = std::iter::repeat_n("?", filters.media.len()).collect();
+ let placeholders = placeholders.join(",");
+ sql.push_str(&format!(
+ " AND f.doc_id IN (SELECT doc_id FROM documents d2 \
+ JOIN assets a ON a.asset_id = d2.asset_id \
+ WHERE CASE \
+ WHEN json_type(a.media_type) = 'text' THEN json_extract(a.media_type, '$') \
+ ELSE (SELECT key FROM json_each(a.media_type) LIMIT 1) \
+ END IN ({placeholders}))"
+ ));
+ for kind in &filters.media {
+ params.push(Box::new(kind.clone()));
+ }
+}
+
+// p9-fb-36: ingested_after filter.
+// `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
+// fb-32 ingest path), so lexicographic >= compare is correct.
+if let Some(after) = &filters.ingested_after {
+ let formatted = after
+ .format(&time::format_description::well_known::Rfc3339)
+ .expect("OffsetDateTime formats to RFC3339");
+ sql.push_str(" AND d.updated_at >= ?");
+ params.push(Box::new(formatted));
+}
+
+// p9-fb-36: doc_id filter — single-doc scoping.
+if let Some(id) = &filters.doc_id {
+ sql.push_str(" AND d.doc_id = ?");
+ params.push(Box::new(id.0.clone()));
+}
+```
+
+The exact `params` API depends on the existing builder pattern in `lexical.rs`. The current code uses something like `let mut params: Vec> = vec![...];`. Match that exactly. Don't introduce a new pattern.
+
+If the existing SQL has joins on `documents d` already (via `chunks → documents`), the `media` subquery uses `documents d2` to avoid alias collision. Read the existing SQL string to verify.
+
+- [ ] **Step 2.4: Run tests (verify pass)**
+
+```bash
+cargo test -p kebab-search --test lexical
+```
+
+Expected: all PASS, including 6 new fb-36 tests.
+
+If the helpers in Step 2.1 weren't fleshed out, this is the moment to fill them in — they're the bridge between the test text above and the actual store setup. The store crate's `tests/contract_roundtrip.rs` is a good model for inserting an asset + document + chunks fixture.
+
+- [ ] **Step 2.5: Commit**
+
+```bash
+git add crates/kebab-search/src/lexical.rs crates/kebab-search/tests/
+git commit -m "$(cat <<'EOF'
+feat(search/lexical): media / ingested_after / doc_id filters (fb-36)
+
+SQL WHERE clause extension. media uses CASE WHEN json_type='text'
+to handle both unit (`"markdown"`) and tuple (`{"image":"png"}`)
+MediaType serde shapes. ingested_after relies on RFC3339 lexicographic
+ordering with UTC Z (per fb-32 ingest invariant). doc_id is a simple
+equality. AND combinator with existing tags / lang / trust filters.
+
+Co-Authored-By: Claude Opus 4.7 (1M context)
+EOF
+)"
+```
+
+---
+
+## Task 3: Vector retriever — `filter_chunks` mirror
+
+**Files:**
+- Modify: `crates/kebab-search/src/vector.rs`
+- Modify: `crates/kebab-search/tests/hybrid.rs`
+
+- [ ] **Step 3.1: Failing test**
+
+Append to `crates/kebab-search/tests/hybrid.rs`:
+
+```rust
+#[test]
+fn vector_filter_by_media() {
+ let env = HybridTestEnv::new();
+ env.insert_doc_with_media("md1.md", "rust ownership", kebab_core::MediaType::Markdown);
+ env.insert_doc_with_media("doc.pdf", "rust pdf body", kebab_core::MediaType::Pdf);
+
+ let filters = kebab_core::SearchFilters {
+ media: vec!["pdf".to_string()],
+ ..Default::default()
+ };
+ let hits = env.run_vector_search("rust", &filters);
+ assert_eq!(hits.len(), 1);
+ assert!(hits[0].doc_path.0.ends_with(".pdf"));
+}
+
+#[test]
+fn vector_filter_by_doc_id() {
+ let env = HybridTestEnv::new();
+ let target = env.insert_doc("a.md", "shared");
+ env.insert_doc("b.md", "shared");
+ let filters = kebab_core::SearchFilters {
+ doc_id: Some(target.clone()),
+ ..Default::default()
+ };
+ let hits = env.run_vector_search("shared", &filters);
+ assert!(hits.iter().all(|h| h.doc_id == target));
+}
+```
+
+Mirror the helpers needed in `crates/kebab-search/tests/common/mod.rs` (add `HybridTestEnv` if it doesn't exist; copy the pattern from existing hybrid tests).
+
+- [ ] **Step 3.2: Run tests (verify failure)**
+
+```bash
+cargo test -p kebab-search --test hybrid vector_filter_by_media
+```
+
+Expected: FAIL.
+
+- [ ] **Step 3.3: Implement filter_chunks extension**
+
+Edit `crates/kebab-search/src/vector.rs::filter_chunks` (or whatever helper the vector retriever uses to post-filter SQLite-side after Lance returns chunks). Add the same 3 SQL fragments as Task 2.
+
+If `filter_chunks` builds its own SQL inline, match the lexical pattern verbatim. If it delegates to a shared SQL helper in `kebab-store-sqlite`, refactor: extract the "filter WHERE clause builder" into a small helper used by both. Inspect first:
+
+```bash
+grep -n "filter_chunks\|tags_any\|trust_min\|lang" crates/kebab-search/src/vector.rs | head -10
+```
+
+Decide: in-place duplication vs shared helper. Shared helper is cleaner if the SQL is identical. If the contexts differ (lexical SQL is a single statement, vector SQL is a follow-up `SELECT ... WHERE chunk_id IN (...) AND `), keep them separate but mirror the new filter pattern exactly.
+
+- [ ] **Step 3.4: Run tests (verify pass)**
+
+```bash
+cargo test -p kebab-search --test hybrid
+cargo test -p kebab-search
+```
+
+Expected: all PASS.
+
+- [ ] **Step 3.5: Commit**
+
+```bash
+git add crates/kebab-search/src/vector.rs crates/kebab-search/tests/
+git commit -m "$(cat <<'EOF'
+feat(search/vector): media / ingested_after / doc_id filters (fb-36)
+
+filter_chunks helper extended with the same 3 WHERE clauses as
+lexical. Vector still over-fetches k * 2 then post-filters; small
+k can return < k hits when filters drop a lot — agent is expected
+to widen k or paginate. AND combinator with existing filters.
+
+Co-Authored-By: Claude Opus 4.7 (1M context)
+EOF
+)"
+```
+
+---
+
+## Task 4: CLI flags + dispatch
+
+**Files:**
+- Modify: `crates/kebab-cli/src/main.rs`
+
+- [ ] **Step 4.1: Add `TrustLevelFlag` clap enum**
+
+Locate the existing `enum Cmd` and `enum ModeFlag` (or similar) declarations. Add near them:
+
+```rust
+#[derive(clap::ValueEnum, Clone, Debug)]
+enum TrustLevelFlag {
+ Trusted,
+ Reviewed,
+ Hearsay,
+ Untrusted,
+}
+
+impl From for kebab_core::TrustLevel {
+ fn from(f: TrustLevelFlag) -> Self {
+ match f {
+ TrustLevelFlag::Trusted => kebab_core::TrustLevel::Trusted,
+ TrustLevelFlag::Reviewed => kebab_core::TrustLevel::Reviewed,
+ TrustLevelFlag::Hearsay => kebab_core::TrustLevel::Hearsay,
+ TrustLevelFlag::Untrusted => kebab_core::TrustLevel::Untrusted,
+ }
+ }
+}
+```
+
+If `TrustLevel` variants are different (verify):
+
+```bash
+grep -A 8 "^pub enum TrustLevel" crates/kebab-core/src/metadata.rs
+```
+
+Adapt names accordingly.
+
+- [ ] **Step 4.2: Add 7 flags to `Cmd::Search`**
+
+In the `enum Cmd { ... Search { ... } }` definition, add 7 fields:
+
+```rust
+/// p9-fb-36: filter by `metadata.tags`. Repeatable; OR-within (any tag).
+#[arg(long)]
+tag: Vec,
+
+/// p9-fb-36: filter by `documents.lang` (ISO code).
+#[arg(long)]
+lang: Option,
+
+/// p9-fb-36: filter by `documents.workspace_path` glob.
+#[arg(long)]
+path_glob: Option,
+
+/// p9-fb-36: filter by minimum `documents.trust_level`.
+#[arg(long, value_enum)]
+trust_min: Option,
+
+/// p9-fb-36: filter by `assets.media_type` kind. Comma-separated.
+/// Aliases: `md` → `markdown`. Other accepted: `markdown`, `pdf`,
+/// `image`, `audio`, `other`. Unknown values match nothing.
+#[arg(long, value_delimiter = ',')]
+media: Vec,
+
+/// p9-fb-36: filter to docs whose `updated_at` is >= this RFC3339
+/// timestamp (UTC). Invalid format → exit 2 with error.v1
+/// code = config_invalid.
+#[arg(long)]
+ingested_after: Option,
+
+/// p9-fb-36: filter to a single doc by id.
+#[arg(long)]
+doc_id: Option,
+```
+
+- [ ] **Step 4.3: Build SearchFilters in dispatch arm**
+
+In the `Cmd::Search { ... } =>` match arm body, before the `let q = kebab_core::SearchQuery { ... }` line, replace the hardcoded `filters: kebab_core::SearchFilters::default()` with a constructed `SearchFilters`. Also normalize `--media` aliases:
+
+```rust
+fn normalize_media_alias(s: &str) -> String {
+ match s.to_ascii_lowercase().as_str() {
+ "md" => "markdown".to_string(),
+ other => other.to_string(),
+ }
+}
+
+let media_norm: Vec = media.iter().map(|s| normalize_media_alias(s)).collect();
+
+let ingested_after_parsed: Option = match ingested_after.as_deref() {
+ Some(s) => {
+ let parsed = time::OffsetDateTime::parse(
+ s,
+ &time::format_description::well_known::Rfc3339,
+ );
+ match parsed {
+ Ok(ts) => Some(ts),
+ Err(e) => {
+ let err = anyhow::Error::new(kebab_app::StructuredError(kebab_app::ErrorV1 {
+ schema_version: "error.v1".to_string(),
+ code: "config_invalid".to_string(),
+ message: format!("--ingested-after: invalid RFC3339 timestamp '{s}': {e}"),
+ details: serde_json::Value::Null,
+ hint: Some("expected format like 2026-04-01T00:00:00Z".to_string()),
+ }));
+ return Err(err);
+ }
+ }
+ }
+ None => None,
+};
+
+let filters = kebab_core::SearchFilters {
+ tags_any: tag.clone(),
+ lang: lang.as_ref().map(|s| kebab_core::Lang(s.clone())),
+ path_glob: path_glob.clone(),
+ trust_min: trust_min.clone().map(Into::into),
+ media: media_norm,
+ ingested_after: ingested_after_parsed,
+ doc_id: doc_id.as_ref().map(|s| kebab_core::DocumentId(s.clone())),
+};
+
+let q = kebab_core::SearchQuery {
+ text: query.clone(),
+ mode: (*mode).into(),
+ k: *k,
+ filters,
+};
+```
+
+If `Lang` constructor differs (e.g. `Lang::new(...)` vs `Lang(s)`), check:
+
+```bash
+grep -A 3 "^pub struct Lang\b" crates/kebab-core/src/media.rs
+```
+
+If the existing `Cmd::Search` arm doesn't currently `return Err(...)` for failures, the dispatch's outer `Result<()>` should catch the anyhow propagation through `?`. Verify the existing pattern.
+
+- [ ] **Step 4.4: Build CLI**
+
+```bash
+cargo build -p kebab-cli
+```
+
+Expected: clean.
+
+- [ ] **Step 4.5: Verify --help**
+
+```bash
+cargo run -q -p kebab-cli -- search --help 2>&1 | grep -E "tag|lang|path-glob|trust-min|media|ingested-after|doc-id"
+```
+
+Expected: 7 new flags appear.
+
+- [ ] **Step 4.6: Run kebab-cli tests**
+
+```bash
+cargo test -p kebab-cli
+```
+
+Expected: all PASS, no regressions.
+
+- [ ] **Step 4.7: Commit**
+
+```bash
+git add crates/kebab-cli/src/main.rs
+git commit -m "$(cat <<'EOF'
+feat(cli): kebab search filter flags (fb-36)
+
+7 new flags: --tag (repeatable), --lang, --path-glob,
+--trust-min (value_enum), --media (csv with `md` alias),
+--ingested-after (RFC3339; config_invalid on parse fail),
+--doc-id. Dispatch translates clap values into SearchFilters
+and propagates structured errors through the existing
+StructuredError wrapper from fb-34.
+
+Co-Authored-By: Claude Opus 4.7 (1M context)
+EOF
+)"
+```
+
+---
+
+## Task 5: CLI integration tests
+
+**Files:**
+- Create: `crates/kebab-cli/tests/wire_search_filters.rs`
+- Modify: `crates/kebab-cli/tests/common/mod.rs` (if helper missing)
+
+- [ ] **Step 5.1: Write integration tests**
+
+Create `crates/kebab-cli/tests/wire_search_filters.rs`:
+
+```rust
+//! p9-fb-36: CLI search filter flags.
+
+mod common;
+
+use serde_json::Value;
+
+#[test]
+fn search_with_doc_id_filter_returns_only_target_doc() {
+ let (cfg, ws) = common::write_config();
+ common::ingest(&cfg, &ws, "a.md", "# A\n\nshared term apple\n");
+ common::ingest(&cfg, &ws, "b.md", "# B\n\nshared term banana\n");
+
+ // Find any doc_id via search.
+ let (probe_stdout, _) = common::run_search_with_args(
+ &cfg,
+ &["--mode", "lexical", "--json", "--k", "5", "shared"],
+ );
+ let probe: Value = serde_json::from_str(probe_stdout.trim()).expect("probe json");
+ let target_doc_id = probe["hits"][0]["doc_id"]
+ .as_str()
+ .expect("doc_id in first hit")
+ .to_string();
+
+ let (stdout, _) = common::run_search_with_args(
+ &cfg,
+ &["--mode", "lexical", "--json", "--doc-id", &target_doc_id, "shared"],
+ );
+ let v: Value = serde_json::from_str(stdout.trim()).expect("filtered json");
+ let hits = v["hits"].as_array().expect("hits array");
+ assert!(!hits.is_empty(), "filter should still match the target doc");
+ for h in hits {
+ assert_eq!(h["doc_id"], target_doc_id);
+ }
+}
+
+#[test]
+fn search_with_invalid_ingested_after_emits_config_invalid() {
+ let (cfg, _ws) = common::write_config();
+
+ let exe = env!("CARGO_BIN_EXE_kebab");
+ let cfg_str = cfg.to_str().expect("utf8");
+ let out = std::process::Command::new(exe)
+ .args([
+ "--config", cfg_str, "--json",
+ "search", "--mode", "lexical",
+ "--ingested-after", "not-a-timestamp",
+ "test",
+ ])
+ .output()
+ .expect("kebab search");
+ assert_ne!(out.status.code(), Some(0));
+ let stderr = String::from_utf8_lossy(&out.stderr);
+ let err_line = stderr
+ .lines()
+ .find(|l| {
+ serde_json::from_str::(l)
+ .ok()
+ .and_then(|v| v.get("schema_version").and_then(|s| s.as_str()).map(String::from))
+ .as_deref()
+ == Some("error.v1")
+ })
+ .unwrap_or_else(|| panic!("no error.v1 on stderr: {stderr}"));
+ let v: Value = serde_json::from_str(err_line).expect("error.v1 json");
+ assert_eq!(v["code"], "config_invalid");
+ assert!(
+ v["message"].as_str().unwrap_or("").contains("ingested-after"),
+ "message should mention the flag: {v:?}"
+ );
+}
+
+#[test]
+fn search_with_media_filter_md_alias_normalizes_to_markdown() {
+ let (cfg, ws) = common::write_config();
+ common::ingest(&cfg, &ws, "a.md", "# A\n\nrust ownership body\n");
+
+ let (stdout, _) = common::run_search_with_args(
+ &cfg,
+ &["--mode", "lexical", "--json", "--media", "md", "rust"],
+ );
+ let v: Value = serde_json::from_str(stdout.trim()).expect("json");
+ let hits = v["hits"].as_array().expect("hits");
+ assert!(!hits.is_empty(), "md alias should match markdown doc");
+}
+
+#[test]
+fn search_with_tag_filter_repeats_or_within() {
+ let (cfg, ws) = common::write_config();
+ // Tag-aware ingest: write a doc with frontmatter tags. The
+ // markdown parser captures them into `metadata.tags`.
+ common::ingest(
+ &cfg,
+ &ws,
+ "tagged.md",
+ "---\ntags: [rust, async]\n---\n\n# Tagged\n\nbody about rust\n",
+ );
+ common::ingest(&cfg, &ws, "untagged.md", "# Plain\n\nbody about rust\n");
+
+ // --tag rust → tagged doc only.
+ let (stdout, _) = common::run_search_with_args(
+ &cfg,
+ &["--mode", "lexical", "--json", "--tag", "rust", "--k", "10", "rust"],
+ );
+ let v: Value = serde_json::from_str(stdout.trim()).expect("json");
+ let hits = v["hits"].as_array().expect("hits");
+ assert!(!hits.is_empty(), "tagged doc should match");
+ for h in hits {
+ let path = h["doc_path"].as_str().unwrap_or("");
+ assert_eq!(path, "tagged.md", "untagged doc should be filtered out");
+ }
+}
+```
+
+If `common::write_config` / `common::ingest` / `common::run_search_with_args` already exist (they do from fb-32 / fb-34), reuse. The test file imports them via `mod common;`.
+
+- [ ] **Step 5.2: Run tests**
+
+```bash
+cargo test -p kebab-cli --test wire_search_filters 2>&1 | tail -10
+```
+
+Expected: 4 PASS.
+
+If the tag-frontmatter test fails because parser doesn't capture tags from this exact format, simplify the test or check what frontmatter shape the codebase expects:
+
+```bash
+grep -rn "metadata.tags\|frontmatter.*tags" crates/kebab-parse-md/src/ 2>/dev/null | head -5
+```
+
+Adapt the fixture frontmatter to the parser's expected shape.
+
+- [ ] **Step 5.3: Run full kebab-cli suite**
+
+```bash
+cargo test -p kebab-cli
+```
+
+Expected: all PASS.
+
+- [ ] **Step 5.4: Commit**
+
+```bash
+git add crates/kebab-cli/tests/
+git commit -m "$(cat <<'EOF'
+test(cli): wire_search_filters — 4 lexical-only integration tests (fb-36)
+
+Cover: --doc-id scoping, --ingested-after validation error,
+--media md alias, --tag repeatable + frontmatter parsing.
+
+Co-Authored-By: Claude Opus 4.7 (1M context)
+EOF
+)"
+```
+
+---
+
+## Task 6: MCP `SearchInput` extension
+
+**Files:**
+- Modify: `crates/kebab-mcp/src/tools/search.rs`
+- Modify: `crates/kebab-mcp/tests/tools_call_search.rs`
+
+- [ ] **Step 6.1: Inspect current `SearchInput`**
+
+```bash
+sed -n '1,80p' crates/kebab-mcp/src/tools/search.rs
+```
+
+Note where `mode` / `k` / `max_tokens` / `cursor` are wired.
+
+- [ ] **Step 6.2: Add 7 fields to `SearchInput`**
+
+Edit the struct:
+
+```rust
+#[derive(Debug, Deserialize, Serialize, JsonSchema)]
+pub struct SearchInput {
+ pub query: String,
+ pub mode: Option,
+ pub k: Option,
+ pub max_tokens: Option,
+ pub snippet_chars: Option,
+ pub cursor: Option,
+ /// p9-fb-36: filter by `metadata.tags` (OR-within).
+ pub tags: Option>,
+ /// p9-fb-36: filter by `documents.lang` (ISO code).
+ pub lang: Option,
+ /// p9-fb-36: filter by `documents.workspace_path` glob.
+ pub path_glob: Option,
+ /// p9-fb-36: filter by minimum `documents.trust_level`.
+ /// Accepts: `"trusted"`, `"reviewed"`, `"hearsay"`, `"untrusted"`.
+ pub trust_min: Option,
+ /// p9-fb-36: filter by `assets.media_type` kind. IN-list. Accepts:
+ /// `"markdown"`, `"pdf"`, `"image"`, `"audio"`, `"other"`.
+ pub media: Option>,
+ /// p9-fb-36: RFC3339 UTC timestamp. Invalid format → invalid_input.
+ pub ingested_after: Option,
+ /// p9-fb-36: filter to a single doc.
+ pub doc_id: Option,
+}
+```
+
+- [ ] **Step 6.3: Update dispatch**
+
+In `handle(state, input)`, before constructing `SearchOpts`, build `SearchFilters` from the new inputs:
+
+```rust
+let trust_min = match input.trust_min.as_deref() {
+ Some("trusted") => Some(kebab_core::TrustLevel::Trusted),
+ Some("reviewed") => Some(kebab_core::TrustLevel::Reviewed),
+ Some("hearsay") => Some(kebab_core::TrustLevel::Hearsay),
+ Some("untrusted") => Some(kebab_core::TrustLevel::Untrusted),
+ Some(other) => {
+ return invalid_input(&format!(
+ "trust_min: unknown level '{other}'; expected trusted|reviewed|hearsay|untrusted"
+ ));
+ }
+ None => None,
+};
+
+let ingested_after = match input.ingested_after.as_deref() {
+ Some(s) => {
+ match time::OffsetDateTime::parse(s, &time::format_description::well_known::Rfc3339) {
+ Ok(ts) => Some(ts),
+ Err(e) => return invalid_input(&format!("ingested_after: invalid RFC3339 '{s}': {e}")),
+ }
+ }
+ None => None,
+};
+
+let filters = kebab_core::SearchFilters {
+ tags_any: input.tags.unwrap_or_default(),
+ lang: input.lang.map(kebab_core::Lang),
+ path_glob: input.path_glob,
+ trust_min,
+ media: input.media.unwrap_or_default(),
+ ingested_after,
+ doc_id: input.doc_id.map(kebab_core::DocumentId),
+};
+
+let query = kebab_core::SearchQuery {
+ text: input.query,
+ mode,
+ k: input.k.unwrap_or(10).clamp(1, 100),
+ filters,
+};
+```
+
+If `invalid_input` helper doesn't exist in this file (per fb-35 `tools/fetch.rs` pattern), add one:
+
+```rust
+fn invalid_input(msg: &str) -> CallToolResult {
+ use kebab_app::{ErrorV1, StructuredError};
+ let err = anyhow::Error::new(StructuredError(ErrorV1 {
+ schema_version: "error.v1".to_string(),
+ code: "invalid_input".to_string(),
+ message: msg.to_string(),
+ details: serde_json::Value::Null,
+ hint: None,
+ }));
+ to_tool_error(&err)
+}
+```
+
+If the existing dispatch hardcodes `SearchFilters::default()`, replace with the new `filters` value above.
+
+- [ ] **Step 6.4: Add MCP test cases**
+
+Edit `crates/kebab-mcp/tests/tools_call_search.rs`. Add tests:
+
+```rust
+#[test]
+fn search_with_doc_id_filter_returns_only_target() {
+ // Mirror the existing tools_call_search.rs setup pattern.
+ // After ingesting 2 docs and discovering target doc_id from a
+ // baseline search, call mcp__kebab__search with doc_id set and
+ // assert v["hits"] all have doc_id == target.
+ // (Concrete test code mirrors what fb-34 / fb-35 added; see them
+ // for the helper pattern this crate uses.)
+}
+
+#[test]
+fn search_with_invalid_ingested_after_returns_invalid_input() {
+ // Same MCP scaffold. Call with ingested_after = "garbage", assert
+ // the response carries error.v1 with code = "invalid_input" and
+ // message containing "ingested_after".
+}
+```
+
+Implement against whatever the existing tools_call_search.rs scaffold uses. The fb-34/35 tests are good templates.
+
+- [ ] **Step 6.5: Run MCP tests**
+
+```bash
+cargo test -p kebab-mcp
+```
+
+Expected: all PASS.
+
+- [ ] **Step 6.6: Commit**
+
+```bash
+git add crates/kebab-mcp/
+git commit -m "$(cat <<'EOF'
+feat(mcp): kebab__search filter inputs (fb-36)
+
+7 new optional inputs on SearchInput: tags, lang, path_glob,
+trust_min, media, ingested_after, doc_id. Validation surfaces as
+error.v1 code = invalid_input via StructuredError. Dispatch builds
+SearchFilters from the inputs and forwards through the existing
+search_with_opts_with_config facade.
+
+Co-Authored-By: Claude Opus 4.7 (1M context)
+EOF
+)"
+```
+
+---
+
+## Task 7: Workspace test + clippy
+
+- [ ] **Step 7.1: Workspace test**
+
+```bash
+cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -15
+```
+
+Expected: all PASS.
+
+- [ ] **Step 7.2: Clippy**
+
+```bash
+cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -10
+```
+
+Expected: clean.
+
+- [ ] **Step 7.3: Commit any clippy fixes**
+
+```bash
+git add -A
+git commit -m "chore: clippy fixes for fb-36"
+```
+
+(Skip if no fixes needed.)
+
+---
+
+## Task 8: Documentation updates
+
+**Files:**
+- Modify: `README.md`
+- Modify: `docs/SMOKE.md`
+- Modify: `tasks/p9/p9-fb-36-search-filters.md`
+- Modify: `tasks/INDEX.md`
+- Modify: `integrations/claude-code/kebab/SKILL.md`
+
+- [ ] **Step 8.1: README — search row update**
+
+Find the `kebab search` row in 명령 table:
+
+```bash
+grep -n "kebab search" README.md | head -5
+```
+
+Append filter flags. The row gets long — keep concise:
+
+> `... [--tag ] [--lang ] [--path-glob ] [--trust-min ] [--media md,pdf,...] [--ingested-after ] [--doc-id ]` (p9-fb-36 — filter args. AND combinator across flags; OR within --tag/--media. Invalid `--ingested-after` RFC3339 → `error.v1.code = config_invalid`.)
+
+- [ ] **Step 8.2: SMOKE.md — filter walkthrough**
+
+After the existing fb-35 verbatim fetch section, append:
+
+```markdown
+### Filter args (fb-36)
+
+```bash
+# Filter by media kind (md alias normalizes to markdown).
+kebab search "rust" --media md --json | jq '.hits | length'
+
+# Filter by ingest timestamp (RFC3339).
+kebab search "rust" --ingested-after 2026-04-01T00:00:00Z --json
+
+# Combine: doc-id scope + tag (AND across flags).
+kebab search "rust" --doc-id "" --tag rust --json
+```
+
+Bad `--ingested-after` → `error.v1.code = config_invalid`, exit 2.
+Unknown `--media` value → silently empty (no error).
+```
+
+- [ ] **Step 8.3: Spec status flip**
+
+Edit `tasks/p9/p9-fb-36-search-filters.md`:
+
+```diff
+-status: open
++status: completed
+```
+
+Replace the `> ⏳ **백로그 only — 미구현.**` block with:
+
+```markdown
+> ✅ **구현 완료.** 본 spec 은 구현 시점의 frozen 상태. post-merge deviation 은 [HOTFIXES.md](../HOTFIXES.md) 참조.
+
+상세 설계: `docs/superpowers/specs/2026-05-10-p9-fb-36-search-filters-design.md`.
+구현 계획: `docs/superpowers/plans/2026-05-10-p9-fb-36-search-filters.md`.
+```
+
+- [ ] **Step 8.4: tasks/INDEX.md**
+
+```diff
+- - [p9-fb-36 search filter args](p9/p9-fb-36-search-filters.md) — ⏳ 미구현, brainstorm 필요 (depends_on 27)
++ - [p9-fb-36 search filter args](p9/p9-fb-36-search-filters.md) — ✅ 머지 + v0.5.0 cut 후보 (2026-05-10)
+```
+
+(The `depends_on 27` annotation in the original was carried over from the spec stub; drop it.)
+
+- [ ] **Step 8.5: SKILL.md — search input shape**
+
+Find the existing `mcp__kebab__search` Input section:
+
+```bash
+grep -n "mcp__kebab__search\|max_tokens.*null" integrations/claude-code/kebab/SKILL.md | head -5
+```
+
+Update the example input + bullets to mention the 7 new fields:
+
+```markdown
+Input:
+```json
+{
+ "query": "",
+ "mode": "hybrid",
+ "k": 10,
+ "max_tokens": null,
+ "snippet_chars": null,
+ "cursor": null,
+ "tags": null,
+ "lang": null,
+ "path_glob": null,
+ "trust_min": null,
+ "media": null,
+ "ingested_after": null,
+ "doc_id": null
+}
+```
+
+- p9-fb-36 filter inputs: `tags` (OR-within), `lang`, `path_glob`, `trust_min`, `media` (IN-list of `markdown|pdf|image|audio|other`), `ingested_after` (RFC3339 UTC), `doc_id`. AND combinator across keys. Invalid `ingested_after` / unknown `trust_min` → `error.v1.code = invalid_input`. Unknown `media` value → empty hits, no error.
+```
+
+- [ ] **Step 8.6: Commit docs**
+
+```bash
+git add README.md docs/SMOKE.md tasks/p9/p9-fb-36-search-filters.md tasks/INDEX.md integrations/claude-code/kebab/SKILL.md
+git commit -m "$(cat <<'EOF'
+docs(fb-36): README + SMOKE + INDEX + skill notes
+
+Co-Authored-By: Claude Opus 4.7 (1M context)
+EOF
+)"
+```
+
+---
+
+## Task 9: Smoke + push + PR
+
+- [ ] **Step 9.1: Manual smoke**
+
+```bash
+cd /tmp/kebab-smoke
+~/Workspace/projects/kebab/target/release/kebab --config /tmp/kebab-smoke/config.toml ingest
+~/Workspace/projects/kebab/target/release/kebab --config /tmp/kebab-smoke/config.toml search "test" --json --media md | jq '{hits: (.hits | length)}'
+~/Workspace/projects/kebab/target/release/kebab --config /tmp/kebab-smoke/config.toml search "test" --json --ingested-after garbage 2>&1 | tail -5
+```
+
+Expected:
+- `--media md` returns sane hit count.
+- garbage `--ingested-after` exits non-zero with `error.v1.code = config_invalid` on stderr.
+
+- [ ] **Step 9.2: Final workspace test**
+
+```bash
+cd ~/Workspace/projects/kebab
+cargo test --workspace --no-fail-fast -j 1
+```
+
+Expected: all green.
+
+- [ ] **Step 9.3: Push branch**
+
+```bash
+git push -u origin feat/fb-36-search-filters
+```
+
+- [ ] **Step 9.4: Open PR**
+
+Build PR body at `/tmp/fb36-pr-body.md`:
+
+```markdown
+## Summary
+
+- adds 7 filter flags on `kebab search` and the equivalent inputs on `mcp__kebab__search`:
+ - existing `SearchFilters` fields exposed: `--tag` (repeatable, OR-within), `--lang`, `--path-glob`, `--trust-min`
+ - new fields: `--media` (csv, `md` alias), `--ingested-after` (RFC3339 UTC), `--doc-id`
+- AND combinator across flags; OR within `--tag` and `--media`
+- filter layer: SQLite WHERE for lexical (incl. media via `CASE WHEN json_type='text'` to handle both unit and tuple `MediaType` serde shapes), over-fetch + `filter_chunks` post-filter for vector
+- wire shape unchanged — input-only feature; `search_response.v1` and `search_hit.v1` untouched
+- invalid `--ingested-after` / unknown `trust_min` → `error.v1.code = config_invalid` (CLI) / `invalid_input` (MCP); unknown `--media` value → empty hits, no error
+
+## Test plan
+
+- [x] `cargo test --workspace --no-fail-fast -j 1` — green
+- [x] `cargo clippy --workspace --all-targets -- -D warnings` — clean
+- [x] new tests: 6 lexical (media / ingested_after / doc_id / AND / unknown / default), 2 vector mirror, 4 CLI integration, 2 MCP
+- [x] manual smoke per `docs/SMOKE.md` "Filter args" walkthrough
+
+## Architectural notes
+
+- `SearchFilters` 3 fields are additive with `#[serde(default)]` — old JSON without the new keys deserializes cleanly.
+- `MediaType` JSON has two shapes (`"markdown"` for unit variants, `{"image":"png"}` for tuple variants); the SQL `CASE WHEN json_type='text' THEN json_extract($) ELSE (first object key) END` extracts a unified kind string.
+- Vector retriever mirrors the lexical SQL exactly (same WHERE clauses, same params binding pattern). path_glob remains a Rust post-filter — unchanged from before fb-36.
+- No new HOTFIXES entry — additive minor, no contract drift.
+
+## Files of interest
+
+- spec: `docs/superpowers/specs/2026-05-10-p9-fb-36-search-filters-design.md`
+- plan: `docs/superpowers/plans/2026-05-10-p9-fb-36-search-filters.md`
+- core: `crates/kebab-core/src/search.rs` (SearchFilters)
+- search: `crates/kebab-search/src/lexical.rs` + `vector.rs`
+- CLI: `crates/kebab-cli/src/main.rs` (Cmd::Search)
+- MCP: `crates/kebab-mcp/src/tools/search.rs` (SearchInput)
+```
+
+Open PR:
+
+```bash
+/Users/user/.claude/skills/gitea-ops/bin/gitea-pr \
+ --title "feat(fb-36): search filter args (--media / --ingested-after / --doc-id + 4 existing)" \
+ --body "$(cat /tmp/fb36-pr-body.md)" \
+ --head feat/fb-36-search-filters \
+ --base main
+```
+
+- [ ] **Step 9.5: Cleanup**
+
+```bash
+rm /tmp/fb36-pr-body.md
+```
+
+---
+
+## Self-review
+
+- **Spec coverage:**
+ - §Behavior contract / 7 flags → Tasks 1, 4 (CLI), 6 (MCP)
+ - §Filter validation (RFC3339, trust_min) → Task 4 (CLI dispatch), Task 6 (MCP dispatch)
+ - §Filter layer (SQLite WHERE for lexical, over-fetch + post-filter for vector) → Tasks 2, 3
+ - §Wire shape (input-only, no schema change) → no task needed; covered by absence of changes
+ - §MCP `SearchInput` extension → Task 6
+ - §Public surface delta (SearchFilters / TrustLevelFlag / SearchInput) → Tasks 1, 4, 6
+ - §Test plan → Tasks 2 (6 lexical), 3 (2 vector), 5 (4 CLI), 6 (2 MCP)
+ - §Documentation → Task 8
+ - §Risks (MediaType JSON shape, RFC3339 UTC, path_glob ordering) → Task 2 explicitly handles the shape; Task 4 / 6 mention UTC; path_glob position unchanged
+
+- **Placeholder scan:**
+ - Task 2 / 3 / 6 contain "mirror the existing scaffold" instructions — concrete fallback paths spelled out (look at file X, copy pattern Y).
+ - No "TODO" / "fill in" / "later" remaining.
+
+- **Type consistency:**
+ - `SearchFilters { tags_any, lang, path_glob, trust_min, media, ingested_after, doc_id }` consistent across Tasks 1, 2, 3, 4, 6.
+ - `media: Vec`, `ingested_after: Option`, `doc_id: Option` consistent.
+ - `MEDIA_KINDS` const used as documentation reference, not at runtime.
+ - `TrustLevelFlag` clap enum → `kebab_core::TrustLevel` mapping defined in Task 4 step 4.1, used in Task 4 step 4.3.
+ - Error codes consistent: `config_invalid` (CLI), `invalid_input` (MCP) — both via StructuredError.
+
+---
+
+## Execution Handoff
+
+Plan complete and saved to `docs/superpowers/plans/2026-05-10-p9-fb-36-search-filters.md`. Two execution options:
+
+**1. Subagent-Driven (recommended)** — fresh subagent per task, review between tasks.
+
+**2. Inline Execution** — execute tasks in this session.
+
+Which approach?
--
2.49.1
From d3f38c76e928a26f32ebbbc2e22a0e330c6fe1e5 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 03:36:45 +0900
Subject: [PATCH 03/11] feat(core): SearchFilters gains media / ingested_after
/ doc_id (fb-36)
3 additive optional fields. #[serde(default)] preserves
backwards compat for older JSON without the new keys.
MEDIA_KINDS const exposes canonical "markdown"/"pdf"/"image"/
"audio"/"other" labels for downstream alias normalization.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-core/src/search.rs | 38 +++++++++++++++++++++++++++++++++
1 file changed, 38 insertions(+)
diff --git a/crates/kebab-core/src/search.rs b/crates/kebab-core/src/search.rs
index 9d6527b..5e5cd31 100644
--- a/crates/kebab-core/src/search.rs
+++ b/crates/kebab-core/src/search.rs
@@ -26,12 +26,30 @@ pub struct SearchQuery {
pub filters: SearchFilters,
}
+/// p9-fb-36: canonical kind labels for `SearchFilters.media`. Mirrors
+/// `MediaType` variant tags; CLI / MCP normalize aliases (`md` → `markdown`)
+/// before populating this Vec.
+pub const MEDIA_KINDS: &[&str] = &["markdown", "pdf", "image", "audio", "other"];
+
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct SearchFilters {
pub tags_any: Vec,
pub lang: Option,
pub path_glob: Option,
pub trust_min: Option,
+ /// p9-fb-36: media_type filter — IN-list of `MediaType.kind`
+ /// strings (`"markdown"`, `"pdf"`, `"image"`, `"audio"`, `"other"`).
+ /// Empty Vec = no filter. Match is on the variant tag only;
+ /// e.g. `["image"]` matches `Image(Png)` and `Image(Jpeg)`.
+ #[serde(default)]
+ pub media: Vec,
+ /// p9-fb-36: hits whose source doc's `documents.updated_at` is at
+ /// or after this timestamp. None = no filter. RFC3339 / UTC.
+ #[serde(default, with = "time::serde::rfc3339::option")]
+ pub ingested_after: Option,
+ /// p9-fb-36: restrict hits to a single document. None = no filter.
+ #[serde(default)]
+ pub doc_id: Option,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -155,4 +173,24 @@ mod tests {
assert!(opts.snippet_chars.is_none());
assert!(opts.cursor.is_none());
}
+
+ #[test]
+ fn search_filters_default_includes_new_fb36_fields() {
+ let f = SearchFilters::default();
+ assert!(f.media.is_empty(), "media default empty");
+ assert!(f.ingested_after.is_none(), "ingested_after default None");
+ assert!(f.doc_id.is_none(), "doc_id default None");
+ assert!(f.tags_any.is_empty());
+ assert!(f.lang.is_none());
+ assert!(f.path_glob.is_none());
+ assert!(f.trust_min.is_none());
+ }
+
+ #[test]
+ fn search_filters_serialize_with_serde_default_compat() {
+ let old: SearchFilters = serde_json::from_str(r#"{"tags_any":[],"lang":null,"path_glob":null,"trust_min":null}"#).unwrap();
+ assert!(old.media.is_empty());
+ assert!(old.ingested_after.is_none());
+ assert!(old.doc_id.is_none());
+ }
}
--
2.49.1
From 2c80e2ad915e021dd73f4571ffa61aeb291341a7 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 03:41:02 +0900
Subject: [PATCH 04/11] feat(search/lexical): media / ingested_after / doc_id
filters (fb-36)
SQL WHERE clause extension. media uses CASE WHEN json_type='text'
to handle both unit (\`"markdown"\`) and tuple (\`{"image":"png"}\`)
MediaType serde shapes. ingested_after relies on RFC3339 lexicographic
ordering with UTC Z (per fb-32 ingest invariant). doc_id is a simple
equality. AND combinator with existing tags / lang / trust filters.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-search/src/lexical.rs | 44 ++++++
crates/kebab-search/tests/lexical.rs | 210 ++++++++++++++++++++++++++-
2 files changed, 253 insertions(+), 1 deletion(-)
diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs
index 513fb9e..954148c 100644
--- a/crates/kebab-search/src/lexical.rs
+++ b/crates/kebab-search/src/lexical.rs
@@ -319,6 +319,50 @@ fn run_query(
};
params.push(Box::new(rank));
}
+ // p9-fb-36: media_type filter (IN-list).
+ // `assets.media_type` JSON has two shapes:
+ // - unit variant (Markdown / Pdf): JSON text, e.g. `"markdown"`
+ // - tuple variant (Image(Png) / Audio(Mp3) / Other(s)): JSON object,
+ // e.g. `{"image": "png"}`
+ // Extract a unified "kind" string for both shapes via:
+ // CASE WHEN json_type = 'text' THEN json_extract($)
+ // ELSE (first object key)
+ // END IN (?, ...)
+ if !filters.media.is_empty() {
+ let placeholders: Vec<&str> =
+ std::iter::repeat("?").take(filters.media.len()).collect();
+ let placeholders = placeholders.join(",");
+ sql.push_str(&format!(
+ " AND f.doc_id IN (\
+ SELECT d2.doc_id FROM documents d2 \
+ JOIN assets a ON a.asset_id = d2.asset_id \
+ WHERE CASE \
+ WHEN json_type(a.media_type) = 'text' THEN json_extract(a.media_type, '$') \
+ ELSE (SELECT key FROM json_each(a.media_type) LIMIT 1) \
+ END IN ({placeholders}))"
+ ));
+ for kind in &filters.media {
+ params.push(Box::new(kind.clone()));
+ }
+ }
+
+ // p9-fb-36: ingested_after filter.
+ // `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
+ // fb-32 ingest path), so lexicographic >= compare is correct.
+ if let Some(after) = &filters.ingested_after {
+ let formatted = after
+ .format(&time::format_description::well_known::Rfc3339)
+ .expect("OffsetDateTime formats to RFC3339");
+ sql.push_str(" AND d.updated_at >= ?");
+ params.push(Box::new(formatted));
+ }
+
+ // p9-fb-36: doc_id filter — single-doc scoping.
+ if let Some(id) = &filters.doc_id {
+ sql.push_str(" AND d.doc_id = ?");
+ params.push(Box::new(id.0.clone()));
+ }
+
// path_glob is intentionally NOT applied here — see module comment
// on PATH_GLOB_OVERFETCH and the post-filter in `LexicalRetriever::search`.
diff --git a/crates/kebab-search/tests/lexical.rs b/crates/kebab-search/tests/lexical.rs
index ae01460..4265160 100644
--- a/crates/kebab-search/tests/lexical.rs
+++ b/crates/kebab-search/tests/lexical.rs
@@ -8,11 +8,15 @@
use std::sync::Arc;
use kebab_config::Config;
-use kebab_core::{IndexVersion, Lang, Retriever, SearchFilters, SearchMode, SearchQuery, TrustLevel};
+use kebab_core::{
+ DocumentId, IndexVersion, Lang, MediaType, Retriever, SearchFilters, SearchHit, SearchMode,
+ SearchQuery, TrustLevel,
+};
use kebab_search::LexicalRetriever;
use kebab_store_sqlite::SqliteStore;
use rusqlite::Connection;
use tempfile::TempDir;
+use time::OffsetDateTime;
// ── Test scaffolding ─────────────────────────────────────────────────────
@@ -679,6 +683,210 @@ fn search_hit_carries_indexed_at_from_documents_updated_at() {
assert!(!hit.stale, "lexical retriever must default stale=false");
}
+// ── TestEnv helper for fb-36 filter tests ───────────────────────────────
+
+/// Convenience wrapper over `Env` that exposes higher-level fixture helpers
+/// for the fb-36 filter tests. Intentionally kept separate from `Env` so
+/// the original tests are untouched.
+struct TestEnv {
+ inner: Env,
+ counter: std::cell::Cell,
+}
+
+impl TestEnv {
+ fn new() -> Self {
+ Self {
+ inner: Env::new(),
+ counter: std::cell::Cell::new(0),
+ }
+ }
+
+ /// Allocate a fresh monotone counter suffix so every inserted doc / chunk
+ /// gets a unique 32-hex ID without the caller worrying about collisions.
+ fn next_id(&self, prefix: &str) -> String {
+ let n = self.counter.get();
+ self.counter.set(n + 1);
+ let suffix = format!("{prefix}{n:04}");
+ id32(&suffix)
+ }
+
+ /// Insert a markdown doc with the given `body` and return its `DocumentId`.
+ fn insert_doc(&self, path: &str, body: &str) -> DocumentId {
+ self.insert_doc_with_media(path, body, MediaType::Markdown)
+ }
+
+ /// Insert a doc whose `assets.media_type` JSON is set to the serialized
+ /// form of `media`. The `documents.updated_at` defaults to now.
+ fn insert_doc_with_media(&self, path: &str, body: &str, media: MediaType) -> DocumentId {
+ self.insert_doc_full(path, body, media, OffsetDateTime::now_utc())
+ }
+
+ /// Insert a doc with an explicit `updated_at` timestamp (for
+ /// `ingested_after` filter tests).
+ fn insert_doc_with_updated_at(
+ &self,
+ path: &str,
+ body: &str,
+ updated_at: OffsetDateTime,
+ ) -> DocumentId {
+ self.insert_doc_full(path, body, MediaType::Markdown, updated_at)
+ }
+
+ fn insert_doc_full(
+ &self,
+ path: &str,
+ body: &str,
+ media: MediaType,
+ updated_at: OffsetDateTime,
+ ) -> DocumentId {
+ use time::format_description::well_known::Rfc3339;
+ let doc_id = self.next_id("doc");
+ let chunk_id = self.next_id("chk");
+ let asset_id = self.next_id("ast");
+ let media_json = serde_json::to_string(&media).expect("serialize MediaType");
+ let updated_at_str = updated_at.format(&Rfc3339).expect("format updated_at");
+
+ let conn = self.inner.raw_conn();
+ conn.execute(
+ "INSERT OR IGNORE INTO assets (
+ asset_id, source_uri, workspace_path, media_type, byte_len,
+ checksum, storage_kind, storage_path, discovered_at
+ ) VALUES (?, ?, ?, ?, 0,
+ 'd0', 'reference', ?, '2024-01-01T00:00:00Z')",
+ rusqlite::params![asset_id, format!("file:///{path}"), path, media_json, path],
+ )
+ .expect("insert asset");
+
+ conn.execute(
+ "INSERT INTO documents (
+ doc_id, asset_id, workspace_path, title, lang,
+ source_type, trust_level, parser_version,
+ doc_version, schema_version, metadata_json,
+ provenance_json, created_at, updated_at
+ ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'pv1', 1, 1,
+ '{}', '{\"events\":[]}',
+ '2024-01-01T00:00:00Z', ?)",
+ rusqlite::params![doc_id, asset_id, path, updated_at_str],
+ )
+ .expect("insert document");
+
+ let empty_headings: Vec<&str> = vec![];
+ let heading_json = serde_json::to_string(&empty_headings).unwrap();
+ conn.execute(
+ "INSERT INTO chunks (
+ chunk_id, doc_id, text, heading_path_json, section_label,
+ source_spans_json, token_estimate, chunker_version,
+ policy_hash, block_ids_json, created_at
+ ) VALUES (?, ?, ?, ?, NULL,
+ '[{\"kind\":\"line\",\"start\":1,\"end\":1}]',
+ 1, 'v1', 'h', '[]', '2024-01-01T00:00:00Z')",
+ rusqlite::params![chunk_id, doc_id, body, heading_json],
+ )
+ .expect("insert chunk");
+
+ DocumentId(doc_id)
+ }
+
+ fn run_search(&self, query: &str, filters: &SearchFilters) -> Vec {
+ let r = self.inner.retriever();
+ let q = SearchQuery {
+ text: query.to_string(),
+ mode: SearchMode::Lexical,
+ k: 10,
+ filters: filters.clone(),
+ };
+ r.search(&q).expect("search")
+ }
+}
+
+// ── fb-36 filter tests ───────────────────────────────────────────────────
+
+#[test]
+fn lexical_filter_by_media() {
+ let env = TestEnv::new();
+ env.insert_doc_with_media("md1.md", "rust ownership", MediaType::Markdown);
+ env.insert_doc_with_media("doc.pdf", "rust pdf body", MediaType::Pdf);
+ let filters = SearchFilters {
+ media: vec!["pdf".to_string()],
+ ..Default::default()
+ };
+ let hits = env.run_search("rust", &filters);
+ assert_eq!(hits.len(), 1, "only pdf doc should match");
+ assert!(hits[0].doc_path.0.ends_with(".pdf"), "got: {}", hits[0].doc_path.0);
+}
+
+#[test]
+fn lexical_filter_by_ingested_after() {
+ let env = TestEnv::new();
+ env.insert_doc_with_updated_at(
+ "old.md",
+ "ingest test",
+ time::macros::datetime!(2020-01-01 00:00:00 UTC),
+ );
+ env.insert_doc_with_updated_at(
+ "new.md",
+ "ingest test",
+ time::macros::datetime!(2026-01-01 00:00:00 UTC),
+ );
+ let filters = SearchFilters {
+ ingested_after: Some(time::macros::datetime!(2025-01-01 00:00:00 UTC)),
+ ..Default::default()
+ };
+ let hits = env.run_search("ingest", &filters);
+ assert_eq!(hits.len(), 1, "only post-2025 doc matches");
+}
+
+#[test]
+fn lexical_filter_by_doc_id() {
+ let env = TestEnv::new();
+ let target = env.insert_doc("a.md", "shared term");
+ env.insert_doc("b.md", "shared term");
+ let filters = SearchFilters {
+ doc_id: Some(target.clone()),
+ ..Default::default()
+ };
+ let hits = env.run_search("shared", &filters);
+ assert!(!hits.is_empty(), "should get at least one hit for target doc");
+ for h in &hits {
+ assert_eq!(h.doc_id, target, "all hits must be from target doc");
+ }
+}
+
+#[test]
+fn lexical_filter_combinator_is_and() {
+ let env = TestEnv::new();
+ let target = env.insert_doc_with_media("a.md", "rust", MediaType::Markdown);
+ env.insert_doc_with_media("b.pdf", "rust", MediaType::Pdf);
+ let filters = SearchFilters {
+ media: vec!["markdown".to_string()],
+ doc_id: Some(target.clone()),
+ ..Default::default()
+ };
+ let hits = env.run_search("rust", &filters);
+ assert!(!hits.is_empty(), "target doc should match combined filter");
+ assert!(hits.iter().all(|h| h.doc_id == target));
+}
+
+#[test]
+fn lexical_filter_unknown_media_returns_empty() {
+ let env = TestEnv::new();
+ env.insert_doc("a.md", "rust");
+ let filters = SearchFilters {
+ media: vec!["nonexistent_kind".to_string()],
+ ..Default::default()
+ };
+ let hits = env.run_search("rust", &filters);
+ assert!(hits.is_empty(), "unknown media → no hits, no error");
+}
+
+#[test]
+fn lexical_empty_filters_match_default_behavior() {
+ let env = TestEnv::new();
+ env.insert_doc("a.md", "rust");
+ let with_default = env.run_search("rust", &SearchFilters::default());
+ assert!(!with_default.is_empty());
+}
+
#[test]
fn lexical_snapshot_run_1() {
// Pinned snapshot. A small, deterministic corpus; the JSON shape of
--
2.49.1
From 86475e5ba2b7108912a489e4bea801011ab2c651 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 03:43:51 +0900
Subject: [PATCH 05/11] fix(search/lexical): use std::iter::repeat_n (clippy)
Per code review on 2c80e2a. manual-repeat-n lint triggers
for Rust 1.94+ when repeat().take() can be expressed as
repeat_n directly.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-search/src/lexical.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs
index 954148c..871c22d 100644
--- a/crates/kebab-search/src/lexical.rs
+++ b/crates/kebab-search/src/lexical.rs
@@ -330,7 +330,7 @@ fn run_query(
// END IN (?, ...)
if !filters.media.is_empty() {
let placeholders: Vec<&str> =
- std::iter::repeat("?").take(filters.media.len()).collect();
+ std::iter::repeat_n("?", filters.media.len()).collect();
let placeholders = placeholders.join(",");
sql.push_str(&format!(
" AND f.doc_id IN (\
--
2.49.1
From c6cc1e2bfef6ac327143df3eeaef160060aa2838 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 03:50:56 +0900
Subject: [PATCH 06/11] feat(search/vector): media / ingested_after / doc_id
filters (fb-36)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
filter_chunks helper in kebab-store-sqlite extended with the same 3
WHERE clauses as lexical. Vector still over-fetches k*2 then
post-filters via SqliteStore::filter_chunks; small k can return < k
hits when filters drop a lot — agent is expected to widen k or
paginate. AND combinator with existing filters.
- kebab-store-sqlite/src/filters.rs: media IN-list subquery, ingested_after
lexicographic >= compare, doc_id equality; mirrors lexical SQL arms
- 3 direct unit tests (filter_chunks_media_type/ingested_after/doc_id)
that run without AVX/Lance
- common/mod.rs: insert_doc / insert_doc_with_media / run_vector_search
helpers on HybridEnv for integration-test use
- hybrid.rs: 2 new #[ignore = "requires AVX..."] integration tests
(vector_filter_by_media, vector_filter_by_doc_id)
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-search/tests/common/mod.rs | 91 +++++++++-
crates/kebab-search/tests/hybrid.rs | 53 +++++-
crates/kebab-store-sqlite/src/filters.rs | 217 +++++++++++++++++++++++
3 files changed, 359 insertions(+), 2 deletions(-)
diff --git a/crates/kebab-search/tests/common/mod.rs b/crates/kebab-search/tests/common/mod.rs
index 69b87bd..d0ae1ad 100644
--- a/crates/kebab-search/tests/common/mod.rs
+++ b/crates/kebab-search/tests/common/mod.rs
@@ -19,7 +19,9 @@ use std::sync::Arc;
use kebab_config::Config;
use kebab_core::{
ChunkId, DocumentId, EmbeddingId, EmbeddingInput, EmbeddingKind,
- EmbeddingModelId, EmbeddingVersion, IndexVersion, VectorRecord, VectorStore,
+ EmbeddingModelId, EmbeddingVersion, IndexVersion, MediaType,
+ Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery,
+ VectorRecord, VectorStore,
};
use kebab_embed::{Embedder, MockEmbedder};
use kebab_search::{LexicalRetriever, VectorRetriever};
@@ -173,6 +175,93 @@ impl HybridEnv {
.unwrap();
}
+ /// High-level helper: seed a doc with the default media type
+ /// (Markdown) and embed its text. Returns the `DocumentId` so
+ /// callers can use it in `doc_id` filter tests.
+ pub fn insert_doc(&self, path: &str, text: &str) -> DocumentId {
+ self.insert_doc_with_media(path, text, MediaType::Markdown)
+ }
+
+ /// High-level helper: seed a doc with an explicit `MediaType`.
+ /// The `media_type` is serialized to JSON (mirrors how
+ /// `DocumentStore::put_document` writes it) and stored in `assets`.
+ pub fn insert_doc_with_media(
+ &self,
+ path: &str,
+ text: &str,
+ media: MediaType,
+ ) -> DocumentId {
+ // Derive deterministic IDs from the path so repeated calls with
+ // the same path are idempotent (INSERT OR IGNORE).
+ let path_hash: String = {
+ use std::collections::hash_map::DefaultHasher;
+ use std::hash::{Hash, Hasher};
+ let mut h = DefaultHasher::new();
+ path.hash(&mut h);
+ format!("{:032x}", h.finish())
+ };
+ let doc_id = format!("d{}", &path_hash[..31]);
+ let chunk_id = format!("c{}", &path_hash[..31]);
+ let asset_id = format!("a{}", &path_hash[..31]);
+
+ let media_json = serde_json::to_string(&media).expect("serialize MediaType");
+ let conn = self.sqlite.read_conn();
+ conn.execute(
+ "INSERT OR IGNORE INTO assets (
+ asset_id, source_uri, workspace_path, media_type, byte_len,
+ checksum, storage_kind, storage_path, discovered_at
+ ) VALUES (?, ?, ?, ?, 0,
+ 'deadbeefdeadbeefdeadbeefdeadbeef',
+ 'reference', ?, '1970-01-01T00:00:00Z')",
+ params![
+ asset_id,
+ format!("file:///{path}"),
+ path,
+ media_json,
+ path,
+ ],
+ )
+ .unwrap();
+ conn.execute(
+ "INSERT OR IGNORE INTO documents (
+ doc_id, asset_id, workspace_path, title, lang, source_type,
+ trust_level, parser_version, doc_version, schema_version,
+ metadata_json, provenance_json, created_at, updated_at
+ ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
+ '{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
+ params![doc_id, asset_id, path],
+ )
+ .unwrap();
+ let heading_json = "[]";
+ conn.execute(
+ "INSERT OR IGNORE INTO chunks (
+ chunk_id, doc_id, text, heading_path_json, section_label,
+ source_spans_json, token_estimate, chunker_version,
+ policy_hash, block_ids_json, created_at
+ ) VALUES (?, ?, ?, ?, NULL,
+ '[{\"kind\":\"line\",\"start\":1,\"end\":1}]',
+ 1, 'v1', 'h', '[]', '1970-01-01T00:00:00Z')",
+ params![chunk_id, doc_id, text, heading_json],
+ )
+ .unwrap();
+ drop(conn);
+ self.embed_and_upsert(&chunk_id, &doc_id, text, &[]);
+ DocumentId(doc_id)
+ }
+
+ /// Run a `SearchMode::Vector` query against the seeded corpus and
+ /// return the resulting `Vec`.
+ pub fn run_vector_search(&self, query: &str, filters: &SearchFilters) -> Vec {
+ let r = self.vector_retriever();
+ let q = SearchQuery {
+ text: query.to_string(),
+ mode: SearchMode::Vector,
+ k: 10,
+ filters: filters.clone(),
+ };
+ r.search(&q).expect("vector search")
+ }
+
/// Embed `text` as a Document and upsert it as the embedding for
/// `chunk_id`. Drives the same code path production uses:
/// MockEmbedder → VectorRecord → LanceVectorStore::upsert →
diff --git a/crates/kebab-search/tests/hybrid.rs b/crates/kebab-search/tests/hybrid.rs
index 13f945d..912422a 100644
--- a/crates/kebab-search/tests/hybrid.rs
+++ b/crates/kebab-search/tests/hybrid.rs
@@ -15,7 +15,7 @@ use common::{
HybridEnv, id32, require_avx_or_panic, TEST_LEX_INDEX_VERSION, TEST_VEC_INDEX_VERSION,
};
use kebab_core::{
- Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery,
+ MediaType, Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery,
};
use kebab_search::{FusionPolicy, HybridRetriever};
use rusqlite::params;
@@ -213,6 +213,57 @@ fn hybrid_snapshot_run_1() {
}
}
+/// p9-fb-36: vector post-filter must pass `media` through `filter_chunks`.
+/// Seeding two docs (markdown + pdf) and filtering for pdf-only must
+/// return only the pdf chunk, proving `LanceVectorStore::search` →
+/// `SqliteStore::filter_chunks` correctly applies the media arm.
+#[test]
+#[ignore = "requires AVX-capable hardware (LanceDB)"]
+fn vector_filter_by_media() {
+ require_avx_or_panic();
+ let env = HybridEnv::new();
+ env.insert_doc_with_media("md1.md", "rust ownership", MediaType::Markdown);
+ env.insert_doc_with_media("doc.pdf", "rust pdf body", MediaType::Pdf);
+
+ let filters = SearchFilters {
+ media: vec!["pdf".to_string()],
+ ..Default::default()
+ };
+ let hits = env.run_vector_search("rust", &filters);
+ assert_eq!(hits.len(), 1, "media filter must keep only pdf chunk");
+ assert!(
+ hits[0].doc_path.0.ends_with(".pdf"),
+ "expected .pdf path, got: {}",
+ hits[0].doc_path.0
+ );
+}
+
+/// p9-fb-36: vector post-filter must pass `doc_id` through `filter_chunks`.
+/// Seeding two docs with shared text, filtering by one doc_id must return
+/// only chunks from that doc.
+#[test]
+#[ignore = "requires AVX-capable hardware (LanceDB)"]
+fn vector_filter_by_doc_id() {
+ require_avx_or_panic();
+ let env = HybridEnv::new();
+ let target = env.insert_doc("a.md", "shared knowledge");
+ env.insert_doc("b.md", "shared knowledge");
+
+ let filters = SearchFilters {
+ doc_id: Some(target.clone()),
+ ..Default::default()
+ };
+ let hits = env.run_vector_search("shared", &filters);
+ assert!(
+ !hits.is_empty(),
+ "doc_id filter must return hits for the target doc"
+ );
+ assert!(
+ hits.iter().all(|h| h.doc_id == target),
+ "all hits must belong to the target doc_id"
+ );
+}
+
#[test]
#[ignore = "requires AVX-capable hardware (LanceDB)"]
fn vector_hit_carries_indexed_at() {
diff --git a/crates/kebab-store-sqlite/src/filters.rs b/crates/kebab-store-sqlite/src/filters.rs
index 2b1ff00..4586236 100644
--- a/crates/kebab-store-sqlite/src/filters.rs
+++ b/crates/kebab-store-sqlite/src/filters.rs
@@ -129,6 +129,47 @@ impl SqliteStore {
}
}
+ // p9-fb-36: media_type filter (IN-list).
+ // `assets.media_type` JSON has two shapes:
+ // - unit variant (Markdown / Pdf / …): JSON text, e.g. `"markdown"`
+ // - tuple variant (Image(Png) / Audio(Mp3) / Other(s)): JSON object,
+ // e.g. `{"image": "png"}`
+ // Extract a unified "kind" string for both shapes; mirrors lexical.
+ if !filters.media.is_empty() {
+ let media_ph = std::iter::repeat_n("?", filters.media.len())
+ .collect::>()
+ .join(",");
+ sql.push_str(&format!(
+ " AND d.doc_id IN (\
+ SELECT d2.doc_id FROM documents d2 \
+ JOIN assets a ON a.asset_id = d2.asset_id \
+ WHERE CASE \
+ WHEN json_type(a.media_type) = 'text' THEN json_extract(a.media_type, '$') \
+ ELSE (SELECT key FROM json_each(a.media_type) LIMIT 1) \
+ END IN ({media_ph}))"
+ ));
+ for kind in &filters.media {
+ bind.push(Box::new(kind.clone()));
+ }
+ }
+
+ // p9-fb-36: ingested_after filter.
+ // `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32);
+ // lexicographic >= compare is correct.
+ if let Some(after) = &filters.ingested_after {
+ let formatted = after
+ .format(&time::format_description::well_known::Rfc3339)
+ .expect("OffsetDateTime formats to RFC3339");
+ sql.push_str(" AND d.updated_at >= ?");
+ bind.push(Box::new(formatted));
+ }
+
+ // p9-fb-36: doc_id filter — single-doc scoping.
+ if let Some(id) = &filters.doc_id {
+ sql.push_str(" AND d.doc_id = ?");
+ bind.push(Box::new(id.0.clone()));
+ }
+
// Optional path_glob: applied in Rust on the rows we get back,
// not in SQL — matching `kb-search::lexical`'s post-filter so
// the glob semantics are byte-identical between retrievers.
@@ -280,6 +321,89 @@ mod tests {
.unwrap();
}
+ /// Variant of `seed_committed` that accepts an explicit `media_type`
+ /// JSON string (e.g. `r#""markdown""#` or `r#""pdf""#`) and an
+ /// explicit `updated_at` RFC3339 string so the fb-36 filter tests can
+ /// exercise `media` and `ingested_after` without going through the full
+ /// ingest pipeline.
+ #[allow(clippy::too_many_arguments)]
+ fn seed_committed_full(
+ store: &SqliteStore,
+ chunk_id: &str,
+ doc_id: &str,
+ workspace_path: &str,
+ lang: &str,
+ tags: &[&str],
+ trust: &str,
+ media_type_json: &str,
+ updated_at: &str,
+ ) {
+ let asset_id = format!("a{}", &doc_id[..31]);
+ {
+ let conn = store.lock_conn();
+ conn.execute(
+ "INSERT INTO assets (
+ asset_id, source_uri, workspace_path, media_type, byte_len,
+ checksum, storage_kind, storage_path, discovered_at
+ ) VALUES (?, ?, ?, ?, 0, 'deadbeefdeadbeefdeadbeefdeadbeef',
+ 'reference', ?, '1970-01-01T00:00:00Z')",
+ params![
+ asset_id,
+ format!("file://{workspace_path}"),
+ workspace_path,
+ media_type_json,
+ workspace_path,
+ ],
+ )
+ .unwrap();
+ conn.execute(
+ "INSERT INTO documents (
+ doc_id, asset_id, workspace_path, title, lang, source_type,
+ trust_level, parser_version, doc_version, schema_version,
+ metadata_json, provenance_json, created_at, updated_at
+ ) VALUES (?, ?, ?, NULL, ?, 'markdown', ?, 'v1', 1, 1,
+ '{}', '{}', '1970-01-01T00:00:00Z', ?)",
+ params![doc_id, asset_id, workspace_path, lang, trust, updated_at],
+ )
+ .unwrap();
+ for t in tags {
+ conn.execute(
+ "INSERT INTO document_tags (doc_id, tag) VALUES (?, ?)",
+ params![doc_id, t],
+ )
+ .unwrap();
+ }
+ conn.execute(
+ "INSERT INTO chunks (
+ chunk_id, doc_id, text, heading_path_json, section_label,
+ source_spans_json, token_estimate, chunker_version,
+ policy_hash, block_ids_json, created_at
+ ) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]',
+ '1970-01-01T00:00:00Z')",
+ params![chunk_id, doc_id],
+ )
+ .unwrap();
+ }
+
+ let embed_row = EmbeddingRecordRow {
+ embedding_id: format!("e{}", &chunk_id[..31]),
+ chunk_id: chunk_id.to_string(),
+ model_id: "m".to_string(),
+ model_version: "v1".to_string(),
+ dimensions: 4,
+ lance_table: "t".to_string(),
+ created_at: OffsetDateTime::UNIX_EPOCH,
+ };
+ store
+ .put_embedding_records_pending(std::slice::from_ref(&embed_row))
+ .unwrap();
+ store
+ .mark_embedding_records_committed(std::slice::from_ref(
+ &embed_row.embedding_id,
+ ))
+ .unwrap();
+ }
+
fn cid(s: &str) -> ChunkId {
ChunkId(s.to_string())
}
@@ -449,4 +573,97 @@ mod tests {
let out = store.filter_chunks(&[], &SearchFilters::default()).unwrap();
assert!(out.is_empty());
}
+
+ // ── p9-fb-36 new filter arms ─────────────────────────────────────────
+
+ #[test]
+ fn filter_chunks_media_type_keeps_matching_kind() {
+ // c1 = markdown, c2 = pdf. Filter for pdf → only c2 survives.
+ let tmp = TempDir::new().unwrap();
+ let store = open_store(&tmp);
+ let c1 = "11111111111111111111111111111111";
+ let c2 = "22222222222222222222222222222222";
+ seed_committed_full(
+ &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1",
+ "notes/a.md", "en", &[], "primary",
+ r#""markdown""#,
+ "1970-01-01T00:00:00Z",
+ );
+ seed_committed_full(
+ &store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2",
+ "notes/b.pdf", "en", &[], "primary",
+ r#""pdf""#,
+ "1970-01-01T00:00:00Z",
+ );
+
+ let f = SearchFilters {
+ media: vec!["pdf".to_string()],
+ ..Default::default()
+ };
+ let out = store
+ .filter_chunks(&[cid(c1), cid(c2)], &f)
+ .unwrap();
+ assert_eq!(out, vec![cid(c2)], "only pdf chunk should survive media filter");
+ }
+
+ #[test]
+ fn filter_chunks_ingested_after_excludes_old_docs() {
+ // c1 ingested 2020, c2 ingested 2026. filter ingested_after=2025 → only c2.
+ let tmp = TempDir::new().unwrap();
+ let store = open_store(&tmp);
+ let c1 = "11111111111111111111111111111111";
+ let c2 = "22222222222222222222222222222222";
+ seed_committed_full(
+ &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1",
+ "old.md", "en", &[], "primary",
+ r#""markdown""#,
+ "2020-01-01T00:00:00Z",
+ );
+ seed_committed_full(
+ &store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2",
+ "new.md", "en", &[], "primary",
+ r#""markdown""#,
+ "2026-01-01T00:00:00Z",
+ );
+
+ let f = SearchFilters {
+ ingested_after: Some(time::macros::datetime!(2025-01-01 00:00:00 UTC)),
+ ..Default::default()
+ };
+ let out = store
+ .filter_chunks(&[cid(c1), cid(c2)], &f)
+ .unwrap();
+ assert_eq!(out, vec![cid(c2)], "only post-2025 chunk should survive ingested_after filter");
+ }
+
+ #[test]
+ fn filter_chunks_doc_id_scopes_to_single_doc() {
+ // c1 belongs to d1, c2 belongs to d2. filter doc_id=d1 → only c1.
+ let tmp = TempDir::new().unwrap();
+ let store = open_store(&tmp);
+ let c1 = "11111111111111111111111111111111";
+ let c2 = "22222222222222222222222222222222";
+ let d1 = "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1";
+ seed_committed_full(
+ &store, c1, d1,
+ "a.md", "en", &[], "primary",
+ r#""markdown""#,
+ "1970-01-01T00:00:00Z",
+ );
+ seed_committed_full(
+ &store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2",
+ "b.md", "en", &[], "primary",
+ r#""markdown""#,
+ "1970-01-01T00:00:00Z",
+ );
+
+ let f = SearchFilters {
+ doc_id: Some(kebab_core::DocumentId(d1.to_string())),
+ ..Default::default()
+ };
+ let out = store
+ .filter_chunks(&[cid(c1), cid(c2)], &f)
+ .unwrap();
+ assert_eq!(out, vec![cid(c1)], "doc_id filter must scope to the target doc only");
+ }
}
--
2.49.1
From 6a18847892c193c0122329d7d0ab8403c686c4de Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 03:57:55 +0900
Subject: [PATCH 07/11] feat(cli): kebab search filter flags (fb-36)
7 new flags: --tag (repeatable), --lang, --path-glob,
--trust-min (value_enum), --media (csv with `md` alias),
--ingested-after (RFC3339; config_invalid on parse fail),
--doc-id. Dispatch translates clap values into SearchFilters
and propagates structured errors through the existing
StructuredError wrapper from fb-34.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-cli/src/main.rs | 111 ++++++++++++++++++++++++++++++++++-
1 file changed, 110 insertions(+), 1 deletion(-)
diff --git a/crates/kebab-cli/src/main.rs b/crates/kebab-cli/src/main.rs
index c92ba27..7e41d8a 100644
--- a/crates/kebab-cli/src/main.rs
+++ b/crates/kebab-cli/src/main.rs
@@ -131,6 +131,38 @@ enum Cmd {
/// `corpus_revision` returns `error.v1.code = stale_cursor`.
#[arg(long)]
cursor: Option,
+
+ /// p9-fb-36: filter by `metadata.tags`. Repeatable; OR-within (any tag).
+ #[arg(long)]
+ tag: Vec,
+
+ /// p9-fb-36: filter by `documents.lang` (ISO code).
+ #[arg(long)]
+ lang: Option,
+
+ /// p9-fb-36: filter by `documents.workspace_path` glob.
+ #[arg(long)]
+ path_glob: Option,
+
+ /// p9-fb-36: filter by minimum `documents.trust_level`.
+ #[arg(long, value_enum)]
+ trust_min: Option,
+
+ /// p9-fb-36: filter by `assets.media_type` kind. Comma-separated.
+ /// Aliases: `md` → `markdown`. Other accepted: `markdown`, `pdf`,
+ /// `image`, `audio`, `other`. Unknown values match nothing.
+ #[arg(long, value_delimiter = ',')]
+ media: Vec,
+
+ /// p9-fb-36: filter to docs whose `updated_at` is >= this RFC3339
+ /// timestamp (UTC). Invalid format → exit 2 with error.v1
+ /// code = config_invalid.
+ #[arg(long)]
+ ingested_after: Option,
+
+ /// p9-fb-36: filter to a single doc by id.
+ #[arg(long)]
+ doc_id: Option,
},
/// Retrieval-augmented question answering.
@@ -351,6 +383,25 @@ impl From for kebab_core::SearchMode {
}
}
+/// p9-fb-36: clap value enum for `--trust-min`. Maps to
+/// `kebab_core::TrustLevel` via `From`.
+#[derive(clap::ValueEnum, Clone, Debug)]
+enum TrustLevelFlag {
+ Primary,
+ Secondary,
+ Generated,
+}
+
+impl From for kebab_core::TrustLevel {
+ fn from(f: TrustLevelFlag) -> Self {
+ match f {
+ TrustLevelFlag::Primary => kebab_core::TrustLevel::Primary,
+ TrustLevelFlag::Secondary => kebab_core::TrustLevel::Secondary,
+ TrustLevelFlag::Generated => kebab_core::TrustLevel::Generated,
+ }
+ }
+}
+
/// Parse boolean env var accepting "1", "true", "yes", "on" (case-insensitive)
/// as truthy; "0", "false", "no", "off" as falsy. Used for `KEBAB_READONLY`.
fn parse_bool_env(s: &str) -> Result {
@@ -611,13 +662,71 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
max_tokens,
snippet_chars,
cursor,
+ tag,
+ lang,
+ path_glob,
+ trust_min,
+ media,
+ ingested_after,
+ doc_id,
} => {
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
+
+ // p9-fb-36: normalize --media aliases (md → markdown).
+ fn normalize_media_alias(s: &str) -> String {
+ match s.to_ascii_lowercase().as_str() {
+ "md" => "markdown".to_string(),
+ other => other.to_string(),
+ }
+ }
+ let media_norm: Vec =
+ media.iter().map(|s| normalize_media_alias(s)).collect();
+
+ // p9-fb-36: parse --ingested-after as RFC3339; structured error on failure.
+ let ingested_after_parsed: Option =
+ match ingested_after.as_deref() {
+ Some(s) => {
+ match time::OffsetDateTime::parse(
+ s,
+ &time::format_description::well_known::Rfc3339,
+ ) {
+ Ok(ts) => Some(ts),
+ Err(e) => {
+ return Err(anyhow::Error::new(
+ kebab_app::StructuredError(kebab_app::ErrorV1 {
+ schema_version: kebab_app::ERROR_V1_ID.to_string(),
+ code: "config_invalid".to_string(),
+ message: format!(
+ "--ingested-after: invalid RFC3339 timestamp '{s}': {e}"
+ ),
+ details: serde_json::Value::Null,
+ hint: Some(
+ "expected format like 2026-04-01T00:00:00Z".to_string(),
+ ),
+ }),
+ ));
+ }
+ }
+ }
+ None => None,
+ };
+
+ // p9-fb-36: build SearchFilters from the 7 new flags.
+ let filters = kebab_core::SearchFilters {
+ tags_any: tag.clone(),
+ lang: lang.as_ref().map(|s| kebab_core::Lang(s.clone())),
+ path_glob: path_glob.clone(),
+ trust_min: trust_min.clone().map(Into::into),
+ media: media_norm,
+ ingested_after: ingested_after_parsed,
+ doc_id: doc_id.as_ref().map(|s| kebab_core::DocumentId(s.clone())),
+ };
+
let q = kebab_core::SearchQuery {
text: query.clone(),
mode: (*mode).into(),
k: *k,
- filters: kebab_core::SearchFilters::default(),
+ filters,
};
let opts = kebab_core::SearchOpts {
max_tokens: *max_tokens,
--
2.49.1
From 4e0379c04fc3ae6bdf96d5a50955e398a6dd2565 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 04:06:21 +0900
Subject: [PATCH 08/11] =?UTF-8?q?test(cli):=20wire=5Fsearch=5Ffilters=20?=
=?UTF-8?q?=E2=80=94=20lexical-only=20integration=20tests=20(fb-36)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Cover: --doc-id scoping, --ingested-after validation error,
--media md alias, --tag repeatable + frontmatter parsing.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-cli/tests/wire_search_filters.rs | 226 ++++++++++++++++++
1 file changed, 226 insertions(+)
create mode 100644 crates/kebab-cli/tests/wire_search_filters.rs
diff --git a/crates/kebab-cli/tests/wire_search_filters.rs b/crates/kebab-cli/tests/wire_search_filters.rs
new file mode 100644
index 0000000..6c68aef
--- /dev/null
+++ b/crates/kebab-cli/tests/wire_search_filters.rs
@@ -0,0 +1,226 @@
+//! p9-fb-36: CLI integration tests for search filter flags.
+//!
+//! Lexical-only — no fastembed / no Ollama. Each test builds its own
+//! TempDir KB via `common::write_config` + `common::ingest` and drives
+//! `kebab search` through `common::run_search_with_args` or direct
+//! `Command` invocations. Verifies:
+//!
+//! - `--doc-id ` restricts all returned hits to the target document.
+//! - `--ingested-after ` exits non-zero and emits `error.v1` on
+//! stderr with `code = "config_invalid"`.
+//! - `--media md` (alias) normalises to `markdown` and matches `.md` docs.
+//! - `--tag ` (repeatable, OR-within) filters by frontmatter tags.
+
+mod common;
+
+use serde_json::Value;
+use std::fs;
+use std::process::Command;
+
+// ---------------------------------------------------------------------------
+// Test 1: --doc-id restricts hits to a single document
+// ---------------------------------------------------------------------------
+
+#[test]
+fn search_with_doc_id_filter_returns_only_target_doc() {
+ let dir = tempfile::tempdir().unwrap();
+ let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
+
+ // Two docs that both contain the search term.
+ fs::write(workspace.join("a.md"), "# Alpha\n\nrust ownership rules\n").unwrap();
+ fs::write(workspace.join("b.md"), "# Beta\n\nrust borrow checker\n").unwrap();
+ common::ingest(&cfg, &workspace);
+
+ // First, search without a doc-id filter to find what doc_ids exist.
+ let (stdout, _) = common::run_search_with_args(
+ &cfg,
+ &["--json", "--mode", "lexical", "rust"],
+ );
+ let resp: Value = serde_json::from_str(stdout.trim())
+ .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
+ let hits = resp["hits"].as_array().expect("hits array");
+ assert!(
+ hits.len() >= 2,
+ "expected ≥2 hits from two docs before filter: {resp}"
+ );
+
+ // Grab one doc_id from the results.
+ let target_doc_id = hits[0]["doc_id"]
+ .as_str()
+ .expect("doc_id string")
+ .to_string();
+
+ // Re-search with --doc-id set to the first hit's doc_id.
+ let (stdout2, _) = common::run_search_with_args(
+ &cfg,
+ &[
+ "--json",
+ "--mode",
+ "lexical",
+ "--doc-id",
+ &target_doc_id,
+ "rust",
+ ],
+ );
+ let resp2: Value = serde_json::from_str(stdout2.trim())
+ .unwrap_or_else(|e| panic!("not JSON after filter: {stdout2:?}: {e}"));
+ let filtered_hits = resp2["hits"].as_array().expect("hits array (filtered)");
+
+ assert!(
+ !filtered_hits.is_empty(),
+ "expected at least one hit for the target doc"
+ );
+ for hit in filtered_hits {
+ let got = hit["doc_id"].as_str().expect("doc_id string in hit");
+ assert_eq!(
+ got, target_doc_id,
+ "--doc-id filter must restrict all hits to target doc, got {got}"
+ );
+ }
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: --ingested-after with bad RFC3339 → exit non-zero + error.v1
+// ---------------------------------------------------------------------------
+
+#[test]
+fn search_with_invalid_ingested_after_emits_config_invalid() {
+ let dir = tempfile::tempdir().unwrap();
+ let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
+ fs::write(workspace.join("a.md"), "# T\n\nrust stuff\n").unwrap();
+ common::ingest(&cfg, &workspace);
+
+ let bin = env!("CARGO_BIN_EXE_kebab");
+ let out = Command::new(bin)
+ .args([
+ "--config",
+ cfg.to_str().unwrap(),
+ "--json",
+ "search",
+ "--mode",
+ "lexical",
+ "--ingested-after",
+ "not-a-date",
+ "rust",
+ ])
+ .output()
+ .expect("kebab search --ingested-after bad");
+
+ assert!(
+ !out.status.success(),
+ "expected non-zero exit for invalid --ingested-after, got: status={} stderr={}",
+ out.status,
+ String::from_utf8_lossy(&out.stderr)
+ );
+
+ let stderr = String::from_utf8_lossy(&out.stderr);
+ // Find the error.v1 ndjson line on stderr (one JSON event per line).
+ let err_line = stderr
+ .lines()
+ .find(|l| {
+ serde_json::from_str::(l)
+ .ok()
+ .and_then(|v| {
+ v.get("schema_version")
+ .and_then(|s| s.as_str())
+ .map(String::from)
+ })
+ .as_deref()
+ == Some("error.v1")
+ })
+ .unwrap_or_else(|| panic!("no error.v1 line on stderr: {stderr:?}"));
+
+ let v: Value = serde_json::from_str(err_line).expect("error.v1 json");
+ assert_eq!(
+ v["code"], "config_invalid",
+ "code must be config_invalid for bad RFC3339: {err_line}"
+ );
+}
+
+// ---------------------------------------------------------------------------
+// Test 3: --media md (alias) normalises to markdown and matches .md docs
+// ---------------------------------------------------------------------------
+
+#[test]
+fn search_with_media_filter_md_alias_normalizes_to_markdown() {
+ let dir = tempfile::tempdir().unwrap();
+ let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
+
+ // Only a markdown file — the `md` alias should match it.
+ fs::write(workspace.join("notes.md"), "# Notes\n\nrust async programming\n").unwrap();
+ common::ingest(&cfg, &workspace);
+
+ let (stdout, _) = common::run_search_with_args(
+ &cfg,
+ &["--json", "--mode", "lexical", "--media", "md", "rust"],
+ );
+ let resp: Value = serde_json::from_str(stdout.trim())
+ .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
+ let hits = resp["hits"].as_array().expect("hits array");
+
+ assert!(
+ !hits.is_empty(),
+ "--media md must match the markdown doc; got 0 hits: {resp}"
+ );
+}
+
+// ---------------------------------------------------------------------------
+// Test 4: --tag (repeatable, OR-within) filters by frontmatter tags
+// ---------------------------------------------------------------------------
+
+#[test]
+fn search_with_tag_filter_matches_frontmatter_tags() {
+ let dir = tempfile::tempdir().unwrap();
+ let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
+
+ // Doc with `rust` tag.
+ fs::write(
+ workspace.join("rust_doc.md"),
+ "---\ntags: [rust, systems]\n---\n# Rust\n\nrust ownership\n",
+ )
+ .unwrap();
+ // Doc without the tag (but same keyword in body so it appears in
+ // unfiltered results — the tag filter must exclude it).
+ fs::write(
+ workspace.join("other_doc.md"),
+ "# Other\n\nrust programming\n",
+ )
+ .unwrap();
+ common::ingest(&cfg, &workspace);
+
+ // Without filter — both docs must produce hits.
+ let (unfiltered, _) = common::run_search_with_args(
+ &cfg,
+ &["--json", "--mode", "lexical", "rust"],
+ );
+ let uresp: Value = serde_json::from_str(unfiltered.trim())
+ .unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}"));
+ let uhits = uresp["hits"].as_array().expect("unfiltered hits array");
+ assert!(
+ uhits.len() >= 2,
+ "expected ≥2 hits before tag filter: {uresp}"
+ );
+
+ // With --tag rust — only the tagged doc's hits should appear.
+ let (filtered, _) = common::run_search_with_args(
+ &cfg,
+ &["--json", "--mode", "lexical", "--tag", "rust", "rust"],
+ );
+ let fresp: Value = serde_json::from_str(filtered.trim())
+ .unwrap_or_else(|e| panic!("not JSON (tag-filtered): {filtered:?}: {e}"));
+ let fhits = fresp["hits"].as_array().expect("filtered hits array");
+
+ assert!(
+ !fhits.is_empty(),
+ "--tag rust must match the tagged doc; got 0 hits: {fresp}"
+ );
+
+ // Every returned hit must come from rust_doc.md (the tagged file).
+ for hit in fhits {
+ let path = hit["doc_path"].as_str().unwrap_or("");
+ assert!(
+ path.ends_with("rust_doc.md"),
+ "--tag rust must only return hits from the tagged doc, got path={path}"
+ );
+ }
+}
--
2.49.1
From b06f4654e712833653bf6aa901709c16b7d8132b Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 04:11:27 +0900
Subject: [PATCH 09/11] feat(mcp): kebab__search filter inputs (fb-36)
7 new optional inputs on SearchInput: tags, lang, path_glob,
trust_min, media, ingested_after, doc_id. Validation surfaces as
error.v1 code = invalid_input via StructuredError. Dispatch builds
SearchFilters from the inputs and forwards through the existing
search_with_opts_with_config facade.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-mcp/Cargo.toml | 2 +
crates/kebab-mcp/src/tools/search.rs | 92 +++++++++-
crates/kebab-mcp/tests/tools_call_fetch.rs | 7 +
crates/kebab-mcp/tests/tools_call_search.rs | 179 ++++++++++++++++++++
4 files changed, 278 insertions(+), 2 deletions(-)
diff --git a/crates/kebab-mcp/Cargo.toml b/crates/kebab-mcp/Cargo.toml
index dfd6136..9ecea0d 100644
--- a/crates/kebab-mcp/Cargo.toml
+++ b/crates/kebab-mcp/Cargo.toml
@@ -19,6 +19,8 @@ tracing = { workspace = true }
# /dependencies endpoint — rmcp declares optional schemars = "^1.0").
schemars = "1"
+time = { workspace = true }
+
kebab-app = { path = "../kebab-app" }
kebab-config = { path = "../kebab-config" }
kebab-core = { path = "../kebab-core" }
diff --git a/crates/kebab-mcp/src/tools/search.rs b/crates/kebab-mcp/src/tools/search.rs
index e5f7b4e..2027024 100644
--- a/crates/kebab-mcp/src/tools/search.rs
+++ b/crates/kebab-mcp/src/tools/search.rs
@@ -1,5 +1,7 @@
//! `search` tool — wraps `kebab_app::search_with_opts_with_config`.
-//! Input: { query, mode?, k?, max_tokens?, snippet_chars?, cursor? }.
+//! Input: { query, mode?, k?, max_tokens?, snippet_chars?, cursor?,
+//! tags?, lang?, path_glob?, trust_min?, media?,
+//! ingested_after?, doc_id? }.
//! Output: search_response.v1 envelope (hits + next_cursor + truncated).
//!
//! First tool with a non-empty `inputSchema`: `SearchInput` derives
@@ -27,6 +29,22 @@ pub struct SearchInput {
pub snippet_chars: Option,
/// p9-fb-34: opaque cursor from a previous response.
pub cursor: Option,
+ /// p9-fb-36: filter by `metadata.tags` (OR-within).
+ pub tags: Option>,
+ /// p9-fb-36: filter by `documents.lang` (ISO code).
+ pub lang: Option,
+ /// p9-fb-36: filter by `documents.workspace_path` glob.
+ pub path_glob: Option,
+ /// p9-fb-36: filter by minimum `documents.trust_level`.
+ /// Accepts: `"primary"`, `"secondary"`, `"generated"`.
+ pub trust_min: Option,
+ /// p9-fb-36: filter by `assets.media_type` kind. IN-list. Accepts:
+ /// `"markdown"`, `"pdf"`, `"image"`, `"audio"`, `"other"`. Aliases: `md` → `markdown`.
+ pub media: Option>,
+ /// p9-fb-36: RFC3339 UTC timestamp. Invalid format → invalid_input.
+ pub ingested_after: Option,
+ /// p9-fb-36: filter to a single doc.
+ pub doc_id: Option,
}
pub fn handle(state: &KebabAppState, input: SearchInput) -> CallToolResult {
@@ -37,11 +55,62 @@ pub fn handle(state: &KebabAppState, input: SearchInput) -> CallToolResult {
"vector" => kebab_core::SearchMode::Vector,
_ => kebab_core::SearchMode::Hybrid,
};
+
+ // p9-fb-36: parse filter inputs, returning invalid_input on bad values.
+ let trust_min = match input.trust_min.as_deref() {
+ Some(s) => match s.to_ascii_lowercase().as_str() {
+ "primary" => Some(kebab_core::TrustLevel::Primary),
+ "secondary" => Some(kebab_core::TrustLevel::Secondary),
+ "generated" => Some(kebab_core::TrustLevel::Generated),
+ other => {
+ return invalid_input(&format!(
+ "trust_min: unknown level '{other}'; expected primary|secondary|generated"
+ ));
+ }
+ },
+ None => None,
+ };
+
+ let ingested_after = match input.ingested_after.as_deref() {
+ Some(s) => {
+ match time::OffsetDateTime::parse(
+ s,
+ &time::format_description::well_known::Rfc3339,
+ ) {
+ Ok(ts) => Some(ts),
+ Err(e) => {
+ return invalid_input(&format!(
+ "ingested_after: invalid RFC3339 '{s}': {e}"
+ ));
+ }
+ }
+ }
+ None => None,
+ };
+
+ let media: Vec = input
+ .media
+ .clone()
+ .unwrap_or_default()
+ .iter()
+ .map(|s| normalize_media_alias(s))
+ .collect();
+
+ let filters = kebab_core::SearchFilters {
+ tags_any: input.tags.clone().unwrap_or_default(),
+ lang: input.lang.clone().map(kebab_core::Lang),
+ path_glob: input.path_glob.clone(),
+ trust_min,
+ media,
+ ingested_after,
+ doc_id: input.doc_id.clone().map(kebab_core::DocumentId),
+ };
+
let query = kebab_core::SearchQuery {
text: input.query,
mode,
k,
- filters: kebab_core::SearchFilters::default(),
+ filters,
};
let opts = kebab_core::SearchOpts {
max_tokens: input.max_tokens,
@@ -81,3 +150,22 @@ pub fn handle(state: &KebabAppState, input: SearchInput) -> CallToolResult {
Err(e) => to_tool_error(&e),
}
}
+
+fn normalize_media_alias(s: &str) -> String {
+ match s.to_ascii_lowercase().as_str() {
+ "md" => "markdown".to_string(),
+ other => other.to_string(),
+ }
+}
+
+fn invalid_input(msg: &str) -> CallToolResult {
+ use kebab_app::{ErrorV1, StructuredError};
+ let err = anyhow::Error::new(StructuredError(ErrorV1 {
+ schema_version: "error.v1".to_string(),
+ code: "invalid_input".to_string(),
+ message: msg.to_string(),
+ details: serde_json::Value::Null,
+ hint: None,
+ }));
+ to_tool_error(&err)
+}
diff --git a/crates/kebab-mcp/tests/tools_call_fetch.rs b/crates/kebab-mcp/tests/tools_call_fetch.rs
index 5627e93..8da70a7 100644
--- a/crates/kebab-mcp/tests/tools_call_fetch.rs
+++ b/crates/kebab-mcp/tests/tools_call_fetch.rs
@@ -62,6 +62,13 @@ async fn fetch_tool_chunk_returns_fetch_result_v1() {
max_tokens: None,
snippet_chars: None,
cursor: None,
+ tags: None,
+ lang: None,
+ path_glob: None,
+ trust_min: None,
+ media: None,
+ ingested_after: None,
+ doc_id: None,
},
);
let search_text = match &search_result.content.first().unwrap().raw {
diff --git a/crates/kebab-mcp/tests/tools_call_search.rs b/crates/kebab-mcp/tests/tools_call_search.rs
index 5995292..58a32d8 100644
--- a/crates/kebab-mcp/tests/tools_call_search.rs
+++ b/crates/kebab-mcp/tests/tools_call_search.rs
@@ -58,6 +58,13 @@ async fn search_tool_returns_search_response_v1() {
max_tokens: None,
snippet_chars: None,
cursor: None,
+ tags: None,
+ lang: None,
+ path_glob: None,
+ trust_min: None,
+ media: None,
+ ingested_after: None,
+ doc_id: None,
},
);
@@ -108,3 +115,175 @@ async fn search_tool_returns_search_response_v1() {
"envelope should carry next_cursor (possibly null)"
);
}
+
+/// p9-fb-36: search with doc_id filter — only hits from the target doc.
+#[tokio::test]
+async fn search_with_doc_id_filter_returns_only_target() {
+ let dir = tempfile::tempdir().unwrap();
+ let data_dir = dir.path().join("data");
+ let workspace_root = dir.path().join("notes");
+ fs::create_dir_all(&data_dir).unwrap();
+ fs::create_dir_all(&workspace_root).unwrap();
+
+ let config = minimal_config(&data_dir, &workspace_root);
+
+ // Write two markdown documents, both containing the query term.
+ fs::write(
+ workspace_root.join("a.md"),
+ "# Alpha\n\nThis document mentions kebab and flatbread.",
+ )
+ .unwrap();
+ fs::write(
+ workspace_root.join("b.md"),
+ "# Beta\n\nAnother document about kebab wraps and fillings.",
+ )
+ .unwrap();
+
+ let scope = SourceScope {
+ root: workspace_root.clone(),
+ include: vec![],
+ exclude: vec![],
+ };
+ let _ = kebab_app::ingest_with_config(config.clone(), scope, false).unwrap();
+
+ let state = KebabAppState::new(config, None);
+ let handler = KebabHandler::new(state);
+
+ // First: unfiltered search to discover a doc_id from one of the docs.
+ let unfiltered = kebab_mcp::tools::search::handle(
+ handler.state(),
+ kebab_mcp::tools::search::SearchInput {
+ query: "kebab".to_string(),
+ mode: Some("lexical".to_string()),
+ k: Some(10),
+ max_tokens: None,
+ snippet_chars: None,
+ cursor: None,
+ tags: None,
+ lang: None,
+ path_glob: None,
+ trust_min: None,
+ media: None,
+ ingested_after: None,
+ doc_id: None,
+ },
+ );
+ assert!(
+ !unfiltered.is_error.unwrap_or(false),
+ "unfiltered search failed: {:?}",
+ unfiltered
+ );
+ let unfiltered_text = match &unfiltered.content.first().unwrap().raw {
+ RawContent::Text(t) => t.text.clone(),
+ other => panic!("expected text content, got {other:?}"),
+ };
+ let unfiltered_v: serde_json::Value = serde_json::from_str(&unfiltered_text).unwrap();
+ let hits = unfiltered_v["hits"].as_array().expect("hits must be array");
+ assert!(hits.len() >= 2, "expected hits from both docs");
+
+ // Pick the doc_id of the first hit.
+ let target_doc_id = hits[0]["doc_id"]
+ .as_str()
+ .expect("doc_id on first hit")
+ .to_string();
+
+ // Now search with doc_id filter — all results must belong to that doc.
+ let filtered = kebab_mcp::tools::search::handle(
+ handler.state(),
+ kebab_mcp::tools::search::SearchInput {
+ query: "kebab".to_string(),
+ mode: Some("lexical".to_string()),
+ k: Some(10),
+ max_tokens: None,
+ snippet_chars: None,
+ cursor: None,
+ tags: None,
+ lang: None,
+ path_glob: None,
+ trust_min: None,
+ media: None,
+ ingested_after: None,
+ doc_id: Some(target_doc_id.clone()),
+ },
+ );
+ assert!(
+ !filtered.is_error.unwrap_or(false),
+ "filtered search failed: {:?}",
+ filtered
+ );
+ let filtered_text = match &filtered.content.first().unwrap().raw {
+ RawContent::Text(t) => t.text.clone(),
+ other => panic!("expected text content, got {other:?}"),
+ };
+ let filtered_v: serde_json::Value = serde_json::from_str(&filtered_text).unwrap();
+ let filtered_hits = filtered_v["hits"].as_array().expect("hits must be array");
+
+ assert!(
+ !filtered_hits.is_empty(),
+ "expected at least one hit for target doc"
+ );
+ for hit in filtered_hits {
+ assert_eq!(
+ hit["doc_id"].as_str(),
+ Some(target_doc_id.as_str()),
+ "all filtered hits must belong to the target doc"
+ );
+ }
+}
+
+/// p9-fb-36: invalid RFC3339 for ingested_after → invalid_input error.v1.
+#[tokio::test]
+async fn search_with_invalid_ingested_after_returns_invalid_input() {
+ let dir = tempfile::tempdir().unwrap();
+ let data_dir = dir.path().join("data");
+ let workspace_root = dir.path().join("notes");
+ fs::create_dir_all(&data_dir).unwrap();
+ fs::create_dir_all(&workspace_root).unwrap();
+
+ let config = minimal_config(&data_dir, &workspace_root);
+ let state = KebabAppState::new(config, None);
+ let handler = KebabHandler::new(state);
+
+ let result = kebab_mcp::tools::search::handle(
+ handler.state(),
+ kebab_mcp::tools::search::SearchInput {
+ query: "kebab".to_string(),
+ mode: None,
+ k: None,
+ max_tokens: None,
+ snippet_chars: None,
+ cursor: None,
+ tags: None,
+ lang: None,
+ path_glob: None,
+ trust_min: None,
+ media: None,
+ ingested_after: Some("garbage".to_string()),
+ doc_id: None,
+ },
+ );
+
+ assert!(
+ result.is_error.unwrap_or(false),
+ "expected isError=true for invalid ingested_after"
+ );
+ let content = result
+ .content
+ .first()
+ .expect("expected at least one content item");
+ let text = match &content.raw {
+ RawContent::Text(t) => &t.text,
+ other => panic!("expected text content, got {other:?}"),
+ };
+ let v: serde_json::Value = serde_json::from_str(text).unwrap();
+ assert_eq!(
+ v.get("schema_version").and_then(|s| s.as_str()),
+ Some("error.v1"),
+ "must carry error.v1 envelope"
+ );
+ assert_eq!(
+ v.get("code").and_then(|s| s.as_str()),
+ Some("invalid_input"),
+ "code must be invalid_input for bad RFC3339"
+ );
+}
--
2.49.1
From 6e7446861bb4f90545408464d169a1c7960df4f0 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 04:26:27 +0900
Subject: [PATCH 10/11] docs(fb-36): README + SMOKE + INDEX + skill notes
Co-Authored-By: Claude Opus 4.7 (1M context)
---
README.md | 2 +-
docs/SMOKE.md | 16 ++++++++++++++++
integrations/claude-code/kebab/SKILL.md | 3 ++-
tasks/INDEX.md | 2 +-
tasks/p9/p9-fb-36-search-filters.md | 7 +++++--
5 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index b7595a6..3c699f3 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ kebab doctor
|------|------|
| `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 |
| `kebab ingest []` | Markdown / 이미지 / PDF 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. |
-| `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor` |
+| `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` / `--media` 는 각각 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). |
| `kebab list docs` | 색인된 문서 목록 |
| `kebab inspect doc ` / `kebab inspect chunk ` | raw record 보기 |
| `kebab fetch chunk [--context N]` / `kebab fetch doc [--max-tokens N]` / `kebab fetch span [--max-tokens N]` | (p9-fb-35) verbatim text fetch from indexed corpus. wire = `fetch_result.v1` (kind discriminator). chunk: target + ±N ordinal-context chunks. doc: full normalized markdown. span: 1-based line range (PDF/audio rejected as `error.v1.code = span_not_supported`). chars/4 budget on doc/span. |
diff --git a/docs/SMOKE.md b/docs/SMOKE.md
index 272c1f7..9a68800 100644
--- a/docs/SMOKE.md
+++ b/docs/SMOKE.md
@@ -190,6 +190,22 @@ kebab fetch span "$DOC_ID" 1 5 --json | jq '{line_start, line_end, effective_end
PDF / audio docs reject `fetch span` with `error.v1.code = span_not_supported` — use `fetch chunk` (PDF chunks are page-aligned) or `fetch doc` instead.
+### Filter args (fb-36)
+
+````bash
+# Filter by media kind (md alias normalizes to markdown).
+kebab search "rust" --media md --json | jq '.hits | length'
+
+# Filter by ingest timestamp (RFC3339).
+kebab search "rust" --ingested-after 2026-04-01T00:00:00Z --json
+
+# Combine: doc-id scope + tag (AND across flags).
+kebab search "rust" --doc-id "" --tag rust --json
+````
+
+Bad `--ingested-after` → `error.v1.code = config_invalid`, exit 2.
+Unknown `--media` value → silently empty (no error).
+
## P6-4 이미지 ingestion 옵션
`config.toml` 에 다음 절을 추가하면 `kebab ingest` 가 `**/*.png` / `**/*.jpg` 등 이미지 자산도 함께 색인합니다 (텍스트만 색인하려면 생략):
diff --git a/integrations/claude-code/kebab/SKILL.md b/integrations/claude-code/kebab/SKILL.md
index 2faedda..fea4e2e 100644
--- a/integrations/claude-code/kebab/SKILL.md
+++ b/integrations/claude-code/kebab/SKILL.md
@@ -48,11 +48,12 @@ Use when the user wants to **find** a doc, or when you (the model) need raw chun
Input:
```json
-{ "query": "", "mode": "hybrid", "k": 10, "max_tokens": null, "snippet_chars": null, "cursor": null }
+{ "query": "", "mode": "hybrid", "k": 10, "max_tokens": null, "snippet_chars": null, "cursor": null, "tags": null, "lang": null, "path_glob": null, "trust_min": null, "media": null, "ingested_after": null, "doc_id": null }
```
- `mode = "hybrid"` is the default-correct choice. Use `"vector"` for semantic-only ("docs about X concept"), `"lexical"` for exact strings ("the literal flag `--foo-bar`").
- **`max_tokens` / `snippet_chars` / `cursor` (p9-fb-34)** — agent budget controls. Set `max_tokens` to cap result wire size (chars/4 estimate); set `cursor` to the previous response's `next_cursor` to fetch the next page.
+- **p9-fb-36 filter inputs:** `tags` (string array — OR-within, AND across keys), `lang` (BCP-47 language code), `path_glob` (glob pattern matched against doc path), `trust_min` (`"primary"` | `"secondary"` | `"generated"` — includes that level and above), `media` (string array — IN-list of `"markdown"` | `"pdf"` | `"image"` | `"audio"` | `"other"`; alias `"md"` → `"markdown"`), `ingested_after` (RFC3339 UTC string), `doc_id` (exact doc UUID). AND combinator across keys. Invalid `ingested_after` or unknown `trust_min` → `error.v1.code = invalid_input`. Unknown `media` value → empty hits, no error.
- Output is `search_response.v1`: `{ hits: search_hit.v1[], next_cursor: string|null, truncated: bool }`. Iterate `response.hits[]` for individual hits. Key hit fields: `rank`, `score`, `doc_path`, `heading_path[]`, `section_label`, `snippet`, `citation` (line range / page), `chunk_id`.
- Cite back to the user as `doc_path § heading_path[-1]` so they can open the source.
- When `truncated: true`, the budget loop modified the page (snippet shortening or k reduction). `next_cursor` is **independent** — non-null whenever more hits may be reachable. Caller may widen `max_tokens` (re-issue same query for fuller snippets / more hits per page) or follow `next_cursor` (advance through more hits) or both. Mismatched cursor (corpus_revision changed) returns `error.v1.code = stale_cursor` — re-issue the search to obtain a fresh one.
diff --git a/tasks/INDEX.md b/tasks/INDEX.md
index fab95d7..db35a0b 100644
--- a/tasks/INDEX.md
+++ b/tasks/INDEX.md
@@ -124,7 +124,7 @@ P0~P5 는 직렬. P6~P9 는 P5 이후 병렬 가능.
- [p9-fb-33 streaming ask (ndjson delta)](p9/p9-fb-33-streaming-ask.md) — ✅ 머지 + v0.5.0 cut 후보 (2026-05-09)
- [p9-fb-34 output budget controls](p9/p9-fb-34-output-budget-controls.md) — ✅ 머지 + v0.5.0 cut 후보 (2026-05-09)
- [p9-fb-35 verbatim fetch](p9/p9-fb-35-verbatim-fetch.md) — ✅ 머지 + v0.5.0 cut 후보 (2026-05-09)
- - [p9-fb-36 search filter args](p9/p9-fb-36-search-filters.md) — ⏳ 미구현, brainstorm 필요
+ - [p9-fb-36 search filter args](p9/p9-fb-36-search-filters.md) — ✅ 머지 (2026-05-10)
- [p9-fb-37 trace + stats](p9/p9-fb-37-trace-and-stats.md) — ⏳ 미구현, brainstorm 필요 (depends_on 27)
### 🎯 0.5.0 — RAG quality (cascade 동반: V00X + reindex)
diff --git a/tasks/p9/p9-fb-36-search-filters.md b/tasks/p9/p9-fb-36-search-filters.md
index a0007fe..3577755 100644
--- a/tasks/p9/p9-fb-36-search-filters.md
+++ b/tasks/p9/p9-fb-36-search-filters.md
@@ -3,7 +3,7 @@ phase: P9
component: kebab-cli + kebab-search + wire-schema
task_id: p9-fb-36
title: "Search filter args (--media / --ingested-after / --doc-id / --tag)"
-status: open
+status: completed
target_version: 0.4.0
depends_on: []
unblocks: []
@@ -14,7 +14,10 @@ source_feedback: 사용자 도그푸딩 2026-05-06 — agent 가 검색 범위
# p9-fb-36 — Search filter args
-> ⏳ **백로그 only — 미구현.** 본 spec 은 도그푸딩 피드백 skeleton. 구현 착수 전 [superpowers:brainstorming](../../docs/superpowers/) 으로 설계 단계 선행 필요. filter 종류 / SQLite 쿼리 통합 / Lance vector 필터 적용 layer brainstorm 후 확정.
+> ✅ **구현 완료.** 본 spec 은 구현 시점의 frozen 상태. post-merge deviation 은 [HOTFIXES.md](../HOTFIXES.md) 참조.
+
+상세 설계: `docs/superpowers/specs/2026-05-10-p9-fb-36-search-filters-design.md`.
+구현 계획: `docs/superpowers/plans/2026-05-10-p9-fb-36-search-filters.md`.
## 증상 / 동기
--
2.49.1
From 84287d0ef65374a15f28226f0f705baab9b47eff Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 04:47:55 +0900
Subject: [PATCH 11/11] fix(fb-36): address PR #127 round 1 review
- ingested_after: convert OffsetDateTime to UTC before formatting
so non-Z offsets compare correctly against UTC TEXT storage
(lexical.rs + filters.rs)
- README: --tag is repeatable-only, not csv (only --media is csv)
- test(cli): add multi-value --tag OR-within IN-list coverage
- test(store): add UTC-offset regression test for ingested_after
- mcp: use ERROR_V1_ID const instead of hardcoded "error.v1"
Co-Authored-By: Claude Opus 4.7 (1M context)
---
README.md | 2 +-
crates/kebab-cli/tests/wire_search_filters.rs | 80 +++++++++++++++++++
crates/kebab-mcp/src/tools/search.rs | 4 +-
crates/kebab-search/src/lexical.rs | 8 +-
crates/kebab-store-sqlite/src/filters.rs | 58 +++++++++++++-
5 files changed, 146 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index 3c699f3..7697391 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ kebab doctor
|------|------|
| `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 |
| `kebab ingest []` | Markdown / 이미지 / PDF 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. |
-| `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` / `--media` 는 각각 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). |
+| `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` 는 반복 가능 flag (`--tag rust --tag async`) 로 OR 매칭, `--media` 는 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). |
| `kebab list docs` | 색인된 문서 목록 |
| `kebab inspect doc ` / `kebab inspect chunk ` | raw record 보기 |
| `kebab fetch chunk [--context N]` / `kebab fetch doc [--max-tokens N]` / `kebab fetch span [--max-tokens N]` | (p9-fb-35) verbatim text fetch from indexed corpus. wire = `fetch_result.v1` (kind discriminator). chunk: target + ±N ordinal-context chunks. doc: full normalized markdown. span: 1-based line range (PDF/audio rejected as `error.v1.code = span_not_supported`). chars/4 budget on doc/span. |
diff --git a/crates/kebab-cli/tests/wire_search_filters.rs b/crates/kebab-cli/tests/wire_search_filters.rs
index 6c68aef..71ba48c 100644
--- a/crates/kebab-cli/tests/wire_search_filters.rs
+++ b/crates/kebab-cli/tests/wire_search_filters.rs
@@ -224,3 +224,83 @@ fn search_with_tag_filter_matches_frontmatter_tags() {
);
}
}
+
+// ---------------------------------------------------------------------------
+// Test 5: --tag is repeatable (OR-within); two --tag values form an IN-list
+// ---------------------------------------------------------------------------
+
+#[test]
+fn search_with_two_tag_filters_returns_or_within_tags() {
+ // Two docs with different tag sets:
+ // a.md → tags: [rust]
+ // b.md → tags: [async]
+ // c.md → no tags (but same keyword in body)
+ // Search with --tag rust --tag async (OR within --tag).
+ // Expect a.md and b.md, not c.md.
+ let dir = tempfile::tempdir().unwrap();
+ let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
+
+ fs::write(
+ workspace.join("a.md"),
+ "---\ntags: [rust]\n---\n# A\n\nrust systems programming\n",
+ )
+ .unwrap();
+ fs::write(
+ workspace.join("b.md"),
+ "---\ntags: [async]\n---\n# B\n\nrust async programming\n",
+ )
+ .unwrap();
+ fs::write(workspace.join("c.md"), "# C\n\nrust programming\n").unwrap();
+ common::ingest(&cfg, &workspace);
+
+ // Without filter: all three docs produce hits.
+ let (unfiltered, _) = common::run_search_with_args(
+ &cfg,
+ &["--json", "--mode", "lexical", "rust"],
+ );
+ let uresp: Value = serde_json::from_str(unfiltered.trim())
+ .unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}"));
+ let uhits = uresp["hits"].as_array().expect("unfiltered hits array");
+ assert!(
+ uhits.len() >= 3,
+ "expected ≥3 hits before tag filter: {uresp}"
+ );
+
+ // With --tag rust --tag async: only a.md and b.md should appear.
+ let (filtered, _) = common::run_search_with_args(
+ &cfg,
+ &[
+ "--json", "--mode", "lexical",
+ "--tag", "rust",
+ "--tag", "async",
+ "rust",
+ ],
+ );
+ let fresp: Value = serde_json::from_str(filtered.trim())
+ .unwrap_or_else(|e| panic!("not JSON (two-tag-filtered): {filtered:?}: {e}"));
+ let fhits = fresp["hits"].as_array().expect("filtered hits array");
+
+ assert!(
+ !fhits.is_empty(),
+ "--tag rust --tag async must return hits from tagged docs; got 0: {fresp}"
+ );
+
+ // c.md must not appear — it has no tags.
+ for hit in fhits {
+ let path = hit["doc_path"].as_str().unwrap_or("");
+ assert!(
+ path.ends_with("a.md") || path.ends_with("b.md"),
+ "--tag rust --tag async must only return a.md or b.md, got path={path}"
+ );
+ }
+
+ // Both a.md and b.md must appear (OR, not AND).
+ let paths: Vec<&str> = fhits
+ .iter()
+ .filter_map(|h| h["doc_path"].as_str())
+ .collect();
+ let has_a = paths.iter().any(|p| p.ends_with("a.md"));
+ let has_b = paths.iter().any(|p| p.ends_with("b.md"));
+ assert!(has_a, "--tag rust must include a.md (rust-tagged): paths={paths:?}");
+ assert!(has_b, "--tag async must include b.md (async-tagged): paths={paths:?}");
+}
diff --git a/crates/kebab-mcp/src/tools/search.rs b/crates/kebab-mcp/src/tools/search.rs
index 2027024..74af6e9 100644
--- a/crates/kebab-mcp/src/tools/search.rs
+++ b/crates/kebab-mcp/src/tools/search.rs
@@ -12,6 +12,8 @@ use rmcp::model::CallToolResult;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
+use kebab_app::ERROR_V1_ID;
+
use crate::error::{to_tool_error, to_tool_success};
use crate::state::KebabAppState;
@@ -161,7 +163,7 @@ fn normalize_media_alias(s: &str) -> String {
fn invalid_input(msg: &str) -> CallToolResult {
use kebab_app::{ErrorV1, StructuredError};
let err = anyhow::Error::new(StructuredError(ErrorV1 {
- schema_version: "error.v1".to_string(),
+ schema_version: ERROR_V1_ID.to_string(),
code: "invalid_input".to_string(),
message: msg.to_string(),
details: serde_json::Value::Null,
diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs
index 871c22d..bfdd0f7 100644
--- a/crates/kebab-search/src/lexical.rs
+++ b/crates/kebab-search/src/lexical.rs
@@ -348,11 +348,15 @@ fn run_query(
// p9-fb-36: ingested_after filter.
// `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
- // fb-32 ingest path), so lexicographic >= compare is correct.
+ // fb-32 ingest path), so lexicographic >= compare is correct — but only
+ // when the filter instant is also formatted as UTC `Z`. A non-UTC offset
+ // (e.g. `+09:00`) would compare as ASCII after `Z` (0x2B < 0x5A) and
+ // produce wrong results. Convert to UTC before formatting.
if let Some(after) = &filters.ingested_after {
let formatted = after
+ .to_offset(time::UtcOffset::UTC)
.format(&time::format_description::well_known::Rfc3339)
- .expect("OffsetDateTime formats to RFC3339");
+ .expect("OffsetDateTime (UTC) formats to RFC3339");
sql.push_str(" AND d.updated_at >= ?");
params.push(Box::new(formatted));
}
diff --git a/crates/kebab-store-sqlite/src/filters.rs b/crates/kebab-store-sqlite/src/filters.rs
index 4586236..9519879 100644
--- a/crates/kebab-store-sqlite/src/filters.rs
+++ b/crates/kebab-store-sqlite/src/filters.rs
@@ -155,11 +155,15 @@ impl SqliteStore {
// p9-fb-36: ingested_after filter.
// `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32);
- // lexicographic >= compare is correct.
+ // lexicographic >= compare is correct — but only when the filter
+ // instant is also formatted as UTC `Z`. A non-UTC offset (e.g.
+ // `+09:00`) would compare as ASCII after `Z` (0x2B < 0x5A) and
+ // produce wrong results. Convert to UTC before formatting.
if let Some(after) = &filters.ingested_after {
let formatted = after
+ .to_offset(time::UtcOffset::UTC)
.format(&time::format_description::well_known::Rfc3339)
- .expect("OffsetDateTime formats to RFC3339");
+ .expect("OffsetDateTime (UTC) formats to RFC3339");
sql.push_str(" AND d.updated_at >= ?");
bind.push(Box::new(formatted));
}
@@ -666,4 +670,54 @@ mod tests {
.unwrap();
assert_eq!(out, vec![cid(c1)], "doc_id filter must scope to the target doc only");
}
+
+ #[test]
+ fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() {
+ // Regression test for the non-UTC offset lex-compare bug.
+ //
+ // Scenario (from PR #127 review):
+ // - doc stored at `2026-04-01T01:00:00Z`
+ // - filter: `2026-04-01T05:00:00+09:00` == `2026-03-31T20:00:00Z` instant
+ //
+ // The doc instant (01:00 UTC on Apr 1) is AFTER the filter instant
+ // (20:00 UTC on Mar 31), so the doc SHOULD match.
+ //
+ // Buggy code: formats `+09:00` as-is → lex compare
+ // `2026-04-01T01:00:00Z` vs `2026-04-01T05:00:00+09:00`
+ // `01` < `05` → doc dropped incorrectly.
+ //
+ // Fixed code: converts to UTC first → compares
+ // `2026-04-01T01:00:00Z` vs `2026-03-31T20:00:00Z`
+ // Apr 1 > Mar 31 → doc correctly included.
+ let tmp = TempDir::new().unwrap();
+ let store = open_store(&tmp);
+ let c1 = "11111111111111111111111111111111";
+ seed_committed_full(
+ &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1",
+ "doc.md", "en", &[], "primary",
+ r#""markdown""#,
+ "2026-04-01T01:00:00Z",
+ );
+
+ // Filter instant: 2026-04-01T05:00:00+09:00 == 2026-03-31T20:00:00 UTC.
+ // Doc (2026-04-01T01:00:00Z) is after the filter instant → should match.
+ let filter_instant = time::OffsetDateTime::parse(
+ "2026-04-01T05:00:00+09:00",
+ &time::format_description::well_known::Rfc3339,
+ )
+ .expect("valid RFC3339 with +09:00 offset");
+
+ let f = SearchFilters {
+ ingested_after: Some(filter_instant),
+ ..Default::default()
+ };
+ let out = store
+ .filter_chunks(&[cid(c1)], &f)
+ .unwrap();
+ assert_eq!(
+ out,
+ vec![cid(c1)],
+ "doc ingested at 01:00Z should match filter 05:00+09:00 (== 20:00Z previous day)"
+ );
+ }
}
--
2.49.1