From 84287d0ef65374a15f28226f0f705baab9b47eff Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 04:47:55 +0900
Subject: [PATCH] fix(fb-36): address PR #127 round 1 review
- ingested_after: convert OffsetDateTime to UTC before formatting
so non-Z offsets compare correctly against UTC TEXT storage
(lexical.rs + filters.rs)
- README: --tag is repeatable-only, not csv (only --media is csv)
- test(cli): add multi-value --tag OR-within IN-list coverage
- test(store): add UTC-offset regression test for ingested_after
- mcp: use ERROR_V1_ID const instead of hardcoded "error.v1"
Co-Authored-By: Claude Opus 4.7 (1M context)
---
README.md | 2 +-
crates/kebab-cli/tests/wire_search_filters.rs | 80 +++++++++++++++++++
crates/kebab-mcp/src/tools/search.rs | 4 +-
crates/kebab-search/src/lexical.rs | 8 +-
crates/kebab-store-sqlite/src/filters.rs | 58 +++++++++++++-
5 files changed, 146 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index 3c699f3..7697391 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ kebab doctor
|------|------|
| `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 |
| `kebab ingest []` | Markdown / 이미지 / PDF 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. |
-| `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` / `--media` 는 각각 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). |
+| `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` 는 반복 가능 flag (`--tag rust --tag async`) 로 OR 매칭, `--media` 는 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). |
| `kebab list docs` | 색인된 문서 목록 |
| `kebab inspect doc ` / `kebab inspect chunk ` | raw record 보기 |
| `kebab fetch chunk [--context N]` / `kebab fetch doc [--max-tokens N]` / `kebab fetch span [--max-tokens N]` | (p9-fb-35) verbatim text fetch from indexed corpus. wire = `fetch_result.v1` (kind discriminator). chunk: target + ±N ordinal-context chunks. doc: full normalized markdown. span: 1-based line range (PDF/audio rejected as `error.v1.code = span_not_supported`). chars/4 budget on doc/span. |
diff --git a/crates/kebab-cli/tests/wire_search_filters.rs b/crates/kebab-cli/tests/wire_search_filters.rs
index 6c68aef..71ba48c 100644
--- a/crates/kebab-cli/tests/wire_search_filters.rs
+++ b/crates/kebab-cli/tests/wire_search_filters.rs
@@ -224,3 +224,83 @@ fn search_with_tag_filter_matches_frontmatter_tags() {
);
}
}
+
+// ---------------------------------------------------------------------------
+// Test 5: --tag is repeatable (OR-within); two --tag values form an IN-list
+// ---------------------------------------------------------------------------
+
+#[test]
+fn search_with_two_tag_filters_returns_or_within_tags() {
+ // Two docs with different tag sets:
+ // a.md → tags: [rust]
+ // b.md → tags: [async]
+ // c.md → no tags (but same keyword in body)
+ // Search with --tag rust --tag async (OR within --tag).
+ // Expect a.md and b.md, not c.md.
+ let dir = tempfile::tempdir().unwrap();
+ let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
+
+ fs::write(
+ workspace.join("a.md"),
+ "---\ntags: [rust]\n---\n# A\n\nrust systems programming\n",
+ )
+ .unwrap();
+ fs::write(
+ workspace.join("b.md"),
+ "---\ntags: [async]\n---\n# B\n\nrust async programming\n",
+ )
+ .unwrap();
+ fs::write(workspace.join("c.md"), "# C\n\nrust programming\n").unwrap();
+ common::ingest(&cfg, &workspace);
+
+ // Without filter: all three docs produce hits.
+ let (unfiltered, _) = common::run_search_with_args(
+ &cfg,
+ &["--json", "--mode", "lexical", "rust"],
+ );
+ let uresp: Value = serde_json::from_str(unfiltered.trim())
+ .unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}"));
+ let uhits = uresp["hits"].as_array().expect("unfiltered hits array");
+ assert!(
+ uhits.len() >= 3,
+ "expected ≥3 hits before tag filter: {uresp}"
+ );
+
+ // With --tag rust --tag async: only a.md and b.md should appear.
+ let (filtered, _) = common::run_search_with_args(
+ &cfg,
+ &[
+ "--json", "--mode", "lexical",
+ "--tag", "rust",
+ "--tag", "async",
+ "rust",
+ ],
+ );
+ let fresp: Value = serde_json::from_str(filtered.trim())
+ .unwrap_or_else(|e| panic!("not JSON (two-tag-filtered): {filtered:?}: {e}"));
+ let fhits = fresp["hits"].as_array().expect("filtered hits array");
+
+ assert!(
+ !fhits.is_empty(),
+ "--tag rust --tag async must return hits from tagged docs; got 0: {fresp}"
+ );
+
+ // c.md must not appear — it has no tags.
+ for hit in fhits {
+ let path = hit["doc_path"].as_str().unwrap_or("");
+ assert!(
+ path.ends_with("a.md") || path.ends_with("b.md"),
+ "--tag rust --tag async must only return a.md or b.md, got path={path}"
+ );
+ }
+
+ // Both a.md and b.md must appear (OR, not AND).
+ let paths: Vec<&str> = fhits
+ .iter()
+ .filter_map(|h| h["doc_path"].as_str())
+ .collect();
+ let has_a = paths.iter().any(|p| p.ends_with("a.md"));
+ let has_b = paths.iter().any(|p| p.ends_with("b.md"));
+ assert!(has_a, "--tag rust must include a.md (rust-tagged): paths={paths:?}");
+ assert!(has_b, "--tag async must include b.md (async-tagged): paths={paths:?}");
+}
diff --git a/crates/kebab-mcp/src/tools/search.rs b/crates/kebab-mcp/src/tools/search.rs
index 2027024..74af6e9 100644
--- a/crates/kebab-mcp/src/tools/search.rs
+++ b/crates/kebab-mcp/src/tools/search.rs
@@ -12,6 +12,8 @@ use rmcp::model::CallToolResult;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
+use kebab_app::ERROR_V1_ID;
+
use crate::error::{to_tool_error, to_tool_success};
use crate::state::KebabAppState;
@@ -161,7 +163,7 @@ fn normalize_media_alias(s: &str) -> String {
fn invalid_input(msg: &str) -> CallToolResult {
use kebab_app::{ErrorV1, StructuredError};
let err = anyhow::Error::new(StructuredError(ErrorV1 {
- schema_version: "error.v1".to_string(),
+ schema_version: ERROR_V1_ID.to_string(),
code: "invalid_input".to_string(),
message: msg.to_string(),
details: serde_json::Value::Null,
diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs
index 871c22d..bfdd0f7 100644
--- a/crates/kebab-search/src/lexical.rs
+++ b/crates/kebab-search/src/lexical.rs
@@ -348,11 +348,15 @@ fn run_query(
// p9-fb-36: ingested_after filter.
// `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per
- // fb-32 ingest path), so lexicographic >= compare is correct.
+ // fb-32 ingest path), so lexicographic >= compare is correct — but only
+ // when the filter instant is also formatted as UTC `Z`. A non-UTC offset
+ // (e.g. `+09:00`) would compare as ASCII after `Z` (0x2B < 0x5A) and
+ // produce wrong results. Convert to UTC before formatting.
if let Some(after) = &filters.ingested_after {
let formatted = after
+ .to_offset(time::UtcOffset::UTC)
.format(&time::format_description::well_known::Rfc3339)
- .expect("OffsetDateTime formats to RFC3339");
+ .expect("OffsetDateTime (UTC) formats to RFC3339");
sql.push_str(" AND d.updated_at >= ?");
params.push(Box::new(formatted));
}
diff --git a/crates/kebab-store-sqlite/src/filters.rs b/crates/kebab-store-sqlite/src/filters.rs
index 4586236..9519879 100644
--- a/crates/kebab-store-sqlite/src/filters.rs
+++ b/crates/kebab-store-sqlite/src/filters.rs
@@ -155,11 +155,15 @@ impl SqliteStore {
// p9-fb-36: ingested_after filter.
// `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32);
- // lexicographic >= compare is correct.
+ // lexicographic >= compare is correct — but only when the filter
+ // instant is also formatted as UTC `Z`. A non-UTC offset (e.g.
+ // `+09:00`) would compare as ASCII after `Z` (0x2B < 0x5A) and
+ // produce wrong results. Convert to UTC before formatting.
if let Some(after) = &filters.ingested_after {
let formatted = after
+ .to_offset(time::UtcOffset::UTC)
.format(&time::format_description::well_known::Rfc3339)
- .expect("OffsetDateTime formats to RFC3339");
+ .expect("OffsetDateTime (UTC) formats to RFC3339");
sql.push_str(" AND d.updated_at >= ?");
bind.push(Box::new(formatted));
}
@@ -666,4 +670,54 @@ mod tests {
.unwrap();
assert_eq!(out, vec![cid(c1)], "doc_id filter must scope to the target doc only");
}
+
+ #[test]
+ fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() {
+ // Regression test for the non-UTC offset lex-compare bug.
+ //
+ // Scenario (from PR #127 review):
+ // - doc stored at `2026-04-01T01:00:00Z`
+ // - filter: `2026-04-01T05:00:00+09:00` == `2026-03-31T20:00:00Z` instant
+ //
+ // The doc instant (01:00 UTC on Apr 1) is AFTER the filter instant
+ // (20:00 UTC on Mar 31), so the doc SHOULD match.
+ //
+ // Buggy code: formats `+09:00` as-is → lex compare
+ // `2026-04-01T01:00:00Z` vs `2026-04-01T05:00:00+09:00`
+ // `01` < `05` → doc dropped incorrectly.
+ //
+ // Fixed code: converts to UTC first → compares
+ // `2026-04-01T01:00:00Z` vs `2026-03-31T20:00:00Z`
+ // Apr 1 > Mar 31 → doc correctly included.
+ let tmp = TempDir::new().unwrap();
+ let store = open_store(&tmp);
+ let c1 = "11111111111111111111111111111111";
+ seed_committed_full(
+ &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1",
+ "doc.md", "en", &[], "primary",
+ r#""markdown""#,
+ "2026-04-01T01:00:00Z",
+ );
+
+ // Filter instant: 2026-04-01T05:00:00+09:00 == 2026-03-31T20:00:00 UTC.
+ // Doc (2026-04-01T01:00:00Z) is after the filter instant → should match.
+ let filter_instant = time::OffsetDateTime::parse(
+ "2026-04-01T05:00:00+09:00",
+ &time::format_description::well_known::Rfc3339,
+ )
+ .expect("valid RFC3339 with +09:00 offset");
+
+ let f = SearchFilters {
+ ingested_after: Some(filter_instant),
+ ..Default::default()
+ };
+ let out = store
+ .filter_chunks(&[cid(c1)], &f)
+ .unwrap();
+ assert_eq!(
+ out,
+ vec![cid(c1)],
+ "doc ingested at 01:00Z should match filter 05:00+09:00 (== 20:00Z previous day)"
+ );
+ }
}