From 84287d0ef65374a15f28226f0f705baab9b47eff Mon Sep 17 00:00:00 2001 From: th-kim0823 Date: Sun, 10 May 2026 04:47:55 +0900 Subject: [PATCH] fix(fb-36): address PR #127 round 1 review - ingested_after: convert OffsetDateTime to UTC before formatting so non-Z offsets compare correctly against UTC TEXT storage (lexical.rs + filters.rs) - README: --tag is repeatable-only, not csv (only --media is csv) - test(cli): add multi-value --tag OR-within IN-list coverage - test(store): add UTC-offset regression test for ingested_after - mcp: use ERROR_V1_ID const instead of hardcoded "error.v1" Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 2 +- crates/kebab-cli/tests/wire_search_filters.rs | 80 +++++++++++++++++++ crates/kebab-mcp/src/tools/search.rs | 4 +- crates/kebab-search/src/lexical.rs | 8 +- crates/kebab-store-sqlite/src/filters.rs | 58 +++++++++++++- 5 files changed, 146 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 3c699f3..7697391 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ kebab doctor |------|------| | `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 | | `kebab ingest []` | Markdown / 이미지 / PDF 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. | -| `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` / `--media` 는 각각 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). | +| `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` 는 반복 가능 flag (`--tag rust --tag async`) 로 OR 매칭, `--media` 는 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). | | `kebab list docs` | 색인된 문서 목록 | | `kebab inspect doc ` / `kebab inspect chunk ` | raw record 보기 | | `kebab fetch chunk [--context N]` / `kebab fetch doc [--max-tokens N]` / `kebab fetch span [--max-tokens N]` | (p9-fb-35) verbatim text fetch from indexed corpus. wire = `fetch_result.v1` (kind discriminator). chunk: target + ±N ordinal-context chunks. doc: full normalized markdown. span: 1-based line range (PDF/audio rejected as `error.v1.code = span_not_supported`). chars/4 budget on doc/span. | diff --git a/crates/kebab-cli/tests/wire_search_filters.rs b/crates/kebab-cli/tests/wire_search_filters.rs index 6c68aef..71ba48c 100644 --- a/crates/kebab-cli/tests/wire_search_filters.rs +++ b/crates/kebab-cli/tests/wire_search_filters.rs @@ -224,3 +224,83 @@ fn search_with_tag_filter_matches_frontmatter_tags() { ); } } + +// --------------------------------------------------------------------------- +// Test 5: --tag is repeatable (OR-within); two --tag values form an IN-list +// --------------------------------------------------------------------------- + +#[test] +fn search_with_two_tag_filters_returns_or_within_tags() { + // Two docs with different tag sets: + // a.md → tags: [rust] + // b.md → tags: [async] + // c.md → no tags (but same keyword in body) + // Search with --tag rust --tag async (OR within --tag). + // Expect a.md and b.md, not c.md. + let dir = tempfile::tempdir().unwrap(); + let (cfg, workspace, _data) = common::write_config(dir.path(), 30); + + fs::write( + workspace.join("a.md"), + "---\ntags: [rust]\n---\n# A\n\nrust systems programming\n", + ) + .unwrap(); + fs::write( + workspace.join("b.md"), + "---\ntags: [async]\n---\n# B\n\nrust async programming\n", + ) + .unwrap(); + fs::write(workspace.join("c.md"), "# C\n\nrust programming\n").unwrap(); + common::ingest(&cfg, &workspace); + + // Without filter: all three docs produce hits. + let (unfiltered, _) = common::run_search_with_args( + &cfg, + &["--json", "--mode", "lexical", "rust"], + ); + let uresp: Value = serde_json::from_str(unfiltered.trim()) + .unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}")); + let uhits = uresp["hits"].as_array().expect("unfiltered hits array"); + assert!( + uhits.len() >= 3, + "expected ≥3 hits before tag filter: {uresp}" + ); + + // With --tag rust --tag async: only a.md and b.md should appear. + let (filtered, _) = common::run_search_with_args( + &cfg, + &[ + "--json", "--mode", "lexical", + "--tag", "rust", + "--tag", "async", + "rust", + ], + ); + let fresp: Value = serde_json::from_str(filtered.trim()) + .unwrap_or_else(|e| panic!("not JSON (two-tag-filtered): {filtered:?}: {e}")); + let fhits = fresp["hits"].as_array().expect("filtered hits array"); + + assert!( + !fhits.is_empty(), + "--tag rust --tag async must return hits from tagged docs; got 0: {fresp}" + ); + + // c.md must not appear — it has no tags. + for hit in fhits { + let path = hit["doc_path"].as_str().unwrap_or(""); + assert!( + path.ends_with("a.md") || path.ends_with("b.md"), + "--tag rust --tag async must only return a.md or b.md, got path={path}" + ); + } + + // Both a.md and b.md must appear (OR, not AND). + let paths: Vec<&str> = fhits + .iter() + .filter_map(|h| h["doc_path"].as_str()) + .collect(); + let has_a = paths.iter().any(|p| p.ends_with("a.md")); + let has_b = paths.iter().any(|p| p.ends_with("b.md")); + assert!(has_a, "--tag rust must include a.md (rust-tagged): paths={paths:?}"); + assert!(has_b, "--tag async must include b.md (async-tagged): paths={paths:?}"); +} diff --git a/crates/kebab-mcp/src/tools/search.rs b/crates/kebab-mcp/src/tools/search.rs index 2027024..74af6e9 100644 --- a/crates/kebab-mcp/src/tools/search.rs +++ b/crates/kebab-mcp/src/tools/search.rs @@ -12,6 +12,8 @@ use rmcp::model::CallToolResult; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use kebab_app::ERROR_V1_ID; + use crate::error::{to_tool_error, to_tool_success}; use crate::state::KebabAppState; @@ -161,7 +163,7 @@ fn normalize_media_alias(s: &str) -> String { fn invalid_input(msg: &str) -> CallToolResult { use kebab_app::{ErrorV1, StructuredError}; let err = anyhow::Error::new(StructuredError(ErrorV1 { - schema_version: "error.v1".to_string(), + schema_version: ERROR_V1_ID.to_string(), code: "invalid_input".to_string(), message: msg.to_string(), details: serde_json::Value::Null, diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs index 871c22d..bfdd0f7 100644 --- a/crates/kebab-search/src/lexical.rs +++ b/crates/kebab-search/src/lexical.rs @@ -348,11 +348,15 @@ fn run_query( // p9-fb-36: ingested_after filter. // `documents.updated_at` is RFC3339 stored as TEXT (always UTC `Z` per - // fb-32 ingest path), so lexicographic >= compare is correct. + // fb-32 ingest path), so lexicographic >= compare is correct — but only + // when the filter instant is also formatted as UTC `Z`. A non-UTC offset + // (e.g. `+09:00`) would compare as ASCII after `Z` (0x2B < 0x5A) and + // produce wrong results. Convert to UTC before formatting. if let Some(after) = &filters.ingested_after { let formatted = after + .to_offset(time::UtcOffset::UTC) .format(&time::format_description::well_known::Rfc3339) - .expect("OffsetDateTime formats to RFC3339"); + .expect("OffsetDateTime (UTC) formats to RFC3339"); sql.push_str(" AND d.updated_at >= ?"); params.push(Box::new(formatted)); } diff --git a/crates/kebab-store-sqlite/src/filters.rs b/crates/kebab-store-sqlite/src/filters.rs index 4586236..9519879 100644 --- a/crates/kebab-store-sqlite/src/filters.rs +++ b/crates/kebab-store-sqlite/src/filters.rs @@ -155,11 +155,15 @@ impl SqliteStore { // p9-fb-36: ingested_after filter. // `documents.updated_at` is RFC3339 TEXT (UTC `Z` per fb-32); - // lexicographic >= compare is correct. + // lexicographic >= compare is correct — but only when the filter + // instant is also formatted as UTC `Z`. A non-UTC offset (e.g. + // `+09:00`) would compare as ASCII after `Z` (0x2B < 0x5A) and + // produce wrong results. Convert to UTC before formatting. if let Some(after) = &filters.ingested_after { let formatted = after + .to_offset(time::UtcOffset::UTC) .format(&time::format_description::well_known::Rfc3339) - .expect("OffsetDateTime formats to RFC3339"); + .expect("OffsetDateTime (UTC) formats to RFC3339"); sql.push_str(" AND d.updated_at >= ?"); bind.push(Box::new(formatted)); } @@ -666,4 +670,54 @@ mod tests { .unwrap(); assert_eq!(out, vec![cid(c1)], "doc_id filter must scope to the target doc only"); } + + #[test] + fn filter_chunks_ingested_after_non_utc_offset_compares_as_instant() { + // Regression test for the non-UTC offset lex-compare bug. + // + // Scenario (from PR #127 review): + // - doc stored at `2026-04-01T01:00:00Z` + // - filter: `2026-04-01T05:00:00+09:00` == `2026-03-31T20:00:00Z` instant + // + // The doc instant (01:00 UTC on Apr 1) is AFTER the filter instant + // (20:00 UTC on Mar 31), so the doc SHOULD match. + // + // Buggy code: formats `+09:00` as-is → lex compare + // `2026-04-01T01:00:00Z` vs `2026-04-01T05:00:00+09:00` + // `01` < `05` → doc dropped incorrectly. + // + // Fixed code: converts to UTC first → compares + // `2026-04-01T01:00:00Z` vs `2026-03-31T20:00:00Z` + // Apr 1 > Mar 31 → doc correctly included. + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let c1 = "11111111111111111111111111111111"; + seed_committed_full( + &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", + "doc.md", "en", &[], "primary", + r#""markdown""#, + "2026-04-01T01:00:00Z", + ); + + // Filter instant: 2026-04-01T05:00:00+09:00 == 2026-03-31T20:00:00 UTC. + // Doc (2026-04-01T01:00:00Z) is after the filter instant → should match. + let filter_instant = time::OffsetDateTime::parse( + "2026-04-01T05:00:00+09:00", + &time::format_description::well_known::Rfc3339, + ) + .expect("valid RFC3339 with +09:00 offset"); + + let f = SearchFilters { + ingested_after: Some(filter_instant), + ..Default::default() + }; + let out = store + .filter_chunks(&[cid(c1)], &f) + .unwrap(); + assert_eq!( + out, + vec![cid(c1)], + "doc ingested at 01:00Z should match filter 05:00+09:00 (== 20:00Z previous day)" + ); + } }