From 7dddc1d706eb1c6a3feb673c417de0d9d8d90bc7 Mon Sep 17 00:00:00 2001 From: th-kim0823 Date: Sun, 10 May 2026 00:45:29 +0900 Subject: [PATCH] fix(fb-35): address PR #126 round 1 review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fetch_span: panic-fix on line_start > total / empty doc (return empty text + effective_end = line_start - 1 instead of out-of-bounds slice) - truncated: reserved for budget-driven truncation only; line range clamp signaled via effective_end < line_end - spec / SKILL.md / README: align rejection wording to "PDF / audio" (matches code; Image OCR allowed for span) - store: warning comment on list_chunk_ids_for_doc — chunk_id hash sort does NOT preserve document position; real fix is a chunks.ordinal column, tracked as follow-up - surrounding_chunks: saturating_add to defend against u32::MAX context arg on 32-bit targets - tests: line_start > total returns empty + chunk context at doc boundary clamps lower bound Deferred nits (follow-up): table-separator strict CommonMark form; MCP per-mode strict validation; CLI chunk_id truncation in plain output. None block correctness. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/fetch.rs | 46 ++++++++++++- crates/kebab-app/tests/fetch_integration.rs | 67 +++++++++++++++++++ crates/kebab-store-sqlite/src/documents.rs | 27 +++++--- ...26-05-09-p9-fb-35-verbatim-fetch-design.md | 4 +- tasks/p9/p9-fb-35-verbatim-fetch.md | 2 +- 5 files changed, 129 insertions(+), 17 deletions(-) diff --git a/crates/kebab-app/src/fetch.rs b/crates/kebab-app/src/fetch.rs index b44d65c..e8a456c 100644 --- a/crates/kebab-app/src/fetch.rs +++ b/crates/kebab-app/src/fetch.rs @@ -227,12 +227,49 @@ fn fetch_span( let full = fmt_canonical_to_markdown(&doc); let lines: Vec<&str> = full.lines().collect(); let total = lines.len() as u32; - let effective_end_raw = line_end.min(total).max(line_start); + + // p9-fb-35 round-1 review fix: empty / out-of-range request must + // not slice. Returning empty text + `effective_end = line_start - 1` + // lets the caller detect "no lines fetched" via + // `text.is_empty() && effective_end < line_start`. `truncated` + // stays false because line-range clamp is NOT a budget event — + // budget-driven truncation is the only thing `truncated` signals. + if total == 0 || line_start > total { + let now = OffsetDateTime::now_utc(); + let stale = compute_stale( + doc_metadata_updated_at(&doc), + now, + app.config.search.stale_threshold_days, + ); + return Ok(FetchResult { + kind: FetchKind::Span, + doc_id: doc.doc_id.clone(), + doc_path: doc.workspace_path.clone(), + indexed_at: doc_metadata_updated_at(&doc), + stale, + chunk: None, + context_before: Vec::new(), + context_after: Vec::new(), + text: Some(String::new()), + line_start: Some(line_start), + line_end: Some(line_end), + // saturating_sub: when line_start = 1 we end at 0, signaling + // "no lines fetched" without underflowing u32. + effective_end: Some(line_start.saturating_sub(1)), + truncated: false, + }); + } + + let effective_end_raw = line_end.min(total); let lo = (line_start - 1) as usize; let hi = effective_end_raw as usize; let mut text = lines[lo..hi].join("\n"); - let mut truncated = effective_end_raw != line_end; + // p9-fb-35 round-1 review fix: `truncated` is reserved for + // budget-driven truncation only. Line-range clamp (line_end > + // total) is signaled via `effective_end < line_end`, not via + // `truncated`. + let mut truncated = false; let mut effective_end = effective_end_raw; if let Some(max_tokens) = opts.max_tokens { let max_chars = max_tokens.saturating_mul(4); @@ -285,7 +322,10 @@ fn surrounding_chunks( .ok_or_else(|| anyhow::anyhow!("chunk not found in doc chunk list"))?; let n = n as usize; let lo = target_idx.saturating_sub(n); - let hi = (target_idx + n + 1).min(chunks.len()); + let hi = target_idx + .saturating_add(n) + .saturating_add(1) + .min(chunks.len()); let before: Vec = chunks[lo..target_idx].to_vec(); let after: Vec = chunks[target_idx + 1..hi].to_vec(); Ok((before, after)) diff --git a/crates/kebab-app/tests/fetch_integration.rs b/crates/kebab-app/tests/fetch_integration.rs index 7b445cc..5d0c4c6 100644 --- a/crates/kebab-app/tests/fetch_integration.rs +++ b/crates/kebab-app/tests/fetch_integration.rs @@ -250,3 +250,70 @@ fn fetch_span_invalid_input_when_zero_lines() { .unwrap_err(); assert!(err.to_string().contains("invalid_input"), "got: {err}"); } + +#[test] +fn fetch_span_line_start_beyond_total_returns_empty_text() { + let env = common::TestEnv::new(); + let body = "- Line one.\n- Line two.\n"; + common::ingest_md(&env, "two_lines.md", body); + let app = env.app(); + let q = kebab_core::SearchQuery { + text: "Line".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 1, + filters: kebab_core::SearchFilters::default(), + }; + let hits = app.search(q).unwrap(); + let doc_id = hits[0].doc_id.clone(); + + let result = app + .fetch( + FetchQuery::Span { + doc_id, + line_start: 100, + line_end: 200, + }, + FetchOpts::default(), + ) + .unwrap(); + let text = result.text.expect("text field"); + assert!(text.is_empty(), "out-of-range request returns empty text"); + assert!( + !result.truncated, + "out-of-range is NOT truncated (budget-only flag)" + ); +} + +#[test] +fn fetch_chunk_context_at_first_chunk_clamps_lower_bound() { + let env = common::TestEnv::new(); + // Multi-chunk markdown so context ±N has neighbors. + let body = + "# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n"; + common::ingest_md(&env, "boundary.md", body); + let app = env.app(); + let q = kebab_core::SearchQuery { + text: "First".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 1, + filters: kebab_core::SearchFilters::default(), + }; + let hits = app.search(q).unwrap(); + let chunk_id = hits[0].chunk_id.clone(); + + let result = app + .fetch( + FetchQuery::Chunk(chunk_id), + FetchOpts { + context: Some(2), + max_tokens: None, + }, + ) + .unwrap(); + // context_before may be empty if target is the first chunk; + // context_after should have ≤ 2 entries. Both clamped at doc boundaries. + assert!( + result.context_before.len() + result.context_after.len() <= 4, + "doc boundary should clamp ±N to fit chunk count" + ); +} diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs index ac6b44f..3c1bcc6 100644 --- a/crates/kebab-store-sqlite/src/documents.rs +++ b/crates/kebab-store-sqlite/src/documents.rs @@ -376,18 +376,23 @@ impl kebab_core::DocumentStore for SqliteStore { } impl SqliteStore { - /// p9-fb-35: list `chunk_id`s for a document in deterministic - /// chunker-emit order. `put_chunks` writes one transaction with a - /// single `created_at` snapshot, so the secondary `chunk_id` sort - /// is what actually orders neighbors within a single re-ingest; - /// the primary `created_at` sort distinguishes successive - /// re-ingests if they ever co-exist in the table (they shouldn't — - /// `put_chunks` deletes the old rows first — but the ordering is - /// still well-defined under that scenario). + /// p9-fb-35: list `chunk_id`s for a document, returning a stable + /// `(created_at, chunk_id)` order. Used by + /// `App::fetch chunk --context N` to find ordinal-adjacent chunks. /// - /// Used by `kebab-app::fetch::surrounding_chunks` to derive ±N - /// neighbors around a target chunk without leaking SQL into the - /// facade crate. + /// ⚠ Round-1 review caveat: `chunk_id` is a blake3 hash of + /// `(doc_id, chunker_version, …)` — hex-lexicographic sort does NOT + /// correspond to document position. Within one ingest transaction + /// all chunks share `created_at` to the millisecond, so the + /// secondary `chunk_id` sort dominates and the "neighbors" + /// returned here may not be document-adjacent. + /// + /// Real fix is a `chunks.ordinal` column (V007 migration) or sort + /// by `chunks.source_spans_json[0]` start offset. Tracked as + /// follow-up; current behavior is good enough for sequentially + /// chunked markdown where created_at uniqueness varies, but PDFs + /// (page-aligned chunks) and large docs may surprise the agent. + /// See `tasks/HOTFIXES.md` if/when this is escalated. pub fn list_chunk_ids_for_doc( &self, doc_id: &kebab_core::DocumentId, diff --git a/docs/superpowers/specs/2026-05-09-p9-fb-35-verbatim-fetch-design.md b/docs/superpowers/specs/2026-05-09-p9-fb-35-verbatim-fetch-design.md index cfdda89..0450e4d 100644 --- a/docs/superpowers/specs/2026-05-09-p9-fb-35-verbatim-fetch-design.md +++ b/docs/superpowers/specs/2026-05-09-p9-fb-35-verbatim-fetch-design.md @@ -14,13 +14,13 @@ date: 2026-05-09 ## Goal -agent 가 search hit / RAG citation 의 `chunk_id` / `doc_id` 로 raw verbatim text 를 deep-link fetch 할 수 있는 surface. CLI 와 MCP 동시 노출. 3가지 mode (chunk / doc / span). PDF / image 의 line-based span 은 명시적 거절 (`error.v1.code = span_not_supported`). +agent 가 search hit / RAG citation 의 `chunk_id` / `doc_id` 로 raw verbatim text 를 deep-link fetch 할 수 있는 surface. CLI 와 MCP 동시 노출. 3가지 mode (chunk / doc / span). PDF / audio 의 line-based span 은 명시적 거절 (`error.v1.code = span_not_supported`). image OCR text 는 line-addressable 이라 span 허용. ## Behavior contract ### Source of truth -모든 text 는 `CanonicalDocument` / `chunks.text` 에서 가져온 정규화된 markdown. 원본 raw bytes (`assets.storage_path` 의 파일) 는 노출 안 함 — 사용자가 필요하면 직접 read. PDF / image 도 동일 surface (page-text / OCR text). +모든 text 는 `CanonicalDocument` / `chunks.text` 에서 가져온 정규화된 markdown. 원본 raw bytes (`assets.storage_path` 의 파일) 는 노출 안 함 — 사용자가 필요하면 직접 read. PDF / audio / image 도 동일 surface (page-text / transcript / OCR text). 단 line-based span 은 PDF / audio 거절 — image OCR 은 line-addressable 이라 허용. ### CLI subcommand diff --git a/tasks/p9/p9-fb-35-verbatim-fetch.md b/tasks/p9/p9-fb-35-verbatim-fetch.md index bc63c86..8efe88f 100644 --- a/tasks/p9/p9-fb-35-verbatim-fetch.md +++ b/tasks/p9/p9-fb-35-verbatim-fetch.md @@ -37,7 +37,7 @@ source_feedback: 사용자 도그푸딩 2026-05-06 — agent 가 search hit / ci - chunk_id / doc_id 노출 — 현재 search_hit.v1 에 있는지 확인 + 안정성. - context window — N 개 chunk vs N tokens. - doc 전체 fetch 의 size 제한 (fb-34 budget 과 통합). -- pdf / image 의 fetch — 텍스트 추출본 vs 원본 path. +- pdf / audio 의 fetch — 텍스트 추출본 vs 원본 path. (image OCR 는 markdown line 으로 떨어져 span 허용.) ## Risks / notes