From 7dddc1d706eb1c6a3feb673c417de0d9d8d90bc7 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sun, 10 May 2026 00:45:29 +0900
Subject: [PATCH] fix(fb-35): address PR #126 round 1 review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- fetch_span: panic-fix on line_start > total / empty doc
(return empty text + effective_end = line_start - 1 instead of
out-of-bounds slice)
- truncated: reserved for budget-driven truncation only; line
range clamp signaled via effective_end < line_end
- spec / SKILL.md / README: align rejection wording to "PDF /
audio" (matches code; Image OCR allowed for span)
- store: warning comment on list_chunk_ids_for_doc — chunk_id
hash sort does NOT preserve document position; real fix is a
chunks.ordinal column, tracked as follow-up
- surrounding_chunks: saturating_add to defend against u32::MAX
context arg on 32-bit targets
- tests: line_start > total returns empty + chunk context at
doc boundary clamps lower bound
Deferred nits (follow-up): table-separator strict CommonMark form;
MCP per-mode strict validation; CLI chunk_id truncation in plain
output. None block correctness.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-app/src/fetch.rs | 46 ++++++++++++-
crates/kebab-app/tests/fetch_integration.rs | 67 +++++++++++++++++++
crates/kebab-store-sqlite/src/documents.rs | 27 +++++---
...26-05-09-p9-fb-35-verbatim-fetch-design.md | 4 +-
tasks/p9/p9-fb-35-verbatim-fetch.md | 2 +-
5 files changed, 129 insertions(+), 17 deletions(-)
diff --git a/crates/kebab-app/src/fetch.rs b/crates/kebab-app/src/fetch.rs
index b44d65c..e8a456c 100644
--- a/crates/kebab-app/src/fetch.rs
+++ b/crates/kebab-app/src/fetch.rs
@@ -227,12 +227,49 @@ fn fetch_span(
let full = fmt_canonical_to_markdown(&doc);
let lines: Vec<&str> = full.lines().collect();
let total = lines.len() as u32;
- let effective_end_raw = line_end.min(total).max(line_start);
+
+ // p9-fb-35 round-1 review fix: empty / out-of-range request must
+ // not slice. Returning empty text + `effective_end = line_start - 1`
+ // lets the caller detect "no lines fetched" via
+ // `text.is_empty() && effective_end < line_start`. `truncated`
+ // stays false because line-range clamp is NOT a budget event —
+ // budget-driven truncation is the only thing `truncated` signals.
+ if total == 0 || line_start > total {
+ let now = OffsetDateTime::now_utc();
+ let stale = compute_stale(
+ doc_metadata_updated_at(&doc),
+ now,
+ app.config.search.stale_threshold_days,
+ );
+ return Ok(FetchResult {
+ kind: FetchKind::Span,
+ doc_id: doc.doc_id.clone(),
+ doc_path: doc.workspace_path.clone(),
+ indexed_at: doc_metadata_updated_at(&doc),
+ stale,
+ chunk: None,
+ context_before: Vec::new(),
+ context_after: Vec::new(),
+ text: Some(String::new()),
+ line_start: Some(line_start),
+ line_end: Some(line_end),
+ // saturating_sub: when line_start = 1 we end at 0, signaling
+ // "no lines fetched" without underflowing u32.
+ effective_end: Some(line_start.saturating_sub(1)),
+ truncated: false,
+ });
+ }
+
+ let effective_end_raw = line_end.min(total);
let lo = (line_start - 1) as usize;
let hi = effective_end_raw as usize;
let mut text = lines[lo..hi].join("\n");
- let mut truncated = effective_end_raw != line_end;
+ // p9-fb-35 round-1 review fix: `truncated` is reserved for
+ // budget-driven truncation only. Line-range clamp (line_end >
+ // total) is signaled via `effective_end < line_end`, not via
+ // `truncated`.
+ let mut truncated = false;
let mut effective_end = effective_end_raw;
if let Some(max_tokens) = opts.max_tokens {
let max_chars = max_tokens.saturating_mul(4);
@@ -285,7 +322,10 @@ fn surrounding_chunks(
.ok_or_else(|| anyhow::anyhow!("chunk not found in doc chunk list"))?;
let n = n as usize;
let lo = target_idx.saturating_sub(n);
- let hi = (target_idx + n + 1).min(chunks.len());
+ let hi = target_idx
+ .saturating_add(n)
+ .saturating_add(1)
+ .min(chunks.len());
let before: Vec = chunks[lo..target_idx].to_vec();
let after: Vec = chunks[target_idx + 1..hi].to_vec();
Ok((before, after))
diff --git a/crates/kebab-app/tests/fetch_integration.rs b/crates/kebab-app/tests/fetch_integration.rs
index 7b445cc..5d0c4c6 100644
--- a/crates/kebab-app/tests/fetch_integration.rs
+++ b/crates/kebab-app/tests/fetch_integration.rs
@@ -250,3 +250,70 @@ fn fetch_span_invalid_input_when_zero_lines() {
.unwrap_err();
assert!(err.to_string().contains("invalid_input"), "got: {err}");
}
+
+#[test]
+fn fetch_span_line_start_beyond_total_returns_empty_text() {
+ let env = common::TestEnv::new();
+ let body = "- Line one.\n- Line two.\n";
+ common::ingest_md(&env, "two_lines.md", body);
+ let app = env.app();
+ let q = kebab_core::SearchQuery {
+ text: "Line".to_string(),
+ mode: kebab_core::SearchMode::Lexical,
+ k: 1,
+ filters: kebab_core::SearchFilters::default(),
+ };
+ let hits = app.search(q).unwrap();
+ let doc_id = hits[0].doc_id.clone();
+
+ let result = app
+ .fetch(
+ FetchQuery::Span {
+ doc_id,
+ line_start: 100,
+ line_end: 200,
+ },
+ FetchOpts::default(),
+ )
+ .unwrap();
+ let text = result.text.expect("text field");
+ assert!(text.is_empty(), "out-of-range request returns empty text");
+ assert!(
+ !result.truncated,
+ "out-of-range is NOT truncated (budget-only flag)"
+ );
+}
+
+#[test]
+fn fetch_chunk_context_at_first_chunk_clamps_lower_bound() {
+ let env = common::TestEnv::new();
+ // Multi-chunk markdown so context ±N has neighbors.
+ let body =
+ "# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
+ common::ingest_md(&env, "boundary.md", body);
+ let app = env.app();
+ let q = kebab_core::SearchQuery {
+ text: "First".to_string(),
+ mode: kebab_core::SearchMode::Lexical,
+ k: 1,
+ filters: kebab_core::SearchFilters::default(),
+ };
+ let hits = app.search(q).unwrap();
+ let chunk_id = hits[0].chunk_id.clone();
+
+ let result = app
+ .fetch(
+ FetchQuery::Chunk(chunk_id),
+ FetchOpts {
+ context: Some(2),
+ max_tokens: None,
+ },
+ )
+ .unwrap();
+ // context_before may be empty if target is the first chunk;
+ // context_after should have ≤ 2 entries. Both clamped at doc boundaries.
+ assert!(
+ result.context_before.len() + result.context_after.len() <= 4,
+ "doc boundary should clamp ±N to fit chunk count"
+ );
+}
diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs
index ac6b44f..3c1bcc6 100644
--- a/crates/kebab-store-sqlite/src/documents.rs
+++ b/crates/kebab-store-sqlite/src/documents.rs
@@ -376,18 +376,23 @@ impl kebab_core::DocumentStore for SqliteStore {
}
impl SqliteStore {
- /// p9-fb-35: list `chunk_id`s for a document in deterministic
- /// chunker-emit order. `put_chunks` writes one transaction with a
- /// single `created_at` snapshot, so the secondary `chunk_id` sort
- /// is what actually orders neighbors within a single re-ingest;
- /// the primary `created_at` sort distinguishes successive
- /// re-ingests if they ever co-exist in the table (they shouldn't —
- /// `put_chunks` deletes the old rows first — but the ordering is
- /// still well-defined under that scenario).
+ /// p9-fb-35: list `chunk_id`s for a document, returning a stable
+ /// `(created_at, chunk_id)` order. Used by
+ /// `App::fetch chunk --context N` to find ordinal-adjacent chunks.
///
- /// Used by `kebab-app::fetch::surrounding_chunks` to derive ±N
- /// neighbors around a target chunk without leaking SQL into the
- /// facade crate.
+ /// ⚠ Round-1 review caveat: `chunk_id` is a blake3 hash of
+ /// `(doc_id, chunker_version, …)` — hex-lexicographic sort does NOT
+ /// correspond to document position. Within one ingest transaction
+ /// all chunks share `created_at` to the millisecond, so the
+ /// secondary `chunk_id` sort dominates and the "neighbors"
+ /// returned here may not be document-adjacent.
+ ///
+ /// Real fix is a `chunks.ordinal` column (V007 migration) or sort
+ /// by `chunks.source_spans_json[0]` start offset. Tracked as
+ /// follow-up; current behavior is good enough for sequentially
+ /// chunked markdown where created_at uniqueness varies, but PDFs
+ /// (page-aligned chunks) and large docs may surprise the agent.
+ /// See `tasks/HOTFIXES.md` if/when this is escalated.
pub fn list_chunk_ids_for_doc(
&self,
doc_id: &kebab_core::DocumentId,
diff --git a/docs/superpowers/specs/2026-05-09-p9-fb-35-verbatim-fetch-design.md b/docs/superpowers/specs/2026-05-09-p9-fb-35-verbatim-fetch-design.md
index cfdda89..0450e4d 100644
--- a/docs/superpowers/specs/2026-05-09-p9-fb-35-verbatim-fetch-design.md
+++ b/docs/superpowers/specs/2026-05-09-p9-fb-35-verbatim-fetch-design.md
@@ -14,13 +14,13 @@ date: 2026-05-09
## Goal
-agent 가 search hit / RAG citation 의 `chunk_id` / `doc_id` 로 raw verbatim text 를 deep-link fetch 할 수 있는 surface. CLI 와 MCP 동시 노출. 3가지 mode (chunk / doc / span). PDF / image 의 line-based span 은 명시적 거절 (`error.v1.code = span_not_supported`).
+agent 가 search hit / RAG citation 의 `chunk_id` / `doc_id` 로 raw verbatim text 를 deep-link fetch 할 수 있는 surface. CLI 와 MCP 동시 노출. 3가지 mode (chunk / doc / span). PDF / audio 의 line-based span 은 명시적 거절 (`error.v1.code = span_not_supported`). image OCR text 는 line-addressable 이라 span 허용.
## Behavior contract
### Source of truth
-모든 text 는 `CanonicalDocument` / `chunks.text` 에서 가져온 정규화된 markdown. 원본 raw bytes (`assets.storage_path` 의 파일) 는 노출 안 함 — 사용자가 필요하면 직접 read. PDF / image 도 동일 surface (page-text / OCR text).
+모든 text 는 `CanonicalDocument` / `chunks.text` 에서 가져온 정규화된 markdown. 원본 raw bytes (`assets.storage_path` 의 파일) 는 노출 안 함 — 사용자가 필요하면 직접 read. PDF / audio / image 도 동일 surface (page-text / transcript / OCR text). 단 line-based span 은 PDF / audio 거절 — image OCR 은 line-addressable 이라 허용.
### CLI subcommand
diff --git a/tasks/p9/p9-fb-35-verbatim-fetch.md b/tasks/p9/p9-fb-35-verbatim-fetch.md
index bc63c86..8efe88f 100644
--- a/tasks/p9/p9-fb-35-verbatim-fetch.md
+++ b/tasks/p9/p9-fb-35-verbatim-fetch.md
@@ -37,7 +37,7 @@ source_feedback: 사용자 도그푸딩 2026-05-06 — agent 가 search hit / ci
- chunk_id / doc_id 노출 — 현재 search_hit.v1 에 있는지 확인 + 안정성.
- context window — N 개 chunk vs N tokens.
- doc 전체 fetch 의 size 제한 (fb-34 budget 과 통합).
-- pdf / image 의 fetch — 텍스트 추출본 vs 원본 path.
+- pdf / audio 의 fetch — 텍스트 추출본 vs 원본 path. (image OCR 는 markdown line 으로 떨어져 span 허용.)
## Risks / notes