diff --git a/crates/kebab-app/tests/pdf_pipeline.rs b/crates/kebab-app/tests/pdf_pipeline.rs index c53f245..4ae7e13 100644 --- a/crates/kebab-app/tests/pdf_pipeline.rs +++ b/crates/kebab-app/tests/pdf_pipeline.rs @@ -165,7 +165,7 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() { ); assert_eq!( pdf_item.chunker_version.as_ref().map(|c| c.0.as_str()), - Some("pdf-page-v1") + Some("pdf-page-v1.1") ); // Inspect the stored doc to confirm SourceSpan::Page round-trip. @@ -365,7 +365,7 @@ fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() { assert_eq!( pdf_item.chunk_count, Some(2), - "pdf-page-v1 emits 0 chunks for the empty page; total = 2" + "pdf-page-v1.1 emits 0 chunks for the empty page; total = 2" ); let doc = kebab_app::inspect_doc_with_config( diff --git a/crates/kebab-chunk/src/pdf_page_v1.rs b/crates/kebab-chunk/src/pdf_page_v1.rs index 7935717..ef53af3 100644 --- a/crates/kebab-chunk/src/pdf_page_v1.rs +++ b/crates/kebab-chunk/src/pdf_page_v1.rs @@ -53,18 +53,21 @@ //! one chunk per atomic block. PdfPageV1 cannot. //! //! Workaround that doesn't change the §4.2 recipe: feed a per-chunk -//! variant `format!("{base_policy_hash}#c{char_start}")` into the -//! recipe's `policy_hash` slot (so distinct chunks distinguish via -//! different policy_hash inputs), while storing the unmodified -//! `base_policy_hash` in `Chunk.policy_hash` so the field still answers -//! "what policy was active". Logged in `tasks/HOTFIXES.md`. +//! variant `format!("{base_policy_hash}#c{segment_start}")` into the +//! recipe's `policy_hash` slot. `segment_start` is the pre-overlap +//! segment boundary, strictly increasing across the returned chunks +//! even when the overlap walk collapses `actual_start` to a previous +//! chunk's `prev_min`. Unmodified `base_policy_hash` is stored in +//! `Chunk.policy_hash` so the field still answers "what policy was +//! active". v1.1 second-iteration patch — logged in +//! `tasks/HOTFIXES.md` (2026-05-27). use kebab_core::{ Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId, SourceSpan, id_for_chunk, }; -const VERSION_LABEL: &str = "pdf-page-v1"; +const VERSION_LABEL: &str = "pdf-page-v1.1"; const BYTES_PER_TOKEN: usize = 3; const POLICY_HASH_HEX_LEN: usize = 16; @@ -146,7 +149,7 @@ impl Chunker for PdfPageV1Chunker { continue; } - for (char_start, char_end, slice) in + for (segment_start, char_start, char_end, slice) in chunk_page(&p.text, target_bytes, overlap_bytes) { // PDF chars-per-page comfortably fits in u32 (a single @@ -164,10 +167,12 @@ impl Chunker for PdfPageV1Chunker { char_end: Some(char_end_u32), }; let block_ids: Vec = vec![p.common.block_id.clone()]; - // Per-chunk policy_hash variant prevents chunk_id - // collision when a page produces multiple chunks. See - // module docs for rationale. - let per_chunk_hash = format!("{base_policy_hash}#c{char_start}"); + // v0.20.0 sub-item 1 bugfix (#3): per-chunk policy_hash + // variant uses `segment_start` (pre-overlap boundary, + // strictly increasing) instead of `char_start` (post- + // overlap, may collapse to prev_min). See module docs + + // spec §4.1 root cause + HOTFIXES.md 2026-05-27. + let per_chunk_hash = format!("{base_policy_hash}#c{segment_start}"); let chunk_id = id_for_chunk(&doc.doc_id, &chunker_version, &block_ids, &per_chunk_hash); let token_estimate = slice.len().div_ceil(BYTES_PER_TOKEN); @@ -198,18 +203,24 @@ impl Chunker for PdfPageV1Chunker { } /// Split a single page's text into ordered chunks, each represented as -/// `(char_start, char_end, text_slice)`. Char positions are within the -/// page text, suitable for `SourceSpan::Page::char_start` / `char_end`. +/// `(segment_start, actual_start, chunk_end, text_slice)`. +/// +/// - `segment_start` = pre-overlap segment boundary. Strictly increasing +/// across the returned vec. Use this for chunk_id uniqueness suffixes. +/// - `actual_start` = post-overlap start char index. May collapse to a +/// previous chunk's `actual_start` under aggressive overlap policy. +/// Use this for `SourceSpan::Page::char_start`. +/// - `chunk_end` = chunk's end char index (exclusive). /// /// Returns an empty vector when `text` is empty or whitespace-only. -fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usize, usize, String)> { +fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usize, usize, usize, String)> { let chars: Vec = text.chars().collect(); let n = chars.len(); if n == 0 { return Vec::new(); } if text.len() <= target_bytes { - return vec![(0, n, text.to_string())]; + return vec![(0, 0, n, text.to_string())]; } // Build candidate boundary positions (char indices where a chunk @@ -239,7 +250,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi chars[a..b].iter().map(|c| c.len_utf8()).sum() }; - let mut chunks: Vec<(usize, usize, String)> = Vec::new(); + let mut chunks: Vec<(usize, usize, usize, String)> = Vec::new(); let mut seg_idx: usize = 0; while seg_idx + 1 < bounds.len() { let start = bounds[seg_idx]; @@ -264,7 +275,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi // have absorbed up to `overlap_bytes` of bytes, but never past // the previous chunk's start (no full re-emission). let actual_start = if let Some(prev) = chunks.last() { - let prev_min = prev.0; + // prev tuple shape = (segment_start, actual_start, chunk_end, slice). + // overlap walk floor = previous chunk's actual_start (prev.1). + let prev_min = prev.1; let mut a = start; let mut acc_o: usize = 0; while a > prev_min { @@ -281,7 +294,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi }; let slice: String = chars[actual_start..chunk_end].iter().collect(); - chunks.push((actual_start, chunk_end, slice)); + chunks.push((start, actual_start, chunk_end, slice)); seg_idx = end_idx; } @@ -674,6 +687,43 @@ mod tests { assert_eq!(ids.len(), total, "chunk_ids must remain unique"); } + #[test] + fn multi_chunk_page_with_aggressive_overlap_produces_unique_chunk_ids() { + // 한국어 OCR text 의 trigger shape: 10 char "가" + ". " + 500 char "나". + // → first segment [0, 12), second segment [12, n). + // page_text byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500 + // → multi-chunk. overlap_bytes = min(240, 750) = 240 chars=80 + // → second chunk 의 actual_start 가 prev_min=0 collapse → same `#c0`. + // + // default_policy(500, 80) — target_tokens=500 → target_bytes=500*3=1500 + // (한국어 3byte/char 환산), overlap_tokens=80 → overlap_bytes=min(240, 750)=240. + // verifier round 1 L-3 보강. + let early_seg = "가".repeat(10); + let tail = "나".repeat(500); + let page_text = format!("{early_seg}. {tail}"); + + let doc = make_pdf_doc(&[&page_text]); + let policy = default_policy(500, 80); // target=1500 byte, overlap=240 byte + let chunks = PdfPageV1Chunker.chunk(&doc, &policy).unwrap(); + + assert!( + chunks.len() >= 2, + "expected ≥2 chunks for {} byte page; got {}", + page_text.len(), + chunks.len() + ); + + let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); + ids.sort_unstable(); + let total = ids.len(); + ids.dedup(); + assert_eq!( + ids.len(), + total, + "all chunk_ids must be unique even when overlap walks actual_start back to prev_min" + ); + } + #[test] fn policy_hash_matches_md_heading_v1_for_identical_policy() { // Cross-chunker policy fingerprint identity — important so a diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index 947aaf6..aa5c145 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -14,6 +14,45 @@ historical contract that was implemented; this file accumulates the deltas so phase 5+ readers can find the live behavior without diffing git history. +## 2026-05-27 — v0.20.0 sub-item 1: chunk_id `#c{char_start}` workaround collapses under aggressive overlap (Bug #3 second-iteration patch) + +**Symptom**: F2 (1580 chars OCR, scanned_page2.pdf) ingest 시 +`DocumentStore::put_chunks (pdf): sqlite error: UNIQUE constraint +failed: chunks.chunk_id: ... Error code 1555: A PRIMARY KEY constraint +failed`. `kebab v0.20.0` (commit `b4d9e60`) dogfood (qwen2.5vl:3b 의 +`192.168.0.47:11434` Ollama endpoint, `/build/cache/tmp/v0.20-dogfood` +isolated KB) `--force-reingest` 마다 reproducible. + +**Root cause**: `crates/kebab-chunk/src/pdf_page_v1.rs:170` 의 +`per_chunk_hash = format!("{base_policy_hash}#c{char_start}")` 에서 +`char_start` = post-overlap `actual_start`. line 266-281 의 overlap +walk 가 `prev_min` floor 까지만 back-walk 하므로 aggressive overlap ++ 첫 segment 가 작은 page (F2 의 한국어 OCR text: 첫 ~10 char 안 +sentence-end → segment_1 = [0, 30], segment_2 = [30, n], overlap_bytes +240 / chars=80 → segment_2 의 actual_start 가 prev_min=0 으로 +collapse) → 두 chunk 의 `#c0` suffix identical → identical chunk_id → +`chunks` PRIMARY KEY violation. + +**Fix** (spec §4.4): `chunk_page` return tuple 에 `segment_start` +추가 (3-tuple → 4-tuple `(segment_start, actual_start, chunk_end, +slice)`), caller `per_chunk_hash` 의 suffix 를 `segment_start` 로 +변경. `segment_start` 는 `bounds[seg_idx]` (dedup 후 strictly +increasing) — overlap walk 와 무관하게 모든 chunk distinct. citation +locality 의 `SourceSpan::Page.char_start` 는 여전히 post-overlap +`actual_start` 유지. + +**chunker_version cascade**: `pdf-page-v1` → `pdf-page-v1.1` bump +(spec §4.4.1 round 1c M-1 결정, design §9 cascade rule 의 직접 적용). +multi-chunk PDF page (pre-OCR 시점 `metro-korea.pdf` 의 21 block / +34 chunk 같은 정상 path) 의 chunk_id 가 변경 — explicit user-facing +audit trail 확보, store layer 의 자동 invalidation report. v0.20.0 +force-update path 라 사용자 cost zero (어차피 fresh ingest). + +**Amends**: spec `docs/superpowers/specs/2026-05-27-v0.20-sub1-bugfix-spec.md` +§4.4. parent design §4.2 chunk_id recipe 자체 unchanged (workaround +layer 의 internal computation 만 변경). parent PR #189 +(`feat/pdf-scanned-ocr`, force-update path). + ## 2026-05-26 — design deviation — kebab-normalize + kebab-parse-types 흡수 (24 → 22 crates) **Symptom**: design deviation — post-PR9 audit (system-architect, `tasks/INDEX.md` L169) identified 두 crate (`kebab-normalize` + `kebab-parse-types`) 가 dead abstraction. design §3.7b 의 "thin layer" raison d'être ((a) `kebab-core` namespace 폭발 방지, (b) normalize 의 parser non-dependence) 가 4 parser 중 1개 (markdown) 만 lift 를 경유하는 현실에서 fan-in/fan-out 모두 1 → layer 의미 잃음. `kebab-parse-types` 의 production caller 가 2개 (`kebab-parse-md` + `kebab-normalize`) 이고 `kebab-normalize` 자체 caller 가 1개 (`kebab-app`) — 모두 markdown 의 lift 경로 안에서 단일 fan-in 경계 가능.