From 7ee0ac989459717a6961574757cd18322130d3bf Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 08:51:44 +0000 Subject: [PATCH] =?UTF-8?q?feat(kebab-chunk):=20P7-2=20pdf-page-v1=20chunk?= =?UTF-8?q?er=20=E2=80=94=20page-aware=20splitting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `PdfPageV1Chunker` 가 `kebab-parse-pdf` 가 emit 한 `CanonicalDocument` (블록당 한 페이지, 모두 `SourceSpan::Page`) 를 받아 페이지 경계를 절대 넘지 않는 `Chunk` 들을 생성. `chunker_version = "pdf-page-v1"`. 핵심 동작: - 페이지 텍스트가 `target_tokens × BYTES_PER_TOKEN` (= 3) 안이면 한 덩어리. 초과 시 `\n\n` (paragraph) 또는 sentence-end 구두점 + whitespace 경계를 segment 로 보고 greedy 누적, 기본 한 chunk 당 최소 한 segment. - 다음 chunk 의 prefix 에 `overlap_tokens × BYTES_PER_TOKEN` 만큼의 직전 꼬리를 prepend (char 단위, 이전 chunk 시작 너머로 backtrack 안 함). - 빈/공백-only 페이지는 0 chunk (페이지의 `Provenance::Warning` 으로 `kebab-parse-pdf` 단계에 이미 표시됨). - 비-PDF doc (Block::Paragraph 가 아니거나 SourceSpan 이 Page 아님) → 명시 에러. Spec deviation (HOTFIXES 2026-05-02 P7-2): - `chunk_id` 충돌 가드: 같은 페이지에서 여러 chunk 가 나오면 `block_ids` 가 모두 같아 §4.2 recipe 가 충돌. `id_for_chunk` 의 `policy_hash` 인풋을 per-chunk 로 `format!("{base}#c{char_start}")` 변형해 회피. recipe 자체는 불변. `Chunk.policy_hash` 필드는 base 유지. - `BYTES_PER_TOKEN = 3` (md-heading-v1 실제 코드와 일치). spec 본문은 "/ 4" 라고 했지만 그 자체가 md-heading-v1 의 실코드와 어긋나 있어 일관성 쪽을 택함. cross-chunker `policy_hash` 동일성 unit test 로 잠금. 테스트 (10개 신규): - chunker_version label, 3-page small, 1-page huge + overlap + chunk_id 유일성, empty page skip, whitespace-only skip, non-PDF error, cross-page boundary 절대 안 만들어짐, determinism (1000회), snapshot shape 안정, md-heading-v1 와 policy_hash 동일. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-chunk/src/lib.rs | 2 + crates/kebab-chunk/src/pdf_page_v1.rs | 606 ++++++++++++++++++++++++++ tasks/HOTFIXES.md | 20 + tasks/p7/p7-2-pdf-page-chunker.md | 2 +- 4 files changed, 629 insertions(+), 1 deletion(-) create mode 100644 crates/kebab-chunk/src/pdf_page_v1.rs diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index dc4b7c0..cd3a2da 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -16,5 +16,7 @@ //! It consumes `CanonicalDocument` purely through `kb-core` types. mod md_heading_v1; +mod pdf_page_v1; pub use md_heading_v1::MdHeadingV1Chunker; +pub use pdf_page_v1::PdfPageV1Chunker; diff --git a/crates/kebab-chunk/src/pdf_page_v1.rs b/crates/kebab-chunk/src/pdf_page_v1.rs new file mode 100644 index 0000000..4e73c15 --- /dev/null +++ b/crates/kebab-chunk/src/pdf_page_v1.rs @@ -0,0 +1,606 @@ +//! `pdf-page-v1` — page-aware PDF chunker. +//! +//! Consumes a [`CanonicalDocument`] produced by `kebab-parse-pdf` (one +//! [`Block::Paragraph`] per page, every block carrying [`SourceSpan::Page`]) +//! and emits one or more [`Chunk`]s per page. Chunks NEVER cross a page +//! boundary (citation locality is the whole reason §3.4 introduced +//! `SourceSpan::Page`), and each chunk's `source_spans` is a single +//! `Page { page, char_start, char_end }` with positions in **characters** +//! within the page text — matching `Citation::Page` fragment semantics +//! across the rest of the workspace. +//! +//! Per design §3.5 (Chunk), §4.2 (chunk_id recipe — see deviation note +//! below), §0 Q3 (citation), §9 (versioning). +//! +//! ## Splitting policy +//! +//! - If a page's bytes fit under `policy.target_tokens * BYTES_PER_TOKEN` +//! the entire page is a single chunk. +//! - Otherwise the page text is segmented at paragraph breaks (`\n\n`) and +//! sentence ends (`.`/`?`/`!` followed by whitespace). Adjacent +//! segments are greedily glued until the running byte budget would be +//! exceeded; the chunk is emitted at that boundary. The next chunk's +//! prefix is seeded with the trailing `policy.overlap_tokens * +//! BYTES_PER_TOKEN` bytes of the prior chunk so retrieval handles +//! queries that fall on the boundary. +//! - A page with no qualifying segment boundary AND text exceeding the +//! budget (e.g. a 5,000-byte single sentence) emits one oversized +//! chunk rather than hard-splitting mid-word — a real tokenizer slot +//! in P+ replaces this proxy and can do better mid-sentence splitting +//! when needed. +//! +//! ## `BYTES_PER_TOKEN` +//! +//! 3 — same calibration as `md-heading-v1` (covers Korean ≈ 3 b/tok and +//! over-estimates English ≈ 4 b/tok). The original p7-2 spec literal said +//! `× 4`, but cross-chunker comparability outweighs the spec literal here. +//! Logged in `tasks/HOTFIXES.md`. +//! +//! ## `chunk_id` collision deviation +//! +//! Design §4.2's `chunk_id = blake3(doc_id || chunker_version || sort(block_ids) +//! || policy_hash)` collides when one block (= one PDF page) is split +//! into multiple chunks: every chunk on that page has identical +//! `block_ids`. md-heading-v1 sidesteps this by always emitting at most +//! one chunk per atomic block. PdfPageV1 cannot. +//! +//! Workaround that doesn't change the §4.2 recipe: feed a per-chunk +//! variant `format!("{base_policy_hash}#c{char_start}")` into the +//! recipe's `policy_hash` slot (so distinct chunks distinguish via +//! different policy_hash inputs), while storing the unmodified +//! `base_policy_hash` in `Chunk.policy_hash` so the field still answers +//! "what policy was active". Logged in `tasks/HOTFIXES.md`. + +use kebab_core::{ + Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId, + SourceSpan, id_for_chunk, +}; + +const VERSION_LABEL: &str = "pdf-page-v1"; +const BYTES_PER_TOKEN: usize = 3; +const POLICY_HASH_HEX_LEN: usize = 16; + +/// Page-aware PDF chunker. See module docs for the splitting policy and +/// the `chunk_id` collision-avoidance deviation. +#[derive(Clone, Copy, Debug, Default)] +pub struct PdfPageV1Chunker; + +impl Chunker for PdfPageV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + /// blake3(canonical_json(policy)) truncated to 16 hex chars. Matches + /// the `md-heading-v1` recipe so a workspace-wide policy hash lookup + /// (e.g. for invalidation reports) yields the same digest across + /// chunkers. + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() + } + + fn chunk( + &self, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + ) -> anyhow::Result> { + // Validate up front — every block must be a Paragraph carrying + // SourceSpan::Page. A mixed document signals a routing bug in + // the caller (e.g. running this chunker on Markdown) and is + // worth surfacing loudly. + for b in &doc.blocks { + let common = match b { + Block::Paragraph(p) => &p.common, + _ => anyhow::bail!( + "PdfPageV1Chunker only handles PDF docs (got non-Paragraph block)" + ), + }; + if !matches!(common.source_span, SourceSpan::Page { .. }) { + anyhow::bail!( + "PdfPageV1Chunker only handles PDF docs (got non-Page source_span)" + ); + } + } + + let base_policy_hash = self.policy_hash(policy); + let chunker_version = self.chunker_version(); + let target_bytes = policy + .target_tokens + .saturating_mul(BYTES_PER_TOKEN) + .max(1); + let overlap_bytes = policy.overlap_tokens.saturating_mul(BYTES_PER_TOKEN); + + let mut out: Vec = Vec::new(); + for b in &doc.blocks { + let p = match b { + Block::Paragraph(t) => t, + _ => unreachable!("validated above"), + }; + let page_num = match p.common.source_span { + SourceSpan::Page { page, .. } => page, + _ => unreachable!("validated above"), + }; + + // Empty page → 0 chunks. Page is still searchable via the + // CanonicalDocument's per-page `Provenance::Warning` + // ("scanned candidate") — chunking just has nothing to say + // about it. + if p.text.trim().is_empty() { + continue; + } + + for (char_start, char_end, slice) in + chunk_page(&p.text, target_bytes, overlap_bytes) + { + let span = SourceSpan::Page { + page: page_num, + char_start: Some(char_start as u32), + char_end: Some(char_end as u32), + }; + let block_ids: Vec = vec![p.common.block_id.clone()]; + // Per-chunk policy_hash variant prevents chunk_id + // collision when a page produces multiple chunks. See + // module docs for rationale. + let per_chunk_hash = format!("{base_policy_hash}#c{char_start}"); + let chunk_id = + id_for_chunk(&doc.doc_id, &chunker_version, &block_ids, &per_chunk_hash); + let token_estimate = slice.len().div_ceil(BYTES_PER_TOKEN); + + out.push(Chunk { + chunk_id, + doc_id: DocumentId(doc.doc_id.0.clone()), + block_ids, + text: slice, + heading_path: Vec::new(), + source_spans: vec![span], + token_estimate, + chunker_version: chunker_version.clone(), + policy_hash: base_policy_hash.clone(), + }); + } + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = out.len(), + "pdf-page-v1 chunked", + ); + + Ok(out) + } +} + +/// Split a single page's text into ordered chunks, each represented as +/// `(char_start, char_end, text_slice)`. Char positions are within the +/// page text, suitable for `SourceSpan::Page::char_start` / `char_end`. +/// +/// Returns an empty vector when `text` is empty or whitespace-only. +fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usize, usize, String)> { + let chars: Vec = text.chars().collect(); + let n = chars.len(); + if n == 0 { + return Vec::new(); + } + if text.len() <= target_bytes { + return vec![(0, n, text.to_string())]; + } + + // Build candidate boundary positions (char indices where a chunk + // *may* start). 0 and n are always boundaries; interior boundaries + // are after a paragraph break (`\n\n`) or after a sentence-ending + // punctuation followed by whitespace. + let mut bounds: Vec = vec![0]; + let mut k = 0; + while k + 1 < n { + let c = chars[k]; + let nx = chars[k + 1]; + let is_paragraph_break = c == '\n' && nx == '\n'; + let is_sentence_end = + matches!(c, '.' | '?' | '!') && nx.is_whitespace(); + if (is_paragraph_break || is_sentence_end) && k + 2 <= n { + bounds.push(k + 2); + } + k += 1; + } + if *bounds.last().unwrap() != n { + bounds.push(n); + } + bounds.dedup(); + + // UTF-8 byte length of the slice between two char indices. + let byte_len = |a: usize, b: usize| -> usize { + chars[a..b].iter().map(|c| c.len_utf8()).sum() + }; + + let mut chunks: Vec<(usize, usize, String)> = Vec::new(); + let mut seg_idx: usize = 0; + while seg_idx + 1 < bounds.len() { + let start = bounds[seg_idx]; + let mut end_idx = seg_idx + 1; + let mut acc = byte_len(start, bounds[end_idx]); + + // Greedy grow: glue subsequent segments while we stay under + // budget. We always include at least one segment per chunk + // (`acc > 0` guard) so a single oversize segment doesn't loop. + while end_idx + 1 < bounds.len() { + let next_bytes = byte_len(bounds[end_idx], bounds[end_idx + 1]); + if acc + next_bytes > target_bytes && acc > 0 { + break; + } + acc += next_bytes; + end_idx += 1; + } + + let chunk_end = bounds[end_idx]; + + // Apply overlap: walk `actual_start` left of `start` until we + // have absorbed up to `overlap_bytes` of bytes, but never past + // the previous chunk's start (no full re-emission). + let actual_start = if let Some(prev) = chunks.last() { + let prev_min = prev.0; + let mut a = start; + let mut acc_o: usize = 0; + while a > prev_min { + let cl = chars[a - 1].len_utf8(); + if acc_o + cl > overlap_bytes { + break; + } + acc_o += cl; + a -= 1; + } + a + } else { + start + }; + + let slice: String = chars[actual_start..chunk_end].iter().collect(); + chunks.push((actual_start, chunk_end, slice)); + seg_idx = end_idx; + } + + chunks +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{ + AssetId, CommonBlock, Inline, Lang, Metadata, ParserVersion, Provenance, SourceType, + TextBlock, TrustLevel, WorkspacePath, id_for_block, id_for_doc, + }; + use time::OffsetDateTime; + + fn make_pdf_doc(pages: &[&str]) -> CanonicalDocument { + let workspace_path = WorkspacePath::new("docs/test.pdf".into()).unwrap(); + let asset_id = AssetId("a".repeat(64)); + let parser_version = ParserVersion("pdf-text-v1".into()); + let doc_id = id_for_doc(&workspace_path, &asset_id, &parser_version); + + let mut blocks: Vec = Vec::new(); + for (i, text) in pages.iter().enumerate() { + let page = (i as u32) + 1; + let char_count = text.chars().count() as u32; + let span = SourceSpan::Page { + page, + char_start: Some(0), + char_end: Some(char_count), + }; + let block_id = id_for_block(&doc_id, "paragraph", &[], i as u32, &span); + let inlines = if text.is_empty() { + Vec::new() + } else { + vec![Inline::Text { + text: (*text).to_string(), + }] + }; + blocks.push(Block::Paragraph(TextBlock { + common: CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span, + }, + text: (*text).to_string(), + inlines, + })); + } + + CanonicalDocument { + doc_id, + source_asset_id: asset_id, + workspace_path, + title: "test".into(), + lang: Lang("und".into()), + blocks, + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Paper, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + }, + provenance: Provenance { events: vec![] }, + parser_version, + schema_version: 1, + doc_version: 1, + } + } + + fn default_policy(target: usize, overlap: usize) -> ChunkPolicy { + ChunkPolicy { + target_tokens: target, + overlap_tokens: overlap, + respect_markdown_headings: false, + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + } + } + + #[test] + fn chunker_version_is_pdf_page_v1() { + assert_eq!( + PdfPageV1Chunker.chunker_version(), + ChunkerVersion(VERSION_LABEL.to_string()) + ); + } + + #[test] + fn three_page_small_emits_one_chunk_per_page() { + let doc = make_pdf_doc(&["page one", "page two", "page three"]); + let chunks = PdfPageV1Chunker + .chunk(&doc, &default_policy(500, 80)) + .unwrap(); + assert_eq!(chunks.len(), 3); + for (i, c) in chunks.iter().enumerate() { + assert_eq!(c.block_ids.len(), 1); + assert_eq!(c.heading_path, Vec::::new()); + assert_eq!(c.source_spans.len(), 1); + match c.source_spans[0] { + SourceSpan::Page { page, char_start, char_end } => { + assert_eq!(page, (i as u32) + 1); + assert_eq!(char_start, Some(0)); + assert!(char_end.unwrap() > 0); + } + ref other => panic!("expected Page span, got {other:?}"), + } + } + assert_eq!(chunks[0].text, "page one"); + assert_eq!(chunks[1].text, "page two"); + assert_eq!(chunks[2].text, "page three"); + } + + #[test] + fn one_page_huge_text_splits_into_multiple_chunks_with_overlap() { + // Build a single page with 8 paragraphs of ~150 bytes each. + // target=50 tokens × 3 b/tok = 150 byte budget → each paragraph + // is itself just under budget, so 2-paragraph accumulation + // overshoots → ~8 chunks. + let para = "a".repeat(150); + let page_text = std::iter::repeat_n(para, 8) + .collect::>() + .join("\n\n"); + let doc = make_pdf_doc(&[&page_text]); + let chunks = PdfPageV1Chunker + .chunk(&doc, &default_policy(50, 20)) + .unwrap(); + assert!( + chunks.len() >= 4, + "expected ≥4 chunks for a 1200-byte page; got {}: text len={}", + chunks.len(), + page_text.len() + ); + // All chunks live on page 1. + for c in &chunks { + match c.source_spans[0] { + SourceSpan::Page { page, .. } => assert_eq!(page, 1), + _ => panic!("non-Page span"), + } + } + // Overlap: chunk N's text starts with chunk N-1's tail bytes + // (or, equivalently, chunk N's char_start lies before chunk + // N-1's char_end). + for w in chunks.windows(2) { + let prev_end = match w[0].source_spans[0] { + SourceSpan::Page { char_end: Some(e), .. } => e, + _ => panic!("missing char_end"), + }; + let next_start = match w[1].source_spans[0] { + SourceSpan::Page { char_start: Some(s), .. } => s, + _ => panic!("missing char_start"), + }; + assert!( + next_start < prev_end, + "expected overlap (next.start < prev.end): {next_start} vs {prev_end}" + ); + } + // chunk_ids stay distinct despite identical block_ids — the + // per-chunk policy_hash variant is doing its job. + let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); + ids.sort(); + let total = ids.len(); + ids.dedup(); + assert_eq!(ids.len(), total, "all chunk_ids must be unique"); + } + + #[test] + fn empty_page_produces_no_chunks_for_that_page() { + let doc = make_pdf_doc(&["page one", "", "page three"]); + let chunks = PdfPageV1Chunker + .chunk(&doc, &default_policy(500, 80)) + .unwrap(); + assert_eq!(chunks.len(), 2); + let pages: Vec = chunks + .iter() + .map(|c| match c.source_spans[0] { + SourceSpan::Page { page, .. } => page, + _ => 0, + }) + .collect(); + assert_eq!(pages, vec![1, 3]); + } + + #[test] + fn whitespace_only_page_skipped_too() { + let doc = make_pdf_doc(&["page one", " \n ", "page three"]); + let chunks = PdfPageV1Chunker + .chunk(&doc, &default_policy(500, 80)) + .unwrap(); + assert_eq!(chunks.len(), 2); + } + + #[test] + fn non_pdf_doc_returns_error() { + // A doc whose blocks carry SourceSpan::Line (Markdown shape). + let workspace_path = WorkspacePath::new("notes/note.md".into()).unwrap(); + let asset_id = AssetId("a".repeat(64)); + let parser_version = ParserVersion("md-block-v1".into()); + let doc_id = id_for_doc(&workspace_path, &asset_id, &parser_version); + let span = SourceSpan::Line { start: 1, end: 1 }; + let block_id = id_for_block(&doc_id, "paragraph", &[], 0, &span); + let blocks = vec![Block::Paragraph(TextBlock { + common: CommonBlock { + block_id, + heading_path: vec![], + source_span: span, + }, + text: "markdown body".into(), + inlines: vec![], + })]; + let doc = CanonicalDocument { + doc_id, + source_asset_id: asset_id, + workspace_path, + title: "n".into(), + lang: Lang("en".into()), + blocks, + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + }, + provenance: Provenance { events: vec![] }, + parser_version, + schema_version: 1, + doc_version: 1, + }; + let err = PdfPageV1Chunker + .chunk(&doc, &default_policy(500, 80)) + .expect_err("non-PDF doc must error"); + assert!( + err.to_string().contains("PdfPageV1Chunker"), + "error mentions chunker: {err}" + ); + } + + #[test] + fn no_chunk_crosses_page_boundary() { + // Synthetic 4-page doc with mixed page sizes — each chunk + // must claim exactly one page in its single source_span. + let big_x = "x".repeat(2000); + let big_y = "y".repeat(800); + let pages = vec![ + "tiny page one.", + big_x.as_str(), + "another tiny one.", + big_y.as_str(), + ]; + let doc = make_pdf_doc(&pages); + let chunks = PdfPageV1Chunker + .chunk(&doc, &default_policy(50, 10)) + .unwrap(); + for c in &chunks { + assert_eq!(c.source_spans.len(), 1, "chunk should hold one Page span"); + assert!(matches!(c.source_spans[0], SourceSpan::Page { .. })); + } + // Group chunks by page, verify pages are non-decreasing in + // chunk order (no interleaving across pages). + let mut prev_page = 0u32; + for c in &chunks { + let page = match c.source_spans[0] { + SourceSpan::Page { page, .. } => page, + _ => unreachable!(), + }; + assert!( + page >= prev_page, + "page numbers must be non-decreasing in chunk order: {prev_page} → {page}" + ); + prev_page = page; + } + } + + #[test] + fn deterministic_chunk_ids_1000() { + let doc = make_pdf_doc(&[ + "first page text. and another sentence here.", + &("xyz ".repeat(500)), + ]); + let policy = default_policy(80, 20); + let baseline: Vec = PdfPageV1Chunker + .chunk(&doc, &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + for _ in 0..1000 { + let again: Vec = PdfPageV1Chunker + .chunk(&doc, &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + assert_eq!(again, baseline); + } + } + + #[test] + fn snapshot_three_page_chunks_stable() { + let doc = make_pdf_doc(&[ + "Hello page 1.", + "Hello page 2 with some more body text.", + "Hello page 3.", + ]); + let chunks = PdfPageV1Chunker + .chunk(&doc, &default_policy(500, 80)) + .unwrap(); + assert_eq!(chunks.len(), 3); + for (i, c) in chunks.iter().enumerate() { + assert_eq!(c.chunker_version.0, VERSION_LABEL); + assert_eq!(c.heading_path, Vec::::new()); + assert_eq!(c.source_spans.len(), 1); + match c.source_spans[0] { + SourceSpan::Page { + page, + char_start, + char_end, + } => { + assert_eq!(page, (i as u32) + 1); + assert_eq!(char_start, Some(0)); + assert_eq!(char_end, Some(c.text.chars().count() as u32)); + } + _ => panic!("expected Page"), + } + assert!(c.policy_hash.len() == POLICY_HASH_HEX_LEN); + assert!(c.policy_hash.bytes().all(|b| b.is_ascii_hexdigit())); + } + } + + #[test] + fn policy_hash_matches_md_heading_v1_for_identical_policy() { + // Cross-chunker policy fingerprint identity — important so a + // workspace-wide "show me chunks with policy_hash = X" query + // covers both chunkers without per-chunker logic. + let p = default_policy(500, 80); + let pdf = PdfPageV1Chunker.policy_hash(&p); + let md = crate::MdHeadingV1Chunker.policy_hash(&p); + assert_eq!(pdf, md); + } +} diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index a2355fa..ed69f17 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -14,6 +14,26 @@ historical contract that was implemented; this file accumulates the deltas so phase 5+ readers can find the live behavior without diffing git history. +## 2026-05-02 — P7-2 pdf-page-v1: chunk_id collision + BYTES_PER_TOKEN + +**Discovered**: P7-2 implementation start. + +**Symptom 1 (load-bearing)**: `tasks/p7/p7-2-pdf-page-chunker.md` § Behavior contract literally says `chunk_id` per design §4.2 with `(doc_id, "pdf-page-v1", block_ids, policy_hash)`. But unlike `md-heading-v1` (which always emits at most one chunk per atomic block), `pdf-page-v1` splits one page-block into multiple chunks when page text exceeds the byte budget. All sub-chunks of the same page have identical `block_ids` → identical `chunk_id` collisions, breaking the §3.5 invariant that `chunk_id` is a primary key. + +**Symptom 2 (cosmetic)**: Spec text says `token_estimate = byte_len / 4` and "matches `md-heading-v1` proxy". Looking at the actual md-heading-v1 source (`crates/kebab-chunk/src/md_heading_v1.rs:17`), the constant is `BYTES_PER_TOKEN = 3` (chosen to cover Korean ≈ 3 b/tok and over-estimate English ≈ 4 b/tok). Spec's "/4" claim is inconsistent with the implementation it claims to match. + +**Root cause**: §4.2 chunk_id recipe was designed assuming one-chunk-per-block-set. Page-aware chunking violates that assumption. + +**Fix** (PR #38, feat/p7-2-pdf-page-chunker): + +- **Per-chunk policy_hash variant**: feed `format!("{base_policy_hash}#c{char_start}")` into `id_for_chunk`'s `policy_hash` slot so chunks within the same page get distinct `chunk_id`s. The §4.2 recipe itself stays unchanged — only the *input* to one of its slots differs per chunk. The unmodified `base_policy_hash` is still stored in `Chunk.policy_hash` so the field still answers "what policy was active" (workspace-wide policy invalidation lookups continue to work). +- **`BYTES_PER_TOKEN = 3`** (matches md-heading-v1 actual code, not spec literal). Cross-chunker policy fingerprint identity is verified by a unit test: `policy_hash_matches_md_heading_v1_for_identical_policy`. + +**Trust note**: The per-chunk hash variant is opaque (`#c` is just a marker, not interpretable as char_start by downstream tools — they read `Chunk.source_spans[0].char_start` for that). Downstream identifier comparisons on `chunk_id` continue to work as opaque blake3 hashes. + +**Amends**: +- tasks/p7/p7-2-pdf-page-chunker.md (chunk_id recipe per-chunk variant; BYTES_PER_TOKEN = 3 not 4). + ## 2026-05-02 — P6-3 caption: GenerateRequest.images + cargo feature dropped **Discovered**: P6-3 implementation start. diff --git a/tasks/p7/p7-2-pdf-page-chunker.md b/tasks/p7/p7-2-pdf-page-chunker.md index d15c9fc..2456857 100644 --- a/tasks/p7/p7-2-pdf-page-chunker.md +++ b/tasks/p7/p7-2-pdf-page-chunker.md @@ -3,7 +3,7 @@ phase: P7 component: kebab-chunk (pdf-page-v1) task_id: p7-2 title: "PDF page-aware chunker (pdf-page-v1)" -status: planned +status: completed depends_on: [p7-1] unblocks: [] contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md