From a58ee10dfbef260442f2f7df33b201b6d423bcef Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 27 May 2026 15:42:59 +0000 Subject: [PATCH] fix(parse-pdf): strip Identity-H Unimplemented marker + dominance heuristic in compute_valid_char_ratio (Bug #6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Why: metro-korea.pdf (Identity-H CID font without ToUnicode CMap) 의 ingest 가 pdf_ocr_pages=0 으로 잘못 종료. lopdf 0.32.0 의 emit `?Identity-H Unimplemented?` marker 28 ASCII char 가 is_valid_text_char() 의 0x0020..=0x007E range 통과 → ratio=1.0 → OCR fallback 0.5 threshold bypass. Change: MOJIBAKE_MARKERS const + compute_valid_char_ratio() 4-단계 (strip → trim-empty zero → dominance cap-0.3 → 기존 ratio). marker list extensible. is_valid_text_char() 본체 변경 0. Tests: +2 unit (dominance + minority) on top of 기존 8. parser_version / wire schema 변경 0. Refs: docs/superpowers/specs/2026-05-27-v0.20-sub1-bugfix2-spec.md §4.1 / §4.2 / §6 R-1. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-parse-pdf/src/text_quality.rs | 81 +++++++++++++++++++++- tasks/HOTFIXES.md | 7 ++ 2 files changed, 85 insertions(+), 3 deletions(-) diff --git a/crates/kebab-parse-pdf/src/text_quality.rs b/crates/kebab-parse-pdf/src/text_quality.rs index 6db900a..4996b08 100644 --- a/crates/kebab-parse-pdf/src/text_quality.rs +++ b/crates/kebab-parse-pdf/src/text_quality.rs @@ -4,16 +4,71 @@ // 의 empty vs mojibake (ToUnicode CMap 누락 PUA codepoint) 구분. // caller (kebab-app::pdf_ocr_apply) 가 threshold 와 비교. +// Source of truth: lopdf-0.32.0/src/document.rs:523 (Document::decode_text). +// Only one Unimplemented marker is emitted by lopdf 0.32.0; other CMap +// encodings fall through to `String::from_utf8_lossy(bytes)`, which yields +// PUA / replacement-char territory already covered by `pure_pua_zero`. +// Re-verify on lopdf dependency upgrade. +const MOJIBAKE_MARKERS: &[&str] = &[ + "?Identity-H Unimplemented?", +]; + /// Valid char ratio (0.0..=1.0). 빈 string → 0.0. /// valid := ASCII printable + Hangul (Jamo/Compatibility/Syllables) + CJK + Latin Extended + common Korean punctuation. pub fn compute_valid_char_ratio(s: &str) -> f32 { + // 1) Strip known mojibake markers before counting valid chars. + // Identity-H CID fonts without ToUnicode CMap emit ASCII-only marker + // substrings (bypassing PUA detection). + let mut cleaned: String = s.to_string(); + // `had_marker` guard preserves prior behavior for whitespace-only input + // (returns ratio of whitespace validity, not 0.0) when no markers found. + // With markers stripped, the guard enables the trim-empty check. + let mut had_marker = false; + for marker in MOJIBAKE_MARKERS { + if cleaned.contains(marker) { + had_marker = true; + cleaned = cleaned.replace(marker, ""); + } + } + // 2) Whitespace-only cleaned text → 0.0 (marker-only page). + if had_marker && cleaned.trim().is_empty() { + return 0.0; + } + // 3) Marker-dominance heuristic — when stripped chars exceed remaining + // chars (i.e. marker > 50% of original), the page is "mostly mojibake + // with some decodeable page-furniture" (e.g. metro-korea.pdf has + // header text in a separate font + body that is Identity-H CID). + // Force ratio downward to trigger OCR fallback (parent spec §1.3 intent). + if had_marker { + let stripped_chars = s.len().saturating_sub(cleaned.len()); + if stripped_chars > cleaned.len() { + // Marker dominates — cap ratio at 0.3 (below 0.5 OCR threshold). + // The 0.3 cap (not 0.0) preserves a small signal that some text + // WAS decodeable, useful for downstream metrics if ever exposed. + let mut total = 0u32; + let mut valid = 0u32; + for c in cleaned.chars() { + total += 1; + if is_valid_text_char(c) { + valid += 1; + } + } + let raw_ratio = if total == 0 { 0.0 } else { valid as f32 / total as f32 }; + return raw_ratio.min(0.3); + } + } + // 4) Otherwise compute ratio on cleaned text (existing logic). let mut total = 0u32; let mut valid = 0u32; - for c in s.chars() { + for c in cleaned.chars() { total += 1; - if is_valid_text_char(c) { valid += 1; } + if is_valid_text_char(c) { + valid += 1; + } + } + if total == 0 { + return 0.0; } - if total == 0 { return 0.0; } valid as f32 / total as f32 } @@ -100,4 +155,24 @@ mod tests { let r = compute_valid_char_ratio(&text); assert!(r < 0.5, "F4 mojibake fixture 의 valid_ratio < 0.5 (production OCR trigger threshold — got {r})"); } + + #[test] + fn identity_h_marker_dominance_caps_ratio_below_threshold() { + // metro-korea.pdf-class: 20× marker (560 char) + 11 char ASCII header. + // Without dominance heuristic: ratio = 11/11 = 1.0 (bypasses OCR). + // With dominance heuristic: ratio ≤ 0.3 (triggers OCR fallback). + let s = format!("Page 1 of 5 {}", "?Identity-H Unimplemented?".repeat(20)); + let r = compute_valid_char_ratio(&s); + assert!(r <= 0.3, "marker-dominant mixed page → ratio ≤ 0.3 (OCR fallback); got {r}"); + } + + #[test] + fn identity_h_marker_minority_with_long_valid_text_keeps_high_ratio() { + // Inverse case: short marker noise + long valid text → ratio stays high + // (no false OCR trigger on otherwise-good pages). + let header = "x".repeat(200); // 200 char valid ASCII + let s = format!("{header} ?Identity-H Unimplemented?"); // 1× marker = 26 char + let r = compute_valid_char_ratio(&s); + assert!(r > 0.9, "marker-minority page keeps high ratio; got {r}"); + } } diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index aa5c145..36648c4 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -14,6 +14,13 @@ historical contract that was implemented; this file accumulates the deltas so phase 5+ readers can find the live behavior without diffing git history. +## 2026-05-27 — Identity-H mojibake marker bypassed OCR fallback (Bug #6) + +- **Symptom**: `metro-korea.pdf` (Identity-H CID font without ToUnicode CMap) 의 ingest 가 `pdf_ocr_pages=0` 으로 종료. text 전체가 `?Identity-H Unimplemented?` marker 1154회 반복 (lopdf 0.32.0 emit). text-detect ratio = 1.0 → OCR fallback threshold 0.5 bypass. +- **Root cause**: `crates/kebab-parse-pdf/src/text_quality.rs::compute_valid_char_ratio()` 의 `is_valid_text_char()` 가 ASCII printable range (0x0020..=0x007E) 를 unconditional valid 처리. marker (28 ASCII char) 는 valid 로 count. +- **Fix**: `MOJIBAKE_MARKERS` const 도입 + marker strip after-strip 의 trim-empty → 0.0 + dominance heuristic (strip > 잔여 일 때 cap 0.3). spec ACCEPT: `docs/superpowers/specs/2026-05-27-v0.20-sub1-bugfix2-spec.md` §4.1. parser_version/wire schema 영향 0. +- **User action**: 이미 `metro-korea.pdf` class 의 mojibake-heavy PDF 를 v0.20.0 pre-bugfix2 binary 로 indexed 한 경우, `kebab ingest --force-reingest ` 로 cached skip 무효화 필요 (release notes 동등 안내). + ## 2026-05-27 — v0.20.0 sub-item 1: chunk_id `#c{char_start}` workaround collapses under aggressive overlap (Bug #3 second-iteration patch) **Symptom**: F2 (1580 chars OCR, scanned_page2.pdf) ingest 시