// crates/kebab-parse-pdf/src/text_quality.rs (신규) // // Per-page text quality metric — vector PDF 의 valid text vs scanned PDF // 의 empty vs mojibake (ToUnicode CMap 누락 PUA codepoint) 구분. // caller (kebab-app::pdf_ocr_apply) 가 threshold 와 비교. // Source of truth: lopdf-0.32.0/src/document.rs:523 (Document::decode_text). // Only one Unimplemented marker is emitted by lopdf 0.32.0; other CMap // encodings fall through to `String::from_utf8_lossy(bytes)`, which yields // PUA / replacement-char territory already covered by `pure_pua_zero`. // Re-verify on lopdf dependency upgrade. const MOJIBAKE_MARKERS: &[&str] = &[ "?Identity-H Unimplemented?", ]; /// Valid char ratio (0.0..=1.0). 빈 string → 0.0. /// valid := ASCII printable + Hangul (Jamo/Compatibility/Syllables) + CJK + Latin Extended + common Korean punctuation. pub fn compute_valid_char_ratio(s: &str) -> f32 { // 1) Strip known mojibake markers before counting valid chars. // Identity-H CID fonts without ToUnicode CMap emit ASCII-only marker // substrings (bypassing PUA detection). let mut cleaned: String = s.to_string(); // `had_marker` guard preserves prior behavior for whitespace-only input // (returns ratio of whitespace validity, not 0.0) when no markers found. // With markers stripped, the guard enables the trim-empty check. let mut had_marker = false; for marker in MOJIBAKE_MARKERS { if cleaned.contains(marker) { had_marker = true; cleaned = cleaned.replace(marker, ""); } } // 2) Whitespace-only cleaned text → 0.0 (marker-only page). if had_marker && cleaned.trim().is_empty() { return 0.0; } // 3) Marker-dominance heuristic — when stripped chars exceed remaining // chars (i.e. marker > 50% of original), the page is "mostly mojibake // with some decodeable page-furniture" (e.g. metro-korea.pdf has // header text in a separate font + body that is Identity-H CID). // Force ratio downward to trigger OCR fallback (parent spec §1.3 intent). if had_marker { let stripped_chars = s.len().saturating_sub(cleaned.len()); if stripped_chars > cleaned.len() { // Marker dominates — cap ratio at 0.3 (below 0.5 OCR threshold). // The 0.3 cap (not 0.0) preserves a small signal that some text // WAS decodeable, useful for downstream metrics if ever exposed. let mut total = 0u32; let mut valid = 0u32; for c in cleaned.chars() { total += 1; if is_valid_text_char(c) { valid += 1; } } let raw_ratio = if total == 0 { 0.0 } else { valid as f32 / total as f32 }; return raw_ratio.min(0.3); } } // 4) Otherwise compute ratio on cleaned text (existing logic). let mut total = 0u32; let mut valid = 0u32; for c in cleaned.chars() { total += 1; if is_valid_text_char(c) { valid += 1; } } if total == 0 { return 0.0; } valid as f32 / total as f32 } fn is_valid_text_char(c: char) -> bool { let cp = c as u32; match cp { 0x0009 | 0x000A | 0x000D => true, // tab / LF / CR 0x0020..=0x007E => true, // ASCII printable 0x00A0..=0x024F => true, // Latin-1 Supplement + Latin Extended-A/B 0x1100..=0x11FF => true, // Hangul Jamo 0x3130..=0x318F => true, // Hangul Compatibility Jamo 0x4E00..=0x9FFF => true, // CJK Unified Ideographs 0xAC00..=0xD7A3 => true, // Hangul Syllables 0x2010..=0x205F => matches!(c, '\u{2010}' | '\u{2013}' | '\u{2014}' | '\u{2015}' | '\u{2018}' | '\u{2019}' | '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{2026}' | '\u{2027}' | '\u{2032}' | '\u{2033}' | '\u{00B7}'), _ => false, } } #[cfg(test)] mod tests { use super::*; #[test] fn empty_string_zero() { assert_eq!(compute_valid_char_ratio(""), 0.0); } #[test] fn pure_ascii_one() { let r = compute_valid_char_ratio("Hello, World! 12345."); assert!((r - 1.0).abs() < 1e-6, "got {r}"); } #[test] fn pure_hangul_syllables_one() { let r = compute_valid_char_ratio("안녕하세요 한글 테스트"); assert!((r - 1.0).abs() < 1e-6, "got {r}"); } #[test] fn pure_pua_zero() { // Private Use Area codepoints — mojibake 의 patten. // U+E000..U+F8FF 가 valid char list 에 없음. let s: String = (0xE000u32..0xE010).map(|c| char::from_u32(c).unwrap()).collect(); let r = compute_valid_char_ratio(&s); assert_eq!(r, 0.0); } #[test] fn mixed_half() { // 5 valid ASCII + 5 PUA → 0.5 let mut s = String::from("ABCDE"); for c in 0xE000u32..0xE005 { s.push(char::from_u32(c).unwrap()); } let r = compute_valid_char_ratio(&s); assert!((r - 0.5).abs() < 1e-6, "got {r}"); } #[test] fn cjk_ideograph_valid() { let r = compute_valid_char_ratio("漢字大韓民國"); assert!((r - 1.0).abs() < 1e-6, "got {r}"); } #[test] fn hangul_jamo_valid() { let r = compute_valid_char_ratio("\u{1100}\u{1161}"); // Jamo ㄱㅏ assert!((r - 1.0).abs() < 1e-6, "got {r}"); } // F4 measurement: pikepdf-fixed fixture (Bug #4). Pages tree 복원 후 lopdf 가 // page 1 을 로드하고 CID 2-byte code 를 fallback decode → 일부 Latin 범위 // codepoint 와 충돌 → ratio ≈ 0.375 (non-zero 이지만 production // valid_ratio_threshold=0.5 미만). OCR trigger 조건 valid. #[test] fn f4_fixture_ratio_under_threshold() { use lopdf::Document; let bytes = include_bytes!("../tests/fixtures/mojibake.pdf"); let doc = Document::load_mem(bytes).unwrap(); let text = doc.extract_text(&[1]).unwrap_or_default(); let r = compute_valid_char_ratio(&text); assert!(r < 0.5, "F4 mojibake fixture 의 valid_ratio < 0.5 (production OCR trigger threshold — got {r})"); } #[test] fn identity_h_marker_dominance_caps_ratio_below_threshold() { // metro-korea.pdf-class: 20× marker (560 char) + 11 char ASCII header. // Without dominance heuristic: ratio = 11/11 = 1.0 (bypasses OCR). // With dominance heuristic: ratio ≤ 0.3 (triggers OCR fallback). let s = format!("Page 1 of 5 {}", "?Identity-H Unimplemented?".repeat(20)); let r = compute_valid_char_ratio(&s); assert!(r <= 0.3, "marker-dominant mixed page → ratio ≤ 0.3 (OCR fallback); got {r}"); } #[test] fn identity_h_marker_minority_with_long_valid_text_keeps_high_ratio() { // Inverse case: short marker noise + long valid text → ratio stays high // (no false OCR trigger on otherwise-good pages). let header = "x".repeat(200); // 200 char valid ASCII let s = format!("{header} ?Identity-H Unimplemented?"); // 1× marker = 26 char let r = compute_valid_char_ratio(&s); assert!(r > 0.9, "marker-minority page keeps high ratio; got {r}"); } }