feat(parse-pdf): add page_image (DCTDecode passthrough, 2 test) + text_quality (valid char ratio, 8 unit test) modules

Step 3 (Group C) of v0.20.0 sub-item 1 (scanned PDF OCR) plan. C1 — `page_image::extract_dctdecode_page_image(pdf_doc, page_num)` -> Result<Option<Vec<u8>>>. lopdf 의 Resources/XObject traverse, 첫 image XObject 의 /Filter 검사 (single Name OR Array form 모두 cover, spec §4.1 line 642-664), DCTDecode + JPEG magic 검증 통과 시 raw bytes 반환. 다른 encoding 또는 image XObject 부재 시 Ok(None). v1 scope = DCTDecode passthrough only (H-3 invariant, image crate 도입 0). Integration test (`tests/page_image.rs`, 2 test): - f1_fixture_yields_dctdecode_jpeg_bytes — F1 fixture happy path. - flate_raw_fixture_yields_none — F6 fixture negative path. C2 — `text_quality::compute_valid_char_ratio(s) -> f32`. valid char = ASCII printable + Hangul (Jamo/Compatibility/Syllables) + CJK + Latin Extended + common Korean punctuation. 빈 string → 0.0. caller (`kebab-app::pdf_ocr_apply`) 가 threshold 와 비교 (default 0.5). Unit test (`mod tests`, 7 + F4 conditional): - empty / pure ASCII / pure Hangul / pure PUA / mixed half / CJK / Hangul Jamo. - f4_fixture_ratio_under_threshold: active (case A — lopdf extract_text 가 ToUnicode CMap 부재 시 빈 string 반환 → valid_ratio = 0.0000 < 0.3). Also: Cargo.toml description 갱신 ("Text PDF extractor + scanned-page image extract helpers ...", Step 1 A2 이연분). fixture fix: mojibake.pdf 의 startxref 22130 → 22114 (16-byte offset 오차 수정 — lopdf strict parser 가 xref 를 찾지 못하는 버그 해결). spec: docs/superpowers/specs/2026-05-27-pdf-scanned-ocr-spec.md (§4.1 line 600-722) plan: docs/superpowers/plans/2026-05-27-pdf-scanned-ocr-plan.md (Step 3 C1+C2) prior: aeeff36 (Step 2 fixtures) + fb3952d (Step 2 F7 record fix) contract: §9 (additive minor wire bump — 후속 step) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 05:59:10 +00:00
parent fb3952d54f
commit c2cd3a7ab7
6 changed files with 209 additions and 1 deletions
--- a/crates/kebab-parse-pdf/Cargo.toml
+++ b/crates/kebab-parse-pdf/Cargo.toml
@@ -5,7 +5,7 @@ edition       = { workspace = true }
 rust-version  = { workspace = true }
 license       = { workspace = true }
 repository    = { workspace = true }
-description   = "Text PDF extractor (per-page text + page citation) for the kebab pipeline (P7-1)"
+description   = "Text PDF extractor + scanned-page image extract helpers for the kebab pipeline (P7-1 + v0.20.0 sub-item 1)"

 [dependencies]
 kebab-core   = { path = "../kebab-core" }
--- a/crates/kebab-parse-pdf/src/lib.rs
+++ b/crates/kebab-parse-pdf/src/lib.rs
@@ -18,7 +18,12 @@
 //! §9.2 (PDF text extraction), §9 versioning.

 mod info;
+mod page_image;
 mod page_text;
+mod text_quality;
+
+pub use page_image::extract_dctdecode_page_image;
+pub use text_quality::compute_valid_char_ratio;

 use anyhow::{Context, Result};
 use kebab_core::{
--- a/crates/kebab-parse-pdf/src/page_image.rs
+++ b/crates/kebab-parse-pdf/src/page_image.rs
@@ -0,0 +1,77 @@
+// crates/kebab-parse-pdf/src/page_image.rs (신규)
+//
+// PDF page → DCTDecode JPEG bytes extract. lopdf 의 page 의 Resources/XObject
+// 를 traverse, 첫 image XObject 의 /Filter 검사, DCTDecode + JPEG magic
+// 검증 통과 시 raw bytes 반환. 다른 encoding (FlateDecode / CCITTFax /
+// JPXDecode) 또는 image XObject 없음 시 Ok(None).
+//
+// v1 scope = DCTDecode passthrough only (H-3 resolution 갈래 A). image
+// crate 도입 0 → single binary 원칙 보존.
+
+use anyhow::{Context, Result};
+use lopdf::{Document, Object};
+
+pub fn extract_dctdecode_page_image(
+    pdf_doc: &Document,
+    page_num: u32,
+) -> Result<Option<Vec<u8>>> {
+    let pages = pdf_doc.get_pages();
+    let &page_oid = pages.get(&page_num)
+        .with_context(|| format!("page {} not in get_pages()", page_num))?;
+
+    // page → /Resources → /XObject → traverse for first /Subtype /Image with /Filter == /DCTDecode.
+    let page = pdf_doc.get_dictionary(page_oid)?;
+    let resources_obj = page.get(b"Resources").ok();
+    let resources = match resources_obj {
+        Some(Object::Dictionary(d)) => Some(d.clone()),
+        Some(Object::Reference(r)) => pdf_doc.get_dictionary(*r).ok().cloned(),
+        _ => None,
+    };
+    let resources = match resources { Some(r) => r, None => return Ok(None) };
+
+    let xobject_obj = resources.get(b"XObject").ok();
+    let xobject = match xobject_obj {
+        Some(Object::Dictionary(d)) => d.clone(),
+        Some(Object::Reference(r)) => match pdf_doc.get_dictionary(*r) { Ok(d) => d.clone(), Err(_) => return Ok(None) },
+        _ => return Ok(None),
+    };
+
+    for (_name, obj) in xobject.iter() {
+        let stream_oid = match obj {
+            Object::Reference(r) => *r,
+            _ => continue,
+        };
+        let stream = match pdf_doc.get_object(stream_oid) {
+            Ok(Object::Stream(s)) => s.clone(),
+            _ => continue,
+        };
+        let subtype_is_image = stream.dict.get(b"Subtype")
+            .ok()
+            .and_then(|o| match o { Object::Name(n) => Some(n.as_slice()), _ => None })
+            .map(|n| n == b"Image")
+            .unwrap_or(false);
+        if !subtype_is_image { continue; }
+
+        let filter_obj = stream.dict.get(b"Filter").ok();
+        let is_dct_only = match filter_obj {
+            Some(Object::Name(n)) => n.as_slice() == b"DCTDecode",
+            Some(Object::Array(arr)) => arr.len() == 1
+                && matches!(arr.first(), Some(Object::Name(n)) if n.as_slice() == b"DCTDecode"),
+            _ => false,
+        };
+        if !is_dct_only { continue; }
+
+        // raw bytes — lopdf 의 stream.content 는 already-encoded (filter 적용
+        // 후). DCTDecode 의 경우 raw JPEG bytes.
+        let bytes = stream.content.clone();
+        if bytes.len() < 4 || &bytes[0..2] != b"\xFF\xD8" {
+            tracing::warn!(
+                target: "kebab-parse-pdf",
+                "page={} DCTDecode stream missing JPEG magic byte (\\xFF\\xD8), skip", page_num
+            );
+            return Ok(None);
+        }
+        return Ok(Some(bytes));
+    }
+    Ok(None)
+}
--- a/crates/kebab-parse-pdf/src/text_quality.rs
+++ b/crates/kebab-parse-pdf/src/text_quality.rs
@@ -0,0 +1,102 @@
+// crates/kebab-parse-pdf/src/text_quality.rs (신규)
+//
+// Per-page text quality metric — vector PDF 의 valid text vs scanned PDF
+// 의 empty vs mojibake (ToUnicode CMap 누락 PUA codepoint) 구분.
+// caller (kebab-app::pdf_ocr_apply) 가 threshold 와 비교.
+
+/// Valid char ratio (0.0..=1.0). 빈 string → 0.0.
+/// valid := ASCII printable + Hangul (Jamo/Compatibility/Syllables) + CJK + Latin Extended + common Korean punctuation.
+pub fn compute_valid_char_ratio(s: &str) -> f32 {
+    let mut total = 0u32;
+    let mut valid = 0u32;
+    for c in s.chars() {
+        total += 1;
+        if is_valid_text_char(c) { valid += 1; }
+    }
+    if total == 0 { return 0.0; }
+    valid as f32 / total as f32
+}
+
+fn is_valid_text_char(c: char) -> bool {
+    let cp = c as u32;
+    match cp {
+        0x0009 | 0x000A | 0x000D => true,                  // tab / LF / CR
+        0x0020..=0x007E => true,                            // ASCII printable
+        0x00A0..=0x024F => true,                            // Latin-1 Supplement + Latin Extended-A/B
+        0x1100..=0x11FF => true,                            // Hangul Jamo
+        0x3130..=0x318F => true,                            // Hangul Compatibility Jamo
+        0x4E00..=0x9FFF => true,                            // CJK Unified Ideographs
+        0xAC00..=0xD7A3 => true,                            // Hangul Syllables
+        0x2010..=0x205F => matches!(c,
+            '\u{2010}' | '\u{2013}' | '\u{2014}' | '\u{2015}' |
+            '\u{2018}' | '\u{2019}' | '\u{201C}' | '\u{201D}' |
+            '\u{201E}' | '\u{2026}' | '\u{2027}' | '\u{2032}' | '\u{2033}'
+            | '\u{00B7}'),
+        _ => false,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn empty_string_zero() {
+        assert_eq!(compute_valid_char_ratio(""), 0.0);
+    }
+
+    #[test]
+    fn pure_ascii_one() {
+        let r = compute_valid_char_ratio("Hello, World! 12345.");
+        assert!((r - 1.0).abs() < 1e-6, "got {r}");
+    }
+
+    #[test]
+    fn pure_hangul_syllables_one() {
+        let r = compute_valid_char_ratio("안녕하세요 한글 테스트");
+        assert!((r - 1.0).abs() < 1e-6, "got {r}");
+    }
+
+    #[test]
+    fn pure_pua_zero() {
+        // Private Use Area codepoints — mojibake 의 patten.
+        // U+E000..U+F8FF 가 valid char list 에 없음.
+        let s: String = (0xE000u32..0xE010).map(|c| char::from_u32(c).unwrap()).collect();
+        let r = compute_valid_char_ratio(&s);
+        assert_eq!(r, 0.0);
+    }
+
+    #[test]
+    fn mixed_half() {
+        // 5 valid ASCII + 5 PUA → 0.5
+        let mut s = String::from("ABCDE");
+        for c in 0xE000u32..0xE005 { s.push(char::from_u32(c).unwrap()); }
+        let r = compute_valid_char_ratio(&s);
+        assert!((r - 0.5).abs() < 1e-6, "got {r}");
+    }
+
+    #[test]
+    fn cjk_ideograph_valid() {
+        let r = compute_valid_char_ratio("漢字大韓民國");
+        assert!((r - 1.0).abs() < 1e-6, "got {r}");
+    }
+
+    #[test]
+    fn hangul_jamo_valid() {
+        let r = compute_valid_char_ratio("\u{1100}\u{1161}");  // Jamo ㄱㅏ
+        assert!((r - 1.0).abs() < 1e-6, "got {r}");
+    }
+
+    // F4 measurement: valid_ratio = 0.0000 (lopdf returns empty string — ToUnicode CMap 부재로
+    // extract_text 가 빈 text 반환). Case A (< 0.3) → active.
+    // fixture fix: mojibake.pdf 의 startxref 22130 → 22114 (16-byte offset 오차 수정).
+    #[test]
+    fn f4_fixture_ratio_under_threshold() {
+        use lopdf::Document;
+        let bytes = include_bytes!("../tests/fixtures/mojibake.pdf");
+        let doc = Document::load_mem(bytes).unwrap();
+        let text = doc.extract_text(&[1]).unwrap_or_default();
+        let r = compute_valid_char_ratio(&text);
+        assert!(r < 0.3, "F4 mojibake fixture 의 valid_ratio < 0.3 (got {r})");
+    }
+}
--- a/crates/kebab-parse-pdf/tests/fixtures/mojibake.pdf
+++ b/crates/kebab-parse-pdf/tests/fixtures/mojibake.pdf
--- a/crates/kebab-parse-pdf/tests/page_image.rs
+++ b/crates/kebab-parse-pdf/tests/page_image.rs
@@ -0,0 +1,24 @@
+// crates/kebab-parse-pdf/tests/page_image.rs (신규)
+
+use lopdf::Document;
+use kebab_parse_pdf::extract_dctdecode_page_image;
+
+// happy path — F1 fixture (DCTDecode JPEG passthrough)
+#[test]
+fn f1_fixture_yields_dctdecode_jpeg_bytes() {
+    let bytes = include_bytes!("fixtures/scanned_page1.pdf");
+    let doc = Document::load_mem(bytes).unwrap();
+    let result = extract_dctdecode_page_image(&doc, 1).unwrap();
+    let jpeg = result.expect("F1 의 page 1 이 DCTDecode image 보유");
+    assert!(jpeg.starts_with(b"\xFF\xD8"), "JPEG magic missing");
+    assert!(jpeg.len() > 1000, "JPEG bytes too small (got {})", jpeg.len());
+}
+
+// negative path — F6 fixture (FlateDecode raw pixel — Ok(None))
+#[test]
+fn flate_raw_fixture_yields_none() {
+    let bytes = include_bytes!("fixtures/flate_raw.pdf");
+    let doc = Document::load_mem(bytes).unwrap();
+    let result = extract_dctdecode_page_image(&doc, 1).unwrap();
+    assert!(result.is_none(), "FlateDecode page 가 Ok(None) 반환 — DCTDecode-only v1 invariant");
+}