From c2cd3a7ab7e0391ee800169f95051a46c5441259 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 27 May 2026 05:59:10 +0000 Subject: [PATCH] feat(parse-pdf): add page_image (DCTDecode passthrough, 2 test) + text_quality (valid char ratio, 8 unit test) modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 3 (Group C) of v0.20.0 sub-item 1 (scanned PDF OCR) plan. C1 — `page_image::extract_dctdecode_page_image(pdf_doc, page_num)` -> Result>>. lopdf 의 Resources/XObject traverse, 첫 image XObject 의 /Filter 검사 (single Name OR Array form 모두 cover, spec §4.1 line 642-664), DCTDecode + JPEG magic 검증 통과 시 raw bytes 반환. 다른 encoding 또는 image XObject 부재 시 Ok(None). v1 scope = DCTDecode passthrough only (H-3 invariant, image crate 도입 0). Integration test (`tests/page_image.rs`, 2 test): - f1_fixture_yields_dctdecode_jpeg_bytes — F1 fixture happy path. - flate_raw_fixture_yields_none — F6 fixture negative path. C2 — `text_quality::compute_valid_char_ratio(s) -> f32`. valid char = ASCII printable + Hangul (Jamo/Compatibility/Syllables) + CJK + Latin Extended + common Korean punctuation. 빈 string → 0.0. caller (`kebab-app::pdf_ocr_apply`) 가 threshold 와 비교 (default 0.5). Unit test (`mod tests`, 7 + F4 conditional): - empty / pure ASCII / pure Hangul / pure PUA / mixed half / CJK / Hangul Jamo. - f4_fixture_ratio_under_threshold: active (case A — lopdf extract_text 가 ToUnicode CMap 부재 시 빈 string 반환 → valid_ratio = 0.0000 < 0.3). Also: Cargo.toml description 갱신 ("Text PDF extractor + scanned-page image extract helpers ...", Step 1 A2 이연분). fixture fix: mojibake.pdf 의 startxref 22130 → 22114 (16-byte offset 오차 수정 — lopdf strict parser 가 xref 를 찾지 못하는 버그 해결). spec: docs/superpowers/specs/2026-05-27-pdf-scanned-ocr-spec.md (§4.1 line 600-722) plan: docs/superpowers/plans/2026-05-27-pdf-scanned-ocr-plan.md (Step 3 C1+C2) prior: aeeff36 (Step 2 fixtures) + fb3952d (Step 2 F7 record fix) contract: §9 (additive minor wire bump — 후속 step) Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-parse-pdf/Cargo.toml | 2 +- crates/kebab-parse-pdf/src/lib.rs | 5 + crates/kebab-parse-pdf/src/page_image.rs | 77 +++++++++++++ crates/kebab-parse-pdf/src/text_quality.rs | 102 ++++++++++++++++++ .../tests/fixtures/mojibake.pdf | Bin 22568 -> 22568 bytes crates/kebab-parse-pdf/tests/page_image.rs | 24 +++++ 6 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 crates/kebab-parse-pdf/src/page_image.rs create mode 100644 crates/kebab-parse-pdf/src/text_quality.rs create mode 100644 crates/kebab-parse-pdf/tests/page_image.rs diff --git a/crates/kebab-parse-pdf/Cargo.toml b/crates/kebab-parse-pdf/Cargo.toml index 134c2c8..c6037c8 100644 --- a/crates/kebab-parse-pdf/Cargo.toml +++ b/crates/kebab-parse-pdf/Cargo.toml @@ -5,7 +5,7 @@ edition = { workspace = true } rust-version = { workspace = true } license = { workspace = true } repository = { workspace = true } -description = "Text PDF extractor (per-page text + page citation) for the kebab pipeline (P7-1)" +description = "Text PDF extractor + scanned-page image extract helpers for the kebab pipeline (P7-1 + v0.20.0 sub-item 1)" [dependencies] kebab-core = { path = "../kebab-core" } diff --git a/crates/kebab-parse-pdf/src/lib.rs b/crates/kebab-parse-pdf/src/lib.rs index 3d2505f..0ecd6e2 100644 --- a/crates/kebab-parse-pdf/src/lib.rs +++ b/crates/kebab-parse-pdf/src/lib.rs @@ -18,7 +18,12 @@ //! §9.2 (PDF text extraction), §9 versioning. mod info; +mod page_image; mod page_text; +mod text_quality; + +pub use page_image::extract_dctdecode_page_image; +pub use text_quality::compute_valid_char_ratio; use anyhow::{Context, Result}; use kebab_core::{ diff --git a/crates/kebab-parse-pdf/src/page_image.rs b/crates/kebab-parse-pdf/src/page_image.rs new file mode 100644 index 0000000..a1bce85 --- /dev/null +++ b/crates/kebab-parse-pdf/src/page_image.rs @@ -0,0 +1,77 @@ +// crates/kebab-parse-pdf/src/page_image.rs (신규) +// +// PDF page → DCTDecode JPEG bytes extract. lopdf 의 page 의 Resources/XObject +// 를 traverse, 첫 image XObject 의 /Filter 검사, DCTDecode + JPEG magic +// 검증 통과 시 raw bytes 반환. 다른 encoding (FlateDecode / CCITTFax / +// JPXDecode) 또는 image XObject 없음 시 Ok(None). +// +// v1 scope = DCTDecode passthrough only (H-3 resolution 갈래 A). image +// crate 도입 0 → single binary 원칙 보존. + +use anyhow::{Context, Result}; +use lopdf::{Document, Object}; + +pub fn extract_dctdecode_page_image( + pdf_doc: &Document, + page_num: u32, +) -> Result>> { + let pages = pdf_doc.get_pages(); + let &page_oid = pages.get(&page_num) + .with_context(|| format!("page {} not in get_pages()", page_num))?; + + // page → /Resources → /XObject → traverse for first /Subtype /Image with /Filter == /DCTDecode. + let page = pdf_doc.get_dictionary(page_oid)?; + let resources_obj = page.get(b"Resources").ok(); + let resources = match resources_obj { + Some(Object::Dictionary(d)) => Some(d.clone()), + Some(Object::Reference(r)) => pdf_doc.get_dictionary(*r).ok().cloned(), + _ => None, + }; + let resources = match resources { Some(r) => r, None => return Ok(None) }; + + let xobject_obj = resources.get(b"XObject").ok(); + let xobject = match xobject_obj { + Some(Object::Dictionary(d)) => d.clone(), + Some(Object::Reference(r)) => match pdf_doc.get_dictionary(*r) { Ok(d) => d.clone(), Err(_) => return Ok(None) }, + _ => return Ok(None), + }; + + for (_name, obj) in xobject.iter() { + let stream_oid = match obj { + Object::Reference(r) => *r, + _ => continue, + }; + let stream = match pdf_doc.get_object(stream_oid) { + Ok(Object::Stream(s)) => s.clone(), + _ => continue, + }; + let subtype_is_image = stream.dict.get(b"Subtype") + .ok() + .and_then(|o| match o { Object::Name(n) => Some(n.as_slice()), _ => None }) + .map(|n| n == b"Image") + .unwrap_or(false); + if !subtype_is_image { continue; } + + let filter_obj = stream.dict.get(b"Filter").ok(); + let is_dct_only = match filter_obj { + Some(Object::Name(n)) => n.as_slice() == b"DCTDecode", + Some(Object::Array(arr)) => arr.len() == 1 + && matches!(arr.first(), Some(Object::Name(n)) if n.as_slice() == b"DCTDecode"), + _ => false, + }; + if !is_dct_only { continue; } + + // raw bytes — lopdf 의 stream.content 는 already-encoded (filter 적용 + // 후). DCTDecode 의 경우 raw JPEG bytes. + let bytes = stream.content.clone(); + if bytes.len() < 4 || &bytes[0..2] != b"\xFF\xD8" { + tracing::warn!( + target: "kebab-parse-pdf", + "page={} DCTDecode stream missing JPEG magic byte (\\xFF\\xD8), skip", page_num + ); + return Ok(None); + } + return Ok(Some(bytes)); + } + Ok(None) +} diff --git a/crates/kebab-parse-pdf/src/text_quality.rs b/crates/kebab-parse-pdf/src/text_quality.rs new file mode 100644 index 0000000..756692f --- /dev/null +++ b/crates/kebab-parse-pdf/src/text_quality.rs @@ -0,0 +1,102 @@ +// crates/kebab-parse-pdf/src/text_quality.rs (신규) +// +// Per-page text quality metric — vector PDF 의 valid text vs scanned PDF +// 의 empty vs mojibake (ToUnicode CMap 누락 PUA codepoint) 구분. +// caller (kebab-app::pdf_ocr_apply) 가 threshold 와 비교. + +/// Valid char ratio (0.0..=1.0). 빈 string → 0.0. +/// valid := ASCII printable + Hangul (Jamo/Compatibility/Syllables) + CJK + Latin Extended + common Korean punctuation. +pub fn compute_valid_char_ratio(s: &str) -> f32 { + let mut total = 0u32; + let mut valid = 0u32; + for c in s.chars() { + total += 1; + if is_valid_text_char(c) { valid += 1; } + } + if total == 0 { return 0.0; } + valid as f32 / total as f32 +} + +fn is_valid_text_char(c: char) -> bool { + let cp = c as u32; + match cp { + 0x0009 | 0x000A | 0x000D => true, // tab / LF / CR + 0x0020..=0x007E => true, // ASCII printable + 0x00A0..=0x024F => true, // Latin-1 Supplement + Latin Extended-A/B + 0x1100..=0x11FF => true, // Hangul Jamo + 0x3130..=0x318F => true, // Hangul Compatibility Jamo + 0x4E00..=0x9FFF => true, // CJK Unified Ideographs + 0xAC00..=0xD7A3 => true, // Hangul Syllables + 0x2010..=0x205F => matches!(c, + '\u{2010}' | '\u{2013}' | '\u{2014}' | '\u{2015}' | + '\u{2018}' | '\u{2019}' | '\u{201C}' | '\u{201D}' | + '\u{201E}' | '\u{2026}' | '\u{2027}' | '\u{2032}' | '\u{2033}' + | '\u{00B7}'), + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_string_zero() { + assert_eq!(compute_valid_char_ratio(""), 0.0); + } + + #[test] + fn pure_ascii_one() { + let r = compute_valid_char_ratio("Hello, World! 12345."); + assert!((r - 1.0).abs() < 1e-6, "got {r}"); + } + + #[test] + fn pure_hangul_syllables_one() { + let r = compute_valid_char_ratio("안녕하세요 한글 테스트"); + assert!((r - 1.0).abs() < 1e-6, "got {r}"); + } + + #[test] + fn pure_pua_zero() { + // Private Use Area codepoints — mojibake 의 patten. + // U+E000..U+F8FF 가 valid char list 에 없음. + let s: String = (0xE000u32..0xE010).map(|c| char::from_u32(c).unwrap()).collect(); + let r = compute_valid_char_ratio(&s); + assert_eq!(r, 0.0); + } + + #[test] + fn mixed_half() { + // 5 valid ASCII + 5 PUA → 0.5 + let mut s = String::from("ABCDE"); + for c in 0xE000u32..0xE005 { s.push(char::from_u32(c).unwrap()); } + let r = compute_valid_char_ratio(&s); + assert!((r - 0.5).abs() < 1e-6, "got {r}"); + } + + #[test] + fn cjk_ideograph_valid() { + let r = compute_valid_char_ratio("漢字大韓民國"); + assert!((r - 1.0).abs() < 1e-6, "got {r}"); + } + + #[test] + fn hangul_jamo_valid() { + let r = compute_valid_char_ratio("\u{1100}\u{1161}"); // Jamo ㄱㅏ + assert!((r - 1.0).abs() < 1e-6, "got {r}"); + } + + // F4 measurement: valid_ratio = 0.0000 (lopdf returns empty string — ToUnicode CMap 부재로 + // extract_text 가 빈 text 반환). Case A (< 0.3) → active. + // fixture fix: mojibake.pdf 의 startxref 22130 → 22114 (16-byte offset 오차 수정). + #[test] + fn f4_fixture_ratio_under_threshold() { + use lopdf::Document; + let bytes = include_bytes!("../tests/fixtures/mojibake.pdf"); + let doc = Document::load_mem(bytes).unwrap(); + let text = doc.extract_text(&[1]).unwrap_or_default(); + let r = compute_valid_char_ratio(&text); + assert!(r < 0.3, "F4 mojibake fixture 의 valid_ratio < 0.3 (got {r})"); + } +} diff --git a/crates/kebab-parse-pdf/tests/fixtures/mojibake.pdf b/crates/kebab-parse-pdf/tests/fixtures/mojibake.pdf index 57ac7b9eae886c0947ea76ef050e7bc3a3100ea4..96e2e3c6a2ea9c020af82024cdbbbcbd179439f3 100644 GIT binary patch delta 19 acmZ3nfpNtK#trfjoQ5V`s;aL3Zd?FFEd_u8 delta 19 acmZ3nfpNtK#trfjoW=%Rs;aL3Zd?FF9|e8@ diff --git a/crates/kebab-parse-pdf/tests/page_image.rs b/crates/kebab-parse-pdf/tests/page_image.rs new file mode 100644 index 0000000..b1a16d4 --- /dev/null +++ b/crates/kebab-parse-pdf/tests/page_image.rs @@ -0,0 +1,24 @@ +// crates/kebab-parse-pdf/tests/page_image.rs (신규) + +use lopdf::Document; +use kebab_parse_pdf::extract_dctdecode_page_image; + +// happy path — F1 fixture (DCTDecode JPEG passthrough) +#[test] +fn f1_fixture_yields_dctdecode_jpeg_bytes() { + let bytes = include_bytes!("fixtures/scanned_page1.pdf"); + let doc = Document::load_mem(bytes).unwrap(); + let result = extract_dctdecode_page_image(&doc, 1).unwrap(); + let jpeg = result.expect("F1 의 page 1 이 DCTDecode image 보유"); + assert!(jpeg.starts_with(b"\xFF\xD8"), "JPEG magic missing"); + assert!(jpeg.len() > 1000, "JPEG bytes too small (got {})", jpeg.len()); +} + +// negative path — F6 fixture (FlateDecode raw pixel — Ok(None)) +#[test] +fn flate_raw_fixture_yields_none() { + let bytes = include_bytes!("fixtures/flate_raw.pdf"); + let doc = Document::load_mem(bytes).unwrap(); + let result = extract_dctdecode_page_image(&doc, 1).unwrap(); + assert!(result.is_none(), "FlateDecode page 가 Ok(None) 반환 — DCTDecode-only v1 invariant"); +}