feat(parse-pdf): add page_image (DCTDecode passthrough, 2 test) + text_quality (valid char ratio, 8 unit test) modules
Step 3 (Group C) of v0.20.0 sub-item 1 (scanned PDF OCR) plan.
C1 — `page_image::extract_dctdecode_page_image(pdf_doc, page_num)` ->
Result<Option<Vec<u8>>>. lopdf 의 Resources/XObject traverse, 첫 image
XObject 의 /Filter 검사 (single Name OR Array form 모두 cover, spec §4.1
line 642-664), DCTDecode + JPEG magic 검증 통과 시 raw bytes 반환. 다른
encoding 또는 image XObject 부재 시 Ok(None). v1 scope = DCTDecode
passthrough only (H-3 invariant, image crate 도입 0).
Integration test (`tests/page_image.rs`, 2 test):
- f1_fixture_yields_dctdecode_jpeg_bytes — F1 fixture happy path.
- flate_raw_fixture_yields_none — F6 fixture negative path.
C2 — `text_quality::compute_valid_char_ratio(s) -> f32`. valid char =
ASCII printable + Hangul (Jamo/Compatibility/Syllables) + CJK + Latin
Extended + common Korean punctuation. 빈 string → 0.0. caller
(`kebab-app::pdf_ocr_apply`) 가 threshold 와 비교 (default 0.5).
Unit test (`mod tests`, 7 + F4 conditional):
- empty / pure ASCII / pure Hangul / pure PUA / mixed half / CJK / Hangul Jamo.
- f4_fixture_ratio_under_threshold: active (case A — lopdf extract_text 가
ToUnicode CMap 부재 시 빈 string 반환 → valid_ratio = 0.0000 < 0.3).
Also: Cargo.toml description 갱신 ("Text PDF extractor + scanned-page
image extract helpers ...", Step 1 A2 이연분).
fixture fix: mojibake.pdf 의 startxref 22130 → 22114 (16-byte offset 오차
수정 — lopdf strict parser 가 xref 를 찾지 못하는 버그 해결).
spec: docs/superpowers/specs/2026-05-27-pdf-scanned-ocr-spec.md (§4.1 line 600-722)
plan: docs/superpowers/plans/2026-05-27-pdf-scanned-ocr-plan.md (Step 3 C1+C2)
prior: aeeff36 (Step 2 fixtures) + fb3952d (Step 2 F7 record fix)
contract: §9 (additive minor wire bump — 후속 step)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -5,7 +5,7 @@ edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Text PDF extractor (per-page text + page citation) for the kebab pipeline (P7-1)"
|
||||
description = "Text PDF extractor + scanned-page image extract helpers for the kebab pipeline (P7-1 + v0.20.0 sub-item 1)"
|
||||
|
||||
[dependencies]
|
||||
kebab-core = { path = "../kebab-core" }
|
||||
|
||||
@@ -18,7 +18,12 @@
|
||||
//! §9.2 (PDF text extraction), §9 versioning.
|
||||
|
||||
mod info;
|
||||
mod page_image;
|
||||
mod page_text;
|
||||
mod text_quality;
|
||||
|
||||
pub use page_image::extract_dctdecode_page_image;
|
||||
pub use text_quality::compute_valid_char_ratio;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kebab_core::{
|
||||
|
||||
77
crates/kebab-parse-pdf/src/page_image.rs
Normal file
77
crates/kebab-parse-pdf/src/page_image.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
// crates/kebab-parse-pdf/src/page_image.rs (신규)
|
||||
//
|
||||
// PDF page → DCTDecode JPEG bytes extract. lopdf 의 page 의 Resources/XObject
|
||||
// 를 traverse, 첫 image XObject 의 /Filter 검사, DCTDecode + JPEG magic
|
||||
// 검증 통과 시 raw bytes 반환. 다른 encoding (FlateDecode / CCITTFax /
|
||||
// JPXDecode) 또는 image XObject 없음 시 Ok(None).
|
||||
//
|
||||
// v1 scope = DCTDecode passthrough only (H-3 resolution 갈래 A). image
|
||||
// crate 도입 0 → single binary 원칙 보존.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use lopdf::{Document, Object};
|
||||
|
||||
pub fn extract_dctdecode_page_image(
|
||||
pdf_doc: &Document,
|
||||
page_num: u32,
|
||||
) -> Result<Option<Vec<u8>>> {
|
||||
let pages = pdf_doc.get_pages();
|
||||
let &page_oid = pages.get(&page_num)
|
||||
.with_context(|| format!("page {} not in get_pages()", page_num))?;
|
||||
|
||||
// page → /Resources → /XObject → traverse for first /Subtype /Image with /Filter == /DCTDecode.
|
||||
let page = pdf_doc.get_dictionary(page_oid)?;
|
||||
let resources_obj = page.get(b"Resources").ok();
|
||||
let resources = match resources_obj {
|
||||
Some(Object::Dictionary(d)) => Some(d.clone()),
|
||||
Some(Object::Reference(r)) => pdf_doc.get_dictionary(*r).ok().cloned(),
|
||||
_ => None,
|
||||
};
|
||||
let resources = match resources { Some(r) => r, None => return Ok(None) };
|
||||
|
||||
let xobject_obj = resources.get(b"XObject").ok();
|
||||
let xobject = match xobject_obj {
|
||||
Some(Object::Dictionary(d)) => d.clone(),
|
||||
Some(Object::Reference(r)) => match pdf_doc.get_dictionary(*r) { Ok(d) => d.clone(), Err(_) => return Ok(None) },
|
||||
_ => return Ok(None),
|
||||
};
|
||||
|
||||
for (_name, obj) in xobject.iter() {
|
||||
let stream_oid = match obj {
|
||||
Object::Reference(r) => *r,
|
||||
_ => continue,
|
||||
};
|
||||
let stream = match pdf_doc.get_object(stream_oid) {
|
||||
Ok(Object::Stream(s)) => s.clone(),
|
||||
_ => continue,
|
||||
};
|
||||
let subtype_is_image = stream.dict.get(b"Subtype")
|
||||
.ok()
|
||||
.and_then(|o| match o { Object::Name(n) => Some(n.as_slice()), _ => None })
|
||||
.map(|n| n == b"Image")
|
||||
.unwrap_or(false);
|
||||
if !subtype_is_image { continue; }
|
||||
|
||||
let filter_obj = stream.dict.get(b"Filter").ok();
|
||||
let is_dct_only = match filter_obj {
|
||||
Some(Object::Name(n)) => n.as_slice() == b"DCTDecode",
|
||||
Some(Object::Array(arr)) => arr.len() == 1
|
||||
&& matches!(arr.first(), Some(Object::Name(n)) if n.as_slice() == b"DCTDecode"),
|
||||
_ => false,
|
||||
};
|
||||
if !is_dct_only { continue; }
|
||||
|
||||
// raw bytes — lopdf 의 stream.content 는 already-encoded (filter 적용
|
||||
// 후). DCTDecode 의 경우 raw JPEG bytes.
|
||||
let bytes = stream.content.clone();
|
||||
if bytes.len() < 4 || &bytes[0..2] != b"\xFF\xD8" {
|
||||
tracing::warn!(
|
||||
target: "kebab-parse-pdf",
|
||||
"page={} DCTDecode stream missing JPEG magic byte (\\xFF\\xD8), skip", page_num
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
return Ok(Some(bytes));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
102
crates/kebab-parse-pdf/src/text_quality.rs
Normal file
102
crates/kebab-parse-pdf/src/text_quality.rs
Normal file
@@ -0,0 +1,102 @@
|
||||
// crates/kebab-parse-pdf/src/text_quality.rs (신규)
|
||||
//
|
||||
// Per-page text quality metric — vector PDF 의 valid text vs scanned PDF
|
||||
// 의 empty vs mojibake (ToUnicode CMap 누락 PUA codepoint) 구분.
|
||||
// caller (kebab-app::pdf_ocr_apply) 가 threshold 와 비교.
|
||||
|
||||
/// Valid char ratio (0.0..=1.0). 빈 string → 0.0.
|
||||
/// valid := ASCII printable + Hangul (Jamo/Compatibility/Syllables) + CJK + Latin Extended + common Korean punctuation.
|
||||
pub fn compute_valid_char_ratio(s: &str) -> f32 {
|
||||
let mut total = 0u32;
|
||||
let mut valid = 0u32;
|
||||
for c in s.chars() {
|
||||
total += 1;
|
||||
if is_valid_text_char(c) { valid += 1; }
|
||||
}
|
||||
if total == 0 { return 0.0; }
|
||||
valid as f32 / total as f32
|
||||
}
|
||||
|
||||
fn is_valid_text_char(c: char) -> bool {
|
||||
let cp = c as u32;
|
||||
match cp {
|
||||
0x0009 | 0x000A | 0x000D => true, // tab / LF / CR
|
||||
0x0020..=0x007E => true, // ASCII printable
|
||||
0x00A0..=0x024F => true, // Latin-1 Supplement + Latin Extended-A/B
|
||||
0x1100..=0x11FF => true, // Hangul Jamo
|
||||
0x3130..=0x318F => true, // Hangul Compatibility Jamo
|
||||
0x4E00..=0x9FFF => true, // CJK Unified Ideographs
|
||||
0xAC00..=0xD7A3 => true, // Hangul Syllables
|
||||
0x2010..=0x205F => matches!(c,
|
||||
'\u{2010}' | '\u{2013}' | '\u{2014}' | '\u{2015}' |
|
||||
'\u{2018}' | '\u{2019}' | '\u{201C}' | '\u{201D}' |
|
||||
'\u{201E}' | '\u{2026}' | '\u{2027}' | '\u{2032}' | '\u{2033}'
|
||||
| '\u{00B7}'),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn empty_string_zero() {
|
||||
assert_eq!(compute_valid_char_ratio(""), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pure_ascii_one() {
|
||||
let r = compute_valid_char_ratio("Hello, World! 12345.");
|
||||
assert!((r - 1.0).abs() < 1e-6, "got {r}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pure_hangul_syllables_one() {
|
||||
let r = compute_valid_char_ratio("안녕하세요 한글 테스트");
|
||||
assert!((r - 1.0).abs() < 1e-6, "got {r}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pure_pua_zero() {
|
||||
// Private Use Area codepoints — mojibake 의 patten.
|
||||
// U+E000..U+F8FF 가 valid char list 에 없음.
|
||||
let s: String = (0xE000u32..0xE010).map(|c| char::from_u32(c).unwrap()).collect();
|
||||
let r = compute_valid_char_ratio(&s);
|
||||
assert_eq!(r, 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixed_half() {
|
||||
// 5 valid ASCII + 5 PUA → 0.5
|
||||
let mut s = String::from("ABCDE");
|
||||
for c in 0xE000u32..0xE005 { s.push(char::from_u32(c).unwrap()); }
|
||||
let r = compute_valid_char_ratio(&s);
|
||||
assert!((r - 0.5).abs() < 1e-6, "got {r}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cjk_ideograph_valid() {
|
||||
let r = compute_valid_char_ratio("漢字大韓民國");
|
||||
assert!((r - 1.0).abs() < 1e-6, "got {r}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hangul_jamo_valid() {
|
||||
let r = compute_valid_char_ratio("\u{1100}\u{1161}"); // Jamo ㄱㅏ
|
||||
assert!((r - 1.0).abs() < 1e-6, "got {r}");
|
||||
}
|
||||
|
||||
// F4 measurement: valid_ratio = 0.0000 (lopdf returns empty string — ToUnicode CMap 부재로
|
||||
// extract_text 가 빈 text 반환). Case A (< 0.3) → active.
|
||||
// fixture fix: mojibake.pdf 의 startxref 22130 → 22114 (16-byte offset 오차 수정).
|
||||
#[test]
|
||||
fn f4_fixture_ratio_under_threshold() {
|
||||
use lopdf::Document;
|
||||
let bytes = include_bytes!("../tests/fixtures/mojibake.pdf");
|
||||
let doc = Document::load_mem(bytes).unwrap();
|
||||
let text = doc.extract_text(&[1]).unwrap_or_default();
|
||||
let r = compute_valid_char_ratio(&text);
|
||||
assert!(r < 0.3, "F4 mojibake fixture 의 valid_ratio < 0.3 (got {r})");
|
||||
}
|
||||
}
|
||||
BIN
crates/kebab-parse-pdf/tests/fixtures/mojibake.pdf
vendored
BIN
crates/kebab-parse-pdf/tests/fixtures/mojibake.pdf
vendored
Binary file not shown.
24
crates/kebab-parse-pdf/tests/page_image.rs
Normal file
24
crates/kebab-parse-pdf/tests/page_image.rs
Normal file
@@ -0,0 +1,24 @@
|
||||
// crates/kebab-parse-pdf/tests/page_image.rs (신규)
|
||||
|
||||
use lopdf::Document;
|
||||
use kebab_parse_pdf::extract_dctdecode_page_image;
|
||||
|
||||
// happy path — F1 fixture (DCTDecode JPEG passthrough)
|
||||
#[test]
|
||||
fn f1_fixture_yields_dctdecode_jpeg_bytes() {
|
||||
let bytes = include_bytes!("fixtures/scanned_page1.pdf");
|
||||
let doc = Document::load_mem(bytes).unwrap();
|
||||
let result = extract_dctdecode_page_image(&doc, 1).unwrap();
|
||||
let jpeg = result.expect("F1 의 page 1 이 DCTDecode image 보유");
|
||||
assert!(jpeg.starts_with(b"\xFF\xD8"), "JPEG magic missing");
|
||||
assert!(jpeg.len() > 1000, "JPEG bytes too small (got {})", jpeg.len());
|
||||
}
|
||||
|
||||
// negative path — F6 fixture (FlateDecode raw pixel — Ok(None))
|
||||
#[test]
|
||||
fn flate_raw_fixture_yields_none() {
|
||||
let bytes = include_bytes!("fixtures/flate_raw.pdf");
|
||||
let doc = Document::load_mem(bytes).unwrap();
|
||||
let result = extract_dctdecode_page_image(&doc, 1).unwrap();
|
||||
assert!(result.is_none(), "FlateDecode page 가 Ok(None) 반환 — DCTDecode-only v1 invariant");
|
||||
}
|
||||
Reference in New Issue
Block a user