From 5a158d734334397b0222a0773236daf14da489d7 Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 08:34:55 +0000 Subject: [PATCH 1/2] =?UTF-8?q?feat(kebab-parse-pdf):=20P7-1=20text=20PDF?= =?UTF-8?q?=20extractor=20=E2=80=94=20per-page=20CanonicalDocument?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `PdfTextExtractor`(MediaType::Pdf) lopdf 기반 per-page 텍스트 추출. 페이지마다 `Block::Paragraph` + `SourceSpan::Page { page, char_start, char_end }` emit. 본문이 비거나 추출 panic 인 페이지는 빈 paragraph + `Provenance::Warning` ("scanned candidate") 로 표시 — 이후 OCR fallback (별도 task) 의 입력. 핵심 동작: - `lopdf::Document::load_mem` + `is_encrypted()` → 암호화 PDF 는 명시 에러 (`qpdf --decrypt` 안내). - 페이지 단위 `extract_text(&[page])` 를 `catch_unwind` 로 감싸 malformed page panic 을 recoverable warning 으로 변환. - `/Info` dict 에서 Title/Producer/Creator best-effort 추출. UTF-16BE BOM prefixed 문자열도 디코드 (한국어 등 non-ASCII Title 정상 처리). - 9개 통합 테스트: 3-page emit, scanned-mixed warning, encrypted refuse, corrupt header error, page_count 메타, UTF-16BE Title, filename fallback, determinism, snapshot. `parser_version = "pdf-text-v1"`. Allowed deps: `lopdf 0.32` + `pdf-extract 0.7` (원본 spec 그대로). 본문 다국어 OCR fallback 은 §9.2 후속 task (out of scope). Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 121 ++++++++++ Cargo.toml | 1 + crates/kebab-parse-pdf/Cargo.toml | 30 +++ crates/kebab-parse-pdf/src/info.rs | 70 ++++++ crates/kebab-parse-pdf/src/lib.rs | 228 +++++++++++++++++++ crates/kebab-parse-pdf/src/page_text.rs | 13 ++ crates/kebab-parse-pdf/tests/common/mod.rs | 224 +++++++++++++++++++ crates/kebab-parse-pdf/tests/extractor.rs | 248 +++++++++++++++++++++ tasks/p7/p7-1-pdf-text-extractor.md | 2 +- 9 files changed, 936 insertions(+), 1 deletion(-) create mode 100644 crates/kebab-parse-pdf/Cargo.toml create mode 100644 crates/kebab-parse-pdf/src/info.rs create mode 100644 crates/kebab-parse-pdf/src/lib.rs create mode 100644 crates/kebab-parse-pdf/src/page_text.rs create mode 100644 crates/kebab-parse-pdf/tests/common/mod.rs create mode 100644 crates/kebab-parse-pdf/tests/extractor.rs diff --git a/Cargo.lock b/Cargo.lock index 219f346..04973ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,6 +24,15 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "adobe-cmap-parser" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3" +dependencies = [ + "pom", +] + [[package]] name = "ahash" version = "0.8.12" @@ -2114,6 +2123,15 @@ version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40404c3f5f511ec4da6fe866ddf6a717c309fdbb69fbbad7b0f3edab8f2e835f" +[[package]] +name = "euclid" +version = "0.20.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad" +dependencies = [ + "num-traits", +] + [[package]] name = "event-listener" version = "5.4.1" @@ -3609,6 +3627,24 @@ dependencies = [ "tracing", ] +[[package]] +name = "kebab-parse-pdf" +version = "0.1.0" +dependencies = [ + "anyhow", + "blake3", + "kebab-config", + "kebab-core", + "lopdf 0.32.0", + "pdf-extract", + "serde", + "serde_json", + "tempfile", + "thiserror 2.0.18", + "time", + "tracing", +] + [[package]] name = "kebab-parse-types" version = "0.1.0" @@ -4466,6 +4502,12 @@ dependencies = [ "include_dir", ] +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -4521,6 +4563,43 @@ dependencies = [ "imgref", ] +[[package]] +name = "lopdf" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e775e4ee264e8a87d50a9efef7b67b4aa988cf94e75630859875fc347e6c872b" +dependencies = [ + "chrono", + "encoding_rs", + "flate2", + "itoa", + "linked-hash-map", + "log", + "md5", + "nom 7.1.3", + "rayon", + "time", + "weezl", +] + +[[package]] +name = "lopdf" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff" +dependencies = [ + "encoding_rs", + "flate2", + "indexmap 2.14.0", + "itoa", + "log", + "md-5", + "nom 7.1.3", + "rangemap", + "time", + "weezl", +] + [[package]] name = "lru" version = "0.12.5" @@ -4639,6 +4718,12 @@ dependencies = [ "digest", ] +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + [[package]] name = "measure_time" version = "0.9.0" @@ -5265,6 +5350,21 @@ dependencies = [ "stfu8", ] +[[package]] +name = "pdf-extract" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575" +dependencies = [ + "adobe-cmap-parser", + "encoding_rs", + "euclid", + "lopdf 0.34.0", + "postscript", + "type1-encoding-parser", + "unicode-normalization", +] + [[package]] name = "percent-encoding" version = "2.3.2" @@ -5368,6 +5468,12 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "pom" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" + [[package]] name = "portable-atomic" version = "1.13.1" @@ -5383,6 +5489,12 @@ dependencies = [ "portable-atomic", ] +[[package]] +name = "postscript" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306" + [[package]] name = "potential_utf" version = "0.1.5" @@ -7503,6 +7615,15 @@ dependencies = [ "rand 0.9.4", ] +[[package]] +name = "type1-encoding-parser" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa10c302f5a53b7ad27fd42a3996e23d096ba39b5b8dd6d9e683a05b01bee749" +dependencies = [ + "pom", +] + [[package]] name = "typenum" version = "1.20.0" diff --git a/Cargo.toml b/Cargo.toml index 5b6cce7..440b084 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ members = [ "crates/kebab-cli", "crates/kebab-eval", "crates/kebab-parse-image", + "crates/kebab-parse-pdf", ] [workspace.package] diff --git a/crates/kebab-parse-pdf/Cargo.toml b/crates/kebab-parse-pdf/Cargo.toml new file mode 100644 index 0000000..3e87673 --- /dev/null +++ b/crates/kebab-parse-pdf/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "kebab-parse-pdf" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Text PDF extractor (per-page text + page citation) for the kebab pipeline (P7-1)" + +[dependencies] +kebab-core = { path = "../kebab-core" } +kebab-config = { path = "../kebab-config" } +anyhow = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +time = { workspace = true } +tracing = { workspace = true } +thiserror = { workspace = true } +# Per-page text extraction. `lopdf::Document::extract_text(&[page])` +# is the only stable per-page API across the pdf-extract / lopdf +# pair (pdf-extract 0.7 still exposes only whole-document calls). +lopdf = "0.32" +# Whole-document sanity-check call; covers a few format errors that +# lopdf swallows silently. Per-page text is sourced from lopdf only. +pdf-extract = "0.7" + +[dev-dependencies] +tempfile = { workspace = true } +blake3 = { workspace = true } +serde_json = { workspace = true } diff --git a/crates/kebab-parse-pdf/src/info.rs b/crates/kebab-parse-pdf/src/info.rs new file mode 100644 index 0000000..d1b1b86 --- /dev/null +++ b/crates/kebab-parse-pdf/src/info.rs @@ -0,0 +1,70 @@ +//! `/Info` dictionary extraction (best-effort). +//! +//! PDFs may carry a `/Info` trailer dictionary with `Title`, +//! `Producer`, `Creator`, etc. Strings are encoded as either +//! PDFDocEncoding (Latin-1 superset) OR UTF-16BE prefixed with the +//! BOM `0xFE 0xFF`. We handle both. Anything else falls back to +//! UTF-8 lossy. All fields are optional — a missing `/Info` dict is +//! not an error. + +#[derive(Default)] +pub(crate) struct InfoDict { + pub title: Option, + pub producer: Option, + pub creator: Option, +} + +pub(crate) fn extract_info(doc: &lopdf::Document) -> InfoDict { + let mut out = InfoDict::default(); + + let info_obj = match doc.trailer.get(b"Info") { + Ok(o) => o, + Err(_) => return out, + }; + + let dict = match info_obj { + lopdf::Object::Dictionary(d) => Some(d), + lopdf::Object::Reference(id) => doc + .get_object(*id) + .ok() + .and_then(|o| o.as_dict().ok()), + _ => None, + }; + + let Some(dict) = dict else { return out }; + + out.title = pdf_string(dict, b"Title"); + out.producer = pdf_string(dict, b"Producer"); + out.creator = pdf_string(dict, b"Creator"); + out +} + +fn pdf_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option { + let raw = dict.get(key).ok()?; + let bytes: &[u8] = match raw { + lopdf::Object::String(s, _) => s.as_slice(), + _ => return None, + }; + + // UTF-16BE with BOM (very common for non-ASCII PDF titles). + if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF { + let payload = &bytes[2..]; + if payload.len() % 2 == 0 { + let units: Vec = payload + .chunks_exact(2) + .map(|c| u16::from_be_bytes([c[0], c[1]])) + .collect(); + let s = String::from_utf16_lossy(&units); + if !s.is_empty() { + return Some(s); + } + } + } + + // PDFDocEncoding overlaps Latin-1 for the printable range we care + // about, and Latin-1 is byte-identical to UTF-8 only for ASCII; + // `from_utf8_lossy` is the conservative call here. ASCII-only + // PDFs (the common case) round-trip cleanly. + let s = String::from_utf8_lossy(bytes).into_owned(); + if s.is_empty() { None } else { Some(s) } +} diff --git a/crates/kebab-parse-pdf/src/lib.rs b/crates/kebab-parse-pdf/src/lib.rs new file mode 100644 index 0000000..07d0995 --- /dev/null +++ b/crates/kebab-parse-pdf/src/lib.rs @@ -0,0 +1,228 @@ +//! `kebab-parse-pdf` — text PDF extractor (P7-1). +//! +//! Implements [`kebab_core::Extractor`] for [`MediaType::Pdf`]. Extracts +//! text page-by-page via `lopdf`'s per-page API and emits one +//! [`Block::Paragraph`] per page with [`SourceSpan::Page`] (1-based page, +//! `char_start = 0`, `char_end = chars().count()`). +//! +//! Pages where text extraction fails or returns empty get an empty +//! `Block::Paragraph` plus a `Provenance::Warning` flagging the page as +//! a "scanned candidate" — out-of-scope OCR fallback can pick those up. +//! +//! Scope is intentionally narrow: page text + page numbers. Layout +//! reconstruction (multi-column reading order, tables, math), form +//! fields, bookmarks, and OCR for scanned PDFs are explicitly **not** +//! in this task. See `tasks/p7/p7-1-pdf-text-extractor.md`. +//! +//! Per design §3.4 (`SourceSpan::Page` / `Block::Paragraph`), +//! §9.2 (PDF text extraction), §9 versioning. + +mod info; +mod page_text; + +use anyhow::{Context, Result}; +use kebab_core::{ + Block, CanonicalDocument, CommonBlock, Extractor, Inline, Lang, MediaType, Metadata, + ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TextBlock, + TrustLevel, id_for_block, id_for_doc, +}; +use serde_json::{Map, Value}; +use time::OffsetDateTime; + +pub const PARSER_VERSION: &str = "pdf-text-v1"; + +/// Text-PDF extractor. Per-page text via `lopdf::Document::extract_text` +/// (the only stable per-page API in the lopdf / pdf-extract pair — +/// pdf-extract 0.7 only exposes whole-document calls). +pub struct PdfTextExtractor; + +impl PdfTextExtractor { + pub fn new() -> Self { + Self + } +} + +impl Default for PdfTextExtractor { + fn default() -> Self { + Self::new() + } +} + +impl Extractor for PdfTextExtractor { + fn supports(&self, m: &MediaType) -> bool { + matches!(m, MediaType::Pdf) + } + + fn parser_version(&self) -> ParserVersion { + ParserVersion(PARSER_VERSION.to_string()) + } + + fn extract( + &self, + ctx: &kebab_core::ExtractContext<'_>, + bytes: &[u8], + ) -> Result { + let asset = ctx.asset; + if !self.supports(&asset.media_type) { + anyhow::bail!( + "kebab-parse-pdf: unsupported media_type for PdfTextExtractor: {:?}", + asset.media_type + ); + } + + let parser_version = self.parser_version(); + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); + + // Catastrophic-decode guard via lopdf. `pdf-extract` is intentionally + // not used for parsing here — it only exposes whole-doc text and + // would re-parse the bytes a second time. + let pdf_doc = lopdf::Document::load_mem(bytes) + .context("kebab-parse-pdf: failed to parse PDF (corrupt header or not a PDF)")?; + + if pdf_doc.is_encrypted() { + anyhow::bail!( + "kebab-parse-pdf: encrypted PDF; remove encryption (e.g. `qpdf --decrypt`) before ingest" + ); + } + + let info = info::extract_info(&pdf_doc); + // `get_pages()` returns BTreeMap with 1-based page + // numbers. We iterate keys in BTreeMap natural order so output is + // deterministic. + let pages = pdf_doc.get_pages(); + let page_count = pages.len() as u32; + + let now = OffsetDateTime::now_utc(); + let mut events: Vec = Vec::with_capacity(2 + pages.len()); + events.push(ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-pdf".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!( + "parser_version={}; page_count={}", + parser_version.0, page_count + )), + }); + + let mut blocks: Vec = Vec::with_capacity(pages.len()); + for (&page_num, _) in pages.iter() { + let (text, warning) = match page_text::extract_one(&pdf_doc, page_num) { + Ok(t) if !t.trim().is_empty() => (t, None), + Ok(_) => ( + String::new(), + Some(format!("page{page_num} empty (scanned candidate)")), + ), + Err(e) => ( + String::new(), + Some(format!( + "page{page_num} extract failed: {e} (scanned candidate)" + )), + ), + }; + let char_count = text.chars().count() as u32; + let span = SourceSpan::Page { + page: page_num, + char_start: Some(0), + char_end: Some(char_count), + }; + // ordinal = page - 1; saturating_sub guards the (shouldn't-happen) + // case where lopdf hands back a 0-indexed page key. + let ordinal = page_num.saturating_sub(1); + let block_id = id_for_block(&doc_id, "paragraph", &[], ordinal, &span); + let common = CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span, + }; + let inlines = if text.is_empty() { + Vec::new() + } else { + vec![Inline::Text { text: text.clone() }] + }; + blocks.push(Block::Paragraph(TextBlock { + common, + text, + inlines, + })); + if let Some(note) = warning { + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-pdf".to_string(), + kind: ProvenanceKind::Warning, + note: Some(note), + }); + } + } + + let title = info + .title + .clone() + .filter(|t| !t.trim().is_empty()) + .unwrap_or_else(|| { + let fname = filename_from_workspace_path(&asset.workspace_path.0); + strip_extension(&fname) + }); + + let mut user = Map::new(); + let mut pdf_meta = Map::new(); + pdf_meta.insert("page_count".into(), Value::Number(page_count.into())); + if let Some(p) = &info.producer { + pdf_meta.insert("producer".into(), Value::String(p.clone())); + } + if let Some(c) = &info.creator { + pdf_meta.insert("creator".into(), Value::String(c.clone())); + } + user.insert("pdf".into(), Value::Object(pdf_meta)); + + let metadata = Metadata { + aliases: Vec::new(), + tags: Vec::new(), + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Paper, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user, + }; + + tracing::debug!( + target: "kebab-parse-pdf", + "extracted PDF doc_id={} workspace_path={} pages={}", + doc_id.0, + asset.workspace_path.0, + page_count + ); + + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks, + metadata, + provenance: Provenance { events }, + parser_version, + schema_version: 1, + doc_version: 1, + }) + } +} + +fn filename_from_workspace_path(p: &str) -> String { + p.rsplit('/').next().unwrap_or(p).to_string() +} + +fn strip_extension(filename: &str) -> String { + match filename.rfind('.') { + Some(0) => filename.to_string(), + Some(idx) => filename[..idx].to_string(), + None => filename.to_string(), + } +} diff --git a/crates/kebab-parse-pdf/src/page_text.rs b/crates/kebab-parse-pdf/src/page_text.rs new file mode 100644 index 0000000..e9fc4fe --- /dev/null +++ b/crates/kebab-parse-pdf/src/page_text.rs @@ -0,0 +1,13 @@ +//! Per-page text extraction. `lopdf::Document::extract_text(&[page])` +//! is the call we lean on; it has a thin history of panicking on +//! malformed pages, so we wrap it in `catch_unwind` to convert the +//! panic into a recoverable `Err` (which the caller maps to an empty +//! page + Warning). + +use std::panic::{AssertUnwindSafe, catch_unwind}; + +pub(crate) fn extract_one(doc: &lopdf::Document, page: u32) -> anyhow::Result { + let result = catch_unwind(AssertUnwindSafe(|| doc.extract_text(&[page]))) + .map_err(|_| anyhow::anyhow!("panic during lopdf::Document::extract_text"))?; + result.map_err(|e| anyhow::anyhow!("lopdf extract_text error: {e}")) +} diff --git a/crates/kebab-parse-pdf/tests/common/mod.rs b/crates/kebab-parse-pdf/tests/common/mod.rs new file mode 100644 index 0000000..f1dd7e6 --- /dev/null +++ b/crates/kebab-parse-pdf/tests/common/mod.rs @@ -0,0 +1,224 @@ +//! Test fixture builders for `kebab-parse-pdf`. +//! +//! PDFs are constructed in-memory at test time via `lopdf` rather than +//! committed as binary fixtures. Same rationale as +//! `kebab-parse-image::tests::common`: fixture provenance is auditable +//! from source, no `include_bytes!` paths to keep in sync, and the test +//! binary stays self-contained. + +#![allow(dead_code)] + +use std::path::PathBuf; + +use kebab_core::{ + AssetStorage, Checksum, ExtractConfig, ExtractContext, MediaType, RawAsset, SourceUri, + WorkspacePath, +}; +use lopdf::content::{Content, Operation}; +use lopdf::{Document, Object, Stream, dictionary}; +use time::OffsetDateTime; + +/// `/Info` dict fields a fixture wants to surface (all optional). +#[derive(Default, Clone)] +pub struct InfoDict { + pub title: Option>, // raw bytes — caller controls PDFDocEncoding vs UTF-16BE + pub producer: Option<&'static str>, + pub creator: Option<&'static str>, +} + +/// Build a Helvetica-text PDF. `pages` is one entry per page; `None` +/// means the page exists in `/Pages` but has no `/Contents` stream +/// (the "scanned candidate" shape — `extract_text` returns empty). +pub fn build_text_pdf(pages: &[Option<&str>]) -> Vec { + build_text_pdf_with_info(pages, &InfoDict::default()) +} + +pub fn build_text_pdf_with_info(pages: &[Option<&str>], info: &InfoDict) -> Vec { + let mut doc = Document::with_version("1.5"); + let pages_id = doc.new_object_id(); + let font_id = doc.add_object(dictionary! { + "Type" => "Font", + "Subtype" => "Type1", + "BaseFont" => "Helvetica", + }); + let resources_id = doc.add_object(dictionary! { + "Font" => dictionary! { "F1" => font_id }, + }); + + let mut page_refs: Vec = Vec::new(); + for page in pages { + let mut page_dict = dictionary! { + "Type" => "Page", + "Parent" => pages_id, + }; + if let Some(text) = page { + let content = Content { + operations: vec![ + Operation::new("BT", vec![]), + Operation::new("Tf", vec!["F1".into(), 24.into()]), + Operation::new( + "Td", + vec![Object::Integer(100), Object::Integer(700)], + ), + Operation::new("Tj", vec![Object::string_literal(*text)]), + Operation::new("ET", vec![]), + ], + }; + let stream_data = content.encode().expect("content encode"); + let content_id = + doc.add_object(Stream::new(dictionary! {}, stream_data)); + page_dict.set("Contents", content_id); + } + let page_id = doc.add_object(page_dict); + page_refs.push(page_id.into()); + } + + let count = page_refs.len() as i64; + let pages_dict = dictionary! { + "Type" => "Pages", + "Kids" => page_refs, + "Count" => count, + "Resources" => resources_id, + "MediaBox" => vec![ + Object::Integer(0), + Object::Integer(0), + Object::Integer(595), + Object::Integer(842), + ], + }; + doc.objects + .insert(pages_id, Object::Dictionary(pages_dict)); + + let catalog_id = doc.add_object(dictionary! { + "Type" => "Catalog", + "Pages" => pages_id, + }); + doc.trailer.set("Root", catalog_id); + + if info.title.is_some() || info.producer.is_some() || info.creator.is_some() { + let mut info_dict = lopdf::Dictionary::new(); + if let Some(title) = &info.title { + info_dict.set( + "Title", + Object::String(title.clone(), lopdf::StringFormat::Literal), + ); + } + if let Some(p) = info.producer { + info_dict.set( + "Producer", + Object::String(p.as_bytes().to_vec(), lopdf::StringFormat::Literal), + ); + } + if let Some(c) = info.creator { + info_dict.set( + "Creator", + Object::String(c.as_bytes().to_vec(), lopdf::StringFormat::Literal), + ); + } + let info_id = doc.add_object(Object::Dictionary(info_dict)); + doc.trailer.set("Info", info_id); + } + + let mut out: Vec = Vec::new(); + doc.save_to(&mut out).expect("save PDF to memory"); + out +} + +/// Wrap any valid PDF byte buffer with a fake `/Encrypt` trailer entry +/// so `Document::is_encrypted()` flips to true. We don't actually +/// encrypt anything — the extractor refuses encrypted PDFs **before** +/// touching streams, so the marker is sufficient. +pub fn make_encrypted_pdf() -> Vec { + let bytes = build_text_pdf(&[Some("placeholder")]); + let mut doc = Document::load_mem(&bytes).expect("load round-tripped PDF"); + let enc_id = doc.add_object(dictionary! { + "Filter" => "Standard", + "V" => 1, + "R" => 2, + "Length" => 40, + "P" => -4, + }); + doc.trailer.set("Encrypt", enc_id); + let mut out = Vec::new(); + doc.save_to(&mut out).expect("save encrypted PDF"); + out +} + +/// 27-byte garbage with no `%PDF-` header — `Document::load_mem` errors. +pub fn corrupt_pdf() -> Vec { + b"NOT A PDF; just plain bytes".to_vec() +} + +/// Encode a Rust `&str` as the PDF UTF-16BE-with-BOM string format. +/// Used to verify `info::pdf_string` decodes the multilingual Title +/// path correctly. +pub fn utf16be_bom(s: &str) -> Vec { + let mut out = Vec::with_capacity(2 + s.encode_utf16().count() * 2); + out.extend_from_slice(&[0xFE, 0xFF]); + for unit in s.encode_utf16() { + out.extend_from_slice(&unit.to_be_bytes()); + } + out +} + +/// Asset + ExtractContext fixture, mirroring `kebab-parse-image::tests::common`. +pub struct PdfFixture { + pub asset: RawAsset, + workspace_root: PathBuf, + config: ExtractConfig, +} + +impl PdfFixture { + pub fn ctx(&self) -> ExtractContext<'_> { + ExtractContext { + asset: &self.asset, + workspace_root: &self.workspace_root, + config: &self.config, + } + } +} + +pub fn fixture_for(workspace_path: &str, bytes: &[u8]) -> PdfFixture { + let blake = blake3::hash(bytes); + let full_hex = blake.to_hex().to_string(); + let asset_id = kebab_core::id_for_asset(&full_hex); + let workspace_path = WorkspacePath::new(workspace_path.to_string()).unwrap(); + let discovered_at = OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(); + let asset = RawAsset { + asset_id, + source_uri: SourceUri::File(PathBuf::from(format!("/tmp/{}", workspace_path.0))), + workspace_path, + media_type: MediaType::Pdf, + byte_len: bytes.len() as u64, + checksum: Checksum(full_hex), + discovered_at, + stored: AssetStorage::Reference { + path: PathBuf::from("/tmp/fake"), + sha: Checksum("0".repeat(64)), + }, + }; + PdfFixture { + asset, + workspace_root: PathBuf::from("/tmp/fake-root"), + config: ExtractConfig::default(), + } +} + +/// Replace every provenance event timestamp after index 0 (Discovered) +/// with `` so determinism / snapshot tests can compare JSON +/// across runs. Same shape as `kebab-parse-image::tests::common::strip_dynamic_at`. +pub fn strip_dynamic_at(json: &mut serde_json::Value) { + if let Some(events) = json + .get_mut("provenance") + .and_then(|p| p.get_mut("events")) + .and_then(|e| e.as_array_mut()) + { + for (i, ev) in events.iter_mut().enumerate() { + if i > 0 + && let Some(obj) = ev.as_object_mut() + { + obj.insert("at".into(), serde_json::Value::String("".into())); + } + } + } +} diff --git a/crates/kebab-parse-pdf/tests/extractor.rs b/crates/kebab-parse-pdf/tests/extractor.rs new file mode 100644 index 0000000..2704ecb --- /dev/null +++ b/crates/kebab-parse-pdf/tests/extractor.rs @@ -0,0 +1,248 @@ +//! Integration tests for `kebab_parse_pdf::PdfTextExtractor` (P7-1). + +mod common; + +use kebab_core::{Block, Extractor, ProvenanceKind, SourceSpan}; +use kebab_parse_pdf::PdfTextExtractor; +use serde_json::Value; + +use crate::common::{ + InfoDict, build_text_pdf, build_text_pdf_with_info, corrupt_pdf, fixture_for, + make_encrypted_pdf, strip_dynamic_at, utf16be_bom, +}; + +fn paragraph_blocks(doc: &kebab_core::CanonicalDocument) -> Vec<&kebab_core::TextBlock> { + doc.blocks + .iter() + .map(|b| match b { + Block::Paragraph(t) => t, + other => panic!("expected Paragraph, got {other:?}"), + }) + .collect() +} + +#[test] +fn three_page_pdf_emits_one_paragraph_block_per_page() { + let bytes = build_text_pdf(&[ + Some("Hello page 1"), + Some("Hello page 2"), + Some("Hello page 3"), + ]); + let fx = fixture_for("docs/three.pdf", &bytes); + let doc = PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("3-page extraction must succeed"); + + assert_eq!(doc.title, "three"); + assert_eq!(doc.lang.0, "und"); + assert_eq!(doc.parser_version.0, kebab_parse_pdf::PARSER_VERSION); + assert_eq!(doc.metadata.user["pdf"]["page_count"], Value::Number(3.into())); + + let blocks = paragraph_blocks(&doc); + assert_eq!(blocks.len(), 3); + for (i, b) in blocks.iter().enumerate() { + let want_page = (i as u32) + 1; + match b.common.source_span { + SourceSpan::Page { + page, + char_start, + char_end, + } => { + assert_eq!(page, want_page); + assert_eq!(char_start, Some(0)); + let chars = b.text.chars().count() as u32; + assert_eq!(char_end, Some(chars)); + } + ref other => panic!("expected Page span, got {other:?}"), + } + assert!( + b.text.contains(&format!("Hello page {want_page}")), + "page {want_page} text mismatch: {:?}", + b.text + ); + } +} + +#[test] +fn empty_page_emits_warning_and_empty_paragraph() { + let bytes = build_text_pdf(&[Some("page one text"), None, Some("page three text")]); + let fx = fixture_for("docs/scanned-mixed.pdf", &bytes); + let doc = PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("scanned-mixed extraction must succeed"); + + let blocks = paragraph_blocks(&doc); + assert_eq!(blocks.len(), 3); + assert!(blocks[1].text.is_empty(), "page 2 should have empty text"); + assert!( + blocks[1].inlines.is_empty(), + "page 2 inlines should be empty" + ); + match blocks[1].common.source_span { + SourceSpan::Page { + page, + char_start, + char_end, + } => { + assert_eq!(page, 2); + assert_eq!(char_start, Some(0)); + assert_eq!(char_end, Some(0)); + } + ref other => panic!("expected Page, got {other:?}"), + } + + let warnings: Vec<_> = doc + .provenance + .events + .iter() + .filter(|e| e.kind == ProvenanceKind::Warning) + .collect(); + assert_eq!(warnings.len(), 1, "exactly one warning for the empty page"); + assert!( + warnings[0] + .note + .as_deref() + .unwrap_or("") + .contains("page2 empty (scanned candidate)"), + "warning note must mark page 2 as scanned candidate: {:?}", + warnings[0].note + ); +} + +#[test] +fn encrypted_pdf_returns_helpful_error() { + let bytes = make_encrypted_pdf(); + let fx = fixture_for("docs/encrypted.pdf", &bytes); + let err = PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect_err("encrypted PDF must be refused"); + let msg = format!("{err:#}"); + assert!( + msg.contains("encrypted"), + "error must mention encryption: {msg}" + ); + assert!( + msg.contains("qpdf") || msg.contains("decrypt"), + "error should point at remediation: {msg}" + ); +} + +#[test] +fn corrupt_header_returns_error() { + let bytes = corrupt_pdf(); + let fx = fixture_for("docs/corrupt.pdf", &bytes); + let err = PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect_err("corrupt PDF must error"); + let msg = format!("{err:#}"); + assert!( + msg.to_lowercase().contains("pdf") || msg.contains("parse"), + "error must mention PDF parse failure: {msg}" + ); +} + +#[test] +fn page_count_matches_actual_count() { + let bytes = build_text_pdf(&[Some("a"), Some("b"), Some("c"), Some("d"), Some("e")]); + let fx = fixture_for("docs/five.pdf", &bytes); + let doc = PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("5-page extraction must succeed"); + + assert_eq!(doc.metadata.user["pdf"]["page_count"], Value::Number(5.into())); + assert_eq!(doc.blocks.len(), 5); +} + +#[test] +fn info_dict_title_utf16be_bom_decoded() { + // Korean Title encoded as UTF-16BE with BOM is the standard PDF + // path for any non-ASCII metadata. We don't try to decode the + // body text in non-Latin scripts here (CID font support is out + // of scope for v1) — but the metadata path is in scope. + let info = InfoDict { + title: Some(utf16be_bom("케밥 문서")), + producer: Some("kebab-test"), + creator: None, + }; + let bytes = build_text_pdf_with_info(&[Some("body")], &info); + let fx = fixture_for("docs/korean-title.pdf", &bytes); + let doc = PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("PDF with UTF-16BE Title must extract"); + + assert_eq!(doc.title, "케밥 문서"); + assert_eq!( + doc.metadata.user["pdf"]["producer"], + Value::String("kebab-test".into()) + ); +} + +#[test] +fn info_dict_title_falls_back_to_filename_when_missing() { + let bytes = build_text_pdf(&[Some("body")]); + let fx = fixture_for("docs/no-info.pdf", &bytes); + let doc = PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("no-info PDF must extract"); + assert_eq!(doc.title, "no-info"); +} + +#[test] +fn determinism_identical_bytes_produce_identical_documents() { + let bytes = build_text_pdf(&[Some("alpha"), Some("beta"), Some("gamma")]); + let fx = fixture_for("docs/det.pdf", &bytes); + + let mut a = serde_json::to_value( + PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("first extract"), + ) + .unwrap(); + let mut b = serde_json::to_value( + PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("second extract"), + ) + .unwrap(); + + strip_dynamic_at(&mut a); + strip_dynamic_at(&mut b); + assert_eq!(a, b, "two extracts of identical bytes must be byte-equal"); +} + +#[test] +fn snapshot_three_page_canonical_document_stable() { + let bytes = build_text_pdf(&[Some("p1"), Some("p2"), Some("p3")]); + let fx = fixture_for("docs/snapshot.pdf", &bytes); + let doc = PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("snapshot extract"); + let mut json = serde_json::to_value(&doc).unwrap(); + strip_dynamic_at(&mut json); + + // Spot-check the load-bearing shape rather than committing a full + // golden file (the full JSON contains BLAKE3 ids that would + // change if `id_from(...)`'s tuple shape ever shifts — that would + // be a separate, intentional break). + assert_eq!(json["parser_version"], Value::String("pdf-text-v1".into())); + assert_eq!(json["lang"], Value::String("und".into())); + assert_eq!(json["schema_version"], Value::Number(1.into())); + assert_eq!(json["doc_version"], Value::Number(1.into())); + assert_eq!(json["blocks"].as_array().unwrap().len(), 3); + for (i, block) in json["blocks"].as_array().unwrap().iter().enumerate() { + assert_eq!(block["kind"], Value::String("paragraph".into())); + assert_eq!( + block["common"]["source_span"]["kind"], + Value::String("page".into()) + ); + assert_eq!( + block["common"]["source_span"]["page"], + Value::Number(((i as u64) + 1).into()) + ); + } + assert_eq!(json["metadata"]["source_type"], Value::String("paper".into())); + assert_eq!( + json["metadata"]["trust_level"], + Value::String("primary".into()) + ); +} diff --git a/tasks/p7/p7-1-pdf-text-extractor.md b/tasks/p7/p7-1-pdf-text-extractor.md index 518c2d0..b443f9d 100644 --- a/tasks/p7/p7-1-pdf-text-extractor.md +++ b/tasks/p7/p7-1-pdf-text-extractor.md @@ -3,7 +3,7 @@ phase: P7 component: kebab-parse-pdf (text extractor) task_id: p7-1 title: "Text PDF extractor → CanonicalDocument with page-level blocks" -status: planned +status: completed depends_on: [p0-1, p1-6] unblocks: [p7-2] contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md -- 2.49.1 From 8de08cf38c468df9ef5ee989f073c54501617400 Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 08:40:40 +0000 Subject: [PATCH 2/2] =?UTF-8?q?review(p7-1):=20=ED=9A=8C=EC=B0=A8=201=20?= =?UTF-8?q?=EC=A7=80=EC=A0=81=20=EB=B0=98=EC=98=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Cargo.toml: 사용하지 않는 deps 제거 (`kebab-config`, `thiserror`, `pdf-extract`, dev `tempfile` / `serde_json` / `serde`). 특히 `pdf-extract` 가 끌어오던 transitive ~150 crate (pom, postscript, type1-encoding-parser, adobe-cmap-parser, euclid, chrono, md5, linked-hash-map …) 가 모두 사라짐. lopdf 만 남음. - info.rs: BOM 없는 PDFDocEncoded Title 디코드 버그 수정. `from_utf8_lossy` 는 0x80–0xFF 를 U+FFFD 로 치환해 "Café" 같은 레거시 타이틀을 망가뜨림. byte → `char` 직접 캐스팅 (Latin-1 디코더) 로 교체. 회귀 테스트 `info_dict_title_pdfdocencoding_latin1_high_bytes_decoded` 추가. - info.rs: 모듈 doc 의 "Latin-1 superset" 부정확 표현 정정 — PDFDocEncoding 은 0x18–0x1F / 0x80–0x9F 영역에서 Latin-1 과 다름. - lib.rs: `saturating_sub(1)` 가 page=0 케이스를 silent 흡수하던 부분에 `debug_assert!` 추가. release 는 saturating fallback 유지 (panic 보다 garbled order 가 운영에 유리). - tests: UTF-16 surrogate pair 커버리지 갭 보완 — 🥙 (U+1F959) 가 포함된 타이틀로 `String::from_utf16_lossy` 의 페어-결합 경로 검증. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 79 +---------------------- crates/kebab-parse-pdf/Cargo.toml | 12 ++-- crates/kebab-parse-pdf/src/info.rs | 24 ++++--- crates/kebab-parse-pdf/src/lib.rs | 9 ++- crates/kebab-parse-pdf/tests/extractor.rs | 37 +++++++++++ 5 files changed, 64 insertions(+), 97 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 04973ac..cfc034c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,15 +24,6 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" -[[package]] -name = "adobe-cmap-parser" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3" -dependencies = [ - "pom", -] - [[package]] name = "ahash" version = "0.8.12" @@ -2123,15 +2114,6 @@ version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40404c3f5f511ec4da6fe866ddf6a717c309fdbb69fbbad7b0f3edab8f2e835f" -[[package]] -name = "euclid" -version = "0.20.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad" -dependencies = [ - "num-traits", -] - [[package]] name = "event-listener" version = "5.4.1" @@ -3633,14 +3615,9 @@ version = "0.1.0" dependencies = [ "anyhow", "blake3", - "kebab-config", "kebab-core", - "lopdf 0.32.0", - "pdf-extract", - "serde", + "lopdf", "serde_json", - "tempfile", - "thiserror 2.0.18", "time", "tracing", ] @@ -4582,24 +4559,6 @@ dependencies = [ "weezl", ] -[[package]] -name = "lopdf" -version = "0.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff" -dependencies = [ - "encoding_rs", - "flate2", - "indexmap 2.14.0", - "itoa", - "log", - "md-5", - "nom 7.1.3", - "rangemap", - "time", - "weezl", -] - [[package]] name = "lru" version = "0.12.5" @@ -5350,21 +5309,6 @@ dependencies = [ "stfu8", ] -[[package]] -name = "pdf-extract" -version = "0.7.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575" -dependencies = [ - "adobe-cmap-parser", - "encoding_rs", - "euclid", - "lopdf 0.34.0", - "postscript", - "type1-encoding-parser", - "unicode-normalization", -] - [[package]] name = "percent-encoding" version = "2.3.2" @@ -5468,12 +5412,6 @@ dependencies = [ "miniz_oxide", ] -[[package]] -name = "pom" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" - [[package]] name = "portable-atomic" version = "1.13.1" @@ -5489,12 +5427,6 @@ dependencies = [ "portable-atomic", ] -[[package]] -name = "postscript" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306" - [[package]] name = "potential_utf" version = "0.1.5" @@ -7615,15 +7547,6 @@ dependencies = [ "rand 0.9.4", ] -[[package]] -name = "type1-encoding-parser" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa10c302f5a53b7ad27fd42a3996e23d096ba39b5b8dd6d9e683a05b01bee749" -dependencies = [ - "pom", -] - [[package]] name = "typenum" version = "1.20.0" diff --git a/crates/kebab-parse-pdf/Cargo.toml b/crates/kebab-parse-pdf/Cargo.toml index 3e87673..d71532c 100644 --- a/crates/kebab-parse-pdf/Cargo.toml +++ b/crates/kebab-parse-pdf/Cargo.toml @@ -9,22 +9,18 @@ description = "Text PDF extractor (per-page text + page citation) for the keba [dependencies] kebab-core = { path = "../kebab-core" } -kebab-config = { path = "../kebab-config" } anyhow = { workspace = true } -serde = { workspace = true } serde_json = { workspace = true } time = { workspace = true } tracing = { workspace = true } -thiserror = { workspace = true } # Per-page text extraction. `lopdf::Document::extract_text(&[page])` # is the only stable per-page API across the pdf-extract / lopdf # pair (pdf-extract 0.7 still exposes only whole-document calls). +# pdf-extract is intentionally NOT pulled in here — its ~150 transitive +# crates (pom, postscript, type1-encoding-parser, …) buy us nothing +# at v1 (we don't call its whole-doc API), and the future scanned-PDF +# OCR fallback can re-add it when it actually needs it. lopdf = "0.32" -# Whole-document sanity-check call; covers a few format errors that -# lopdf swallows silently. Per-page text is sourced from lopdf only. -pdf-extract = "0.7" [dev-dependencies] -tempfile = { workspace = true } blake3 = { workspace = true } -serde_json = { workspace = true } diff --git a/crates/kebab-parse-pdf/src/info.rs b/crates/kebab-parse-pdf/src/info.rs index d1b1b86..4b3eb02 100644 --- a/crates/kebab-parse-pdf/src/info.rs +++ b/crates/kebab-parse-pdf/src/info.rs @@ -2,10 +2,15 @@ //! //! PDFs may carry a `/Info` trailer dictionary with `Title`, //! `Producer`, `Creator`, etc. Strings are encoded as either -//! PDFDocEncoding (Latin-1 superset) OR UTF-16BE prefixed with the -//! BOM `0xFE 0xFF`. We handle both. Anything else falls back to -//! UTF-8 lossy. All fields are optional — a missing `/Info` dict is -//! not an error. +//! UTF-16BE prefixed with the BOM `0xFE 0xFF` OR PDFDocEncoding +//! (which agrees with Latin-1 over `0x20–0x7E` + `0xA0–0xFF` and +//! diverges in the `0x18–0x1F` / `0x80–0x9F` ranges). We decode +//! BOM'd strings as proper UTF-16BE; non-BOM strings are decoded +//! as Latin-1 (byte → `char`), which is correct for the common +//! ASCII case and a best-effort approximation for the divergent +//! PDFDocEncoding ranges (full PDFDocEncoding tables aren't worth +//! the maintenance for what is effectively legacy metadata). All +//! fields are optional — a missing `/Info` dict is not an error. #[derive(Default)] pub(crate) struct InfoDict { @@ -61,10 +66,11 @@ fn pdf_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option { } } - // PDFDocEncoding overlaps Latin-1 for the printable range we care - // about, and Latin-1 is byte-identical to UTF-8 only for ASCII; - // `from_utf8_lossy` is the conservative call here. ASCII-only - // PDFs (the common case) round-trip cleanly. - let s = String::from_utf8_lossy(bytes).into_owned(); + // PDFDocEncoding fallback (no BOM). Direct byte → char cast is + // a Latin-1 decoder: ASCII (0x00–0x7F) round-trips, and + // 0xA0–0xFF maps to the matching Unicode code point. `from_utf8_lossy` + // would have replaced 0x80–0xFF with U+FFFD, mangling legacy + // PDFDocEncoded titles like "Café". + let s: String = bytes.iter().map(|&b| b as char).collect(); if s.is_empty() { None } else { Some(s) } } diff --git a/crates/kebab-parse-pdf/src/lib.rs b/crates/kebab-parse-pdf/src/lib.rs index 07d0995..554b04a 100644 --- a/crates/kebab-parse-pdf/src/lib.rs +++ b/crates/kebab-parse-pdf/src/lib.rs @@ -131,8 +131,13 @@ impl Extractor for PdfTextExtractor { char_start: Some(0), char_end: Some(char_count), }; - // ordinal = page - 1; saturating_sub guards the (shouldn't-happen) - // case where lopdf hands back a 0-indexed page key. + // lopdf's `get_pages()` is 1-based by contract. A 0-key would + // collapse two pages onto the same ordinal (silently breaking + // ordinal-based sorting downstream), so we assert the + // invariant in dev builds. The release fallback still uses + // saturating_sub so a future lopdf regression degrades to + // garbled order rather than panic. + debug_assert!(page_num >= 1, "lopdf get_pages() returned 0-based page key"); let ordinal = page_num.saturating_sub(1); let block_id = id_for_block(&doc_id, "paragraph", &[], ordinal, &span); let common = CommonBlock { diff --git a/crates/kebab-parse-pdf/tests/extractor.rs b/crates/kebab-parse-pdf/tests/extractor.rs index 2704ecb..32ff352 100644 --- a/crates/kebab-parse-pdf/tests/extractor.rs +++ b/crates/kebab-parse-pdf/tests/extractor.rs @@ -177,6 +177,43 @@ fn info_dict_title_utf16be_bom_decoded() { ); } +#[test] +fn info_dict_title_utf16be_surrogate_pair_decoded() { + // 🥙 (U+1F959 STUFFED FLATBREAD) sits in the supplementary plane, + // so encoding it as UTF-16BE produces a surrogate pair (D83E DD59). + // BMP-only inputs would never exercise the pair-joining path of + // `String::from_utf16_lossy` — this asserts that path round-trips. + let info = InfoDict { + title: Some(utf16be_bom("케밥 🥙 문서")), + producer: None, + creator: None, + }; + let bytes = build_text_pdf_with_info(&[Some("body")], &info); + let fx = fixture_for("docs/emoji-title.pdf", &bytes); + let doc = PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("PDF with surrogate-pair Title must extract"); + assert_eq!(doc.title, "케밥 🥙 문서"); +} + +#[test] +fn info_dict_title_pdfdocencoding_latin1_high_bytes_decoded() { + // BOM-less PDFDocEncoded title with a high-byte char (0xE9 = 'é'). + // `from_utf8_lossy` would have replaced this with U+FFFD; the + // byte-as-char path keeps it intact. + let info = InfoDict { + title: Some(b"Caf\xE9".to_vec()), + producer: None, + creator: None, + }; + let bytes = build_text_pdf_with_info(&[Some("body")], &info); + let fx = fixture_for("docs/cafe-title.pdf", &bytes); + let doc = PdfTextExtractor::new() + .extract(&fx.ctx(), &bytes) + .expect("PDF with Latin-1 Title must extract"); + assert_eq!(doc.title, "Café"); +} + #[test] fn info_dict_title_falls_back_to_filename_when_missing() { let bytes = build_text_pdf(&[Some("body")]); -- 2.49.1