feat(kebab-parse-pdf): P7-1 text PDF extractor — per-page CanonicalDocument
`PdfTextExtractor`(MediaType::Pdf) lopdf 기반 per-page 텍스트 추출.
페이지마다 `Block::Paragraph` + `SourceSpan::Page { page, char_start, char_end }`
emit. 본문이 비거나 추출 panic 인 페이지는 빈 paragraph + `Provenance::Warning`
("scanned candidate") 로 표시 — 이후 OCR fallback (별도 task) 의 입력.
핵심 동작:
- `lopdf::Document::load_mem` + `is_encrypted()` → 암호화 PDF 는 명시 에러
(`qpdf --decrypt` 안내).
- 페이지 단위 `extract_text(&[page])` 를 `catch_unwind` 로 감싸 malformed
page panic 을 recoverable warning 으로 변환.
- `/Info` dict 에서 Title/Producer/Creator best-effort 추출. UTF-16BE BOM
prefixed 문자열도 디코드 (한국어 등 non-ASCII Title 정상 처리).
- 9개 통합 테스트: 3-page emit, scanned-mixed warning, encrypted refuse,
corrupt header error, page_count 메타, UTF-16BE Title, filename
fallback, determinism, snapshot.
`parser_version = "pdf-text-v1"`. Allowed deps: `lopdf 0.32` + `pdf-extract 0.7`
(원본 spec 그대로). 본문 다국어 OCR fallback 은 §9.2 후속 task (out of scope).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
121
Cargo.lock
generated
121
Cargo.lock
generated
@@ -24,6 +24,15 @@ version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
|
||||
|
||||
[[package]]
|
||||
name = "adobe-cmap-parser"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3"
|
||||
dependencies = [
|
||||
"pom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.8.12"
|
||||
@@ -2114,6 +2123,15 @@ version = "1.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "40404c3f5f511ec4da6fe866ddf6a717c309fdbb69fbbad7b0f3edab8f2e835f"
|
||||
|
||||
[[package]]
|
||||
name = "euclid"
|
||||
version = "0.20.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "5.4.1"
|
||||
@@ -3609,6 +3627,24 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-pdf"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
"kebab-config",
|
||||
"kebab-core",
|
||||
"lopdf 0.32.0",
|
||||
"pdf-extract",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror 2.0.18",
|
||||
"time",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-types"
|
||||
version = "0.1.0"
|
||||
@@ -4466,6 +4502,12 @@ dependencies = [
|
||||
"include_dir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "linked-hash-map"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.15"
|
||||
@@ -4521,6 +4563,43 @@ dependencies = [
|
||||
"imgref",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lopdf"
|
||||
version = "0.32.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e775e4ee264e8a87d50a9efef7b67b4aa988cf94e75630859875fc347e6c872b"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"encoding_rs",
|
||||
"flate2",
|
||||
"itoa",
|
||||
"linked-hash-map",
|
||||
"log",
|
||||
"md5",
|
||||
"nom 7.1.3",
|
||||
"rayon",
|
||||
"time",
|
||||
"weezl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lopdf"
|
||||
version = "0.34.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
"flate2",
|
||||
"indexmap 2.14.0",
|
||||
"itoa",
|
||||
"log",
|
||||
"md-5",
|
||||
"nom 7.1.3",
|
||||
"rangemap",
|
||||
"time",
|
||||
"weezl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.12.5"
|
||||
@@ -4639,6 +4718,12 @@ dependencies = [
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "md5"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "measure_time"
|
||||
version = "0.9.0"
|
||||
@@ -5265,6 +5350,21 @@ dependencies = [
|
||||
"stfu8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pdf-extract"
|
||||
version = "0.7.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575"
|
||||
dependencies = [
|
||||
"adobe-cmap-parser",
|
||||
"encoding_rs",
|
||||
"euclid",
|
||||
"lopdf 0.34.0",
|
||||
"postscript",
|
||||
"type1-encoding-parser",
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.3.2"
|
||||
@@ -5368,6 +5468,12 @@ dependencies = [
|
||||
"miniz_oxide",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pom"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6"
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic"
|
||||
version = "1.13.1"
|
||||
@@ -5383,6 +5489,12 @@ dependencies = [
|
||||
"portable-atomic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postscript"
|
||||
version = "0.14.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306"
|
||||
|
||||
[[package]]
|
||||
name = "potential_utf"
|
||||
version = "0.1.5"
|
||||
@@ -7503,6 +7615,15 @@ dependencies = [
|
||||
"rand 0.9.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "type1-encoding-parser"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa10c302f5a53b7ad27fd42a3996e23d096ba39b5b8dd6d9e683a05b01bee749"
|
||||
dependencies = [
|
||||
"pom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.20.0"
|
||||
|
||||
@@ -20,6 +20,7 @@ members = [
|
||||
"crates/kebab-cli",
|
||||
"crates/kebab-eval",
|
||||
"crates/kebab-parse-image",
|
||||
"crates/kebab-parse-pdf",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
|
||||
30
crates/kebab-parse-pdf/Cargo.toml
Normal file
30
crates/kebab-parse-pdf/Cargo.toml
Normal file
@@ -0,0 +1,30 @@
|
||||
[package]
|
||||
name = "kebab-parse-pdf"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Text PDF extractor (per-page text + page citation) for the kebab pipeline (P7-1)"
|
||||
|
||||
[dependencies]
|
||||
kebab-core = { path = "../kebab-core" }
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
anyhow = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
# Per-page text extraction. `lopdf::Document::extract_text(&[page])`
|
||||
# is the only stable per-page API across the pdf-extract / lopdf
|
||||
# pair (pdf-extract 0.7 still exposes only whole-document calls).
|
||||
lopdf = "0.32"
|
||||
# Whole-document sanity-check call; covers a few format errors that
|
||||
# lopdf swallows silently. Per-page text is sourced from lopdf only.
|
||||
pdf-extract = "0.7"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
70
crates/kebab-parse-pdf/src/info.rs
Normal file
70
crates/kebab-parse-pdf/src/info.rs
Normal file
@@ -0,0 +1,70 @@
|
||||
//! `/Info` dictionary extraction (best-effort).
|
||||
//!
|
||||
//! PDFs may carry a `/Info` trailer dictionary with `Title`,
|
||||
//! `Producer`, `Creator`, etc. Strings are encoded as either
|
||||
//! PDFDocEncoding (Latin-1 superset) OR UTF-16BE prefixed with the
|
||||
//! BOM `0xFE 0xFF`. We handle both. Anything else falls back to
|
||||
//! UTF-8 lossy. All fields are optional — a missing `/Info` dict is
|
||||
//! not an error.
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct InfoDict {
|
||||
pub title: Option<String>,
|
||||
pub producer: Option<String>,
|
||||
pub creator: Option<String>,
|
||||
}
|
||||
|
||||
pub(crate) fn extract_info(doc: &lopdf::Document) -> InfoDict {
|
||||
let mut out = InfoDict::default();
|
||||
|
||||
let info_obj = match doc.trailer.get(b"Info") {
|
||||
Ok(o) => o,
|
||||
Err(_) => return out,
|
||||
};
|
||||
|
||||
let dict = match info_obj {
|
||||
lopdf::Object::Dictionary(d) => Some(d),
|
||||
lopdf::Object::Reference(id) => doc
|
||||
.get_object(*id)
|
||||
.ok()
|
||||
.and_then(|o| o.as_dict().ok()),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let Some(dict) = dict else { return out };
|
||||
|
||||
out.title = pdf_string(dict, b"Title");
|
||||
out.producer = pdf_string(dict, b"Producer");
|
||||
out.creator = pdf_string(dict, b"Creator");
|
||||
out
|
||||
}
|
||||
|
||||
fn pdf_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
|
||||
let raw = dict.get(key).ok()?;
|
||||
let bytes: &[u8] = match raw {
|
||||
lopdf::Object::String(s, _) => s.as_slice(),
|
||||
_ => return None,
|
||||
};
|
||||
|
||||
// UTF-16BE with BOM (very common for non-ASCII PDF titles).
|
||||
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
|
||||
let payload = &bytes[2..];
|
||||
if payload.len() % 2 == 0 {
|
||||
let units: Vec<u16> = payload
|
||||
.chunks_exact(2)
|
||||
.map(|c| u16::from_be_bytes([c[0], c[1]]))
|
||||
.collect();
|
||||
let s = String::from_utf16_lossy(&units);
|
||||
if !s.is_empty() {
|
||||
return Some(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PDFDocEncoding overlaps Latin-1 for the printable range we care
|
||||
// about, and Latin-1 is byte-identical to UTF-8 only for ASCII;
|
||||
// `from_utf8_lossy` is the conservative call here. ASCII-only
|
||||
// PDFs (the common case) round-trip cleanly.
|
||||
let s = String::from_utf8_lossy(bytes).into_owned();
|
||||
if s.is_empty() { None } else { Some(s) }
|
||||
}
|
||||
228
crates/kebab-parse-pdf/src/lib.rs
Normal file
228
crates/kebab-parse-pdf/src/lib.rs
Normal file
@@ -0,0 +1,228 @@
|
||||
//! `kebab-parse-pdf` — text PDF extractor (P7-1).
|
||||
//!
|
||||
//! Implements [`kebab_core::Extractor`] for [`MediaType::Pdf`]. Extracts
|
||||
//! text page-by-page via `lopdf`'s per-page API and emits one
|
||||
//! [`Block::Paragraph`] per page with [`SourceSpan::Page`] (1-based page,
|
||||
//! `char_start = 0`, `char_end = chars().count()`).
|
||||
//!
|
||||
//! Pages where text extraction fails or returns empty get an empty
|
||||
//! `Block::Paragraph` plus a `Provenance::Warning` flagging the page as
|
||||
//! a "scanned candidate" — out-of-scope OCR fallback can pick those up.
|
||||
//!
|
||||
//! Scope is intentionally narrow: page text + page numbers. Layout
|
||||
//! reconstruction (multi-column reading order, tables, math), form
|
||||
//! fields, bookmarks, and OCR for scanned PDFs are explicitly **not**
|
||||
//! in this task. See `tasks/p7/p7-1-pdf-text-extractor.md`.
|
||||
//!
|
||||
//! Per design §3.4 (`SourceSpan::Page` / `Block::Paragraph`),
|
||||
//! §9.2 (PDF text extraction), §9 versioning.
|
||||
|
||||
mod info;
|
||||
mod page_text;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, CommonBlock, Extractor, Inline, Lang, MediaType, Metadata,
|
||||
ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TextBlock,
|
||||
TrustLevel, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::{Map, Value};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
pub const PARSER_VERSION: &str = "pdf-text-v1";
|
||||
|
||||
/// Text-PDF extractor. Per-page text via `lopdf::Document::extract_text`
|
||||
/// (the only stable per-page API in the lopdf / pdf-extract pair —
|
||||
/// pdf-extract 0.7 only exposes whole-document calls).
|
||||
pub struct PdfTextExtractor;
|
||||
|
||||
impl PdfTextExtractor {
|
||||
pub fn new() -> Self {
|
||||
Self
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for PdfTextExtractor {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Extractor for PdfTextExtractor {
|
||||
fn supports(&self, m: &MediaType) -> bool {
|
||||
matches!(m, MediaType::Pdf)
|
||||
}
|
||||
|
||||
fn parser_version(&self) -> ParserVersion {
|
||||
ParserVersion(PARSER_VERSION.to_string())
|
||||
}
|
||||
|
||||
fn extract(
|
||||
&self,
|
||||
ctx: &kebab_core::ExtractContext<'_>,
|
||||
bytes: &[u8],
|
||||
) -> Result<CanonicalDocument> {
|
||||
let asset = ctx.asset;
|
||||
if !self.supports(&asset.media_type) {
|
||||
anyhow::bail!(
|
||||
"kebab-parse-pdf: unsupported media_type for PdfTextExtractor: {:?}",
|
||||
asset.media_type
|
||||
);
|
||||
}
|
||||
|
||||
let parser_version = self.parser_version();
|
||||
let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version);
|
||||
|
||||
// Catastrophic-decode guard via lopdf. `pdf-extract` is intentionally
|
||||
// not used for parsing here — it only exposes whole-doc text and
|
||||
// would re-parse the bytes a second time.
|
||||
let pdf_doc = lopdf::Document::load_mem(bytes)
|
||||
.context("kebab-parse-pdf: failed to parse PDF (corrupt header or not a PDF)")?;
|
||||
|
||||
if pdf_doc.is_encrypted() {
|
||||
anyhow::bail!(
|
||||
"kebab-parse-pdf: encrypted PDF; remove encryption (e.g. `qpdf --decrypt`) before ingest"
|
||||
);
|
||||
}
|
||||
|
||||
let info = info::extract_info(&pdf_doc);
|
||||
// `get_pages()` returns BTreeMap<u32, ObjectId> with 1-based page
|
||||
// numbers. We iterate keys in BTreeMap natural order so output is
|
||||
// deterministic.
|
||||
let pages = pdf_doc.get_pages();
|
||||
let page_count = pages.len() as u32;
|
||||
|
||||
let now = OffsetDateTime::now_utc();
|
||||
let mut events: Vec<ProvenanceEvent> = Vec::with_capacity(2 + pages.len());
|
||||
events.push(ProvenanceEvent {
|
||||
at: asset.discovered_at,
|
||||
agent: "kb-source-fs".to_string(),
|
||||
kind: ProvenanceKind::Discovered,
|
||||
note: None,
|
||||
});
|
||||
events.push(ProvenanceEvent {
|
||||
at: now,
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::Parsed,
|
||||
note: Some(format!(
|
||||
"parser_version={}; page_count={}",
|
||||
parser_version.0, page_count
|
||||
)),
|
||||
});
|
||||
|
||||
let mut blocks: Vec<Block> = Vec::with_capacity(pages.len());
|
||||
for (&page_num, _) in pages.iter() {
|
||||
let (text, warning) = match page_text::extract_one(&pdf_doc, page_num) {
|
||||
Ok(t) if !t.trim().is_empty() => (t, None),
|
||||
Ok(_) => (
|
||||
String::new(),
|
||||
Some(format!("page{page_num} empty (scanned candidate)")),
|
||||
),
|
||||
Err(e) => (
|
||||
String::new(),
|
||||
Some(format!(
|
||||
"page{page_num} extract failed: {e} (scanned candidate)"
|
||||
)),
|
||||
),
|
||||
};
|
||||
let char_count = text.chars().count() as u32;
|
||||
let span = SourceSpan::Page {
|
||||
page: page_num,
|
||||
char_start: Some(0),
|
||||
char_end: Some(char_count),
|
||||
};
|
||||
// ordinal = page - 1; saturating_sub guards the (shouldn't-happen)
|
||||
// case where lopdf hands back a 0-indexed page key.
|
||||
let ordinal = page_num.saturating_sub(1);
|
||||
let block_id = id_for_block(&doc_id, "paragraph", &[], ordinal, &span);
|
||||
let common = CommonBlock {
|
||||
block_id,
|
||||
heading_path: Vec::new(),
|
||||
source_span: span,
|
||||
};
|
||||
let inlines = if text.is_empty() {
|
||||
Vec::new()
|
||||
} else {
|
||||
vec![Inline::Text { text: text.clone() }]
|
||||
};
|
||||
blocks.push(Block::Paragraph(TextBlock {
|
||||
common,
|
||||
text,
|
||||
inlines,
|
||||
}));
|
||||
if let Some(note) = warning {
|
||||
events.push(ProvenanceEvent {
|
||||
at: now,
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(note),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let title = info
|
||||
.title
|
||||
.clone()
|
||||
.filter(|t| !t.trim().is_empty())
|
||||
.unwrap_or_else(|| {
|
||||
let fname = filename_from_workspace_path(&asset.workspace_path.0);
|
||||
strip_extension(&fname)
|
||||
});
|
||||
|
||||
let mut user = Map::new();
|
||||
let mut pdf_meta = Map::new();
|
||||
pdf_meta.insert("page_count".into(), Value::Number(page_count.into()));
|
||||
if let Some(p) = &info.producer {
|
||||
pdf_meta.insert("producer".into(), Value::String(p.clone()));
|
||||
}
|
||||
if let Some(c) = &info.creator {
|
||||
pdf_meta.insert("creator".into(), Value::String(c.clone()));
|
||||
}
|
||||
user.insert("pdf".into(), Value::Object(pdf_meta));
|
||||
|
||||
let metadata = Metadata {
|
||||
aliases: Vec::new(),
|
||||
tags: Vec::new(),
|
||||
created_at: asset.discovered_at,
|
||||
updated_at: asset.discovered_at,
|
||||
source_type: SourceType::Paper,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-parse-pdf",
|
||||
"extracted PDF doc_id={} workspace_path={} pages={}",
|
||||
doc_id.0,
|
||||
asset.workspace_path.0,
|
||||
page_count
|
||||
);
|
||||
|
||||
Ok(CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: asset.asset_id.clone(),
|
||||
workspace_path: asset.workspace_path.clone(),
|
||||
title,
|
||||
lang: Lang("und".to_string()),
|
||||
blocks,
|
||||
metadata,
|
||||
provenance: Provenance { events },
|
||||
parser_version,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn filename_from_workspace_path(p: &str) -> String {
|
||||
p.rsplit('/').next().unwrap_or(p).to_string()
|
||||
}
|
||||
|
||||
fn strip_extension(filename: &str) -> String {
|
||||
match filename.rfind('.') {
|
||||
Some(0) => filename.to_string(),
|
||||
Some(idx) => filename[..idx].to_string(),
|
||||
None => filename.to_string(),
|
||||
}
|
||||
}
|
||||
13
crates/kebab-parse-pdf/src/page_text.rs
Normal file
13
crates/kebab-parse-pdf/src/page_text.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
//! Per-page text extraction. `lopdf::Document::extract_text(&[page])`
|
||||
//! is the call we lean on; it has a thin history of panicking on
|
||||
//! malformed pages, so we wrap it in `catch_unwind` to convert the
|
||||
//! panic into a recoverable `Err` (which the caller maps to an empty
|
||||
//! page + Warning).
|
||||
|
||||
use std::panic::{AssertUnwindSafe, catch_unwind};
|
||||
|
||||
pub(crate) fn extract_one(doc: &lopdf::Document, page: u32) -> anyhow::Result<String> {
|
||||
let result = catch_unwind(AssertUnwindSafe(|| doc.extract_text(&[page])))
|
||||
.map_err(|_| anyhow::anyhow!("panic during lopdf::Document::extract_text"))?;
|
||||
result.map_err(|e| anyhow::anyhow!("lopdf extract_text error: {e}"))
|
||||
}
|
||||
224
crates/kebab-parse-pdf/tests/common/mod.rs
Normal file
224
crates/kebab-parse-pdf/tests/common/mod.rs
Normal file
@@ -0,0 +1,224 @@
|
||||
//! Test fixture builders for `kebab-parse-pdf`.
|
||||
//!
|
||||
//! PDFs are constructed in-memory at test time via `lopdf` rather than
|
||||
//! committed as binary fixtures. Same rationale as
|
||||
//! `kebab-parse-image::tests::common`: fixture provenance is auditable
|
||||
//! from source, no `include_bytes!` paths to keep in sync, and the test
|
||||
//! binary stays self-contained.
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_core::{
|
||||
AssetStorage, Checksum, ExtractConfig, ExtractContext, MediaType, RawAsset, SourceUri,
|
||||
WorkspacePath,
|
||||
};
|
||||
use lopdf::content::{Content, Operation};
|
||||
use lopdf::{Document, Object, Stream, dictionary};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
/// `/Info` dict fields a fixture wants to surface (all optional).
|
||||
#[derive(Default, Clone)]
|
||||
pub struct InfoDict {
|
||||
pub title: Option<Vec<u8>>, // raw bytes — caller controls PDFDocEncoding vs UTF-16BE
|
||||
pub producer: Option<&'static str>,
|
||||
pub creator: Option<&'static str>,
|
||||
}
|
||||
|
||||
/// Build a Helvetica-text PDF. `pages` is one entry per page; `None`
|
||||
/// means the page exists in `/Pages` but has no `/Contents` stream
|
||||
/// (the "scanned candidate" shape — `extract_text` returns empty).
|
||||
pub fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
|
||||
build_text_pdf_with_info(pages, &InfoDict::default())
|
||||
}
|
||||
|
||||
pub fn build_text_pdf_with_info(pages: &[Option<&str>], info: &InfoDict) -> Vec<u8> {
|
||||
let mut doc = Document::with_version("1.5");
|
||||
let pages_id = doc.new_object_id();
|
||||
let font_id = doc.add_object(dictionary! {
|
||||
"Type" => "Font",
|
||||
"Subtype" => "Type1",
|
||||
"BaseFont" => "Helvetica",
|
||||
});
|
||||
let resources_id = doc.add_object(dictionary! {
|
||||
"Font" => dictionary! { "F1" => font_id },
|
||||
});
|
||||
|
||||
let mut page_refs: Vec<Object> = Vec::new();
|
||||
for page in pages {
|
||||
let mut page_dict = dictionary! {
|
||||
"Type" => "Page",
|
||||
"Parent" => pages_id,
|
||||
};
|
||||
if let Some(text) = page {
|
||||
let content = Content {
|
||||
operations: vec![
|
||||
Operation::new("BT", vec![]),
|
||||
Operation::new("Tf", vec!["F1".into(), 24.into()]),
|
||||
Operation::new(
|
||||
"Td",
|
||||
vec![Object::Integer(100), Object::Integer(700)],
|
||||
),
|
||||
Operation::new("Tj", vec![Object::string_literal(*text)]),
|
||||
Operation::new("ET", vec![]),
|
||||
],
|
||||
};
|
||||
let stream_data = content.encode().expect("content encode");
|
||||
let content_id =
|
||||
doc.add_object(Stream::new(dictionary! {}, stream_data));
|
||||
page_dict.set("Contents", content_id);
|
||||
}
|
||||
let page_id = doc.add_object(page_dict);
|
||||
page_refs.push(page_id.into());
|
||||
}
|
||||
|
||||
let count = page_refs.len() as i64;
|
||||
let pages_dict = dictionary! {
|
||||
"Type" => "Pages",
|
||||
"Kids" => page_refs,
|
||||
"Count" => count,
|
||||
"Resources" => resources_id,
|
||||
"MediaBox" => vec![
|
||||
Object::Integer(0),
|
||||
Object::Integer(0),
|
||||
Object::Integer(595),
|
||||
Object::Integer(842),
|
||||
],
|
||||
};
|
||||
doc.objects
|
||||
.insert(pages_id, Object::Dictionary(pages_dict));
|
||||
|
||||
let catalog_id = doc.add_object(dictionary! {
|
||||
"Type" => "Catalog",
|
||||
"Pages" => pages_id,
|
||||
});
|
||||
doc.trailer.set("Root", catalog_id);
|
||||
|
||||
if info.title.is_some() || info.producer.is_some() || info.creator.is_some() {
|
||||
let mut info_dict = lopdf::Dictionary::new();
|
||||
if let Some(title) = &info.title {
|
||||
info_dict.set(
|
||||
"Title",
|
||||
Object::String(title.clone(), lopdf::StringFormat::Literal),
|
||||
);
|
||||
}
|
||||
if let Some(p) = info.producer {
|
||||
info_dict.set(
|
||||
"Producer",
|
||||
Object::String(p.as_bytes().to_vec(), lopdf::StringFormat::Literal),
|
||||
);
|
||||
}
|
||||
if let Some(c) = info.creator {
|
||||
info_dict.set(
|
||||
"Creator",
|
||||
Object::String(c.as_bytes().to_vec(), lopdf::StringFormat::Literal),
|
||||
);
|
||||
}
|
||||
let info_id = doc.add_object(Object::Dictionary(info_dict));
|
||||
doc.trailer.set("Info", info_id);
|
||||
}
|
||||
|
||||
let mut out: Vec<u8> = Vec::new();
|
||||
doc.save_to(&mut out).expect("save PDF to memory");
|
||||
out
|
||||
}
|
||||
|
||||
/// Wrap any valid PDF byte buffer with a fake `/Encrypt` trailer entry
|
||||
/// so `Document::is_encrypted()` flips to true. We don't actually
|
||||
/// encrypt anything — the extractor refuses encrypted PDFs **before**
|
||||
/// touching streams, so the marker is sufficient.
|
||||
pub fn make_encrypted_pdf() -> Vec<u8> {
|
||||
let bytes = build_text_pdf(&[Some("placeholder")]);
|
||||
let mut doc = Document::load_mem(&bytes).expect("load round-tripped PDF");
|
||||
let enc_id = doc.add_object(dictionary! {
|
||||
"Filter" => "Standard",
|
||||
"V" => 1,
|
||||
"R" => 2,
|
||||
"Length" => 40,
|
||||
"P" => -4,
|
||||
});
|
||||
doc.trailer.set("Encrypt", enc_id);
|
||||
let mut out = Vec::new();
|
||||
doc.save_to(&mut out).expect("save encrypted PDF");
|
||||
out
|
||||
}
|
||||
|
||||
/// 27-byte garbage with no `%PDF-` header — `Document::load_mem` errors.
|
||||
pub fn corrupt_pdf() -> Vec<u8> {
|
||||
b"NOT A PDF; just plain bytes".to_vec()
|
||||
}
|
||||
|
||||
/// Encode a Rust `&str` as the PDF UTF-16BE-with-BOM string format.
|
||||
/// Used to verify `info::pdf_string` decodes the multilingual Title
|
||||
/// path correctly.
|
||||
pub fn utf16be_bom(s: &str) -> Vec<u8> {
|
||||
let mut out = Vec::with_capacity(2 + s.encode_utf16().count() * 2);
|
||||
out.extend_from_slice(&[0xFE, 0xFF]);
|
||||
for unit in s.encode_utf16() {
|
||||
out.extend_from_slice(&unit.to_be_bytes());
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Asset + ExtractContext fixture, mirroring `kebab-parse-image::tests::common`.
|
||||
pub struct PdfFixture {
|
||||
pub asset: RawAsset,
|
||||
workspace_root: PathBuf,
|
||||
config: ExtractConfig,
|
||||
}
|
||||
|
||||
impl PdfFixture {
|
||||
pub fn ctx(&self) -> ExtractContext<'_> {
|
||||
ExtractContext {
|
||||
asset: &self.asset,
|
||||
workspace_root: &self.workspace_root,
|
||||
config: &self.config,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fixture_for(workspace_path: &str, bytes: &[u8]) -> PdfFixture {
|
||||
let blake = blake3::hash(bytes);
|
||||
let full_hex = blake.to_hex().to_string();
|
||||
let asset_id = kebab_core::id_for_asset(&full_hex);
|
||||
let workspace_path = WorkspacePath::new(workspace_path.to_string()).unwrap();
|
||||
let discovered_at = OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap();
|
||||
let asset = RawAsset {
|
||||
asset_id,
|
||||
source_uri: SourceUri::File(PathBuf::from(format!("/tmp/{}", workspace_path.0))),
|
||||
workspace_path,
|
||||
media_type: MediaType::Pdf,
|
||||
byte_len: bytes.len() as u64,
|
||||
checksum: Checksum(full_hex),
|
||||
discovered_at,
|
||||
stored: AssetStorage::Reference {
|
||||
path: PathBuf::from("/tmp/fake"),
|
||||
sha: Checksum("0".repeat(64)),
|
||||
},
|
||||
};
|
||||
PdfFixture {
|
||||
asset,
|
||||
workspace_root: PathBuf::from("/tmp/fake-root"),
|
||||
config: ExtractConfig::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Replace every provenance event timestamp after index 0 (Discovered)
|
||||
/// with `<stripped>` so determinism / snapshot tests can compare JSON
|
||||
/// across runs. Same shape as `kebab-parse-image::tests::common::strip_dynamic_at`.
|
||||
pub fn strip_dynamic_at(json: &mut serde_json::Value) {
|
||||
if let Some(events) = json
|
||||
.get_mut("provenance")
|
||||
.and_then(|p| p.get_mut("events"))
|
||||
.and_then(|e| e.as_array_mut())
|
||||
{
|
||||
for (i, ev) in events.iter_mut().enumerate() {
|
||||
if i > 0
|
||||
&& let Some(obj) = ev.as_object_mut()
|
||||
{
|
||||
obj.insert("at".into(), serde_json::Value::String("<stripped>".into()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
248
crates/kebab-parse-pdf/tests/extractor.rs
Normal file
248
crates/kebab-parse-pdf/tests/extractor.rs
Normal file
@@ -0,0 +1,248 @@
|
||||
//! Integration tests for `kebab_parse_pdf::PdfTextExtractor` (P7-1).
|
||||
|
||||
mod common;
|
||||
|
||||
use kebab_core::{Block, Extractor, ProvenanceKind, SourceSpan};
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::common::{
|
||||
InfoDict, build_text_pdf, build_text_pdf_with_info, corrupt_pdf, fixture_for,
|
||||
make_encrypted_pdf, strip_dynamic_at, utf16be_bom,
|
||||
};
|
||||
|
||||
fn paragraph_blocks(doc: &kebab_core::CanonicalDocument) -> Vec<&kebab_core::TextBlock> {
|
||||
doc.blocks
|
||||
.iter()
|
||||
.map(|b| match b {
|
||||
Block::Paragraph(t) => t,
|
||||
other => panic!("expected Paragraph, got {other:?}"),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn three_page_pdf_emits_one_paragraph_block_per_page() {
|
||||
let bytes = build_text_pdf(&[
|
||||
Some("Hello page 1"),
|
||||
Some("Hello page 2"),
|
||||
Some("Hello page 3"),
|
||||
]);
|
||||
let fx = fixture_for("docs/three.pdf", &bytes);
|
||||
let doc = PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("3-page extraction must succeed");
|
||||
|
||||
assert_eq!(doc.title, "three");
|
||||
assert_eq!(doc.lang.0, "und");
|
||||
assert_eq!(doc.parser_version.0, kebab_parse_pdf::PARSER_VERSION);
|
||||
assert_eq!(doc.metadata.user["pdf"]["page_count"], Value::Number(3.into()));
|
||||
|
||||
let blocks = paragraph_blocks(&doc);
|
||||
assert_eq!(blocks.len(), 3);
|
||||
for (i, b) in blocks.iter().enumerate() {
|
||||
let want_page = (i as u32) + 1;
|
||||
match b.common.source_span {
|
||||
SourceSpan::Page {
|
||||
page,
|
||||
char_start,
|
||||
char_end,
|
||||
} => {
|
||||
assert_eq!(page, want_page);
|
||||
assert_eq!(char_start, Some(0));
|
||||
let chars = b.text.chars().count() as u32;
|
||||
assert_eq!(char_end, Some(chars));
|
||||
}
|
||||
ref other => panic!("expected Page span, got {other:?}"),
|
||||
}
|
||||
assert!(
|
||||
b.text.contains(&format!("Hello page {want_page}")),
|
||||
"page {want_page} text mismatch: {:?}",
|
||||
b.text
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_page_emits_warning_and_empty_paragraph() {
|
||||
let bytes = build_text_pdf(&[Some("page one text"), None, Some("page three text")]);
|
||||
let fx = fixture_for("docs/scanned-mixed.pdf", &bytes);
|
||||
let doc = PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("scanned-mixed extraction must succeed");
|
||||
|
||||
let blocks = paragraph_blocks(&doc);
|
||||
assert_eq!(blocks.len(), 3);
|
||||
assert!(blocks[1].text.is_empty(), "page 2 should have empty text");
|
||||
assert!(
|
||||
blocks[1].inlines.is_empty(),
|
||||
"page 2 inlines should be empty"
|
||||
);
|
||||
match blocks[1].common.source_span {
|
||||
SourceSpan::Page {
|
||||
page,
|
||||
char_start,
|
||||
char_end,
|
||||
} => {
|
||||
assert_eq!(page, 2);
|
||||
assert_eq!(char_start, Some(0));
|
||||
assert_eq!(char_end, Some(0));
|
||||
}
|
||||
ref other => panic!("expected Page, got {other:?}"),
|
||||
}
|
||||
|
||||
let warnings: Vec<_> = doc
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == ProvenanceKind::Warning)
|
||||
.collect();
|
||||
assert_eq!(warnings.len(), 1, "exactly one warning for the empty page");
|
||||
assert!(
|
||||
warnings[0]
|
||||
.note
|
||||
.as_deref()
|
||||
.unwrap_or("")
|
||||
.contains("page2 empty (scanned candidate)"),
|
||||
"warning note must mark page 2 as scanned candidate: {:?}",
|
||||
warnings[0].note
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn encrypted_pdf_returns_helpful_error() {
|
||||
let bytes = make_encrypted_pdf();
|
||||
let fx = fixture_for("docs/encrypted.pdf", &bytes);
|
||||
let err = PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect_err("encrypted PDF must be refused");
|
||||
let msg = format!("{err:#}");
|
||||
assert!(
|
||||
msg.contains("encrypted"),
|
||||
"error must mention encryption: {msg}"
|
||||
);
|
||||
assert!(
|
||||
msg.contains("qpdf") || msg.contains("decrypt"),
|
||||
"error should point at remediation: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_header_returns_error() {
|
||||
let bytes = corrupt_pdf();
|
||||
let fx = fixture_for("docs/corrupt.pdf", &bytes);
|
||||
let err = PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect_err("corrupt PDF must error");
|
||||
let msg = format!("{err:#}");
|
||||
assert!(
|
||||
msg.to_lowercase().contains("pdf") || msg.contains("parse"),
|
||||
"error must mention PDF parse failure: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn page_count_matches_actual_count() {
|
||||
let bytes = build_text_pdf(&[Some("a"), Some("b"), Some("c"), Some("d"), Some("e")]);
|
||||
let fx = fixture_for("docs/five.pdf", &bytes);
|
||||
let doc = PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("5-page extraction must succeed");
|
||||
|
||||
assert_eq!(doc.metadata.user["pdf"]["page_count"], Value::Number(5.into()));
|
||||
assert_eq!(doc.blocks.len(), 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn info_dict_title_utf16be_bom_decoded() {
|
||||
// Korean Title encoded as UTF-16BE with BOM is the standard PDF
|
||||
// path for any non-ASCII metadata. We don't try to decode the
|
||||
// body text in non-Latin scripts here (CID font support is out
|
||||
// of scope for v1) — but the metadata path is in scope.
|
||||
let info = InfoDict {
|
||||
title: Some(utf16be_bom("케밥 문서")),
|
||||
producer: Some("kebab-test"),
|
||||
creator: None,
|
||||
};
|
||||
let bytes = build_text_pdf_with_info(&[Some("body")], &info);
|
||||
let fx = fixture_for("docs/korean-title.pdf", &bytes);
|
||||
let doc = PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("PDF with UTF-16BE Title must extract");
|
||||
|
||||
assert_eq!(doc.title, "케밥 문서");
|
||||
assert_eq!(
|
||||
doc.metadata.user["pdf"]["producer"],
|
||||
Value::String("kebab-test".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn info_dict_title_falls_back_to_filename_when_missing() {
|
||||
let bytes = build_text_pdf(&[Some("body")]);
|
||||
let fx = fixture_for("docs/no-info.pdf", &bytes);
|
||||
let doc = PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("no-info PDF must extract");
|
||||
assert_eq!(doc.title, "no-info");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn determinism_identical_bytes_produce_identical_documents() {
|
||||
let bytes = build_text_pdf(&[Some("alpha"), Some("beta"), Some("gamma")]);
|
||||
let fx = fixture_for("docs/det.pdf", &bytes);
|
||||
|
||||
let mut a = serde_json::to_value(
|
||||
PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("first extract"),
|
||||
)
|
||||
.unwrap();
|
||||
let mut b = serde_json::to_value(
|
||||
PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("second extract"),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
strip_dynamic_at(&mut a);
|
||||
strip_dynamic_at(&mut b);
|
||||
assert_eq!(a, b, "two extracts of identical bytes must be byte-equal");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn snapshot_three_page_canonical_document_stable() {
|
||||
let bytes = build_text_pdf(&[Some("p1"), Some("p2"), Some("p3")]);
|
||||
let fx = fixture_for("docs/snapshot.pdf", &bytes);
|
||||
let doc = PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("snapshot extract");
|
||||
let mut json = serde_json::to_value(&doc).unwrap();
|
||||
strip_dynamic_at(&mut json);
|
||||
|
||||
// Spot-check the load-bearing shape rather than committing a full
|
||||
// golden file (the full JSON contains BLAKE3 ids that would
|
||||
// change if `id_from(...)`'s tuple shape ever shifts — that would
|
||||
// be a separate, intentional break).
|
||||
assert_eq!(json["parser_version"], Value::String("pdf-text-v1".into()));
|
||||
assert_eq!(json["lang"], Value::String("und".into()));
|
||||
assert_eq!(json["schema_version"], Value::Number(1.into()));
|
||||
assert_eq!(json["doc_version"], Value::Number(1.into()));
|
||||
assert_eq!(json["blocks"].as_array().unwrap().len(), 3);
|
||||
for (i, block) in json["blocks"].as_array().unwrap().iter().enumerate() {
|
||||
assert_eq!(block["kind"], Value::String("paragraph".into()));
|
||||
assert_eq!(
|
||||
block["common"]["source_span"]["kind"],
|
||||
Value::String("page".into())
|
||||
);
|
||||
assert_eq!(
|
||||
block["common"]["source_span"]["page"],
|
||||
Value::Number(((i as u64) + 1).into())
|
||||
);
|
||||
}
|
||||
assert_eq!(json["metadata"]["source_type"], Value::String("paper".into()));
|
||||
assert_eq!(
|
||||
json["metadata"]["trust_level"],
|
||||
Value::String("primary".into())
|
||||
);
|
||||
}
|
||||
@@ -3,7 +3,7 @@ phase: P7
|
||||
component: kebab-parse-pdf (text extractor)
|
||||
task_id: p7-1
|
||||
title: "Text PDF extractor → CanonicalDocument with page-level blocks"
|
||||
status: planned
|
||||
status: completed
|
||||
depends_on: [p0-1, p1-6]
|
||||
unblocks: [p7-2]
|
||||
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
|
||||
|
||||
Reference in New Issue
Block a user