feat(kebab-parse-pdf): P7-1 text PDF extractor — per-page CanonicalDocument

`PdfTextExtractor`(MediaType::Pdf) lopdf 기반 per-page 텍스트 추출.
페이지마다 `Block::Paragraph` + `SourceSpan::Page { page, char_start, char_end }`
emit. 본문이 비거나 추출 panic 인 페이지는 빈 paragraph + `Provenance::Warning`
("scanned candidate") 로 표시 — 이후 OCR fallback (별도 task) 의 입력.

핵심 동작:
- `lopdf::Document::load_mem` + `is_encrypted()` → 암호화 PDF 는 명시 에러
  (`qpdf --decrypt` 안내).
- 페이지 단위 `extract_text(&[page])` 를 `catch_unwind` 로 감싸 malformed
  page panic 을 recoverable warning 으로 변환.
- `/Info` dict 에서 Title/Producer/Creator best-effort 추출. UTF-16BE BOM
  prefixed 문자열도 디코드 (한국어 등 non-ASCII Title 정상 처리).
- 9개 통합 테스트: 3-page emit, scanned-mixed warning, encrypted refuse,
  corrupt header error, page_count 메타, UTF-16BE Title, filename
  fallback, determinism, snapshot.

`parser_version = "pdf-text-v1"`. Allowed deps: `lopdf 0.32` + `pdf-extract 0.7`
(원본 spec 그대로). 본문 다국어 OCR fallback 은 §9.2 후속 task (out of scope).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-02 08:34:55 +00:00
parent fd89777c83
commit 5a158d7343
9 changed files with 936 additions and 1 deletions

121
Cargo.lock generated
View File

@@ -24,6 +24,15 @@ version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
[[package]]
name = "adobe-cmap-parser"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3"
dependencies = [
"pom",
]
[[package]]
name = "ahash"
version = "0.8.12"
@@ -2114,6 +2123,15 @@ version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40404c3f5f511ec4da6fe866ddf6a717c309fdbb69fbbad7b0f3edab8f2e835f"
[[package]]
name = "euclid"
version = "0.20.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad"
dependencies = [
"num-traits",
]
[[package]]
name = "event-listener"
version = "5.4.1"
@@ -3609,6 +3627,24 @@ dependencies = [
"tracing",
]
[[package]]
name = "kebab-parse-pdf"
version = "0.1.0"
dependencies = [
"anyhow",
"blake3",
"kebab-config",
"kebab-core",
"lopdf 0.32.0",
"pdf-extract",
"serde",
"serde_json",
"tempfile",
"thiserror 2.0.18",
"time",
"tracing",
]
[[package]]
name = "kebab-parse-types"
version = "0.1.0"
@@ -4466,6 +4502,12 @@ dependencies = [
"include_dir",
]
[[package]]
name = "linked-hash-map"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
[[package]]
name = "linux-raw-sys"
version = "0.4.15"
@@ -4521,6 +4563,43 @@ dependencies = [
"imgref",
]
[[package]]
name = "lopdf"
version = "0.32.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e775e4ee264e8a87d50a9efef7b67b4aa988cf94e75630859875fc347e6c872b"
dependencies = [
"chrono",
"encoding_rs",
"flate2",
"itoa",
"linked-hash-map",
"log",
"md5",
"nom 7.1.3",
"rayon",
"time",
"weezl",
]
[[package]]
name = "lopdf"
version = "0.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff"
dependencies = [
"encoding_rs",
"flate2",
"indexmap 2.14.0",
"itoa",
"log",
"md-5",
"nom 7.1.3",
"rangemap",
"time",
"weezl",
]
[[package]]
name = "lru"
version = "0.12.5"
@@ -4639,6 +4718,12 @@ dependencies = [
"digest",
]
[[package]]
name = "md5"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "measure_time"
version = "0.9.0"
@@ -5265,6 +5350,21 @@ dependencies = [
"stfu8",
]
[[package]]
name = "pdf-extract"
version = "0.7.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575"
dependencies = [
"adobe-cmap-parser",
"encoding_rs",
"euclid",
"lopdf 0.34.0",
"postscript",
"type1-encoding-parser",
"unicode-normalization",
]
[[package]]
name = "percent-encoding"
version = "2.3.2"
@@ -5368,6 +5468,12 @@ dependencies = [
"miniz_oxide",
]
[[package]]
name = "pom"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6"
[[package]]
name = "portable-atomic"
version = "1.13.1"
@@ -5383,6 +5489,12 @@ dependencies = [
"portable-atomic",
]
[[package]]
name = "postscript"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306"
[[package]]
name = "potential_utf"
version = "0.1.5"
@@ -7503,6 +7615,15 @@ dependencies = [
"rand 0.9.4",
]
[[package]]
name = "type1-encoding-parser"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa10c302f5a53b7ad27fd42a3996e23d096ba39b5b8dd6d9e683a05b01bee749"
dependencies = [
"pom",
]
[[package]]
name = "typenum"
version = "1.20.0"

View File

@@ -20,6 +20,7 @@ members = [
"crates/kebab-cli",
"crates/kebab-eval",
"crates/kebab-parse-image",
"crates/kebab-parse-pdf",
]
[workspace.package]

View File

@@ -0,0 +1,30 @@
[package]
name = "kebab-parse-pdf"
version = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "Text PDF extractor (per-page text + page citation) for the kebab pipeline (P7-1)"
[dependencies]
kebab-core = { path = "../kebab-core" }
kebab-config = { path = "../kebab-config" }
anyhow = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
time = { workspace = true }
tracing = { workspace = true }
thiserror = { workspace = true }
# Per-page text extraction. `lopdf::Document::extract_text(&[page])`
# is the only stable per-page API across the pdf-extract / lopdf
# pair (pdf-extract 0.7 still exposes only whole-document calls).
lopdf = "0.32"
# Whole-document sanity-check call; covers a few format errors that
# lopdf swallows silently. Per-page text is sourced from lopdf only.
pdf-extract = "0.7"
[dev-dependencies]
tempfile = { workspace = true }
blake3 = { workspace = true }
serde_json = { workspace = true }

View File

@@ -0,0 +1,70 @@
//! `/Info` dictionary extraction (best-effort).
//!
//! PDFs may carry a `/Info` trailer dictionary with `Title`,
//! `Producer`, `Creator`, etc. Strings are encoded as either
//! PDFDocEncoding (Latin-1 superset) OR UTF-16BE prefixed with the
//! BOM `0xFE 0xFF`. We handle both. Anything else falls back to
//! UTF-8 lossy. All fields are optional — a missing `/Info` dict is
//! not an error.
#[derive(Default)]
pub(crate) struct InfoDict {
pub title: Option<String>,
pub producer: Option<String>,
pub creator: Option<String>,
}
pub(crate) fn extract_info(doc: &lopdf::Document) -> InfoDict {
let mut out = InfoDict::default();
let info_obj = match doc.trailer.get(b"Info") {
Ok(o) => o,
Err(_) => return out,
};
let dict = match info_obj {
lopdf::Object::Dictionary(d) => Some(d),
lopdf::Object::Reference(id) => doc
.get_object(*id)
.ok()
.and_then(|o| o.as_dict().ok()),
_ => None,
};
let Some(dict) = dict else { return out };
out.title = pdf_string(dict, b"Title");
out.producer = pdf_string(dict, b"Producer");
out.creator = pdf_string(dict, b"Creator");
out
}
fn pdf_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
let raw = dict.get(key).ok()?;
let bytes: &[u8] = match raw {
lopdf::Object::String(s, _) => s.as_slice(),
_ => return None,
};
// UTF-16BE with BOM (very common for non-ASCII PDF titles).
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
let payload = &bytes[2..];
if payload.len() % 2 == 0 {
let units: Vec<u16> = payload
.chunks_exact(2)
.map(|c| u16::from_be_bytes([c[0], c[1]]))
.collect();
let s = String::from_utf16_lossy(&units);
if !s.is_empty() {
return Some(s);
}
}
}
// PDFDocEncoding overlaps Latin-1 for the printable range we care
// about, and Latin-1 is byte-identical to UTF-8 only for ASCII;
// `from_utf8_lossy` is the conservative call here. ASCII-only
// PDFs (the common case) round-trip cleanly.
let s = String::from_utf8_lossy(bytes).into_owned();
if s.is_empty() { None } else { Some(s) }
}

View File

@@ -0,0 +1,228 @@
//! `kebab-parse-pdf` — text PDF extractor (P7-1).
//!
//! Implements [`kebab_core::Extractor`] for [`MediaType::Pdf`]. Extracts
//! text page-by-page via `lopdf`'s per-page API and emits one
//! [`Block::Paragraph`] per page with [`SourceSpan::Page`] (1-based page,
//! `char_start = 0`, `char_end = chars().count()`).
//!
//! Pages where text extraction fails or returns empty get an empty
//! `Block::Paragraph` plus a `Provenance::Warning` flagging the page as
//! a "scanned candidate" — out-of-scope OCR fallback can pick those up.
//!
//! Scope is intentionally narrow: page text + page numbers. Layout
//! reconstruction (multi-column reading order, tables, math), form
//! fields, bookmarks, and OCR for scanned PDFs are explicitly **not**
//! in this task. See `tasks/p7/p7-1-pdf-text-extractor.md`.
//!
//! Per design §3.4 (`SourceSpan::Page` / `Block::Paragraph`),
//! §9.2 (PDF text extraction), §9 versioning.
mod info;
mod page_text;
use anyhow::{Context, Result};
use kebab_core::{
Block, CanonicalDocument, CommonBlock, Extractor, Inline, Lang, MediaType, Metadata,
ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TextBlock,
TrustLevel, id_for_block, id_for_doc,
};
use serde_json::{Map, Value};
use time::OffsetDateTime;
pub const PARSER_VERSION: &str = "pdf-text-v1";
/// Text-PDF extractor. Per-page text via `lopdf::Document::extract_text`
/// (the only stable per-page API in the lopdf / pdf-extract pair —
/// pdf-extract 0.7 only exposes whole-document calls).
pub struct PdfTextExtractor;
impl PdfTextExtractor {
pub fn new() -> Self {
Self
}
}
impl Default for PdfTextExtractor {
fn default() -> Self {
Self::new()
}
}
impl Extractor for PdfTextExtractor {
fn supports(&self, m: &MediaType) -> bool {
matches!(m, MediaType::Pdf)
}
fn parser_version(&self) -> ParserVersion {
ParserVersion(PARSER_VERSION.to_string())
}
fn extract(
&self,
ctx: &kebab_core::ExtractContext<'_>,
bytes: &[u8],
) -> Result<CanonicalDocument> {
let asset = ctx.asset;
if !self.supports(&asset.media_type) {
anyhow::bail!(
"kebab-parse-pdf: unsupported media_type for PdfTextExtractor: {:?}",
asset.media_type
);
}
let parser_version = self.parser_version();
let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version);
// Catastrophic-decode guard via lopdf. `pdf-extract` is intentionally
// not used for parsing here — it only exposes whole-doc text and
// would re-parse the bytes a second time.
let pdf_doc = lopdf::Document::load_mem(bytes)
.context("kebab-parse-pdf: failed to parse PDF (corrupt header or not a PDF)")?;
if pdf_doc.is_encrypted() {
anyhow::bail!(
"kebab-parse-pdf: encrypted PDF; remove encryption (e.g. `qpdf --decrypt`) before ingest"
);
}
let info = info::extract_info(&pdf_doc);
// `get_pages()` returns BTreeMap<u32, ObjectId> with 1-based page
// numbers. We iterate keys in BTreeMap natural order so output is
// deterministic.
let pages = pdf_doc.get_pages();
let page_count = pages.len() as u32;
let now = OffsetDateTime::now_utc();
let mut events: Vec<ProvenanceEvent> = Vec::with_capacity(2 + pages.len());
events.push(ProvenanceEvent {
at: asset.discovered_at,
agent: "kb-source-fs".to_string(),
kind: ProvenanceKind::Discovered,
note: None,
});
events.push(ProvenanceEvent {
at: now,
agent: "kb-parse-pdf".to_string(),
kind: ProvenanceKind::Parsed,
note: Some(format!(
"parser_version={}; page_count={}",
parser_version.0, page_count
)),
});
let mut blocks: Vec<Block> = Vec::with_capacity(pages.len());
for (&page_num, _) in pages.iter() {
let (text, warning) = match page_text::extract_one(&pdf_doc, page_num) {
Ok(t) if !t.trim().is_empty() => (t, None),
Ok(_) => (
String::new(),
Some(format!("page{page_num} empty (scanned candidate)")),
),
Err(e) => (
String::new(),
Some(format!(
"page{page_num} extract failed: {e} (scanned candidate)"
)),
),
};
let char_count = text.chars().count() as u32;
let span = SourceSpan::Page {
page: page_num,
char_start: Some(0),
char_end: Some(char_count),
};
// ordinal = page - 1; saturating_sub guards the (shouldn't-happen)
// case where lopdf hands back a 0-indexed page key.
let ordinal = page_num.saturating_sub(1);
let block_id = id_for_block(&doc_id, "paragraph", &[], ordinal, &span);
let common = CommonBlock {
block_id,
heading_path: Vec::new(),
source_span: span,
};
let inlines = if text.is_empty() {
Vec::new()
} else {
vec![Inline::Text { text: text.clone() }]
};
blocks.push(Block::Paragraph(TextBlock {
common,
text,
inlines,
}));
if let Some(note) = warning {
events.push(ProvenanceEvent {
at: now,
agent: "kb-parse-pdf".to_string(),
kind: ProvenanceKind::Warning,
note: Some(note),
});
}
}
let title = info
.title
.clone()
.filter(|t| !t.trim().is_empty())
.unwrap_or_else(|| {
let fname = filename_from_workspace_path(&asset.workspace_path.0);
strip_extension(&fname)
});
let mut user = Map::new();
let mut pdf_meta = Map::new();
pdf_meta.insert("page_count".into(), Value::Number(page_count.into()));
if let Some(p) = &info.producer {
pdf_meta.insert("producer".into(), Value::String(p.clone()));
}
if let Some(c) = &info.creator {
pdf_meta.insert("creator".into(), Value::String(c.clone()));
}
user.insert("pdf".into(), Value::Object(pdf_meta));
let metadata = Metadata {
aliases: Vec::new(),
tags: Vec::new(),
created_at: asset.discovered_at,
updated_at: asset.discovered_at,
source_type: SourceType::Paper,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user,
};
tracing::debug!(
target: "kebab-parse-pdf",
"extracted PDF doc_id={} workspace_path={} pages={}",
doc_id.0,
asset.workspace_path.0,
page_count
);
Ok(CanonicalDocument {
doc_id,
source_asset_id: asset.asset_id.clone(),
workspace_path: asset.workspace_path.clone(),
title,
lang: Lang("und".to_string()),
blocks,
metadata,
provenance: Provenance { events },
parser_version,
schema_version: 1,
doc_version: 1,
})
}
}
fn filename_from_workspace_path(p: &str) -> String {
p.rsplit('/').next().unwrap_or(p).to_string()
}
fn strip_extension(filename: &str) -> String {
match filename.rfind('.') {
Some(0) => filename.to_string(),
Some(idx) => filename[..idx].to_string(),
None => filename.to_string(),
}
}

View File

@@ -0,0 +1,13 @@
//! Per-page text extraction. `lopdf::Document::extract_text(&[page])`
//! is the call we lean on; it has a thin history of panicking on
//! malformed pages, so we wrap it in `catch_unwind` to convert the
//! panic into a recoverable `Err` (which the caller maps to an empty
//! page + Warning).
use std::panic::{AssertUnwindSafe, catch_unwind};
pub(crate) fn extract_one(doc: &lopdf::Document, page: u32) -> anyhow::Result<String> {
let result = catch_unwind(AssertUnwindSafe(|| doc.extract_text(&[page])))
.map_err(|_| anyhow::anyhow!("panic during lopdf::Document::extract_text"))?;
result.map_err(|e| anyhow::anyhow!("lopdf extract_text error: {e}"))
}

View File

@@ -0,0 +1,224 @@
//! Test fixture builders for `kebab-parse-pdf`.
//!
//! PDFs are constructed in-memory at test time via `lopdf` rather than
//! committed as binary fixtures. Same rationale as
//! `kebab-parse-image::tests::common`: fixture provenance is auditable
//! from source, no `include_bytes!` paths to keep in sync, and the test
//! binary stays self-contained.
#![allow(dead_code)]
use std::path::PathBuf;
use kebab_core::{
AssetStorage, Checksum, ExtractConfig, ExtractContext, MediaType, RawAsset, SourceUri,
WorkspacePath,
};
use lopdf::content::{Content, Operation};
use lopdf::{Document, Object, Stream, dictionary};
use time::OffsetDateTime;
/// `/Info` dict fields a fixture wants to surface (all optional).
#[derive(Default, Clone)]
pub struct InfoDict {
pub title: Option<Vec<u8>>, // raw bytes — caller controls PDFDocEncoding vs UTF-16BE
pub producer: Option<&'static str>,
pub creator: Option<&'static str>,
}
/// Build a Helvetica-text PDF. `pages` is one entry per page; `None`
/// means the page exists in `/Pages` but has no `/Contents` stream
/// (the "scanned candidate" shape — `extract_text` returns empty).
pub fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
build_text_pdf_with_info(pages, &InfoDict::default())
}
pub fn build_text_pdf_with_info(pages: &[Option<&str>], info: &InfoDict) -> Vec<u8> {
let mut doc = Document::with_version("1.5");
let pages_id = doc.new_object_id();
let font_id = doc.add_object(dictionary! {
"Type" => "Font",
"Subtype" => "Type1",
"BaseFont" => "Helvetica",
});
let resources_id = doc.add_object(dictionary! {
"Font" => dictionary! { "F1" => font_id },
});
let mut page_refs: Vec<Object> = Vec::new();
for page in pages {
let mut page_dict = dictionary! {
"Type" => "Page",
"Parent" => pages_id,
};
if let Some(text) = page {
let content = Content {
operations: vec![
Operation::new("BT", vec![]),
Operation::new("Tf", vec!["F1".into(), 24.into()]),
Operation::new(
"Td",
vec![Object::Integer(100), Object::Integer(700)],
),
Operation::new("Tj", vec![Object::string_literal(*text)]),
Operation::new("ET", vec![]),
],
};
let stream_data = content.encode().expect("content encode");
let content_id =
doc.add_object(Stream::new(dictionary! {}, stream_data));
page_dict.set("Contents", content_id);
}
let page_id = doc.add_object(page_dict);
page_refs.push(page_id.into());
}
let count = page_refs.len() as i64;
let pages_dict = dictionary! {
"Type" => "Pages",
"Kids" => page_refs,
"Count" => count,
"Resources" => resources_id,
"MediaBox" => vec![
Object::Integer(0),
Object::Integer(0),
Object::Integer(595),
Object::Integer(842),
],
};
doc.objects
.insert(pages_id, Object::Dictionary(pages_dict));
let catalog_id = doc.add_object(dictionary! {
"Type" => "Catalog",
"Pages" => pages_id,
});
doc.trailer.set("Root", catalog_id);
if info.title.is_some() || info.producer.is_some() || info.creator.is_some() {
let mut info_dict = lopdf::Dictionary::new();
if let Some(title) = &info.title {
info_dict.set(
"Title",
Object::String(title.clone(), lopdf::StringFormat::Literal),
);
}
if let Some(p) = info.producer {
info_dict.set(
"Producer",
Object::String(p.as_bytes().to_vec(), lopdf::StringFormat::Literal),
);
}
if let Some(c) = info.creator {
info_dict.set(
"Creator",
Object::String(c.as_bytes().to_vec(), lopdf::StringFormat::Literal),
);
}
let info_id = doc.add_object(Object::Dictionary(info_dict));
doc.trailer.set("Info", info_id);
}
let mut out: Vec<u8> = Vec::new();
doc.save_to(&mut out).expect("save PDF to memory");
out
}
/// Wrap any valid PDF byte buffer with a fake `/Encrypt` trailer entry
/// so `Document::is_encrypted()` flips to true. We don't actually
/// encrypt anything — the extractor refuses encrypted PDFs **before**
/// touching streams, so the marker is sufficient.
pub fn make_encrypted_pdf() -> Vec<u8> {
let bytes = build_text_pdf(&[Some("placeholder")]);
let mut doc = Document::load_mem(&bytes).expect("load round-tripped PDF");
let enc_id = doc.add_object(dictionary! {
"Filter" => "Standard",
"V" => 1,
"R" => 2,
"Length" => 40,
"P" => -4,
});
doc.trailer.set("Encrypt", enc_id);
let mut out = Vec::new();
doc.save_to(&mut out).expect("save encrypted PDF");
out
}
/// 27-byte garbage with no `%PDF-` header — `Document::load_mem` errors.
pub fn corrupt_pdf() -> Vec<u8> {
b"NOT A PDF; just plain bytes".to_vec()
}
/// Encode a Rust `&str` as the PDF UTF-16BE-with-BOM string format.
/// Used to verify `info::pdf_string` decodes the multilingual Title
/// path correctly.
pub fn utf16be_bom(s: &str) -> Vec<u8> {
let mut out = Vec::with_capacity(2 + s.encode_utf16().count() * 2);
out.extend_from_slice(&[0xFE, 0xFF]);
for unit in s.encode_utf16() {
out.extend_from_slice(&unit.to_be_bytes());
}
out
}
/// Asset + ExtractContext fixture, mirroring `kebab-parse-image::tests::common`.
pub struct PdfFixture {
pub asset: RawAsset,
workspace_root: PathBuf,
config: ExtractConfig,
}
impl PdfFixture {
pub fn ctx(&self) -> ExtractContext<'_> {
ExtractContext {
asset: &self.asset,
workspace_root: &self.workspace_root,
config: &self.config,
}
}
}
pub fn fixture_for(workspace_path: &str, bytes: &[u8]) -> PdfFixture {
let blake = blake3::hash(bytes);
let full_hex = blake.to_hex().to_string();
let asset_id = kebab_core::id_for_asset(&full_hex);
let workspace_path = WorkspacePath::new(workspace_path.to_string()).unwrap();
let discovered_at = OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap();
let asset = RawAsset {
asset_id,
source_uri: SourceUri::File(PathBuf::from(format!("/tmp/{}", workspace_path.0))),
workspace_path,
media_type: MediaType::Pdf,
byte_len: bytes.len() as u64,
checksum: Checksum(full_hex),
discovered_at,
stored: AssetStorage::Reference {
path: PathBuf::from("/tmp/fake"),
sha: Checksum("0".repeat(64)),
},
};
PdfFixture {
asset,
workspace_root: PathBuf::from("/tmp/fake-root"),
config: ExtractConfig::default(),
}
}
/// Replace every provenance event timestamp after index 0 (Discovered)
/// with `<stripped>` so determinism / snapshot tests can compare JSON
/// across runs. Same shape as `kebab-parse-image::tests::common::strip_dynamic_at`.
pub fn strip_dynamic_at(json: &mut serde_json::Value) {
if let Some(events) = json
.get_mut("provenance")
.and_then(|p| p.get_mut("events"))
.and_then(|e| e.as_array_mut())
{
for (i, ev) in events.iter_mut().enumerate() {
if i > 0
&& let Some(obj) = ev.as_object_mut()
{
obj.insert("at".into(), serde_json::Value::String("<stripped>".into()));
}
}
}
}

View File

@@ -0,0 +1,248 @@
//! Integration tests for `kebab_parse_pdf::PdfTextExtractor` (P7-1).
mod common;
use kebab_core::{Block, Extractor, ProvenanceKind, SourceSpan};
use kebab_parse_pdf::PdfTextExtractor;
use serde_json::Value;
use crate::common::{
InfoDict, build_text_pdf, build_text_pdf_with_info, corrupt_pdf, fixture_for,
make_encrypted_pdf, strip_dynamic_at, utf16be_bom,
};
fn paragraph_blocks(doc: &kebab_core::CanonicalDocument) -> Vec<&kebab_core::TextBlock> {
doc.blocks
.iter()
.map(|b| match b {
Block::Paragraph(t) => t,
other => panic!("expected Paragraph, got {other:?}"),
})
.collect()
}
#[test]
fn three_page_pdf_emits_one_paragraph_block_per_page() {
let bytes = build_text_pdf(&[
Some("Hello page 1"),
Some("Hello page 2"),
Some("Hello page 3"),
]);
let fx = fixture_for("docs/three.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("3-page extraction must succeed");
assert_eq!(doc.title, "three");
assert_eq!(doc.lang.0, "und");
assert_eq!(doc.parser_version.0, kebab_parse_pdf::PARSER_VERSION);
assert_eq!(doc.metadata.user["pdf"]["page_count"], Value::Number(3.into()));
let blocks = paragraph_blocks(&doc);
assert_eq!(blocks.len(), 3);
for (i, b) in blocks.iter().enumerate() {
let want_page = (i as u32) + 1;
match b.common.source_span {
SourceSpan::Page {
page,
char_start,
char_end,
} => {
assert_eq!(page, want_page);
assert_eq!(char_start, Some(0));
let chars = b.text.chars().count() as u32;
assert_eq!(char_end, Some(chars));
}
ref other => panic!("expected Page span, got {other:?}"),
}
assert!(
b.text.contains(&format!("Hello page {want_page}")),
"page {want_page} text mismatch: {:?}",
b.text
);
}
}
#[test]
fn empty_page_emits_warning_and_empty_paragraph() {
let bytes = build_text_pdf(&[Some("page one text"), None, Some("page three text")]);
let fx = fixture_for("docs/scanned-mixed.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("scanned-mixed extraction must succeed");
let blocks = paragraph_blocks(&doc);
assert_eq!(blocks.len(), 3);
assert!(blocks[1].text.is_empty(), "page 2 should have empty text");
assert!(
blocks[1].inlines.is_empty(),
"page 2 inlines should be empty"
);
match blocks[1].common.source_span {
SourceSpan::Page {
page,
char_start,
char_end,
} => {
assert_eq!(page, 2);
assert_eq!(char_start, Some(0));
assert_eq!(char_end, Some(0));
}
ref other => panic!("expected Page, got {other:?}"),
}
let warnings: Vec<_> = doc
.provenance
.events
.iter()
.filter(|e| e.kind == ProvenanceKind::Warning)
.collect();
assert_eq!(warnings.len(), 1, "exactly one warning for the empty page");
assert!(
warnings[0]
.note
.as_deref()
.unwrap_or("")
.contains("page2 empty (scanned candidate)"),
"warning note must mark page 2 as scanned candidate: {:?}",
warnings[0].note
);
}
#[test]
fn encrypted_pdf_returns_helpful_error() {
let bytes = make_encrypted_pdf();
let fx = fixture_for("docs/encrypted.pdf", &bytes);
let err = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect_err("encrypted PDF must be refused");
let msg = format!("{err:#}");
assert!(
msg.contains("encrypted"),
"error must mention encryption: {msg}"
);
assert!(
msg.contains("qpdf") || msg.contains("decrypt"),
"error should point at remediation: {msg}"
);
}
#[test]
fn corrupt_header_returns_error() {
let bytes = corrupt_pdf();
let fx = fixture_for("docs/corrupt.pdf", &bytes);
let err = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect_err("corrupt PDF must error");
let msg = format!("{err:#}");
assert!(
msg.to_lowercase().contains("pdf") || msg.contains("parse"),
"error must mention PDF parse failure: {msg}"
);
}
#[test]
fn page_count_matches_actual_count() {
let bytes = build_text_pdf(&[Some("a"), Some("b"), Some("c"), Some("d"), Some("e")]);
let fx = fixture_for("docs/five.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("5-page extraction must succeed");
assert_eq!(doc.metadata.user["pdf"]["page_count"], Value::Number(5.into()));
assert_eq!(doc.blocks.len(), 5);
}
#[test]
fn info_dict_title_utf16be_bom_decoded() {
// Korean Title encoded as UTF-16BE with BOM is the standard PDF
// path for any non-ASCII metadata. We don't try to decode the
// body text in non-Latin scripts here (CID font support is out
// of scope for v1) — but the metadata path is in scope.
let info = InfoDict {
title: Some(utf16be_bom("케밥 문서")),
producer: Some("kebab-test"),
creator: None,
};
let bytes = build_text_pdf_with_info(&[Some("body")], &info);
let fx = fixture_for("docs/korean-title.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("PDF with UTF-16BE Title must extract");
assert_eq!(doc.title, "케밥 문서");
assert_eq!(
doc.metadata.user["pdf"]["producer"],
Value::String("kebab-test".into())
);
}
#[test]
fn info_dict_title_falls_back_to_filename_when_missing() {
let bytes = build_text_pdf(&[Some("body")]);
let fx = fixture_for("docs/no-info.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("no-info PDF must extract");
assert_eq!(doc.title, "no-info");
}
#[test]
fn determinism_identical_bytes_produce_identical_documents() {
let bytes = build_text_pdf(&[Some("alpha"), Some("beta"), Some("gamma")]);
let fx = fixture_for("docs/det.pdf", &bytes);
let mut a = serde_json::to_value(
PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("first extract"),
)
.unwrap();
let mut b = serde_json::to_value(
PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("second extract"),
)
.unwrap();
strip_dynamic_at(&mut a);
strip_dynamic_at(&mut b);
assert_eq!(a, b, "two extracts of identical bytes must be byte-equal");
}
#[test]
fn snapshot_three_page_canonical_document_stable() {
let bytes = build_text_pdf(&[Some("p1"), Some("p2"), Some("p3")]);
let fx = fixture_for("docs/snapshot.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("snapshot extract");
let mut json = serde_json::to_value(&doc).unwrap();
strip_dynamic_at(&mut json);
// Spot-check the load-bearing shape rather than committing a full
// golden file (the full JSON contains BLAKE3 ids that would
// change if `id_from(...)`'s tuple shape ever shifts — that would
// be a separate, intentional break).
assert_eq!(json["parser_version"], Value::String("pdf-text-v1".into()));
assert_eq!(json["lang"], Value::String("und".into()));
assert_eq!(json["schema_version"], Value::Number(1.into()));
assert_eq!(json["doc_version"], Value::Number(1.into()));
assert_eq!(json["blocks"].as_array().unwrap().len(), 3);
for (i, block) in json["blocks"].as_array().unwrap().iter().enumerate() {
assert_eq!(block["kind"], Value::String("paragraph".into()));
assert_eq!(
block["common"]["source_span"]["kind"],
Value::String("page".into())
);
assert_eq!(
block["common"]["source_span"]["page"],
Value::Number(((i as u64) + 1).into())
);
}
assert_eq!(json["metadata"]["source_type"], Value::String("paper".into()));
assert_eq!(
json["metadata"]["trust_level"],
Value::String("primary".into())
);
}

View File

@@ -3,7 +3,7 @@ phase: P7
component: kebab-parse-pdf (text extractor)
task_id: p7-1
title: "Text PDF extractor → CanonicalDocument with page-level blocks"
status: planned
status: completed
depends_on: [p0-1, p1-6]
unblocks: [p7-2]
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md