feat(kebab-parse-pdf): P7-1 text PDF extractor — per-page CanonicalDocument

`PdfTextExtractor`(MediaType::Pdf) lopdf 기반 per-page 텍스트 추출. 페이지마다 `Block::Paragraph` + `SourceSpan::Page { page, char_start, char_end }` emit. 본문이 비거나 추출 panic 인 페이지는 빈 paragraph + `Provenance::Warning` ("scanned candidate") 로 표시 — 이후 OCR fallback (별도 task) 의 입력. 핵심 동작: - `lopdf::Document::load_mem` + `is_encrypted()` → 암호화 PDF 는 명시 에러 (`qpdf --decrypt` 안내). - 페이지 단위 `extract_text(&[page])` 를 `catch_unwind` 로 감싸 malformed page panic 을 recoverable warning 으로 변환. - `/Info` dict 에서 Title/Producer/Creator best-effort 추출. UTF-16BE BOM prefixed 문자열도 디코드 (한국어 등 non-ASCII Title 정상 처리). - 9개 통합 테스트: 3-page emit, scanned-mixed warning, encrypted refuse, corrupt header error, page_count 메타, UTF-16BE Title, filename fallback, determinism, snapshot. `parser_version = "pdf-text-v1"`. Allowed deps: `lopdf 0.32` + `pdf-extract 0.7` (원본 spec 그대로). 본문 다국어 OCR fallback 은 §9.2 후속 task (out of scope). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 08:34:55 +00:00
parent fd89777c83
commit 5a158d7343
9 changed files with 936 additions and 1 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -24,6 +24,15 @@ version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"

+[[package]]
+name = "adobe-cmap-parser"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3"
+dependencies = [
+ "pom",
+]
+
 [[package]]
 name = "ahash"
 version = "0.8.12"
@@ -2114,6 +2123,15 @@ version = "1.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "40404c3f5f511ec4da6fe866ddf6a717c309fdbb69fbbad7b0f3edab8f2e835f"

+[[package]]
+name = "euclid"
+version = "0.20.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "event-listener"
 version = "5.4.1"
@@ -3609,6 +3627,24 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "kebab-parse-pdf"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "blake3",
+ "kebab-config",
+ "kebab-core",
+ "lopdf 0.32.0",
+ "pdf-extract",
+ "serde",
+ "serde_json",
+ "tempfile",
+ "thiserror 2.0.18",
+ "time",
+ "tracing",
+]
+
 [[package]]
 name = "kebab-parse-types"
 version = "0.1.0"
@@ -4466,6 +4502,12 @@ dependencies = [
 "include_dir",
 ]

+[[package]]
+name = "linked-hash-map"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.15"
@@ -4521,6 +4563,43 @@ dependencies = [
 "imgref",
 ]

+[[package]]
+name = "lopdf"
+version = "0.32.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e775e4ee264e8a87d50a9efef7b67b4aa988cf94e75630859875fc347e6c872b"
+dependencies = [
+ "chrono",
+ "encoding_rs",
+ "flate2",
+ "itoa",
+ "linked-hash-map",
+ "log",
+ "md5",
+ "nom 7.1.3",
+ "rayon",
+ "time",
+ "weezl",
+]
+
+[[package]]
+name = "lopdf"
+version = "0.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff"
+dependencies = [
+ "encoding_rs",
+ "flate2",
+ "indexmap 2.14.0",
+ "itoa",
+ "log",
+ "md-5",
+ "nom 7.1.3",
+ "rangemap",
+ "time",
+ "weezl",
+]
+
 [[package]]
 name = "lru"
 version = "0.12.5"
@@ -4639,6 +4718,12 @@ dependencies = [
 "digest",
 ]

+[[package]]
+name = "md5"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
+
 [[package]]
 name = "measure_time"
 version = "0.9.0"
@@ -5265,6 +5350,21 @@ dependencies = [
 "stfu8",
 ]

+[[package]]
+name = "pdf-extract"
+version = "0.7.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575"
+dependencies = [
+ "adobe-cmap-parser",
+ "encoding_rs",
+ "euclid",
+ "lopdf 0.34.0",
+ "postscript",
+ "type1-encoding-parser",
+ "unicode-normalization",
+]
+
 [[package]]
 name = "percent-encoding"
 version = "2.3.2"
@@ -5368,6 +5468,12 @@ dependencies = [
 "miniz_oxide",
 ]

+[[package]]
+name = "pom"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6"
+
 [[package]]
 name = "portable-atomic"
 version = "1.13.1"
@@ -5383,6 +5489,12 @@ dependencies = [
 "portable-atomic",
 ]

+[[package]]
+name = "postscript"
+version = "0.14.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306"
+
 [[package]]
 name = "potential_utf"
 version = "0.1.5"
@@ -7503,6 +7615,15 @@ dependencies = [
 "rand 0.9.4",
 ]

+[[package]]
+name = "type1-encoding-parser"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa10c302f5a53b7ad27fd42a3996e23d096ba39b5b8dd6d9e683a05b01bee749"
+dependencies = [
+ "pom",
+]
+
 [[package]]
 name = "typenum"
 version = "1.20.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ members = [
    "crates/kebab-cli",
    "crates/kebab-eval",
    "crates/kebab-parse-image",
+    "crates/kebab-parse-pdf",
 ]

 [workspace.package]
--- a/crates/kebab-parse-pdf/Cargo.toml
+++ b/crates/kebab-parse-pdf/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "kebab-parse-pdf"
+version       = { workspace = true }
+edition       = { workspace = true }
+rust-version  = { workspace = true }
+license       = { workspace = true }
+repository    = { workspace = true }
+description   = "Text PDF extractor (per-page text + page citation) for the kebab pipeline (P7-1)"
+
+[dependencies]
+kebab-core   = { path = "../kebab-core" }
+kebab-config = { path = "../kebab-config" }
+anyhow       = { workspace = true }
+serde        = { workspace = true }
+serde_json   = { workspace = true }
+time         = { workspace = true }
+tracing      = { workspace = true }
+thiserror    = { workspace = true }
+# Per-page text extraction. `lopdf::Document::extract_text(&[page])`
+# is the only stable per-page API across the pdf-extract / lopdf
+# pair (pdf-extract 0.7 still exposes only whole-document calls).
+lopdf        = "0.32"
+# Whole-document sanity-check call; covers a few format errors that
+# lopdf swallows silently. Per-page text is sourced from lopdf only.
+pdf-extract  = "0.7"
+
+[dev-dependencies]
+tempfile     = { workspace = true }
+blake3       = { workspace = true }
+serde_json   = { workspace = true }
--- a/crates/kebab-parse-pdf/src/info.rs
+++ b/crates/kebab-parse-pdf/src/info.rs
@@ -0,0 +1,70 @@
+//! `/Info` dictionary extraction (best-effort).
+//!
+//! PDFs may carry a `/Info` trailer dictionary with `Title`,
+//! `Producer`, `Creator`, etc. Strings are encoded as either
+//! PDFDocEncoding (Latin-1 superset) OR UTF-16BE prefixed with the
+//! BOM `0xFE 0xFF`. We handle both. Anything else falls back to
+//! UTF-8 lossy. All fields are optional — a missing `/Info` dict is
+//! not an error.
+
+#[derive(Default)]
+pub(crate) struct InfoDict {
+    pub title: Option<String>,
+    pub producer: Option<String>,
+    pub creator: Option<String>,
+}
+
+pub(crate) fn extract_info(doc: &lopdf::Document) -> InfoDict {
+    let mut out = InfoDict::default();
+
+    let info_obj = match doc.trailer.get(b"Info") {
+        Ok(o) => o,
+        Err(_) => return out,
+    };
+
+    let dict = match info_obj {
+        lopdf::Object::Dictionary(d) => Some(d),
+        lopdf::Object::Reference(id) => doc
+            .get_object(*id)
+            .ok()
+            .and_then(|o| o.as_dict().ok()),
+        _ => None,
+    };
+
+    let Some(dict) = dict else { return out };
+
+    out.title = pdf_string(dict, b"Title");
+    out.producer = pdf_string(dict, b"Producer");
+    out.creator = pdf_string(dict, b"Creator");
+    out
+}
+
+fn pdf_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
+    let raw = dict.get(key).ok()?;
+    let bytes: &[u8] = match raw {
+        lopdf::Object::String(s, _) => s.as_slice(),
+        _ => return None,
+    };
+
+    // UTF-16BE with BOM (very common for non-ASCII PDF titles).
+    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
+        let payload = &bytes[2..];
+        if payload.len() % 2 == 0 {
+            let units: Vec<u16> = payload
+                .chunks_exact(2)
+                .map(|c| u16::from_be_bytes([c[0], c[1]]))
+                .collect();
+            let s = String::from_utf16_lossy(&units);
+            if !s.is_empty() {
+                return Some(s);
+            }
+        }
+    }
+
+    // PDFDocEncoding overlaps Latin-1 for the printable range we care
+    // about, and Latin-1 is byte-identical to UTF-8 only for ASCII;
+    // `from_utf8_lossy` is the conservative call here. ASCII-only
+    // PDFs (the common case) round-trip cleanly.
+    let s = String::from_utf8_lossy(bytes).into_owned();
+    if s.is_empty() { None } else { Some(s) }
+}
--- a/crates/kebab-parse-pdf/src/lib.rs
+++ b/crates/kebab-parse-pdf/src/lib.rs
@@ -0,0 +1,228 @@
+//! `kebab-parse-pdf` — text PDF extractor (P7-1).
+//!
+//! Implements [`kebab_core::Extractor`] for [`MediaType::Pdf`]. Extracts
+//! text page-by-page via `lopdf`'s per-page API and emits one
+//! [`Block::Paragraph`] per page with [`SourceSpan::Page`] (1-based page,
+//! `char_start = 0`, `char_end = chars().count()`).
+//!
+//! Pages where text extraction fails or returns empty get an empty
+//! `Block::Paragraph` plus a `Provenance::Warning` flagging the page as
+//! a "scanned candidate" — out-of-scope OCR fallback can pick those up.
+//!
+//! Scope is intentionally narrow: page text + page numbers. Layout
+//! reconstruction (multi-column reading order, tables, math), form
+//! fields, bookmarks, and OCR for scanned PDFs are explicitly **not**
+//! in this task. See `tasks/p7/p7-1-pdf-text-extractor.md`.
+//!
+//! Per design §3.4 (`SourceSpan::Page` / `Block::Paragraph`),
+//! §9.2 (PDF text extraction), §9 versioning.
+
+mod info;
+mod page_text;
+
+use anyhow::{Context, Result};
+use kebab_core::{
+    Block, CanonicalDocument, CommonBlock, Extractor, Inline, Lang, MediaType, Metadata,
+    ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TextBlock,
+    TrustLevel, id_for_block, id_for_doc,
+};
+use serde_json::{Map, Value};
+use time::OffsetDateTime;
+
+pub const PARSER_VERSION: &str = "pdf-text-v1";
+
+/// Text-PDF extractor. Per-page text via `lopdf::Document::extract_text`
+/// (the only stable per-page API in the lopdf / pdf-extract pair —
+/// pdf-extract 0.7 only exposes whole-document calls).
+pub struct PdfTextExtractor;
+
+impl PdfTextExtractor {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl Default for PdfTextExtractor {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Extractor for PdfTextExtractor {
+    fn supports(&self, m: &MediaType) -> bool {
+        matches!(m, MediaType::Pdf)
+    }
+
+    fn parser_version(&self) -> ParserVersion {
+        ParserVersion(PARSER_VERSION.to_string())
+    }
+
+    fn extract(
+        &self,
+        ctx: &kebab_core::ExtractContext<'_>,
+        bytes: &[u8],
+    ) -> Result<CanonicalDocument> {
+        let asset = ctx.asset;
+        if !self.supports(&asset.media_type) {
+            anyhow::bail!(
+                "kebab-parse-pdf: unsupported media_type for PdfTextExtractor: {:?}",
+                asset.media_type
+            );
+        }
+
+        let parser_version = self.parser_version();
+        let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version);
+
+        // Catastrophic-decode guard via lopdf. `pdf-extract` is intentionally
+        // not used for parsing here — it only exposes whole-doc text and
+        // would re-parse the bytes a second time.
+        let pdf_doc = lopdf::Document::load_mem(bytes)
+            .context("kebab-parse-pdf: failed to parse PDF (corrupt header or not a PDF)")?;
+
+        if pdf_doc.is_encrypted() {
+            anyhow::bail!(
+                "kebab-parse-pdf: encrypted PDF; remove encryption (e.g. `qpdf --decrypt`) before ingest"
+            );
+        }
+
+        let info = info::extract_info(&pdf_doc);
+        // `get_pages()` returns BTreeMap<u32, ObjectId> with 1-based page
+        // numbers. We iterate keys in BTreeMap natural order so output is
+        // deterministic.
+        let pages = pdf_doc.get_pages();
+        let page_count = pages.len() as u32;
+
+        let now = OffsetDateTime::now_utc();
+        let mut events: Vec<ProvenanceEvent> = Vec::with_capacity(2 + pages.len());
+        events.push(ProvenanceEvent {
+            at: asset.discovered_at,
+            agent: "kb-source-fs".to_string(),
+            kind: ProvenanceKind::Discovered,
+            note: None,
+        });
+        events.push(ProvenanceEvent {
+            at: now,
+            agent: "kb-parse-pdf".to_string(),
+            kind: ProvenanceKind::Parsed,
+            note: Some(format!(
+                "parser_version={}; page_count={}",
+                parser_version.0, page_count
+            )),
+        });
+
+        let mut blocks: Vec<Block> = Vec::with_capacity(pages.len());
+        for (&page_num, _) in pages.iter() {
+            let (text, warning) = match page_text::extract_one(&pdf_doc, page_num) {
+                Ok(t) if !t.trim().is_empty() => (t, None),
+                Ok(_) => (
+                    String::new(),
+                    Some(format!("page{page_num} empty (scanned candidate)")),
+                ),
+                Err(e) => (
+                    String::new(),
+                    Some(format!(
+                        "page{page_num} extract failed: {e} (scanned candidate)"
+                    )),
+                ),
+            };
+            let char_count = text.chars().count() as u32;
+            let span = SourceSpan::Page {
+                page: page_num,
+                char_start: Some(0),
+                char_end: Some(char_count),
+            };
+            // ordinal = page - 1; saturating_sub guards the (shouldn't-happen)
+            // case where lopdf hands back a 0-indexed page key.
+            let ordinal = page_num.saturating_sub(1);
+            let block_id = id_for_block(&doc_id, "paragraph", &[], ordinal, &span);
+            let common = CommonBlock {
+                block_id,
+                heading_path: Vec::new(),
+                source_span: span,
+            };
+            let inlines = if text.is_empty() {
+                Vec::new()
+            } else {
+                vec![Inline::Text { text: text.clone() }]
+            };
+            blocks.push(Block::Paragraph(TextBlock {
+                common,
+                text,
+                inlines,
+            }));
+            if let Some(note) = warning {
+                events.push(ProvenanceEvent {
+                    at: now,
+                    agent: "kb-parse-pdf".to_string(),
+                    kind: ProvenanceKind::Warning,
+                    note: Some(note),
+                });
+            }
+        }
+
+        let title = info
+            .title
+            .clone()
+            .filter(|t| !t.trim().is_empty())
+            .unwrap_or_else(|| {
+                let fname = filename_from_workspace_path(&asset.workspace_path.0);
+                strip_extension(&fname)
+            });
+
+        let mut user = Map::new();
+        let mut pdf_meta = Map::new();
+        pdf_meta.insert("page_count".into(), Value::Number(page_count.into()));
+        if let Some(p) = &info.producer {
+            pdf_meta.insert("producer".into(), Value::String(p.clone()));
+        }
+        if let Some(c) = &info.creator {
+            pdf_meta.insert("creator".into(), Value::String(c.clone()));
+        }
+        user.insert("pdf".into(), Value::Object(pdf_meta));
+
+        let metadata = Metadata {
+            aliases: Vec::new(),
+            tags: Vec::new(),
+            created_at: asset.discovered_at,
+            updated_at: asset.discovered_at,
+            source_type: SourceType::Paper,
+            trust_level: TrustLevel::Primary,
+            user_id_alias: None,
+            user,
+        };
+
+        tracing::debug!(
+            target: "kebab-parse-pdf",
+            "extracted PDF doc_id={} workspace_path={} pages={}",
+            doc_id.0,
+            asset.workspace_path.0,
+            page_count
+        );
+
+        Ok(CanonicalDocument {
+            doc_id,
+            source_asset_id: asset.asset_id.clone(),
+            workspace_path: asset.workspace_path.clone(),
+            title,
+            lang: Lang("und".to_string()),
+            blocks,
+            metadata,
+            provenance: Provenance { events },
+            parser_version,
+            schema_version: 1,
+            doc_version: 1,
+        })
+    }
+}
+
+fn filename_from_workspace_path(p: &str) -> String {
+    p.rsplit('/').next().unwrap_or(p).to_string()
+}
+
+fn strip_extension(filename: &str) -> String {
+    match filename.rfind('.') {
+        Some(0) => filename.to_string(),
+        Some(idx) => filename[..idx].to_string(),
+        None => filename.to_string(),
+    }
+}
--- a/crates/kebab-parse-pdf/src/page_text.rs
+++ b/crates/kebab-parse-pdf/src/page_text.rs
@@ -0,0 +1,13 @@
+//! Per-page text extraction. `lopdf::Document::extract_text(&[page])`
+//! is the call we lean on; it has a thin history of panicking on
+//! malformed pages, so we wrap it in `catch_unwind` to convert the
+//! panic into a recoverable `Err` (which the caller maps to an empty
+//! page + Warning).
+
+use std::panic::{AssertUnwindSafe, catch_unwind};
+
+pub(crate) fn extract_one(doc: &lopdf::Document, page: u32) -> anyhow::Result<String> {
+    let result = catch_unwind(AssertUnwindSafe(|| doc.extract_text(&[page])))
+        .map_err(|_| anyhow::anyhow!("panic during lopdf::Document::extract_text"))?;
+    result.map_err(|e| anyhow::anyhow!("lopdf extract_text error: {e}"))
+}
--- a/crates/kebab-parse-pdf/tests/common/mod.rs
+++ b/crates/kebab-parse-pdf/tests/common/mod.rs
@@ -0,0 +1,224 @@
+//! Test fixture builders for `kebab-parse-pdf`.
+//!
+//! PDFs are constructed in-memory at test time via `lopdf` rather than
+//! committed as binary fixtures. Same rationale as
+//! `kebab-parse-image::tests::common`: fixture provenance is auditable
+//! from source, no `include_bytes!` paths to keep in sync, and the test
+//! binary stays self-contained.
+
+#![allow(dead_code)]
+
+use std::path::PathBuf;
+
+use kebab_core::{
+    AssetStorage, Checksum, ExtractConfig, ExtractContext, MediaType, RawAsset, SourceUri,
+    WorkspacePath,
+};
+use lopdf::content::{Content, Operation};
+use lopdf::{Document, Object, Stream, dictionary};
+use time::OffsetDateTime;
+
+/// `/Info` dict fields a fixture wants to surface (all optional).
+#[derive(Default, Clone)]
+pub struct InfoDict {
+    pub title: Option<Vec<u8>>, // raw bytes — caller controls PDFDocEncoding vs UTF-16BE
+    pub producer: Option<&'static str>,
+    pub creator: Option<&'static str>,
+}
+
+/// Build a Helvetica-text PDF. `pages` is one entry per page; `None`
+/// means the page exists in `/Pages` but has no `/Contents` stream
+/// (the "scanned candidate" shape — `extract_text` returns empty).
+pub fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
+    build_text_pdf_with_info(pages, &InfoDict::default())
+}
+
+pub fn build_text_pdf_with_info(pages: &[Option<&str>], info: &InfoDict) -> Vec<u8> {
+    let mut doc = Document::with_version("1.5");
+    let pages_id = doc.new_object_id();
+    let font_id = doc.add_object(dictionary! {
+        "Type" => "Font",
+        "Subtype" => "Type1",
+        "BaseFont" => "Helvetica",
+    });
+    let resources_id = doc.add_object(dictionary! {
+        "Font" => dictionary! { "F1" => font_id },
+    });
+
+    let mut page_refs: Vec<Object> = Vec::new();
+    for page in pages {
+        let mut page_dict = dictionary! {
+            "Type" => "Page",
+            "Parent" => pages_id,
+        };
+        if let Some(text) = page {
+            let content = Content {
+                operations: vec![
+                    Operation::new("BT", vec![]),
+                    Operation::new("Tf", vec!["F1".into(), 24.into()]),
+                    Operation::new(
+                        "Td",
+                        vec![Object::Integer(100), Object::Integer(700)],
+                    ),
+                    Operation::new("Tj", vec![Object::string_literal(*text)]),
+                    Operation::new("ET", vec![]),
+                ],
+            };
+            let stream_data = content.encode().expect("content encode");
+            let content_id =
+                doc.add_object(Stream::new(dictionary! {}, stream_data));
+            page_dict.set("Contents", content_id);
+        }
+        let page_id = doc.add_object(page_dict);
+        page_refs.push(page_id.into());
+    }
+
+    let count = page_refs.len() as i64;
+    let pages_dict = dictionary! {
+        "Type" => "Pages",
+        "Kids" => page_refs,
+        "Count" => count,
+        "Resources" => resources_id,
+        "MediaBox" => vec![
+            Object::Integer(0),
+            Object::Integer(0),
+            Object::Integer(595),
+            Object::Integer(842),
+        ],
+    };
+    doc.objects
+        .insert(pages_id, Object::Dictionary(pages_dict));
+
+    let catalog_id = doc.add_object(dictionary! {
+        "Type" => "Catalog",
+        "Pages" => pages_id,
+    });
+    doc.trailer.set("Root", catalog_id);
+
+    if info.title.is_some() || info.producer.is_some() || info.creator.is_some() {
+        let mut info_dict = lopdf::Dictionary::new();
+        if let Some(title) = &info.title {
+            info_dict.set(
+                "Title",
+                Object::String(title.clone(), lopdf::StringFormat::Literal),
+            );
+        }
+        if let Some(p) = info.producer {
+            info_dict.set(
+                "Producer",
+                Object::String(p.as_bytes().to_vec(), lopdf::StringFormat::Literal),
+            );
+        }
+        if let Some(c) = info.creator {
+            info_dict.set(
+                "Creator",
+                Object::String(c.as_bytes().to_vec(), lopdf::StringFormat::Literal),
+            );
+        }
+        let info_id = doc.add_object(Object::Dictionary(info_dict));
+        doc.trailer.set("Info", info_id);
+    }
+
+    let mut out: Vec<u8> = Vec::new();
+    doc.save_to(&mut out).expect("save PDF to memory");
+    out
+}
+
+/// Wrap any valid PDF byte buffer with a fake `/Encrypt` trailer entry
+/// so `Document::is_encrypted()` flips to true. We don't actually
+/// encrypt anything — the extractor refuses encrypted PDFs **before**
+/// touching streams, so the marker is sufficient.
+pub fn make_encrypted_pdf() -> Vec<u8> {
+    let bytes = build_text_pdf(&[Some("placeholder")]);
+    let mut doc = Document::load_mem(&bytes).expect("load round-tripped PDF");
+    let enc_id = doc.add_object(dictionary! {
+        "Filter" => "Standard",
+        "V" => 1,
+        "R" => 2,
+        "Length" => 40,
+        "P" => -4,
+    });
+    doc.trailer.set("Encrypt", enc_id);
+    let mut out = Vec::new();
+    doc.save_to(&mut out).expect("save encrypted PDF");
+    out
+}
+
+/// 27-byte garbage with no `%PDF-` header — `Document::load_mem` errors.
+pub fn corrupt_pdf() -> Vec<u8> {
+    b"NOT A PDF; just plain bytes".to_vec()
+}
+
+/// Encode a Rust `&str` as the PDF UTF-16BE-with-BOM string format.
+/// Used to verify `info::pdf_string` decodes the multilingual Title
+/// path correctly.
+pub fn utf16be_bom(s: &str) -> Vec<u8> {
+    let mut out = Vec::with_capacity(2 + s.encode_utf16().count() * 2);
+    out.extend_from_slice(&[0xFE, 0xFF]);
+    for unit in s.encode_utf16() {
+        out.extend_from_slice(&unit.to_be_bytes());
+    }
+    out
+}
+
+/// Asset + ExtractContext fixture, mirroring `kebab-parse-image::tests::common`.
+pub struct PdfFixture {
+    pub asset: RawAsset,
+    workspace_root: PathBuf,
+    config: ExtractConfig,
+}
+
+impl PdfFixture {
+    pub fn ctx(&self) -> ExtractContext<'_> {
+        ExtractContext {
+            asset: &self.asset,
+            workspace_root: &self.workspace_root,
+            config: &self.config,
+        }
+    }
+}
+
+pub fn fixture_for(workspace_path: &str, bytes: &[u8]) -> PdfFixture {
+    let blake = blake3::hash(bytes);
+    let full_hex = blake.to_hex().to_string();
+    let asset_id = kebab_core::id_for_asset(&full_hex);
+    let workspace_path = WorkspacePath::new(workspace_path.to_string()).unwrap();
+    let discovered_at = OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap();
+    let asset = RawAsset {
+        asset_id,
+        source_uri: SourceUri::File(PathBuf::from(format!("/tmp/{}", workspace_path.0))),
+        workspace_path,
+        media_type: MediaType::Pdf,
+        byte_len: bytes.len() as u64,
+        checksum: Checksum(full_hex),
+        discovered_at,
+        stored: AssetStorage::Reference {
+            path: PathBuf::from("/tmp/fake"),
+            sha: Checksum("0".repeat(64)),
+        },
+    };
+    PdfFixture {
+        asset,
+        workspace_root: PathBuf::from("/tmp/fake-root"),
+        config: ExtractConfig::default(),
+    }
+}
+
+/// Replace every provenance event timestamp after index 0 (Discovered)
+/// with `<stripped>` so determinism / snapshot tests can compare JSON
+/// across runs. Same shape as `kebab-parse-image::tests::common::strip_dynamic_at`.
+pub fn strip_dynamic_at(json: &mut serde_json::Value) {
+    if let Some(events) = json
+        .get_mut("provenance")
+        .and_then(|p| p.get_mut("events"))
+        .and_then(|e| e.as_array_mut())
+    {
+        for (i, ev) in events.iter_mut().enumerate() {
+            if i > 0
+                && let Some(obj) = ev.as_object_mut()
+            {
+                obj.insert("at".into(), serde_json::Value::String("<stripped>".into()));
+            }
+        }
+    }
+}
--- a/crates/kebab-parse-pdf/tests/extractor.rs
+++ b/crates/kebab-parse-pdf/tests/extractor.rs
@@ -0,0 +1,248 @@
+//! Integration tests for `kebab_parse_pdf::PdfTextExtractor` (P7-1).
+
+mod common;
+
+use kebab_core::{Block, Extractor, ProvenanceKind, SourceSpan};
+use kebab_parse_pdf::PdfTextExtractor;
+use serde_json::Value;
+
+use crate::common::{
+    InfoDict, build_text_pdf, build_text_pdf_with_info, corrupt_pdf, fixture_for,
+    make_encrypted_pdf, strip_dynamic_at, utf16be_bom,
+};
+
+fn paragraph_blocks(doc: &kebab_core::CanonicalDocument) -> Vec<&kebab_core::TextBlock> {
+    doc.blocks
+        .iter()
+        .map(|b| match b {
+            Block::Paragraph(t) => t,
+            other => panic!("expected Paragraph, got {other:?}"),
+        })
+        .collect()
+}
+
+#[test]
+fn three_page_pdf_emits_one_paragraph_block_per_page() {
+    let bytes = build_text_pdf(&[
+        Some("Hello page 1"),
+        Some("Hello page 2"),
+        Some("Hello page 3"),
+    ]);
+    let fx = fixture_for("docs/three.pdf", &bytes);
+    let doc = PdfTextExtractor::new()
+        .extract(&fx.ctx(), &bytes)
+        .expect("3-page extraction must succeed");
+
+    assert_eq!(doc.title, "three");
+    assert_eq!(doc.lang.0, "und");
+    assert_eq!(doc.parser_version.0, kebab_parse_pdf::PARSER_VERSION);
+    assert_eq!(doc.metadata.user["pdf"]["page_count"], Value::Number(3.into()));
+
+    let blocks = paragraph_blocks(&doc);
+    assert_eq!(blocks.len(), 3);
+    for (i, b) in blocks.iter().enumerate() {
+        let want_page = (i as u32) + 1;
+        match b.common.source_span {
+            SourceSpan::Page {
+                page,
+                char_start,
+                char_end,
+            } => {
+                assert_eq!(page, want_page);
+                assert_eq!(char_start, Some(0));
+                let chars = b.text.chars().count() as u32;
+                assert_eq!(char_end, Some(chars));
+            }
+            ref other => panic!("expected Page span, got {other:?}"),
+        }
+        assert!(
+            b.text.contains(&format!("Hello page {want_page}")),
+            "page {want_page} text mismatch: {:?}",
+            b.text
+        );
+    }
+}
+
+#[test]
+fn empty_page_emits_warning_and_empty_paragraph() {
+    let bytes = build_text_pdf(&[Some("page one text"), None, Some("page three text")]);
+    let fx = fixture_for("docs/scanned-mixed.pdf", &bytes);
+    let doc = PdfTextExtractor::new()
+        .extract(&fx.ctx(), &bytes)
+        .expect("scanned-mixed extraction must succeed");
+
+    let blocks = paragraph_blocks(&doc);
+    assert_eq!(blocks.len(), 3);
+    assert!(blocks[1].text.is_empty(), "page 2 should have empty text");
+    assert!(
+        blocks[1].inlines.is_empty(),
+        "page 2 inlines should be empty"
+    );
+    match blocks[1].common.source_span {
+        SourceSpan::Page {
+            page,
+            char_start,
+            char_end,
+        } => {
+            assert_eq!(page, 2);
+            assert_eq!(char_start, Some(0));
+            assert_eq!(char_end, Some(0));
+        }
+        ref other => panic!("expected Page, got {other:?}"),
+    }
+
+    let warnings: Vec<_> = doc
+        .provenance
+        .events
+        .iter()
+        .filter(|e| e.kind == ProvenanceKind::Warning)
+        .collect();
+    assert_eq!(warnings.len(), 1, "exactly one warning for the empty page");
+    assert!(
+        warnings[0]
+            .note
+            .as_deref()
+            .unwrap_or("")
+            .contains("page2 empty (scanned candidate)"),
+        "warning note must mark page 2 as scanned candidate: {:?}",
+        warnings[0].note
+    );
+}
+
+#[test]
+fn encrypted_pdf_returns_helpful_error() {
+    let bytes = make_encrypted_pdf();
+    let fx = fixture_for("docs/encrypted.pdf", &bytes);
+    let err = PdfTextExtractor::new()
+        .extract(&fx.ctx(), &bytes)
+        .expect_err("encrypted PDF must be refused");
+    let msg = format!("{err:#}");
+    assert!(
+        msg.contains("encrypted"),
+        "error must mention encryption: {msg}"
+    );
+    assert!(
+        msg.contains("qpdf") || msg.contains("decrypt"),
+        "error should point at remediation: {msg}"
+    );
+}
+
+#[test]
+fn corrupt_header_returns_error() {
+    let bytes = corrupt_pdf();
+    let fx = fixture_for("docs/corrupt.pdf", &bytes);
+    let err = PdfTextExtractor::new()
+        .extract(&fx.ctx(), &bytes)
+        .expect_err("corrupt PDF must error");
+    let msg = format!("{err:#}");
+    assert!(
+        msg.to_lowercase().contains("pdf") || msg.contains("parse"),
+        "error must mention PDF parse failure: {msg}"
+    );
+}
+
+#[test]
+fn page_count_matches_actual_count() {
+    let bytes = build_text_pdf(&[Some("a"), Some("b"), Some("c"), Some("d"), Some("e")]);
+    let fx = fixture_for("docs/five.pdf", &bytes);
+    let doc = PdfTextExtractor::new()
+        .extract(&fx.ctx(), &bytes)
+        .expect("5-page extraction must succeed");
+
+    assert_eq!(doc.metadata.user["pdf"]["page_count"], Value::Number(5.into()));
+    assert_eq!(doc.blocks.len(), 5);
+}
+
+#[test]
+fn info_dict_title_utf16be_bom_decoded() {
+    // Korean Title encoded as UTF-16BE with BOM is the standard PDF
+    // path for any non-ASCII metadata. We don't try to decode the
+    // body text in non-Latin scripts here (CID font support is out
+    // of scope for v1) — but the metadata path is in scope.
+    let info = InfoDict {
+        title: Some(utf16be_bom("케밥 문서")),
+        producer: Some("kebab-test"),
+        creator: None,
+    };
+    let bytes = build_text_pdf_with_info(&[Some("body")], &info);
+    let fx = fixture_for("docs/korean-title.pdf", &bytes);
+    let doc = PdfTextExtractor::new()
+        .extract(&fx.ctx(), &bytes)
+        .expect("PDF with UTF-16BE Title must extract");
+
+    assert_eq!(doc.title, "케밥 문서");
+    assert_eq!(
+        doc.metadata.user["pdf"]["producer"],
+        Value::String("kebab-test".into())
+    );
+}
+
+#[test]
+fn info_dict_title_falls_back_to_filename_when_missing() {
+    let bytes = build_text_pdf(&[Some("body")]);
+    let fx = fixture_for("docs/no-info.pdf", &bytes);
+    let doc = PdfTextExtractor::new()
+        .extract(&fx.ctx(), &bytes)
+        .expect("no-info PDF must extract");
+    assert_eq!(doc.title, "no-info");
+}
+
+#[test]
+fn determinism_identical_bytes_produce_identical_documents() {
+    let bytes = build_text_pdf(&[Some("alpha"), Some("beta"), Some("gamma")]);
+    let fx = fixture_for("docs/det.pdf", &bytes);
+
+    let mut a = serde_json::to_value(
+        PdfTextExtractor::new()
+            .extract(&fx.ctx(), &bytes)
+            .expect("first extract"),
+    )
+    .unwrap();
+    let mut b = serde_json::to_value(
+        PdfTextExtractor::new()
+            .extract(&fx.ctx(), &bytes)
+            .expect("second extract"),
+    )
+    .unwrap();
+
+    strip_dynamic_at(&mut a);
+    strip_dynamic_at(&mut b);
+    assert_eq!(a, b, "two extracts of identical bytes must be byte-equal");
+}
+
+#[test]
+fn snapshot_three_page_canonical_document_stable() {
+    let bytes = build_text_pdf(&[Some("p1"), Some("p2"), Some("p3")]);
+    let fx = fixture_for("docs/snapshot.pdf", &bytes);
+    let doc = PdfTextExtractor::new()
+        .extract(&fx.ctx(), &bytes)
+        .expect("snapshot extract");
+    let mut json = serde_json::to_value(&doc).unwrap();
+    strip_dynamic_at(&mut json);
+
+    // Spot-check the load-bearing shape rather than committing a full
+    // golden file (the full JSON contains BLAKE3 ids that would
+    // change if `id_from(...)`'s tuple shape ever shifts — that would
+    // be a separate, intentional break).
+    assert_eq!(json["parser_version"], Value::String("pdf-text-v1".into()));
+    assert_eq!(json["lang"], Value::String("und".into()));
+    assert_eq!(json["schema_version"], Value::Number(1.into()));
+    assert_eq!(json["doc_version"], Value::Number(1.into()));
+    assert_eq!(json["blocks"].as_array().unwrap().len(), 3);
+    for (i, block) in json["blocks"].as_array().unwrap().iter().enumerate() {
+        assert_eq!(block["kind"], Value::String("paragraph".into()));
+        assert_eq!(
+            block["common"]["source_span"]["kind"],
+            Value::String("page".into())
+        );
+        assert_eq!(
+            block["common"]["source_span"]["page"],
+            Value::Number(((i as u64) + 1).into())
+        );
+    }
+    assert_eq!(json["metadata"]["source_type"], Value::String("paper".into()));
+    assert_eq!(
+        json["metadata"]["trust_level"],
+        Value::String("primary".into())
+    );
+}
--- a/tasks/p7/p7-1-pdf-text-extractor.md
+++ b/tasks/p7/p7-1-pdf-text-extractor.md
@@ -3,7 +3,7 @@ phase: P7
 component: kebab-parse-pdf (text extractor)
 task_id: p7-1
 title: "Text PDF extractor → CanonicalDocument with page-level blocks"
-status: planned
+status: completed
 depends_on: [p0-1, p1-6]
 unblocks: [p7-2]
 contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md