Files
kebab/crates/kebab-parse-pdf/tests/extractor.rs
altair823 685007789a style: cargo fmt --all (round 4 ingest log feature follow-up)
Phase C4 executor 의 마지막 `fix(test): clippy + fmt fixes` commit 이
test file 부분만 fmt 적용. workspace 전체 fmt 누락 발견 → cargo fmt --all
적용. 모든 import alphabetical reorder + line wrapping 정합.

추가 untracked artifact 동시 commit:
- docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md (491 line, ACCEPT)
- docs/superpowers/plans/2026-05-28-v0.20-ingest-log-plan.md (616 line, ACCEPT)

workspace test: 1370 passed / 0 failed / 50 ignored, ingest_log_smoke green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 04:18:40 +00:00

295 lines
9.6 KiB
Rust

//! Integration tests for `kebab_parse_pdf::PdfTextExtractor` (P7-1).
mod common;
use kebab_core::{Block, Extractor, ProvenanceKind, SourceSpan};
use kebab_parse_pdf::PdfTextExtractor;
use serde_json::Value;
use crate::common::{
InfoDict, build_text_pdf, build_text_pdf_with_info, corrupt_pdf, fixture_for,
make_encrypted_pdf, strip_dynamic_at, utf16be_bom,
};
fn paragraph_blocks(doc: &kebab_core::CanonicalDocument) -> Vec<&kebab_core::TextBlock> {
doc.blocks
.iter()
.map(|b| match b {
Block::Paragraph(t) => t,
other => panic!("expected Paragraph, got {other:?}"),
})
.collect()
}
#[test]
fn three_page_pdf_emits_one_paragraph_block_per_page() {
let bytes = build_text_pdf(&[
Some("Hello page 1"),
Some("Hello page 2"),
Some("Hello page 3"),
]);
let fx = fixture_for("docs/three.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("3-page extraction must succeed");
assert_eq!(doc.title, "three");
assert_eq!(doc.lang.0, "und");
assert_eq!(doc.parser_version.0, kebab_parse_pdf::PARSER_VERSION);
assert_eq!(
doc.metadata.user["pdf"]["page_count"],
Value::Number(3.into())
);
let blocks = paragraph_blocks(&doc);
assert_eq!(blocks.len(), 3);
for (i, b) in blocks.iter().enumerate() {
let want_page = (i as u32) + 1;
match b.common.source_span {
SourceSpan::Page {
page,
char_start,
char_end,
} => {
assert_eq!(page, want_page);
assert_eq!(char_start, Some(0));
let chars = b.text.chars().count() as u32;
assert_eq!(char_end, Some(chars));
}
ref other => panic!("expected Page span, got {other:?}"),
}
assert!(
b.text.contains(&format!("Hello page {want_page}")),
"page {want_page} text mismatch: {:?}",
b.text
);
}
}
#[test]
fn empty_page_emits_warning_and_empty_paragraph() {
let bytes = build_text_pdf(&[Some("page one text"), None, Some("page three text")]);
let fx = fixture_for("docs/scanned-mixed.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("scanned-mixed extraction must succeed");
let blocks = paragraph_blocks(&doc);
assert_eq!(blocks.len(), 3);
assert!(blocks[1].text.is_empty(), "page 2 should have empty text");
assert!(
blocks[1].inlines.is_empty(),
"page 2 inlines should be empty"
);
match blocks[1].common.source_span {
SourceSpan::Page {
page,
char_start,
char_end,
} => {
assert_eq!(page, 2);
assert_eq!(char_start, Some(0));
assert_eq!(char_end, Some(0));
}
ref other => panic!("expected Page, got {other:?}"),
}
let warnings: Vec<_> = doc
.provenance
.events
.iter()
.filter(|e| e.kind == ProvenanceKind::Warning)
.collect();
assert_eq!(warnings.len(), 1, "exactly one warning for the empty page");
assert!(
warnings[0]
.note
.as_deref()
.unwrap_or("")
.contains("page2 empty (scanned candidate)"),
"warning note must mark page 2 as scanned candidate: {:?}",
warnings[0].note
);
}
#[test]
fn encrypted_pdf_returns_helpful_error() {
let bytes = make_encrypted_pdf();
let fx = fixture_for("docs/encrypted.pdf", &bytes);
let err = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect_err("encrypted PDF must be refused");
let msg = format!("{err:#}");
assert!(
msg.contains("encrypted"),
"error must mention encryption: {msg}"
);
assert!(
msg.contains("qpdf") || msg.contains("decrypt"),
"error should point at remediation: {msg}"
);
}
#[test]
fn corrupt_header_returns_error() {
let bytes = corrupt_pdf();
let fx = fixture_for("docs/corrupt.pdf", &bytes);
let err = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect_err("corrupt PDF must error");
let msg = format!("{err:#}");
assert!(
msg.to_lowercase().contains("pdf") || msg.contains("parse"),
"error must mention PDF parse failure: {msg}"
);
}
#[test]
fn page_count_matches_actual_count() {
let bytes = build_text_pdf(&[Some("a"), Some("b"), Some("c"), Some("d"), Some("e")]);
let fx = fixture_for("docs/five.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("5-page extraction must succeed");
assert_eq!(
doc.metadata.user["pdf"]["page_count"],
Value::Number(5.into())
);
assert_eq!(doc.blocks.len(), 5);
}
#[test]
fn info_dict_title_utf16be_bom_decoded() {
// Korean Title encoded as UTF-16BE with BOM is the standard PDF
// path for any non-ASCII metadata. We don't try to decode the
// body text in non-Latin scripts here (CID font support is out
// of scope for v1) — but the metadata path is in scope.
let info = InfoDict {
title: Some(utf16be_bom("케밥 문서")),
producer: Some("kebab-test"),
creator: None,
};
let bytes = build_text_pdf_with_info(&[Some("body")], &info);
let fx = fixture_for("docs/korean-title.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("PDF with UTF-16BE Title must extract");
assert_eq!(doc.title, "케밥 문서");
assert_eq!(
doc.metadata.user["pdf"]["producer"],
Value::String("kebab-test".into())
);
}
#[test]
fn info_dict_title_utf16be_surrogate_pair_decoded() {
// 🥙 (U+1F959 STUFFED FLATBREAD) sits in the supplementary plane,
// so encoding it as UTF-16BE produces a surrogate pair (D83E DD59).
// BMP-only inputs would never exercise the pair-joining path of
// `String::from_utf16_lossy` — this asserts that path round-trips.
let info = InfoDict {
title: Some(utf16be_bom("케밥 🥙 문서")),
producer: None,
creator: None,
};
let bytes = build_text_pdf_with_info(&[Some("body")], &info);
let fx = fixture_for("docs/emoji-title.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("PDF with surrogate-pair Title must extract");
assert_eq!(doc.title, "케밥 🥙 문서");
}
#[test]
fn info_dict_title_pdfdocencoding_latin1_high_bytes_decoded() {
// BOM-less PDFDocEncoded title with a high-byte char (0xE9 = 'é').
// `from_utf8_lossy` would have replaced this with U+FFFD; the
// byte-as-char path keeps it intact.
let info = InfoDict {
title: Some(b"Caf\xE9".to_vec()),
producer: None,
creator: None,
};
let bytes = build_text_pdf_with_info(&[Some("body")], &info);
let fx = fixture_for("docs/cafe-title.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("PDF with Latin-1 Title must extract");
assert_eq!(doc.title, "Café");
}
#[test]
fn info_dict_title_falls_back_to_filename_when_missing() {
let bytes = build_text_pdf(&[Some("body")]);
let fx = fixture_for("docs/no-info.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("no-info PDF must extract");
assert_eq!(doc.title, "no-info");
}
#[test]
fn determinism_identical_bytes_produce_identical_documents() {
let bytes = build_text_pdf(&[Some("alpha"), Some("beta"), Some("gamma")]);
let fx = fixture_for("docs/det.pdf", &bytes);
let mut a = serde_json::to_value(
PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("first extract"),
)
.unwrap();
let mut b = serde_json::to_value(
PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("second extract"),
)
.unwrap();
strip_dynamic_at(&mut a);
strip_dynamic_at(&mut b);
assert_eq!(a, b, "two extracts of identical bytes must be byte-equal");
}
#[test]
fn snapshot_three_page_canonical_document_stable() {
let bytes = build_text_pdf(&[Some("p1"), Some("p2"), Some("p3")]);
let fx = fixture_for("docs/snapshot.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("snapshot extract");
let mut json = serde_json::to_value(&doc).unwrap();
strip_dynamic_at(&mut json);
// Spot-check the load-bearing shape rather than committing a full
// golden file (the full JSON contains BLAKE3 ids that would
// change if `id_from(...)`'s tuple shape ever shifts — that would
// be a separate, intentional break).
assert_eq!(json["parser_version"], Value::String("pdf-text-v1".into()));
assert_eq!(json["lang"], Value::String("und".into()));
assert_eq!(json["schema_version"], Value::Number(1.into()));
assert_eq!(json["doc_version"], Value::Number(1.into()));
assert_eq!(json["blocks"].as_array().unwrap().len(), 3);
for (i, block) in json["blocks"].as_array().unwrap().iter().enumerate() {
assert_eq!(block["kind"], Value::String("paragraph".into()));
assert_eq!(
block["common"]["source_span"]["kind"],
Value::String("page".into())
);
assert_eq!(
block["common"]["source_span"]["page"],
Value::Number(((i as u64) + 1).into())
);
}
assert_eq!(
json["metadata"]["source_type"],
Value::String("paper".into())
);
assert_eq!(
json["metadata"]["trust_level"],
Value::String("primary".into())
);
}