Files
kebab/crates/kebab-parse-pdf/tests/text_extractor_regression.rs
altair823 e674ff474b fix(parse-pdf): F4 mojibake.pdf via pikepdf surgery; preserve 1-page invariant (Bug #4)
v0.20.0 sub-item 1 dogfood report 의 Bug #4 — F4 mojibake.pdf 의 lopdf
`get_pages()` count = 0 (Pages tree broken). root cause = 기존 byte-
level `re.sub` + manual startxref edit 가 lopdf strict load 통과시키지만
Pages dict 의 `/Kids` reference 깨짐.

- `tests/fixtures/_synth/mojibake.py`: full rewrite — replace byte-level
  `re.sub` + manual startxref with pikepdf open+inject-dummy-ToUnicode+
  del+save (auto xref regen). HYSMyeongJo-Medium CID font: CID font 이
  ToUnicode 를 자체 생성하지 않아 dummy stream 을 inject 후 strip
  (removed=1 invariant). Exit codes 2/3/4 for invariant fail.
- `crates/kebab-parse-pdf/tests/fixtures/mojibake.pdf`: regenerate via
  pikepdf — 1 valid page, no /ToUnicode marker, byte-identical 후 reproducible.
- `crates/kebab-parse-pdf/tests/snapshots/vector_pdf_canonical.json`:
  regen via 2-run cargo test pattern (hand-rolled unwrap_or_else baseline
  bootstrap, no insta crate).
- `crates/kebab-parse-pdf/tests/text_extractor_regression.rs`: append 3
  invariant test — (1) lopdf 1-page, (2) /ToUnicode marker absent,
  (3) PdfTextExtractor 1-block invariant.
- `crates/kebab-parse-pdf/src/text_quality.rs`: f4_fixture_ratio_under_threshold
  threshold 0.3 → 0.5 (production valid_ratio_threshold 기본값). 구 broken
  fixture (pages=0) 는 extract_text="" → ratio=0.0; 신 fixed fixture 는
  CID 2-byte fallback decode → ratio≈0.375 — 여전히 OCR trigger 조건 충족.

spec: docs/superpowers/specs/2026-05-27-v0.20-sub1-bugfix-spec.md (§5)
plan: docs/superpowers/plans/2026-05-27-v0.20-sub1-bugfix-plan.md (Step 4)
prior: 241ded5 (Step 3 integration test)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 14:02:17 +00:00

105 lines
3.5 KiB
Rust

//! Byte-identical regression for the vector PDF extraction path (spec §5.4).
//! Uses F4 (mojibake.pdf) — the only fixture with extractable text content.
//! First invocation creates the baseline snapshot; subsequent runs verify
//! identity to detect silent regressions across all Step 1-8 changes.
use std::path::Path;
use kebab_core::{
AssetStorage, Checksum, ExtractConfig, ExtractContext, Extractor, MediaType, RawAsset,
SourceUri, WorkspacePath, id_for_asset,
};
use kebab_parse_pdf::PdfTextExtractor;
use time::OffsetDateTime;
/// Normalize all provenance timestamps to UNIX_EPOCH so the snapshot is
/// byte-stable across runs (R-3 mitigation — no workspace helper exists).
fn normalize_provenance_timestamps(doc: &mut kebab_core::CanonicalDocument) {
for event in &mut doc.provenance.events {
event.at = OffsetDateTime::UNIX_EPOCH;
}
}
fn make_raw_asset(path: &str) -> RawAsset {
let fake_hash = "0".repeat(64);
let asset_id = id_for_asset(&fake_hash);
RawAsset {
asset_id,
source_uri: SourceUri::File(std::path::PathBuf::from(path)),
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
media_type: MediaType::Pdf,
byte_len: 0,
checksum: Checksum(fake_hash),
discovered_at: OffsetDateTime::UNIX_EPOCH,
stored: AssetStorage::Copied {
path: std::path::PathBuf::from(path),
},
}
}
#[test]
fn vector_pdf_extract_byte_identical_to_baseline() {
let bytes = include_bytes!("fixtures/mojibake.pdf");
let asset = make_raw_asset("mojibake.pdf");
let workspace_root = Path::new("/");
let config = ExtractConfig::default();
let ctx = ExtractContext {
asset: &asset,
workspace_root,
config: &config,
};
let mut canonical = PdfTextExtractor::new()
.extract(&ctx, bytes)
.expect("PdfTextExtractor::extract");
normalize_provenance_timestamps(&mut canonical);
let actual = serde_json::to_string_pretty(&canonical).expect("serialize canonical");
let baseline_path = "tests/snapshots/vector_pdf_canonical.json";
let baseline = std::fs::read_to_string(baseline_path).unwrap_or_else(|_| {
std::fs::create_dir_all("tests/snapshots").ok();
std::fs::write(baseline_path, &actual).expect("write baseline snapshot");
actual.clone()
});
assert_eq!(
actual, baseline,
"vector PDF canonical must be byte-identical to baseline (Step 1-8 regression)"
);
}
#[test]
fn mojibake_fixture_load_yields_one_page() {
let bytes = include_bytes!("fixtures/mojibake.pdf");
let doc = lopdf::Document::load_mem(bytes).expect("load mojibake");
assert_eq!(doc.get_pages().len(), 1, "F4 must have 1 page");
}
#[test]
fn mojibake_fixture_has_no_tounicode_cmap() {
let bytes = include_bytes!("fixtures/mojibake.pdf");
let count = bytes
.windows(b"/ToUnicode".len())
.filter(|w| *w == b"/ToUnicode")
.count();
assert_eq!(count, 0, "F4 must have no /ToUnicode marker");
}
#[test]
fn pdf_text_extractor_on_mojibake_yields_one_block() {
let bytes = include_bytes!("fixtures/mojibake.pdf");
let asset = make_raw_asset("mojibake.pdf");
let workspace_root = Path::new("/");
let config = ExtractConfig::default();
let ctx = ExtractContext {
asset: &asset,
workspace_root,
config: &config,
};
let canonical = PdfTextExtractor::new()
.extract(&ctx, bytes)
.expect("PdfTextExtractor::extract");
assert_eq!(canonical.blocks.len(), 1, "F4 must yield 1 block");
}