review(p7-1): 회차 1 지적 반영

- Cargo.toml: 사용하지 않는 deps 제거 (`kebab-config`, `thiserror`,
  `pdf-extract`, dev `tempfile` / `serde_json` / `serde`). 특히
  `pdf-extract` 가 끌어오던 transitive ~150 crate (pom, postscript,
  type1-encoding-parser, adobe-cmap-parser, euclid, chrono, md5,
  linked-hash-map …) 가 모두 사라짐. lopdf 만 남음.
- info.rs: BOM 없는 PDFDocEncoded Title 디코드 버그 수정. `from_utf8_lossy`
  는 0x80–0xFF 를 U+FFFD 로 치환해 "Café" 같은 레거시 타이틀을 망가뜨림.
  byte → `char` 직접 캐스팅 (Latin-1 디코더) 로 교체. 회귀 테스트
  `info_dict_title_pdfdocencoding_latin1_high_bytes_decoded` 추가.
- info.rs: 모듈 doc 의 "Latin-1 superset" 부정확 표현 정정 — PDFDocEncoding
  은 0x18–0x1F / 0x80–0x9F 영역에서 Latin-1 과 다름.
- lib.rs: `saturating_sub(1)` 가 page=0 케이스를 silent 흡수하던 부분에
  `debug_assert!` 추가. release 는 saturating fallback 유지 (panic 보다
  garbled order 가 운영에 유리).
- tests: UTF-16 surrogate pair 커버리지 갭 보완 — 🥙 (U+1F959) 가 포함된
  타이틀로 `String::from_utf16_lossy` 의 페어-결합 경로 검증.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-02 08:40:40 +00:00
parent 5a158d7343
commit 8de08cf38c
5 changed files with 64 additions and 97 deletions

79
Cargo.lock generated
View File

@@ -24,15 +24,6 @@ version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
[[package]]
name = "adobe-cmap-parser"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3"
dependencies = [
"pom",
]
[[package]]
name = "ahash"
version = "0.8.12"
@@ -2123,15 +2114,6 @@ version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40404c3f5f511ec4da6fe866ddf6a717c309fdbb69fbbad7b0f3edab8f2e835f"
[[package]]
name = "euclid"
version = "0.20.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad"
dependencies = [
"num-traits",
]
[[package]]
name = "event-listener"
version = "5.4.1"
@@ -3633,14 +3615,9 @@ version = "0.1.0"
dependencies = [
"anyhow",
"blake3",
"kebab-config",
"kebab-core",
"lopdf 0.32.0",
"pdf-extract",
"serde",
"lopdf",
"serde_json",
"tempfile",
"thiserror 2.0.18",
"time",
"tracing",
]
@@ -4582,24 +4559,6 @@ dependencies = [
"weezl",
]
[[package]]
name = "lopdf"
version = "0.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff"
dependencies = [
"encoding_rs",
"flate2",
"indexmap 2.14.0",
"itoa",
"log",
"md-5",
"nom 7.1.3",
"rangemap",
"time",
"weezl",
]
[[package]]
name = "lru"
version = "0.12.5"
@@ -5350,21 +5309,6 @@ dependencies = [
"stfu8",
]
[[package]]
name = "pdf-extract"
version = "0.7.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575"
dependencies = [
"adobe-cmap-parser",
"encoding_rs",
"euclid",
"lopdf 0.34.0",
"postscript",
"type1-encoding-parser",
"unicode-normalization",
]
[[package]]
name = "percent-encoding"
version = "2.3.2"
@@ -5468,12 +5412,6 @@ dependencies = [
"miniz_oxide",
]
[[package]]
name = "pom"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6"
[[package]]
name = "portable-atomic"
version = "1.13.1"
@@ -5489,12 +5427,6 @@ dependencies = [
"portable-atomic",
]
[[package]]
name = "postscript"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306"
[[package]]
name = "potential_utf"
version = "0.1.5"
@@ -7615,15 +7547,6 @@ dependencies = [
"rand 0.9.4",
]
[[package]]
name = "type1-encoding-parser"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa10c302f5a53b7ad27fd42a3996e23d096ba39b5b8dd6d9e683a05b01bee749"
dependencies = [
"pom",
]
[[package]]
name = "typenum"
version = "1.20.0"

View File

@@ -9,22 +9,18 @@ description = "Text PDF extractor (per-page text + page citation) for the keba
[dependencies]
kebab-core = { path = "../kebab-core" }
kebab-config = { path = "../kebab-config" }
anyhow = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
time = { workspace = true }
tracing = { workspace = true }
thiserror = { workspace = true }
# Per-page text extraction. `lopdf::Document::extract_text(&[page])`
# is the only stable per-page API across the pdf-extract / lopdf
# pair (pdf-extract 0.7 still exposes only whole-document calls).
# pdf-extract is intentionally NOT pulled in here — its ~150 transitive
# crates (pom, postscript, type1-encoding-parser, …) buy us nothing
# at v1 (we don't call its whole-doc API), and the future scanned-PDF
# OCR fallback can re-add it when it actually needs it.
lopdf = "0.32"
# Whole-document sanity-check call; covers a few format errors that
# lopdf swallows silently. Per-page text is sourced from lopdf only.
pdf-extract = "0.7"
[dev-dependencies]
tempfile = { workspace = true }
blake3 = { workspace = true }
serde_json = { workspace = true }

View File

@@ -2,10 +2,15 @@
//!
//! PDFs may carry a `/Info` trailer dictionary with `Title`,
//! `Producer`, `Creator`, etc. Strings are encoded as either
//! PDFDocEncoding (Latin-1 superset) OR UTF-16BE prefixed with the
//! BOM `0xFE 0xFF`. We handle both. Anything else falls back to
//! UTF-8 lossy. All fields are optional — a missing `/Info` dict is
//! not an error.
//! UTF-16BE prefixed with the BOM `0xFE 0xFF` OR PDFDocEncoding
//! (which agrees with Latin-1 over `0x200x7E` + `0xA00xFF` and
//! diverges in the `0x180x1F` / `0x800x9F` ranges). We decode
//! BOM'd strings as proper UTF-16BE; non-BOM strings are decoded
//! as Latin-1 (byte → `char`), which is correct for the common
//! ASCII case and a best-effort approximation for the divergent
//! PDFDocEncoding ranges (full PDFDocEncoding tables aren't worth
//! the maintenance for what is effectively legacy metadata). All
//! fields are optional — a missing `/Info` dict is not an error.
#[derive(Default)]
pub(crate) struct InfoDict {
@@ -61,10 +66,11 @@ fn pdf_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
}
}
// PDFDocEncoding overlaps Latin-1 for the printable range we care
// about, and Latin-1 is byte-identical to UTF-8 only for ASCII;
// `from_utf8_lossy` is the conservative call here. ASCII-only
// PDFs (the common case) round-trip cleanly.
let s = String::from_utf8_lossy(bytes).into_owned();
// PDFDocEncoding fallback (no BOM). Direct byte → char cast is
// a Latin-1 decoder: ASCII (0x000x7F) round-trips, and
// 0xA00xFF maps to the matching Unicode code point. `from_utf8_lossy`
// would have replaced 0x800xFF with U+FFFD, mangling legacy
// PDFDocEncoded titles like "Café".
let s: String = bytes.iter().map(|&b| b as char).collect();
if s.is_empty() { None } else { Some(s) }
}

View File

@@ -131,8 +131,13 @@ impl Extractor for PdfTextExtractor {
char_start: Some(0),
char_end: Some(char_count),
};
// ordinal = page - 1; saturating_sub guards the (shouldn't-happen)
// case where lopdf hands back a 0-indexed page key.
// lopdf's `get_pages()` is 1-based by contract. A 0-key would
// collapse two pages onto the same ordinal (silently breaking
// ordinal-based sorting downstream), so we assert the
// invariant in dev builds. The release fallback still uses
// saturating_sub so a future lopdf regression degrades to
// garbled order rather than panic.
debug_assert!(page_num >= 1, "lopdf get_pages() returned 0-based page key");
let ordinal = page_num.saturating_sub(1);
let block_id = id_for_block(&doc_id, "paragraph", &[], ordinal, &span);
let common = CommonBlock {

View File

@@ -177,6 +177,43 @@ fn info_dict_title_utf16be_bom_decoded() {
);
}
#[test]
fn info_dict_title_utf16be_surrogate_pair_decoded() {
// 🥙 (U+1F959 STUFFED FLATBREAD) sits in the supplementary plane,
// so encoding it as UTF-16BE produces a surrogate pair (D83E DD59).
// BMP-only inputs would never exercise the pair-joining path of
// `String::from_utf16_lossy` — this asserts that path round-trips.
let info = InfoDict {
title: Some(utf16be_bom("케밥 🥙 문서")),
producer: None,
creator: None,
};
let bytes = build_text_pdf_with_info(&[Some("body")], &info);
let fx = fixture_for("docs/emoji-title.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("PDF with surrogate-pair Title must extract");
assert_eq!(doc.title, "케밥 🥙 문서");
}
#[test]
fn info_dict_title_pdfdocencoding_latin1_high_bytes_decoded() {
// BOM-less PDFDocEncoded title with a high-byte char (0xE9 = 'é').
// `from_utf8_lossy` would have replaced this with U+FFFD; the
// byte-as-char path keeps it intact.
let info = InfoDict {
title: Some(b"Caf\xE9".to_vec()),
producer: None,
creator: None,
};
let bytes = build_text_pdf_with_info(&[Some("body")], &info);
let fx = fixture_for("docs/cafe-title.pdf", &bytes);
let doc = PdfTextExtractor::new()
.extract(&fx.ctx(), &bytes)
.expect("PDF with Latin-1 Title must extract");
assert_eq!(doc.title, "Café");
}
#[test]
fn info_dict_title_falls_back_to_filename_when_missing() {
let bytes = build_text_pdf(&[Some("body")]);