review(p7-1): 회차 1 지적 반영
- Cargo.toml: 사용하지 않는 deps 제거 (`kebab-config`, `thiserror`, `pdf-extract`, dev `tempfile` / `serde_json` / `serde`). 특히 `pdf-extract` 가 끌어오던 transitive ~150 crate (pom, postscript, type1-encoding-parser, adobe-cmap-parser, euclid, chrono, md5, linked-hash-map …) 가 모두 사라짐. lopdf 만 남음. - info.rs: BOM 없는 PDFDocEncoded Title 디코드 버그 수정. `from_utf8_lossy` 는 0x80–0xFF 를 U+FFFD 로 치환해 "Café" 같은 레거시 타이틀을 망가뜨림. byte → `char` 직접 캐스팅 (Latin-1 디코더) 로 교체. 회귀 테스트 `info_dict_title_pdfdocencoding_latin1_high_bytes_decoded` 추가. - info.rs: 모듈 doc 의 "Latin-1 superset" 부정확 표현 정정 — PDFDocEncoding 은 0x18–0x1F / 0x80–0x9F 영역에서 Latin-1 과 다름. - lib.rs: `saturating_sub(1)` 가 page=0 케이스를 silent 흡수하던 부분에 `debug_assert!` 추가. release 는 saturating fallback 유지 (panic 보다 garbled order 가 운영에 유리). - tests: UTF-16 surrogate pair 커버리지 갭 보완 — 🥙 (U+1F959) 가 포함된 타이틀로 `String::from_utf16_lossy` 의 페어-결합 경로 검증. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
79
Cargo.lock
generated
79
Cargo.lock
generated
@@ -24,15 +24,6 @@ version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
|
||||
|
||||
[[package]]
|
||||
name = "adobe-cmap-parser"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3"
|
||||
dependencies = [
|
||||
"pom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.8.12"
|
||||
@@ -2123,15 +2114,6 @@ version = "1.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "40404c3f5f511ec4da6fe866ddf6a717c309fdbb69fbbad7b0f3edab8f2e835f"
|
||||
|
||||
[[package]]
|
||||
name = "euclid"
|
||||
version = "0.20.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "5.4.1"
|
||||
@@ -3633,14 +3615,9 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
"kebab-config",
|
||||
"kebab-core",
|
||||
"lopdf 0.32.0",
|
||||
"pdf-extract",
|
||||
"serde",
|
||||
"lopdf",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror 2.0.18",
|
||||
"time",
|
||||
"tracing",
|
||||
]
|
||||
@@ -4582,24 +4559,6 @@ dependencies = [
|
||||
"weezl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lopdf"
|
||||
version = "0.34.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
"flate2",
|
||||
"indexmap 2.14.0",
|
||||
"itoa",
|
||||
"log",
|
||||
"md-5",
|
||||
"nom 7.1.3",
|
||||
"rangemap",
|
||||
"time",
|
||||
"weezl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.12.5"
|
||||
@@ -5350,21 +5309,6 @@ dependencies = [
|
||||
"stfu8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pdf-extract"
|
||||
version = "0.7.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575"
|
||||
dependencies = [
|
||||
"adobe-cmap-parser",
|
||||
"encoding_rs",
|
||||
"euclid",
|
||||
"lopdf 0.34.0",
|
||||
"postscript",
|
||||
"type1-encoding-parser",
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.3.2"
|
||||
@@ -5468,12 +5412,6 @@ dependencies = [
|
||||
"miniz_oxide",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pom"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6"
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic"
|
||||
version = "1.13.1"
|
||||
@@ -5489,12 +5427,6 @@ dependencies = [
|
||||
"portable-atomic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postscript"
|
||||
version = "0.14.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306"
|
||||
|
||||
[[package]]
|
||||
name = "potential_utf"
|
||||
version = "0.1.5"
|
||||
@@ -7615,15 +7547,6 @@ dependencies = [
|
||||
"rand 0.9.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "type1-encoding-parser"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa10c302f5a53b7ad27fd42a3996e23d096ba39b5b8dd6d9e683a05b01bee749"
|
||||
dependencies = [
|
||||
"pom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.20.0"
|
||||
|
||||
@@ -9,22 +9,18 @@ description = "Text PDF extractor (per-page text + page citation) for the keba
|
||||
|
||||
[dependencies]
|
||||
kebab-core = { path = "../kebab-core" }
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
anyhow = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
# Per-page text extraction. `lopdf::Document::extract_text(&[page])`
|
||||
# is the only stable per-page API across the pdf-extract / lopdf
|
||||
# pair (pdf-extract 0.7 still exposes only whole-document calls).
|
||||
# pdf-extract is intentionally NOT pulled in here — its ~150 transitive
|
||||
# crates (pom, postscript, type1-encoding-parser, …) buy us nothing
|
||||
# at v1 (we don't call its whole-doc API), and the future scanned-PDF
|
||||
# OCR fallback can re-add it when it actually needs it.
|
||||
lopdf = "0.32"
|
||||
# Whole-document sanity-check call; covers a few format errors that
|
||||
# lopdf swallows silently. Per-page text is sourced from lopdf only.
|
||||
pdf-extract = "0.7"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
|
||||
@@ -2,10 +2,15 @@
|
||||
//!
|
||||
//! PDFs may carry a `/Info` trailer dictionary with `Title`,
|
||||
//! `Producer`, `Creator`, etc. Strings are encoded as either
|
||||
//! PDFDocEncoding (Latin-1 superset) OR UTF-16BE prefixed with the
|
||||
//! BOM `0xFE 0xFF`. We handle both. Anything else falls back to
|
||||
//! UTF-8 lossy. All fields are optional — a missing `/Info` dict is
|
||||
//! not an error.
|
||||
//! UTF-16BE prefixed with the BOM `0xFE 0xFF` OR PDFDocEncoding
|
||||
//! (which agrees with Latin-1 over `0x20–0x7E` + `0xA0–0xFF` and
|
||||
//! diverges in the `0x18–0x1F` / `0x80–0x9F` ranges). We decode
|
||||
//! BOM'd strings as proper UTF-16BE; non-BOM strings are decoded
|
||||
//! as Latin-1 (byte → `char`), which is correct for the common
|
||||
//! ASCII case and a best-effort approximation for the divergent
|
||||
//! PDFDocEncoding ranges (full PDFDocEncoding tables aren't worth
|
||||
//! the maintenance for what is effectively legacy metadata). All
|
||||
//! fields are optional — a missing `/Info` dict is not an error.
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct InfoDict {
|
||||
@@ -61,10 +66,11 @@ fn pdf_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
|
||||
}
|
||||
}
|
||||
|
||||
// PDFDocEncoding overlaps Latin-1 for the printable range we care
|
||||
// about, and Latin-1 is byte-identical to UTF-8 only for ASCII;
|
||||
// `from_utf8_lossy` is the conservative call here. ASCII-only
|
||||
// PDFs (the common case) round-trip cleanly.
|
||||
let s = String::from_utf8_lossy(bytes).into_owned();
|
||||
// PDFDocEncoding fallback (no BOM). Direct byte → char cast is
|
||||
// a Latin-1 decoder: ASCII (0x00–0x7F) round-trips, and
|
||||
// 0xA0–0xFF maps to the matching Unicode code point. `from_utf8_lossy`
|
||||
// would have replaced 0x80–0xFF with U+FFFD, mangling legacy
|
||||
// PDFDocEncoded titles like "Café".
|
||||
let s: String = bytes.iter().map(|&b| b as char).collect();
|
||||
if s.is_empty() { None } else { Some(s) }
|
||||
}
|
||||
|
||||
@@ -131,8 +131,13 @@ impl Extractor for PdfTextExtractor {
|
||||
char_start: Some(0),
|
||||
char_end: Some(char_count),
|
||||
};
|
||||
// ordinal = page - 1; saturating_sub guards the (shouldn't-happen)
|
||||
// case where lopdf hands back a 0-indexed page key.
|
||||
// lopdf's `get_pages()` is 1-based by contract. A 0-key would
|
||||
// collapse two pages onto the same ordinal (silently breaking
|
||||
// ordinal-based sorting downstream), so we assert the
|
||||
// invariant in dev builds. The release fallback still uses
|
||||
// saturating_sub so a future lopdf regression degrades to
|
||||
// garbled order rather than panic.
|
||||
debug_assert!(page_num >= 1, "lopdf get_pages() returned 0-based page key");
|
||||
let ordinal = page_num.saturating_sub(1);
|
||||
let block_id = id_for_block(&doc_id, "paragraph", &[], ordinal, &span);
|
||||
let common = CommonBlock {
|
||||
|
||||
@@ -177,6 +177,43 @@ fn info_dict_title_utf16be_bom_decoded() {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn info_dict_title_utf16be_surrogate_pair_decoded() {
|
||||
// 🥙 (U+1F959 STUFFED FLATBREAD) sits in the supplementary plane,
|
||||
// so encoding it as UTF-16BE produces a surrogate pair (D83E DD59).
|
||||
// BMP-only inputs would never exercise the pair-joining path of
|
||||
// `String::from_utf16_lossy` — this asserts that path round-trips.
|
||||
let info = InfoDict {
|
||||
title: Some(utf16be_bom("케밥 🥙 문서")),
|
||||
producer: None,
|
||||
creator: None,
|
||||
};
|
||||
let bytes = build_text_pdf_with_info(&[Some("body")], &info);
|
||||
let fx = fixture_for("docs/emoji-title.pdf", &bytes);
|
||||
let doc = PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("PDF with surrogate-pair Title must extract");
|
||||
assert_eq!(doc.title, "케밥 🥙 문서");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn info_dict_title_pdfdocencoding_latin1_high_bytes_decoded() {
|
||||
// BOM-less PDFDocEncoded title with a high-byte char (0xE9 = 'é').
|
||||
// `from_utf8_lossy` would have replaced this with U+FFFD; the
|
||||
// byte-as-char path keeps it intact.
|
||||
let info = InfoDict {
|
||||
title: Some(b"Caf\xE9".to_vec()),
|
||||
producer: None,
|
||||
creator: None,
|
||||
};
|
||||
let bytes = build_text_pdf_with_info(&[Some("body")], &info);
|
||||
let fx = fixture_for("docs/cafe-title.pdf", &bytes);
|
||||
let doc = PdfTextExtractor::new()
|
||||
.extract(&fx.ctx(), &bytes)
|
||||
.expect("PDF with Latin-1 Title must extract");
|
||||
assert_eq!(doc.title, "Café");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn info_dict_title_falls_back_to_filename_when_missing() {
|
||||
let bytes = build_text_pdf(&[Some("body")]);
|
||||
|
||||
Reference in New Issue
Block a user