review(p7-1): 회차 1 지적 반영

- Cargo.toml: 사용하지 않는 deps 제거 (`kebab-config`, `thiserror`, `pdf-extract`, dev `tempfile` / `serde_json` / `serde`). 특히 `pdf-extract` 가 끌어오던 transitive ~150 crate (pom, postscript, type1-encoding-parser, adobe-cmap-parser, euclid, chrono, md5, linked-hash-map …) 가 모두 사라짐. lopdf 만 남음. - info.rs: BOM 없는 PDFDocEncoded Title 디코드 버그 수정. `from_utf8_lossy` 는 0x80–0xFF 를 U+FFFD 로 치환해 "Café" 같은 레거시 타이틀을 망가뜨림. byte → `char` 직접 캐스팅 (Latin-1 디코더) 로 교체. 회귀 테스트 `info_dict_title_pdfdocencoding_latin1_high_bytes_decoded` 추가. - info.rs: 모듈 doc 의 "Latin-1 superset" 부정확 표현 정정 — PDFDocEncoding 은 0x18–0x1F / 0x80–0x9F 영역에서 Latin-1 과 다름. - lib.rs: `saturating_sub(1)` 가 page=0 케이스를 silent 흡수하던 부분에 `debug_assert!` 추가. release 는 saturating fallback 유지 (panic 보다 garbled order 가 운영에 유리). - tests: UTF-16 surrogate pair 커버리지 갭 보완 — 🥙 (U+1F959) 가 포함된 타이틀로 `String::from_utf16_lossy` 의 페어-결합 경로 검증. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 08:40:40 +00:00
parent 5a158d7343
commit 8de08cf38c
5 changed files with 64 additions and 97 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -24,15 +24,6 @@ version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"

-[[package]]
-name = "adobe-cmap-parser"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3"
-dependencies = [
- "pom",
-]
-
 [[package]]
 name = "ahash"
 version = "0.8.12"
@@ -2123,15 +2114,6 @@ version = "1.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "40404c3f5f511ec4da6fe866ddf6a717c309fdbb69fbbad7b0f3edab8f2e835f"

-[[package]]
-name = "euclid"
-version = "0.20.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad"
-dependencies = [
- "num-traits",
-]
-
 [[package]]
 name = "event-listener"
 version = "5.4.1"
@@ -3633,14 +3615,9 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "blake3",
- "kebab-config",
 "kebab-core",
- "lopdf 0.32.0",
- "pdf-extract",
- "serde",
+ "lopdf",
 "serde_json",
- "tempfile",
- "thiserror 2.0.18",
 "time",
 "tracing",
 ]
@@ -4582,24 +4559,6 @@ dependencies = [
 "weezl",
 ]

-[[package]]
-name = "lopdf"
-version = "0.34.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff"
-dependencies = [
- "encoding_rs",
- "flate2",
- "indexmap 2.14.0",
- "itoa",
- "log",
- "md-5",
- "nom 7.1.3",
- "rangemap",
- "time",
- "weezl",
-]
-
 [[package]]
 name = "lru"
 version = "0.12.5"
@@ -5350,21 +5309,6 @@ dependencies = [
 "stfu8",
 ]

-[[package]]
-name = "pdf-extract"
-version = "0.7.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575"
-dependencies = [
- "adobe-cmap-parser",
- "encoding_rs",
- "euclid",
- "lopdf 0.34.0",
- "postscript",
- "type1-encoding-parser",
- "unicode-normalization",
-]
-
 [[package]]
 name = "percent-encoding"
 version = "2.3.2"
@@ -5468,12 +5412,6 @@ dependencies = [
 "miniz_oxide",
 ]

-[[package]]
-name = "pom"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6"
-
 [[package]]
 name = "portable-atomic"
 version = "1.13.1"
@@ -5489,12 +5427,6 @@ dependencies = [
 "portable-atomic",
 ]

-[[package]]
-name = "postscript"
-version = "0.14.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306"
-
 [[package]]
 name = "potential_utf"
 version = "0.1.5"
@@ -7615,15 +7547,6 @@ dependencies = [
 "rand 0.9.4",
 ]

-[[package]]
-name = "type1-encoding-parser"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa10c302f5a53b7ad27fd42a3996e23d096ba39b5b8dd6d9e683a05b01bee749"
-dependencies = [
- "pom",
-]
-
 [[package]]
 name = "typenum"
 version = "1.20.0"
--- a/crates/kebab-parse-pdf/Cargo.toml
+++ b/crates/kebab-parse-pdf/Cargo.toml
@@ -9,22 +9,18 @@ description   = "Text PDF extractor (per-page text + page citation) for the keba

 [dependencies]
 kebab-core   = { path = "../kebab-core" }
-kebab-config = { path = "../kebab-config" }
 anyhow       = { workspace = true }
-serde        = { workspace = true }
 serde_json   = { workspace = true }
 time         = { workspace = true }
 tracing      = { workspace = true }
-thiserror    = { workspace = true }
 # Per-page text extraction. `lopdf::Document::extract_text(&[page])`
 # is the only stable per-page API across the pdf-extract / lopdf
 # pair (pdf-extract 0.7 still exposes only whole-document calls).
+# pdf-extract is intentionally NOT pulled in here — its ~150 transitive
+# crates (pom, postscript, type1-encoding-parser, …) buy us nothing
+# at v1 (we don't call its whole-doc API), and the future scanned-PDF
+# OCR fallback can re-add it when it actually needs it.
 lopdf        = "0.32"
-# Whole-document sanity-check call; covers a few format errors that
-# lopdf swallows silently. Per-page text is sourced from lopdf only.
-pdf-extract  = "0.7"

 [dev-dependencies]
-tempfile     = { workspace = true }
 blake3       = { workspace = true }
-serde_json   = { workspace = true }
--- a/crates/kebab-parse-pdf/src/info.rs
+++ b/crates/kebab-parse-pdf/src/info.rs
@@ -2,10 +2,15 @@
 //!
 //! PDFs may carry a `/Info` trailer dictionary with `Title`,
 //! `Producer`, `Creator`, etc. Strings are encoded as either
-//! PDFDocEncoding (Latin-1 superset) OR UTF-16BE prefixed with the
-//! BOM `0xFE 0xFF`. We handle both. Anything else falls back to
-//! UTF-8 lossy. All fields are optional — a missing `/Info` dict is
-//! not an error.
+//! UTF-16BE prefixed with the BOM `0xFE 0xFF` OR PDFDocEncoding
+//! (which agrees with Latin-1 over `0x20–0x7E` + `0xA0–0xFF` and
+//! diverges in the `0x18–0x1F` / `0x80–0x9F` ranges). We decode
+//! BOM'd strings as proper UTF-16BE; non-BOM strings are decoded
+//! as Latin-1 (byte → `char`), which is correct for the common
+//! ASCII case and a best-effort approximation for the divergent
+//! PDFDocEncoding ranges (full PDFDocEncoding tables aren't worth
+//! the maintenance for what is effectively legacy metadata). All
+//! fields are optional — a missing `/Info` dict is not an error.

 #[derive(Default)]
 pub(crate) struct InfoDict {
@@ -61,10 +66,11 @@ fn pdf_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
        }
    }

-    // PDFDocEncoding overlaps Latin-1 for the printable range we care
-    // about, and Latin-1 is byte-identical to UTF-8 only for ASCII;
-    // `from_utf8_lossy` is the conservative call here. ASCII-only
-    // PDFs (the common case) round-trip cleanly.
-    let s = String::from_utf8_lossy(bytes).into_owned();
+    // PDFDocEncoding fallback (no BOM). Direct byte → char cast is
+    // a Latin-1 decoder: ASCII (0x00–0x7F) round-trips, and
+    // 0xA0–0xFF maps to the matching Unicode code point. `from_utf8_lossy`
+    // would have replaced 0x80–0xFF with U+FFFD, mangling legacy
+    // PDFDocEncoded titles like "Café".
+    let s: String = bytes.iter().map(|&b| b as char).collect();
    if s.is_empty() { None } else { Some(s) }
 }
--- a/crates/kebab-parse-pdf/src/lib.rs
+++ b/crates/kebab-parse-pdf/src/lib.rs
@@ -131,8 +131,13 @@ impl Extractor for PdfTextExtractor {
                char_start: Some(0),
                char_end: Some(char_count),
            };
-            // ordinal = page - 1; saturating_sub guards the (shouldn't-happen)
-            // case where lopdf hands back a 0-indexed page key.
+            // lopdf's `get_pages()` is 1-based by contract. A 0-key would
+            // collapse two pages onto the same ordinal (silently breaking
+            // ordinal-based sorting downstream), so we assert the
+            // invariant in dev builds. The release fallback still uses
+            // saturating_sub so a future lopdf regression degrades to
+            // garbled order rather than panic.
+            debug_assert!(page_num >= 1, "lopdf get_pages() returned 0-based page key");
            let ordinal = page_num.saturating_sub(1);
            let block_id = id_for_block(&doc_id, "paragraph", &[], ordinal, &span);
            let common = CommonBlock {
--- a/crates/kebab-parse-pdf/tests/extractor.rs
+++ b/crates/kebab-parse-pdf/tests/extractor.rs
@@ -177,6 +177,43 @@ fn info_dict_title_utf16be_bom_decoded() {
    );
 }

+#[test]
+fn info_dict_title_utf16be_surrogate_pair_decoded() {
+    // 🥙 (U+1F959 STUFFED FLATBREAD) sits in the supplementary plane,
+    // so encoding it as UTF-16BE produces a surrogate pair (D83E DD59).
+    // BMP-only inputs would never exercise the pair-joining path of
+    // `String::from_utf16_lossy` — this asserts that path round-trips.
+    let info = InfoDict {
+        title: Some(utf16be_bom("케밥 🥙 문서")),
+        producer: None,
+        creator: None,
+    };
+    let bytes = build_text_pdf_with_info(&[Some("body")], &info);
+    let fx = fixture_for("docs/emoji-title.pdf", &bytes);
+    let doc = PdfTextExtractor::new()
+        .extract(&fx.ctx(), &bytes)
+        .expect("PDF with surrogate-pair Title must extract");
+    assert_eq!(doc.title, "케밥 🥙 문서");
+}
+
+#[test]
+fn info_dict_title_pdfdocencoding_latin1_high_bytes_decoded() {
+    // BOM-less PDFDocEncoded title with a high-byte char (0xE9 = 'é').
+    // `from_utf8_lossy` would have replaced this with U+FFFD; the
+    // byte-as-char path keeps it intact.
+    let info = InfoDict {
+        title: Some(b"Caf\xE9".to_vec()),
+        producer: None,
+        creator: None,
+    };
+    let bytes = build_text_pdf_with_info(&[Some("body")], &info);
+    let fx = fixture_for("docs/cafe-title.pdf", &bytes);
+    let doc = PdfTextExtractor::new()
+        .extract(&fx.ctx(), &bytes)
+        .expect("PDF with Latin-1 Title must extract");
+    assert_eq!(doc.title, "Café");
+}
+
 #[test]
 fn info_dict_title_falls_back_to_filename_when_missing() {
    let bytes = build_text_pdf(&[Some("body")]);