feat(app): capture image_width/height in PDF OCR raster decode (Enhancement 1)

Add extract_image_dimensions(bytes) helper using image::ImageReader and fill the 2 PdfOcrProgress::Finished emit points in pdf_ocr_apply.rs where page_image_bytes is in scope (OCR error path + success path). The no-DCTDecode skip path leaves None as page_image_bytes is absent. Result: LogEvent::Ocr carries non-null image_width/image_height on successful raster decode, enabling future size-conditioned timeout tuning. Closure r1 F3: kebab-app/Cargo.toml image features += "jpeg" added as direct [dependencies] entry (not relying on feature unification via kebab-parse-image). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 05:54:55 +00:00
parent 89d334a92b
commit 5977c8cdf1
2 changed files with 51 additions and 5 deletions
--- a/crates/kebab-app/Cargo.toml
+++ b/crates/kebab-app/Cargo.toml
@@ -36,6 +36,10 @@ kebab-parse-image = { path = "../kebab-parse-image" }
 # resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
 kebab-parse-pdf = { path = "../kebab-parse-pdf" }
 lopdf            = { workspace = true }
+# Enhancement 1 (v0.20.x r2): JPEG dimension decode in pdf_ocr_apply.rs.
+# jpeg feature added explicitly (F3 closure-r1) rather than relying on
+# feature unification via kebab-parse-image.
+image            = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
 # p10-1A-2: Rust AST extractor lives here. App threads it into the
 # per-asset dispatch (see `ingest_one_asset` Code branch) and runs the
 # resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`.
@@ -72,7 +76,7 @@ tempfile             = { workspace = true }
 # the kb-app code under test stays sync.
 wiremock             = { workspace = true }
 tokio                = { workspace = true, features = ["rt-multi-thread"] }
-image                = { version = "0.25", default-features = false, features = ["png"] }
+image                = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
 # P7-3 PDF integration tests build in-memory PDF fixtures via the same
 # lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
 # to the same major (0.32) so byte output is identical between the two
--- a/crates/kebab-app/src/pdf_ocr_apply.rs
+++ b/crates/kebab-app/src/pdf_ocr_apply.rs
@@ -22,6 +22,18 @@ use lopdf::Document as LopdfDocument;
 use time::OffsetDateTime;
 use tracing::warn;

+/// Extract width/height from a JPEG (or any image format) byte slice.
+/// Returns `None` on corrupt / unsupported data — callers fall back to
+/// `(None, None)` so OCR results remain valid (R-4 mitigation).
+fn extract_image_dimensions(bytes: &[u8]) -> Option<(u32, u32)> {
+    use image::ImageReader;
+    ImageReader::new(std::io::Cursor::new(bytes))
+        .with_guessed_format()
+        .ok()?
+        .into_dimensions()
+        .ok()
+}
+
 /// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`].
 /// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade
 /// (`kebab_app::ingest_one_pdf_asset`) fills these from
@@ -178,14 +190,18 @@ where
                    kind: ProvenanceKind::Warning,
                    note: Some(note),
                });
+                let (image_width, image_height) =
+                    extract_image_dimensions(&page_image_bytes)
+                        .map(|(w, h)| (Some(w), Some(h)))
+                        .unwrap_or((None, None));
                emit_progress(PdfOcrProgress::Finished {
                    page: page_num,
                    ms: start.elapsed().as_millis() as u64,
                    chars: 0,
                    skipped: true,
                    image_byte_size: Some(page_image_bytes.len() as u64),
-                    image_width: None,
-                    image_height: None,
+                    image_width,
+                    image_height,
                    failure_reason: Some("ocr_error".to_string()),
                });
                continue;
@@ -256,14 +272,17 @@ where
            )),
        });

+        let (image_width, image_height) = extract_image_dimensions(&page_image_bytes)
+            .map(|(w, h)| (Some(w), Some(h)))
+            .unwrap_or((None, None));
        emit_progress(PdfOcrProgress::Finished {
            page: page_num,
            ms: elapsed_ms,
            chars: chars_ocr,
            skipped: false,
            image_byte_size: Some(page_image_bytes.len() as u64),
-            image_width: None,
-            image_height: None,
+            image_width,
+            image_height,
            failure_reason: None,
        });
    }
@@ -321,3 +340,26 @@ pub enum PdfOcrProgress {
        failure_reason: Option<String>,
    },
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn extract_image_dimensions_valid_jpeg() {
+        let img = image::RgbImage::new(16, 12);
+        let mut bytes = Vec::new();
+        image::DynamicImage::from(img)
+            .write_to(
+                &mut std::io::Cursor::new(&mut bytes),
+                image::ImageFormat::Jpeg,
+            )
+            .expect("encode jpeg");
+        assert_eq!(extract_image_dimensions(&bytes), Some((16, 12)));
+    }
+
+    #[test]
+    fn extract_image_dimensions_corrupt_returns_none() {
+        assert_eq!(extract_image_dimensions(b"not a jpeg"), None);
+    }
+}