feat(app): capture image_width/height in PDF OCR raster decode (Enhancement 1)

Add extract_image_dimensions(bytes) helper using image::ImageReader
and fill the 2 PdfOcrProgress::Finished emit points in pdf_ocr_apply.rs
where page_image_bytes is in scope (OCR error path + success path).
The no-DCTDecode skip path leaves None as page_image_bytes is absent.
Result: LogEvent::Ocr carries non-null image_width/image_height on
successful raster decode, enabling future size-conditioned timeout tuning.

Closure r1 F3: kebab-app/Cargo.toml image features += "jpeg" added as
direct [dependencies] entry (not relying on feature unification via
kebab-parse-image).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 05:54:55 +00:00
parent 89d334a92b
commit 5977c8cdf1
2 changed files with 51 additions and 5 deletions

View File

@@ -36,6 +36,10 @@ kebab-parse-image = { path = "../kebab-parse-image" }
# resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
kebab-parse-pdf = { path = "../kebab-parse-pdf" }
lopdf = { workspace = true }
# Enhancement 1 (v0.20.x r2): JPEG dimension decode in pdf_ocr_apply.rs.
# jpeg feature added explicitly (F3 closure-r1) rather than relying on
# feature unification via kebab-parse-image.
image = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
# p10-1A-2: Rust AST extractor lives here. App threads it into the
# per-asset dispatch (see `ingest_one_asset` Code branch) and runs the
# resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`.
@@ -72,7 +76,7 @@ tempfile = { workspace = true }
# the kb-app code under test stays sync.
wiremock = { workspace = true }
tokio = { workspace = true, features = ["rt-multi-thread"] }
image = { version = "0.25", default-features = false, features = ["png"] }
image = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
# P7-3 PDF integration tests build in-memory PDF fixtures via the same
# lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
# to the same major (0.32) so byte output is identical between the two

View File

@@ -22,6 +22,18 @@ use lopdf::Document as LopdfDocument;
use time::OffsetDateTime;
use tracing::warn;
/// Extract width/height from a JPEG (or any image format) byte slice.
/// Returns `None` on corrupt / unsupported data — callers fall back to
/// `(None, None)` so OCR results remain valid (R-4 mitigation).
fn extract_image_dimensions(bytes: &[u8]) -> Option<(u32, u32)> {
use image::ImageReader;
ImageReader::new(std::io::Cursor::new(bytes))
.with_guessed_format()
.ok()?
.into_dimensions()
.ok()
}
/// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`].
/// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade
/// (`kebab_app::ingest_one_pdf_asset`) fills these from
@@ -178,14 +190,18 @@ where
kind: ProvenanceKind::Warning,
note: Some(note),
});
let (image_width, image_height) =
extract_image_dimensions(&page_image_bytes)
.map(|(w, h)| (Some(w), Some(h)))
.unwrap_or((None, None));
emit_progress(PdfOcrProgress::Finished {
page: page_num,
ms: start.elapsed().as_millis() as u64,
chars: 0,
skipped: true,
image_byte_size: Some(page_image_bytes.len() as u64),
image_width: None,
image_height: None,
image_width,
image_height,
failure_reason: Some("ocr_error".to_string()),
});
continue;
@@ -256,14 +272,17 @@ where
)),
});
let (image_width, image_height) = extract_image_dimensions(&page_image_bytes)
.map(|(w, h)| (Some(w), Some(h)))
.unwrap_or((None, None));
emit_progress(PdfOcrProgress::Finished {
page: page_num,
ms: elapsed_ms,
chars: chars_ocr,
skipped: false,
image_byte_size: Some(page_image_bytes.len() as u64),
image_width: None,
image_height: None,
image_width,
image_height,
failure_reason: None,
});
}
@@ -321,3 +340,26 @@ pub enum PdfOcrProgress {
failure_reason: Option<String>,
},
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_image_dimensions_valid_jpeg() {
let img = image::RgbImage::new(16, 12);
let mut bytes = Vec::new();
image::DynamicImage::from(img)
.write_to(
&mut std::io::Cursor::new(&mut bytes),
image::ImageFormat::Jpeg,
)
.expect("encode jpeg");
assert_eq!(extract_image_dimensions(&bytes), Some((16, 12)));
}
#[test]
fn extract_image_dimensions_corrupt_returns_none() {
assert_eq!(extract_image_dimensions(b"not a jpeg"), None);
}
}