feat(app): capture image_width/height in PDF OCR raster decode (Enhancement 1)
Add extract_image_dimensions(bytes) helper using image::ImageReader and fill the 2 PdfOcrProgress::Finished emit points in pdf_ocr_apply.rs where page_image_bytes is in scope (OCR error path + success path). The no-DCTDecode skip path leaves None as page_image_bytes is absent. Result: LogEvent::Ocr carries non-null image_width/image_height on successful raster decode, enabling future size-conditioned timeout tuning. Closure r1 F3: kebab-app/Cargo.toml image features += "jpeg" added as direct [dependencies] entry (not relying on feature unification via kebab-parse-image). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -36,6 +36,10 @@ kebab-parse-image = { path = "../kebab-parse-image" }
|
||||
# resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
|
||||
kebab-parse-pdf = { path = "../kebab-parse-pdf" }
|
||||
lopdf = { workspace = true }
|
||||
# Enhancement 1 (v0.20.x r2): JPEG dimension decode in pdf_ocr_apply.rs.
|
||||
# jpeg feature added explicitly (F3 closure-r1) rather than relying on
|
||||
# feature unification via kebab-parse-image.
|
||||
image = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
|
||||
# p10-1A-2: Rust AST extractor lives here. App threads it into the
|
||||
# per-asset dispatch (see `ingest_one_asset` Code branch) and runs the
|
||||
# resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`.
|
||||
@@ -72,7 +76,7 @@ tempfile = { workspace = true }
|
||||
# the kb-app code under test stays sync.
|
||||
wiremock = { workspace = true }
|
||||
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||
image = { version = "0.25", default-features = false, features = ["png"] }
|
||||
image = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
|
||||
# P7-3 PDF integration tests build in-memory PDF fixtures via the same
|
||||
# lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
|
||||
# to the same major (0.32) so byte output is identical between the two
|
||||
|
||||
@@ -22,6 +22,18 @@ use lopdf::Document as LopdfDocument;
|
||||
use time::OffsetDateTime;
|
||||
use tracing::warn;
|
||||
|
||||
/// Extract width/height from a JPEG (or any image format) byte slice.
|
||||
/// Returns `None` on corrupt / unsupported data — callers fall back to
|
||||
/// `(None, None)` so OCR results remain valid (R-4 mitigation).
|
||||
fn extract_image_dimensions(bytes: &[u8]) -> Option<(u32, u32)> {
|
||||
use image::ImageReader;
|
||||
ImageReader::new(std::io::Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.ok()?
|
||||
.into_dimensions()
|
||||
.ok()
|
||||
}
|
||||
|
||||
/// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`].
|
||||
/// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade
|
||||
/// (`kebab_app::ingest_one_pdf_asset`) fills these from
|
||||
@@ -178,14 +190,18 @@ where
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(note),
|
||||
});
|
||||
let (image_width, image_height) =
|
||||
extract_image_dimensions(&page_image_bytes)
|
||||
.map(|(w, h)| (Some(w), Some(h)))
|
||||
.unwrap_or((None, None));
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: start.elapsed().as_millis() as u64,
|
||||
chars: 0,
|
||||
skipped: true,
|
||||
image_byte_size: Some(page_image_bytes.len() as u64),
|
||||
image_width: None,
|
||||
image_height: None,
|
||||
image_width,
|
||||
image_height,
|
||||
failure_reason: Some("ocr_error".to_string()),
|
||||
});
|
||||
continue;
|
||||
@@ -256,14 +272,17 @@ where
|
||||
)),
|
||||
});
|
||||
|
||||
let (image_width, image_height) = extract_image_dimensions(&page_image_bytes)
|
||||
.map(|(w, h)| (Some(w), Some(h)))
|
||||
.unwrap_or((None, None));
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: elapsed_ms,
|
||||
chars: chars_ocr,
|
||||
skipped: false,
|
||||
image_byte_size: Some(page_image_bytes.len() as u64),
|
||||
image_width: None,
|
||||
image_height: None,
|
||||
image_width,
|
||||
image_height,
|
||||
failure_reason: None,
|
||||
});
|
||||
}
|
||||
@@ -321,3 +340,26 @@ pub enum PdfOcrProgress {
|
||||
failure_reason: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn extract_image_dimensions_valid_jpeg() {
|
||||
let img = image::RgbImage::new(16, 12);
|
||||
let mut bytes = Vec::new();
|
||||
image::DynamicImage::from(img)
|
||||
.write_to(
|
||||
&mut std::io::Cursor::new(&mut bytes),
|
||||
image::ImageFormat::Jpeg,
|
||||
)
|
||||
.expect("encode jpeg");
|
||||
assert_eq!(extract_image_dimensions(&bytes), Some((16, 12)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_image_dimensions_corrupt_returns_none() {
|
||||
assert_eq!(extract_image_dimensions(b"not a jpeg"), None);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user