diff --git a/crates/kebab-parse-image/src/image_prep.rs b/crates/kebab-parse-image/src/image_prep.rs index 208b378..6fa7cfd 100644 --- a/crates/kebab-parse-image/src/image_prep.rs +++ b/crates/kebab-parse-image/src/image_prep.rs @@ -1,12 +1,15 @@ -//! Shared image preparation for OCR / caption / future vision pipelines. +//! Shared image preparation for any image-to-LM pipeline. //! -//! Both P6-2 OCR and P6-3 caption need the same pre-LM step: clamp the -//! long edge to a configured max, re-encode as PNG (Ollama's vision -//! channel format), pass through the source bytes when they already -//! satisfy both constraints. Centralising this here keeps the -//! 1px-rounding fix, the PNG passthrough hot path, and the error -//! messages in one place — future modules (PDF page thumbnails, -//! video keyframes, …) plug in without re-deriving the algorithm. +//! P6-2 OCR and P6-3 caption both need the same pre-LM step: clamp +//! the long edge to a configured max, re-encode as PNG (the wire +//! format vision channels expect — Ollama's `images: [base64, ...]` +//! takes PNG/JPEG, but PNG keeps the alpha + lossless invariant we +//! prefer for hand-drawn / screenshot inputs), pass through the +//! source bytes when they already satisfy both constraints. +//! Centralising this here keeps the 1px-rounding fix, the PNG +//! passthrough hot path, and the error messages in one place — +//! future image-to-LM channels (PDF page thumbnails, video +//! keyframes, …) plug in without re-deriving the algorithm. use std::io::Cursor; @@ -81,3 +84,106 @@ pub(crate) fn downscale_to_png( .context("encoding image as PNG")?; Ok((out.into_inner(), final_w, final_h)) } + +#[cfg(test)] +mod tests { + use super::*; + + use std::io::Cursor; + + use image::{ImageBuffer, Rgb}; + + /// Solid-colour PNG of the given dimensions. Solid colour + /// compresses aggressively so even 4001×3001 stays under a few + /// kilobytes. + fn solid_png(w: u32, h: u32) -> Vec { + let img: ImageBuffer, _> = + ImageBuffer::from_pixel(w, h, Rgb([0, 0, 255])); + let mut buf = Cursor::new(Vec::new()); + img.write_to(&mut buf, ImageFormat::Png) + .expect("encoding solid PNG must not fail"); + buf.into_inner() + } + + fn solid_jpeg(w: u32, h: u32) -> Vec { + let img: ImageBuffer, _> = + ImageBuffer::from_pixel(w, h, Rgb([255, 255, 255])); + let mut buf = Cursor::new(Vec::new()); + img.write_to(&mut buf, ImageFormat::Jpeg) + .expect("encoding solid JPEG must not fail"); + buf.into_inner() + } + + /// PNG within budget skips the decode + re-encode round-trip + /// entirely. Source bytes survive byte-for-byte. + #[test] + fn png_within_cap_passes_through_zero_decode() { + let bytes = solid_png(100, 50); + let (out, w, h) = + downscale_to_png(&bytes, 1024).expect("PNG passthrough must succeed"); + assert_eq!((w, h), (100, 50)); + assert_eq!(out, bytes, "PNG passthrough must return source bytes verbatim"); + } + + /// JPEG within budget gets re-encoded as PNG (the wire format) + /// while preserving dimensions. + #[test] + fn jpeg_within_cap_reencodes_as_png() { + let bytes = solid_jpeg(100, 50); + let (out, w, h) = + downscale_to_png(&bytes, 1024).expect("JPEG re-encode must succeed"); + assert_eq!((w, h), (100, 50)); + // Byte stream must now start with the PNG magic. + assert_eq!( + &out[..8], + &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A], + "output must be PNG-encoded after JPEG input" + ); + } + + /// Pathological irrational scale — `max=1601, long=4001` would let + /// independent f32 round-to-nearest push the long axis to 1602. + /// The post-resize clamp pins it back to `max_long_edge`. + #[test] + fn long_edge_clamped_strictly_to_max_for_irrational_scale() { + let bytes = solid_png(4001, 3001); + let (_out, w, h) = + downscale_to_png(&bytes, 1601).expect("downscale must succeed"); + let long = w.max(h); + assert!(long <= 1601, "long edge must be ≤ max, got {long}"); + } + + /// Aspect ratio survives the downscale within 2%. + #[test] + fn aspect_ratio_preserved_within_rounding() { + let bytes = solid_png(4000, 3000); + let (_out, w, h) = + downscale_to_png(&bytes, 1024).expect("downscale must succeed"); + let ratio = w as f32 / h as f32; + assert!( + (ratio - 4.0 / 3.0).abs() < 0.02, + "aspect drift: in=4/3 out={}/{}={ratio}", + w, + h + ); + } + + /// Truncated PNG header — format guess succeeds (8-byte signature + /// intact) but `into_dimensions` fails. Surfaced as Err so + /// callers can route to "skip + warning" without confusing the + /// downstream pipeline with a zero-size image. + #[test] + fn corrupt_bytes_return_err() { + let truncated = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]; + let r = downscale_to_png(&truncated, 1024); + assert!(r.is_err(), "corrupt PNG must surface as Err"); + } + + /// Unrecognised bytes (not any image format) — header sniff fails + /// before dimension read. + #[test] + fn unrecognised_bytes_return_err() { + let r = downscale_to_png(b"definitely not an image", 1024); + assert!(r.is_err(), "non-image bytes must surface as Err"); + } +}