diff --git a/crates/kebab-parse-image/src/caption.rs b/crates/kebab-parse-image/src/caption.rs index 3e3597b..23b4a1b 100644 --- a/crates/kebab-parse-image/src/caption.rs +++ b/crates/kebab-parse-image/src/caption.rs @@ -25,50 +25,45 @@ //! without saving meaningful binary weight. See `tasks/HOTFIXES.md` //! (2026-05-02) for the deviation log. -use std::io::Cursor; - use anyhow::{Context, Result}; use base64::Engine as _; use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; -use image::{ImageFormat, ImageReader}; use kebab_core::{ FinishReason, GenerateRequest, ImageRefBlock, Lang, LanguageModel, ModelCaption, ProvenanceEvent, ProvenanceKind, TokenChunk, }; use time::OffsetDateTime; +use crate::image_prep; + /// Long-edge clamp range for caption inputs. Smaller than OCR's /// `[256, 4096]` because vision LMs charge proportionally to input /// dimension — captions tolerate aggressive downscale better than /// OCR. -const MIN_CAPTION_LONG_EDGE: u32 = 128; -const MAX_CAPTION_LONG_EDGE: u32 = 1536; +pub const MIN_CAPTION_LONG_EDGE: u32 = 128; +pub const MAX_CAPTION_LONG_EDGE: u32 = 1536; /// Token budget for captions. Captions are one-sentence by spec — 96 /// tokens covers a 50-word English sentence or a 30-token Korean one /// with headroom for the LM's preamble before the stop sequence. const CAPTION_MAX_TOKENS: usize = 96; -/// Run a caption pass and return the resulting `ModelCaption`. Honours -/// `config.image.caption.enabled` — when disabled the function is a -/// no-op and returns an `Err` so the caller can route the asset -/// through `apply_caption` instead, which knows to short-circuit. +/// Run a caption pass and return the resulting `ModelCaption`. /// -/// Direct callers should prefer [`apply_caption`] for end-to-end -/// pipeline integration; this lower-level entry exists so tests can -/// pin the produced `ModelCaption` independent of block mutation. +/// Pure raw operation — does **not** consult `config.image.caption.enabled`. +/// The runtime feature gate lives in [`apply_caption`]; this entry +/// always invokes the LM. Tests pinning the produced `ModelCaption` +/// shape can call this directly without flipping the config flag. +/// +/// Honours the `[MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE]` clamp +/// on `config.image.caption.max_pixels` so a hostile config cannot +/// blow up prompt cost. pub fn caption_image( llm: &dyn LanguageModel, image_bytes: &[u8], lang_hint: Option<&Lang>, cfg: &kebab_config::Config, ) -> Result { - if !cfg.image.caption.enabled { - anyhow::bail!( - "captioning is disabled (set image.caption.enabled = true in config to enable)" - ); - } - let max_pixels = cfg .image .caption @@ -85,7 +80,7 @@ pub fn caption_image( ); } - let prepared = downscale_to_png(image_bytes, max_pixels) + let (prepared, _w, _h) = image_prep::downscale_to_png(image_bytes, max_pixels) .context("preparing image for caption")?; let b64 = BASE64_STANDARD.encode(&prepared); @@ -156,13 +151,13 @@ pub fn caption_image( }) } -/// Mutate `block.caption` in place by running `caption_image` over -/// `image_bytes`. When `config.image.caption.enabled = false` the -/// function is a clean no-op (returns `Ok(())` without invoking the -/// LM and without writing a Provenance event). +/// Pipeline entry point — gate-checks `config.image.caption.enabled` +/// then mutates `block.caption` in place via [`caption_image`]. /// -/// On LM failure, `block.caption` stays `None` — partial state is -/// never written. The caller decides whether to skip the asset or +/// When `enabled = false` the function is a clean no-op (returns +/// `Ok(())` without invoking the LM and without writing a Provenance +/// event). On LM failure `block.caption` stays `None` — partial state +/// is never written. The caller decides whether to skip the asset or /// surface the error. pub fn apply_caption( llm: &dyn LanguageModel, @@ -180,16 +175,20 @@ pub fn apply_caption( return Ok(()); } let caption = caption_image(llm, image_bytes, lang_hint, cfg)?; - let model_label = caption.model.clone(); - let model_version_label = caption.model_version.clone(); + // Build the Provenance note BEFORE moving `caption` into + // `block.caption` so we sidestep the per-call `String::clone` of + // `caption.model` + `caption.model_version`. Tight ingest loops + // (thousands of images) save two allocations per asset. + let note = format!( + "model={} model_version={}", + caption.model, caption.model_version + ); block.caption = Some(caption); events.push(ProvenanceEvent { at: OffsetDateTime::now_utc(), agent: "kb-parse-image".to_string(), kind: ProvenanceKind::CaptionApplied, - note: Some(format!( - "model={model_label} model_version={model_version_label}" - )), + note: Some(note), }); Ok(()) } @@ -216,50 +215,6 @@ fn build_prompt(lang_hint: Option<&str>) -> (String, String) { } } -/// Decode `bytes`, downscale long-edge to `max_long_edge`, re-encode as -/// PNG. Mirrors the OCR pipeline's pattern but with the caption-side -/// long-edge bounds. PNG sources within the cap pass through without -/// re-encode. -fn downscale_to_png(bytes: &[u8], max_long_edge: u32) -> Result> { - let reader = ImageReader::new(Cursor::new(bytes)) - .with_guessed_format() - .context("reading image header for caption")?; - let format = reader.format(); - let (w, h) = reader - .into_dimensions() - .context("reading image dimensions for caption")?; - - let long = w.max(h); - if long <= max_long_edge && format == Some(ImageFormat::Png) { - return Ok(bytes.to_vec()); - } - - let img = ImageReader::new(Cursor::new(bytes)) - .with_guessed_format() - .context("re-reading image for caption decode")? - .decode() - .context("decoding image for caption")?; - let final_img = if long <= max_long_edge { - img - } else { - let scale = max_long_edge as f32 / long as f32; - let mut new_w = ((w as f32) * scale).round().max(1.0) as u32; - let mut new_h = ((h as f32) * scale).round().max(1.0) as u32; - if w >= h { - new_w = new_w.min(max_long_edge); - } else { - new_h = new_h.min(max_long_edge); - } - img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle) - }; - - let mut out = Cursor::new(Vec::new()); - final_img - .write_to(&mut out, ImageFormat::Png) - .context("encoding image as PNG for caption")?; - Ok(out.into_inner()) -} - #[cfg(test)] mod tests { use super::*; diff --git a/crates/kebab-parse-image/src/image_prep.rs b/crates/kebab-parse-image/src/image_prep.rs new file mode 100644 index 0000000..208b378 --- /dev/null +++ b/crates/kebab-parse-image/src/image_prep.rs @@ -0,0 +1,83 @@ +//! Shared image preparation for OCR / caption / future vision pipelines. +//! +//! Both P6-2 OCR and P6-3 caption need the same pre-LM step: clamp the +//! long edge to a configured max, re-encode as PNG (Ollama's vision +//! channel format), pass through the source bytes when they already +//! satisfy both constraints. Centralising this here keeps the +//! 1px-rounding fix, the PNG passthrough hot path, and the error +//! messages in one place — future modules (PDF page thumbnails, +//! video keyframes, …) plug in without re-deriving the algorithm. + +use std::io::Cursor; + +use anyhow::{Context, Result}; +use image::{ImageFormat, ImageReader}; + +/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`, +/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)` so +/// callers that care about the final dimensions (e.g. OCR's +/// `SourceSpan::Region`) get them without re-decoding. +/// +/// PNG sources that already fit the cap pass through (zero decodes, +/// just a `Vec` clone). Every other path decodes the image exactly +/// once: a cheap header sniff peeks at the format / dimensions before +/// committing to a decode, so non-PNG passthrough and downscale share +/// the same `decode → optionally resize → re-encode` tail. +pub(crate) fn downscale_to_png( + bytes: &[u8], + max_long_edge: u32, +) -> Result<(Vec, u32, u32)> { + let reader = ImageReader::new(Cursor::new(bytes)) + .with_guessed_format() + .context("reading image header")?; + let format = reader.format(); + let (w, h) = reader + .into_dimensions() + .context("reading image dimensions")?; + + let long = w.max(h); + + // Hot path — PNG within budget already matches the wire format we + // send to vision models, so we ship the bytes verbatim without + // paying for a decode + re-encode round-trip. + if long <= max_long_edge && format == Some(ImageFormat::Png) { + return Ok((bytes.to_vec(), w, h)); + } + + // Every remaining branch needs the pixels — either to re-encode as + // PNG (non-PNG within budget) or to resize first (over budget). + // One decode covers both. + let img = ImageReader::new(Cursor::new(bytes)) + .with_guessed_format() + .context("re-reading image for decode")? + .decode() + .context("decoding image")?; + + let (final_w, final_h, final_img) = if long <= max_long_edge { + (w, h, img) + } else { + let scale = max_long_edge as f32 / long as f32; + let mut new_w = ((w as f32) * scale).round().max(1.0) as u32; + let mut new_h = ((h as f32) * scale).round().max(1.0) as u32; + // Independent rounding of the two axes can let `f32`'s + // round-to-nearest push the long axis one pixel past + // `max_long_edge` for irrational scales (e.g. `max=1601, + // long=4001`). Pin the long axis to exactly `max_long_edge` + // so the doc-comment's "long edge is at most max_long_edge" + // stays a strict bound. + if w >= h { + new_w = new_w.min(max_long_edge); + } else { + new_h = new_h.min(max_long_edge); + } + let resized = + img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle); + (new_w, new_h, resized) + }; + + let mut out = Cursor::new(Vec::new()); + final_img + .write_to(&mut out, ImageFormat::Png) + .context("encoding image as PNG")?; + Ok((out.into_inner(), final_w, final_h)) +} diff --git a/crates/kebab-parse-image/src/lib.rs b/crates/kebab-parse-image/src/lib.rs index c5e1b87..7d23cc3 100644 --- a/crates/kebab-parse-image/src/lib.rs +++ b/crates/kebab-parse-image/src/lib.rs @@ -27,6 +27,7 @@ mod dims; mod exif_extract; +mod image_prep; pub mod caption; pub mod ocr; diff --git a/crates/kebab-parse-image/src/ocr.rs b/crates/kebab-parse-image/src/ocr.rs index 8d7f88a..77b6bef 100644 --- a/crates/kebab-parse-image/src/ocr.rs +++ b/crates/kebab-parse-image/src/ocr.rs @@ -25,17 +25,17 @@ //! field on [`OcrText`] makes the source explicit, so a caller can //! decide whether to trust based on which engine produced the text. -use std::io::Cursor; use std::time::Duration; use anyhow::{Context, Result}; use base64::Engine as _; use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; -use image::{ImageFormat, ImageReader}; use kebab_core::{ImageRefBlock, Lang, OcrRegion, OcrText, ProvenanceEvent, ProvenanceKind}; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; +use crate::image_prep; + /// Engine name written into `OcrText.engine` for the Ollama-vision adapter. pub const OLLAMA_VISION_ENGINE: &str = "ollama-vision"; @@ -239,7 +239,7 @@ impl OcrEngine for OllamaVisionOcr { image_bytes: &[u8], lang_hint: Option<&Lang>, ) -> Result { - let (prepared, w, h) = downscale_to_long_edge(image_bytes, self.max_pixels) + let (prepared, w, h) = image_prep::downscale_to_png(image_bytes, self.max_pixels) .context("preparing image for OCR")?; let b64 = BASE64_STANDARD.encode(&prepared); @@ -311,71 +311,6 @@ impl OcrEngine for OllamaVisionOcr { } } -// ── Image preparation ───────────────────────────────────────────────────── - -/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`, -/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)`. -/// -/// PNG sources that already fit the cap are passthrough (zero decodes, -/// just a `Vec` clone). Every other path decodes the image exactly -/// once: the cheap header sniff peeks at the format / dimensions before -/// committing to a decode, so non-PNG passthrough and downscale share -/// the same `decode → optionally resize → re-encode` tail. -fn downscale_to_long_edge(bytes: &[u8], max_long_edge: u32) -> Result<(Vec, u32, u32)> { - let reader = ImageReader::new(Cursor::new(bytes)) - .with_guessed_format() - .context("reading image header for OCR")?; - let format = reader.format(); - let (w, h) = reader - .into_dimensions() - .context("reading image dimensions for OCR")?; - - let long = w.max(h); - - // Hot path — PNG within budget already matches the wire format we - // send Ollama, so we ship the bytes verbatim without paying for a - // decode + re-encode round-trip. - if long <= max_long_edge && format == Some(ImageFormat::Png) { - return Ok((bytes.to_vec(), w, h)); - } - - // Every remaining branch needs the pixels — either to re-encode as - // PNG (non-PNG within budget) or to resize first (over budget). - // One decode covers both. - let img = ImageReader::new(Cursor::new(bytes)) - .with_guessed_format() - .context("re-reading image for OCR decode")? - .decode() - .context("decoding image for OCR")?; - - let (final_w, final_h, final_img) = if long <= max_long_edge { - (w, h, img) - } else { - let scale = max_long_edge as f32 / long as f32; - let mut new_w = ((w as f32) * scale).round().max(1.0) as u32; - let mut new_h = ((h as f32) * scale).round().max(1.0) as u32; - // Independent rounding of the two axes can let `f32`'s nearest - // round push the long axis one pixel past `max_long_edge` for - // irrational scales (e.g. `max=1601, long=4001`). Pin the long - // axis to exactly `max_long_edge` so the doc-comment's - // "long edge is at most max_long_edge" stays a strict bound. - if w >= h { - new_w = new_w.min(max_long_edge); - } else { - new_h = new_h.min(max_long_edge); - } - let resized = - img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle); - (new_w, new_h, resized) - }; - - let mut out = Cursor::new(Vec::new()); - final_img - .write_to(&mut out, ImageFormat::Png) - .context("encoding image as PNG for OCR")?; - Ok((out.into_inner(), final_w, final_h)) -} - fn truncate(s: &str, n: usize) -> String { if s.chars().count() <= n { return s.to_string(); diff --git a/crates/kebab-parse-image/tests/caption.rs b/crates/kebab-parse-image/tests/caption.rs index dc8c78e..583f95d 100644 --- a/crates/kebab-parse-image/tests/caption.rs +++ b/crates/kebab-parse-image/tests/caption.rs @@ -86,20 +86,17 @@ fn apply_caption_no_op_when_feature_disabled() { } #[test] -fn caption_image_errors_when_feature_disabled() { - let cfg = Config::defaults(); // enabled = false - let mock = mk_mock("ignored"); +fn caption_image_runs_regardless_of_enabled_flag() { + // Feature gate lives in `apply_caption`; `caption_image` is the + // raw operation. Calling it directly with enabled = false must + // still produce a `ModelCaption` so tests can pin the produced + // shape independent of pipeline gating. + let cfg = Config::defaults(); // enabled = false (default) + let mock = mk_mock("hi"); let bytes = red_100x50_png(); - let r = caption_image(&mock, &bytes, None, &cfg); - assert!( - r.is_err(), - "caption_image must Err when image.caption.enabled = false" - ); - let msg = format!("{:#}", r.unwrap_err()); - assert!( - msg.contains("disabled"), - "error must mention disabled state: {msg}" - ); + let cap = caption_image(&mock, &bytes, None, &cfg) + .expect("caption_image runs even when enabled = false"); + assert_eq!(cap.text, "hi"); } // ── Happy path ──────────────────────────────────────────────────────────── @@ -317,8 +314,9 @@ fn caption_image_clamps_oversized_max_pixels() { let (w, h) = reader.into_dimensions().unwrap(); let long = w.max(h); assert!( - long <= 1536, - "max_pixels must clamp to MAX_CAPTION_LONG_EDGE=1536, got {long}" + long <= kebab_parse_image::caption::MAX_CAPTION_LONG_EDGE, + "max_pixels must clamp to MAX_CAPTION_LONG_EDGE={}, got {long}", + kebab_parse_image::caption::MAX_CAPTION_LONG_EDGE ); } diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index d1863b0..a2355fa 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -33,6 +33,8 @@ git history. **Trust note**: Captions stay explicitly model-generated. `ModelCaption.model_version` carries `"/"` (e.g. `"ollama/caption-v1"`) so a regression in either prompt or model is auditable from the wire. +**`model_version` shape deviation**: spec literal says `model_version: llm.model_ref().provider` (provider as a coarse version proxy). We extend to `/` because prompt template churn is a real regression vector independent of the model — pinning both axes in one string lets `kebab-eval` (P5) detect either drift without a schema bump. Spec already left the door open ("if a vision model exposes a stable revision, prefer that"); the prompt template version is the closest stable revision we have today. Future PaddleOCR / Apple Vision adapters that expose a real model revision string can substitute it for `prompt_template_version` without breaking the wire shape. + **Amends**: - tasks/p4/p4-1-llm-trait.md (`GenerateRequest` schema gained `images: Vec`). - tasks/p4/p4-2-ollama-adapter.md (request body now optionally includes `images: [...]`).