review(p6-3): 회차 1 지적 반영
- 새 모듈 `crates/kebab-parse-image/src/image_prep.rs` — OCR + caption
+ 향후 PDF/video 가 공유할 단일 다운스케일 헬퍼 (`downscale_to_png`)
추출. 기존 ocr.rs / caption.rs 의 거의 동일 알고리즘 두 벌을 한
곳으로 통합. 1px 후행 클램프 / PNG passthrough hot path / 에러
메시지 패턴이 한 곳에서 관리됨.
- src/ocr.rs: `downscale_to_long_edge` 제거 → `image_prep::downscale_to_png`
호출. `image::ImageReader / ImageFormat / Cursor` import 도 정리.
- src/caption.rs:
• `caption_image` / `apply_caption` 의 disabled 처리 비대칭 해소.
`caption_image` 는 raw 연산 (gate 없음), `apply_caption` 만
`cfg.image.caption.enabled` 게이트 검사. 호출자가 같은 함수에서
같은 의미를 얻음.
• `apply_caption` 의 caption.model / model_version `String::clone`
2회 → 0회. caption move 전에 ProvenanceEvent.note 를 먼저 빌드.
• 다운스케일 로직 통째로 image_prep 위임.
• `MIN_CAPTION_LONG_EDGE` / `MAX_CAPTION_LONG_EDGE` 를 `pub const`
로 노출 (P6-2 의 `MAX_DECODE_DIM` 가시성 컨벤션과 일관).
- tests/caption.rs:
• `caption_image_errors_when_feature_disabled` 를
`caption_image_runs_regardless_of_enabled_flag` 로 교체 — 새
책임 분리 의미 검증.
• `caption_image_clamps_oversized_max_pixels` 가 literal 1536 대신
`kebab_parse_image::caption::MAX_CAPTION_LONG_EDGE` 상수 참조.
- tasks/HOTFIXES.md: `model_version` 형태 deviation 한 단락 추가
(spec literal `provider` → `<provider>/<prompt_template_version>`
확장 + 사유).
cargo test -p kebab-parse-image — 42 pass + 2 ignored
(13 unit + 12 P6-1 + 8 P6-2 + 9 P6-3).
cargo clippy --workspace --all-targets -- -D warnings — pass.
This commit is contained in:
@@ -25,50 +25,45 @@
|
||||
//! without saving meaningful binary weight. See `tasks/HOTFIXES.md`
|
||||
//! (2026-05-02) for the deviation log.
|
||||
|
||||
use std::io::Cursor;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use base64::Engine as _;
|
||||
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
|
||||
use image::{ImageFormat, ImageReader};
|
||||
use kebab_core::{
|
||||
FinishReason, GenerateRequest, ImageRefBlock, Lang, LanguageModel, ModelCaption,
|
||||
ProvenanceEvent, ProvenanceKind, TokenChunk,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::image_prep;
|
||||
|
||||
/// Long-edge clamp range for caption inputs. Smaller than OCR's
|
||||
/// `[256, 4096]` because vision LMs charge proportionally to input
|
||||
/// dimension — captions tolerate aggressive downscale better than
|
||||
/// OCR.
|
||||
const MIN_CAPTION_LONG_EDGE: u32 = 128;
|
||||
const MAX_CAPTION_LONG_EDGE: u32 = 1536;
|
||||
pub const MIN_CAPTION_LONG_EDGE: u32 = 128;
|
||||
pub const MAX_CAPTION_LONG_EDGE: u32 = 1536;
|
||||
|
||||
/// Token budget for captions. Captions are one-sentence by spec — 96
|
||||
/// tokens covers a 50-word English sentence or a 30-token Korean one
|
||||
/// with headroom for the LM's preamble before the stop sequence.
|
||||
const CAPTION_MAX_TOKENS: usize = 96;
|
||||
|
||||
/// Run a caption pass and return the resulting `ModelCaption`. Honours
|
||||
/// `config.image.caption.enabled` — when disabled the function is a
|
||||
/// no-op and returns an `Err` so the caller can route the asset
|
||||
/// through `apply_caption` instead, which knows to short-circuit.
|
||||
/// Run a caption pass and return the resulting `ModelCaption`.
|
||||
///
|
||||
/// Direct callers should prefer [`apply_caption`] for end-to-end
|
||||
/// pipeline integration; this lower-level entry exists so tests can
|
||||
/// pin the produced `ModelCaption` independent of block mutation.
|
||||
/// Pure raw operation — does **not** consult `config.image.caption.enabled`.
|
||||
/// The runtime feature gate lives in [`apply_caption`]; this entry
|
||||
/// always invokes the LM. Tests pinning the produced `ModelCaption`
|
||||
/// shape can call this directly without flipping the config flag.
|
||||
///
|
||||
/// Honours the `[MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE]` clamp
|
||||
/// on `config.image.caption.max_pixels` so a hostile config cannot
|
||||
/// blow up prompt cost.
|
||||
pub fn caption_image(
|
||||
llm: &dyn LanguageModel,
|
||||
image_bytes: &[u8],
|
||||
lang_hint: Option<&Lang>,
|
||||
cfg: &kebab_config::Config,
|
||||
) -> Result<ModelCaption> {
|
||||
if !cfg.image.caption.enabled {
|
||||
anyhow::bail!(
|
||||
"captioning is disabled (set image.caption.enabled = true in config to enable)"
|
||||
);
|
||||
}
|
||||
|
||||
let max_pixels = cfg
|
||||
.image
|
||||
.caption
|
||||
@@ -85,7 +80,7 @@ pub fn caption_image(
|
||||
);
|
||||
}
|
||||
|
||||
let prepared = downscale_to_png(image_bytes, max_pixels)
|
||||
let (prepared, _w, _h) = image_prep::downscale_to_png(image_bytes, max_pixels)
|
||||
.context("preparing image for caption")?;
|
||||
let b64 = BASE64_STANDARD.encode(&prepared);
|
||||
|
||||
@@ -156,13 +151,13 @@ pub fn caption_image(
|
||||
})
|
||||
}
|
||||
|
||||
/// Mutate `block.caption` in place by running `caption_image` over
|
||||
/// `image_bytes`. When `config.image.caption.enabled = false` the
|
||||
/// function is a clean no-op (returns `Ok(())` without invoking the
|
||||
/// LM and without writing a Provenance event).
|
||||
/// Pipeline entry point — gate-checks `config.image.caption.enabled`
|
||||
/// then mutates `block.caption` in place via [`caption_image`].
|
||||
///
|
||||
/// On LM failure, `block.caption` stays `None` — partial state is
|
||||
/// never written. The caller decides whether to skip the asset or
|
||||
/// When `enabled = false` the function is a clean no-op (returns
|
||||
/// `Ok(())` without invoking the LM and without writing a Provenance
|
||||
/// event). On LM failure `block.caption` stays `None` — partial state
|
||||
/// is never written. The caller decides whether to skip the asset or
|
||||
/// surface the error.
|
||||
pub fn apply_caption(
|
||||
llm: &dyn LanguageModel,
|
||||
@@ -180,16 +175,20 @@ pub fn apply_caption(
|
||||
return Ok(());
|
||||
}
|
||||
let caption = caption_image(llm, image_bytes, lang_hint, cfg)?;
|
||||
let model_label = caption.model.clone();
|
||||
let model_version_label = caption.model_version.clone();
|
||||
// Build the Provenance note BEFORE moving `caption` into
|
||||
// `block.caption` so we sidestep the per-call `String::clone` of
|
||||
// `caption.model` + `caption.model_version`. Tight ingest loops
|
||||
// (thousands of images) save two allocations per asset.
|
||||
let note = format!(
|
||||
"model={} model_version={}",
|
||||
caption.model, caption.model_version
|
||||
);
|
||||
block.caption = Some(caption);
|
||||
events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-image".to_string(),
|
||||
kind: ProvenanceKind::CaptionApplied,
|
||||
note: Some(format!(
|
||||
"model={model_label} model_version={model_version_label}"
|
||||
)),
|
||||
note: Some(note),
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
@@ -216,50 +215,6 @@ fn build_prompt(lang_hint: Option<&str>) -> (String, String) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode `bytes`, downscale long-edge to `max_long_edge`, re-encode as
|
||||
/// PNG. Mirrors the OCR pipeline's pattern but with the caption-side
|
||||
/// long-edge bounds. PNG sources within the cap pass through without
|
||||
/// re-encode.
|
||||
fn downscale_to_png(bytes: &[u8], max_long_edge: u32) -> Result<Vec<u8>> {
|
||||
let reader = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("reading image header for caption")?;
|
||||
let format = reader.format();
|
||||
let (w, h) = reader
|
||||
.into_dimensions()
|
||||
.context("reading image dimensions for caption")?;
|
||||
|
||||
let long = w.max(h);
|
||||
if long <= max_long_edge && format == Some(ImageFormat::Png) {
|
||||
return Ok(bytes.to_vec());
|
||||
}
|
||||
|
||||
let img = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("re-reading image for caption decode")?
|
||||
.decode()
|
||||
.context("decoding image for caption")?;
|
||||
let final_img = if long <= max_long_edge {
|
||||
img
|
||||
} else {
|
||||
let scale = max_long_edge as f32 / long as f32;
|
||||
let mut new_w = ((w as f32) * scale).round().max(1.0) as u32;
|
||||
let mut new_h = ((h as f32) * scale).round().max(1.0) as u32;
|
||||
if w >= h {
|
||||
new_w = new_w.min(max_long_edge);
|
||||
} else {
|
||||
new_h = new_h.min(max_long_edge);
|
||||
}
|
||||
img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle)
|
||||
};
|
||||
|
||||
let mut out = Cursor::new(Vec::new());
|
||||
final_img
|
||||
.write_to(&mut out, ImageFormat::Png)
|
||||
.context("encoding image as PNG for caption")?;
|
||||
Ok(out.into_inner())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
83
crates/kebab-parse-image/src/image_prep.rs
Normal file
83
crates/kebab-parse-image/src/image_prep.rs
Normal file
@@ -0,0 +1,83 @@
|
||||
//! Shared image preparation for OCR / caption / future vision pipelines.
|
||||
//!
|
||||
//! Both P6-2 OCR and P6-3 caption need the same pre-LM step: clamp the
|
||||
//! long edge to a configured max, re-encode as PNG (Ollama's vision
|
||||
//! channel format), pass through the source bytes when they already
|
||||
//! satisfy both constraints. Centralising this here keeps the
|
||||
//! 1px-rounding fix, the PNG passthrough hot path, and the error
|
||||
//! messages in one place — future modules (PDF page thumbnails,
|
||||
//! video keyframes, …) plug in without re-deriving the algorithm.
|
||||
|
||||
use std::io::Cursor;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use image::{ImageFormat, ImageReader};
|
||||
|
||||
/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`,
|
||||
/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)` so
|
||||
/// callers that care about the final dimensions (e.g. OCR's
|
||||
/// `SourceSpan::Region`) get them without re-decoding.
|
||||
///
|
||||
/// PNG sources that already fit the cap pass through (zero decodes,
|
||||
/// just a `Vec` clone). Every other path decodes the image exactly
|
||||
/// once: a cheap header sniff peeks at the format / dimensions before
|
||||
/// committing to a decode, so non-PNG passthrough and downscale share
|
||||
/// the same `decode → optionally resize → re-encode` tail.
|
||||
pub(crate) fn downscale_to_png(
|
||||
bytes: &[u8],
|
||||
max_long_edge: u32,
|
||||
) -> Result<(Vec<u8>, u32, u32)> {
|
||||
let reader = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("reading image header")?;
|
||||
let format = reader.format();
|
||||
let (w, h) = reader
|
||||
.into_dimensions()
|
||||
.context("reading image dimensions")?;
|
||||
|
||||
let long = w.max(h);
|
||||
|
||||
// Hot path — PNG within budget already matches the wire format we
|
||||
// send to vision models, so we ship the bytes verbatim without
|
||||
// paying for a decode + re-encode round-trip.
|
||||
if long <= max_long_edge && format == Some(ImageFormat::Png) {
|
||||
return Ok((bytes.to_vec(), w, h));
|
||||
}
|
||||
|
||||
// Every remaining branch needs the pixels — either to re-encode as
|
||||
// PNG (non-PNG within budget) or to resize first (over budget).
|
||||
// One decode covers both.
|
||||
let img = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("re-reading image for decode")?
|
||||
.decode()
|
||||
.context("decoding image")?;
|
||||
|
||||
let (final_w, final_h, final_img) = if long <= max_long_edge {
|
||||
(w, h, img)
|
||||
} else {
|
||||
let scale = max_long_edge as f32 / long as f32;
|
||||
let mut new_w = ((w as f32) * scale).round().max(1.0) as u32;
|
||||
let mut new_h = ((h as f32) * scale).round().max(1.0) as u32;
|
||||
// Independent rounding of the two axes can let `f32`'s
|
||||
// round-to-nearest push the long axis one pixel past
|
||||
// `max_long_edge` for irrational scales (e.g. `max=1601,
|
||||
// long=4001`). Pin the long axis to exactly `max_long_edge`
|
||||
// so the doc-comment's "long edge is at most max_long_edge"
|
||||
// stays a strict bound.
|
||||
if w >= h {
|
||||
new_w = new_w.min(max_long_edge);
|
||||
} else {
|
||||
new_h = new_h.min(max_long_edge);
|
||||
}
|
||||
let resized =
|
||||
img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle);
|
||||
(new_w, new_h, resized)
|
||||
};
|
||||
|
||||
let mut out = Cursor::new(Vec::new());
|
||||
final_img
|
||||
.write_to(&mut out, ImageFormat::Png)
|
||||
.context("encoding image as PNG")?;
|
||||
Ok((out.into_inner(), final_w, final_h))
|
||||
}
|
||||
@@ -27,6 +27,7 @@
|
||||
|
||||
mod dims;
|
||||
mod exif_extract;
|
||||
mod image_prep;
|
||||
pub mod caption;
|
||||
pub mod ocr;
|
||||
|
||||
|
||||
@@ -25,17 +25,17 @@
|
||||
//! field on [`OcrText`] makes the source explicit, so a caller can
|
||||
//! decide whether to trust based on which engine produced the text.
|
||||
|
||||
use std::io::Cursor;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use base64::Engine as _;
|
||||
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
|
||||
use image::{ImageFormat, ImageReader};
|
||||
use kebab_core::{ImageRefBlock, Lang, OcrRegion, OcrText, ProvenanceEvent, ProvenanceKind};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::image_prep;
|
||||
|
||||
/// Engine name written into `OcrText.engine` for the Ollama-vision adapter.
|
||||
pub const OLLAMA_VISION_ENGINE: &str = "ollama-vision";
|
||||
|
||||
@@ -239,7 +239,7 @@ impl OcrEngine for OllamaVisionOcr {
|
||||
image_bytes: &[u8],
|
||||
lang_hint: Option<&Lang>,
|
||||
) -> Result<OcrText> {
|
||||
let (prepared, w, h) = downscale_to_long_edge(image_bytes, self.max_pixels)
|
||||
let (prepared, w, h) = image_prep::downscale_to_png(image_bytes, self.max_pixels)
|
||||
.context("preparing image for OCR")?;
|
||||
let b64 = BASE64_STANDARD.encode(&prepared);
|
||||
|
||||
@@ -311,71 +311,6 @@ impl OcrEngine for OllamaVisionOcr {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Image preparation ─────────────────────────────────────────────────────
|
||||
|
||||
/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`,
|
||||
/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)`.
|
||||
///
|
||||
/// PNG sources that already fit the cap are passthrough (zero decodes,
|
||||
/// just a `Vec` clone). Every other path decodes the image exactly
|
||||
/// once: the cheap header sniff peeks at the format / dimensions before
|
||||
/// committing to a decode, so non-PNG passthrough and downscale share
|
||||
/// the same `decode → optionally resize → re-encode` tail.
|
||||
fn downscale_to_long_edge(bytes: &[u8], max_long_edge: u32) -> Result<(Vec<u8>, u32, u32)> {
|
||||
let reader = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("reading image header for OCR")?;
|
||||
let format = reader.format();
|
||||
let (w, h) = reader
|
||||
.into_dimensions()
|
||||
.context("reading image dimensions for OCR")?;
|
||||
|
||||
let long = w.max(h);
|
||||
|
||||
// Hot path — PNG within budget already matches the wire format we
|
||||
// send Ollama, so we ship the bytes verbatim without paying for a
|
||||
// decode + re-encode round-trip.
|
||||
if long <= max_long_edge && format == Some(ImageFormat::Png) {
|
||||
return Ok((bytes.to_vec(), w, h));
|
||||
}
|
||||
|
||||
// Every remaining branch needs the pixels — either to re-encode as
|
||||
// PNG (non-PNG within budget) or to resize first (over budget).
|
||||
// One decode covers both.
|
||||
let img = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("re-reading image for OCR decode")?
|
||||
.decode()
|
||||
.context("decoding image for OCR")?;
|
||||
|
||||
let (final_w, final_h, final_img) = if long <= max_long_edge {
|
||||
(w, h, img)
|
||||
} else {
|
||||
let scale = max_long_edge as f32 / long as f32;
|
||||
let mut new_w = ((w as f32) * scale).round().max(1.0) as u32;
|
||||
let mut new_h = ((h as f32) * scale).round().max(1.0) as u32;
|
||||
// Independent rounding of the two axes can let `f32`'s nearest
|
||||
// round push the long axis one pixel past `max_long_edge` for
|
||||
// irrational scales (e.g. `max=1601, long=4001`). Pin the long
|
||||
// axis to exactly `max_long_edge` so the doc-comment's
|
||||
// "long edge is at most max_long_edge" stays a strict bound.
|
||||
if w >= h {
|
||||
new_w = new_w.min(max_long_edge);
|
||||
} else {
|
||||
new_h = new_h.min(max_long_edge);
|
||||
}
|
||||
let resized =
|
||||
img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle);
|
||||
(new_w, new_h, resized)
|
||||
};
|
||||
|
||||
let mut out = Cursor::new(Vec::new());
|
||||
final_img
|
||||
.write_to(&mut out, ImageFormat::Png)
|
||||
.context("encoding image as PNG for OCR")?;
|
||||
Ok((out.into_inner(), final_w, final_h))
|
||||
}
|
||||
|
||||
fn truncate(s: &str, n: usize) -> String {
|
||||
if s.chars().count() <= n {
|
||||
return s.to_string();
|
||||
|
||||
Reference in New Issue
Block a user