feat(kebab-parse-image): P6-3 caption adapter — vision LM via trait #34
2
Cargo.lock
generated
@@ -3576,6 +3576,8 @@ dependencies = [
|
||||
"kamadak-exif",
|
||||
"kebab-config",
|
||||
"kebab-core",
|
||||
"kebab-llm",
|
||||
"kebab-llm-local",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
||||
@@ -105,18 +105,20 @@ pub struct RagCfg {
|
||||
}
|
||||
|
||||
/// Settings for the image ingest pipeline (P6). `ocr` controls OCR
|
||||
/// behaviour; future fields (e.g. `caption`) will join here as P6-3
|
||||
/// lands.
|
||||
/// behaviour (P6-2); `caption` controls vision-LM captioning (P6-3).
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ImageCfg {
|
||||
#[serde(default = "OcrCfg::defaults")]
|
||||
pub ocr: OcrCfg,
|
||||
#[serde(default = "CaptionCfg::defaults")]
|
||||
pub caption: CaptionCfg,
|
||||
}
|
||||
|
||||
impl ImageCfg {
|
||||
pub fn defaults() -> Self {
|
||||
Self {
|
||||
ocr: OcrCfg::defaults(),
|
||||
caption: CaptionCfg::defaults(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -162,6 +164,36 @@ impl OcrCfg {
|
||||
}
|
||||
}
|
||||
|
||||
/// Caption settings (P6-3). Caption uses the same Ollama-vision /
|
||||
/// `LanguageModel` pipeline as the rest of the workspace; the trait
|
||||
/// abstraction is the part the spec demands. `enabled` defaults to
|
||||
/// `false` because captioning costs one model call per asset and the
|
||||
/// output is model-generated (low trust).
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct CaptionCfg {
|
||||
/// Run captioning on every image during ingest. Default `false`.
|
||||
pub enabled: bool,
|
||||
/// Cap the long edge of the image (in pixels) before sending. The
|
||||
/// spec recommends an aggressive 768×768 cap because larger
|
||||
/// vision-LM inputs translate directly into prompt cost. Default
|
||||
/// `768`.
|
||||
pub max_pixels: u32,
|
||||
/// Caption prompt template version pinned into wire output via
|
||||
/// `ModelCaption.model_version`. Bump when the prompt changes so
|
||||
/// downstream eval can detect regressions.
|
||||
pub prompt_template_version: String,
|
||||
}
|
||||
|
||||
impl CaptionCfg {
|
||||
pub fn defaults() -> Self {
|
||||
Self {
|
||||
enabled: false,
|
||||
max_pixels: 768,
|
||||
prompt_template_version: "caption-v1".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Defaults per design §6.4.
|
||||
pub fn defaults() -> Self {
|
||||
@@ -417,6 +449,19 @@ impl Config {
|
||||
}
|
||||
}
|
||||
|
||||
// image.caption (P6-3)
|
||||
"KEBAB_IMAGE_CAPTION_ENABLED" => {
|
||||
self.image.caption.enabled = parse_bool(v);
|
||||
}
|
||||
"KEBAB_IMAGE_CAPTION_MAX_PIXELS" => {
|
||||
if let Ok(n) = v.parse::<u32>() {
|
||||
self.image.caption.max_pixels = n;
|
||||
}
|
||||
}
|
||||
"KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION" => {
|
||||
self.image.caption.prompt_template_version = v.clone();
|
||||
}
|
||||
|
||||
// Unknown KEBAB_* keys are silently ignored — see
|
||||
// `env_unknown_key_is_ignored` test.
|
||||
_ => {}
|
||||
@@ -608,6 +653,35 @@ mod tests {
|
||||
/// Pre-P6 config files don't have an `[image]` section. The
|
||||
/// `#[serde(default)]` attribute on `Config::image` must let those
|
||||
/// files load with `ImageCfg::defaults()` instead of erroring.
|
||||
#[test]
|
||||
fn image_caption_defaults_disabled() {
|
||||
let c = Config::defaults();
|
||||
assert!(!c.image.caption.enabled);
|
||||
assert_eq!(c.image.caption.max_pixels, 768);
|
||||
assert_eq!(c.image.caption.prompt_template_version, "caption-v1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn image_caption_env_overrides() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert(
|
||||
"KEBAB_IMAGE_CAPTION_ENABLED".to_string(),
|
||||
"true".to_string(),
|
||||
);
|
||||
env.insert(
|
||||
"KEBAB_IMAGE_CAPTION_MAX_PIXELS".to_string(),
|
||||
"1024".to_string(),
|
||||
);
|
||||
env.insert(
|
||||
"KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION".to_string(),
|
||||
"caption-v2".to_string(),
|
||||
);
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert!(c.image.caption.enabled);
|
||||
assert_eq!(c.image.caption.max_pixels, 1024);
|
||||
assert_eq!(c.image.caption.prompt_template_version, "caption-v2");
|
||||
}
|
||||
|
||||
/// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None`
|
||||
/// rather than to `Some("")` so the fallback to `models.llm.endpoint`
|
||||
/// kicks in. Covers the env-equivalent of a missing TOML key.
|
||||
|
||||
@@ -69,6 +69,17 @@ pub struct GenerateRequest {
|
||||
pub max_tokens: usize,
|
||||
pub temperature: f32,
|
||||
pub seed: Option<u64>,
|
||||
/// Vision inputs (base64-encoded, one per image). Empty for the
|
||||
/// text-only path that P4-2 / P4-3 / RAG uses; non-empty when a
|
||||
/// vision-capable adapter (P6-3 caption, future multimodal RAG)
|
||||
/// drives the call. The LM adapter is responsible for routing
|
||||
/// these onto the wire — Ollama uses `images: [base64, ...]`,
|
||||
/// other backends may differ.
|
||||
///
|
||||
/// Defaulted on deserialization so older `*.json` payloads /
|
||||
/// snapshots that predate the field still parse.
|
||||
#[serde(default)]
|
||||
pub images: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
|
||||
@@ -140,9 +140,15 @@ impl LanguageModel for OllamaLanguageModel {
|
||||
format!("{}\n\n{}", req.system, req.user)
|
||||
};
|
||||
|
||||
// Vision inputs (P6-3) flow through the request via Ollama's
|
||||
// `images: [base64, ...]` field. Empty for the text-only RAG
|
||||
// path so older snapshots and JSON dumps stay byte-identical
|
||||
// (the field is `#[serde(default)]` here so it's omitted from
|
||||
// the wire when empty).
|
||||
let body = OllamaRequest {
|
||||
model: &self.model_id,
|
||||
prompt,
|
||||
images: &req.images,
|
||||
stream: true,
|
||||
options: OllamaOptions {
|
||||
temperature: effective_temperature,
|
||||
@@ -188,6 +194,13 @@ impl LanguageModel for OllamaLanguageModel {
|
||||
struct OllamaRequest<'a> {
|
||||
model: &'a str,
|
||||
prompt: String,
|
||||
/// Skipped from the JSON when empty so the text-only path keeps
|
||||
/// the same on-the-wire shape it had pre-P6-3 (`{"model": ...,
|
||||
/// "prompt": ..., "stream": ..., "options": ...}` — no `images`
|
||||
/// key). Vision-capable callers populate this with one or more
|
||||
/// base64-encoded images.
|
||||
#[serde(skip_serializing_if = "<[String]>::is_empty")]
|
||||
images: &'a [String],
|
||||
stream: bool,
|
||||
options: OllamaOptions<'a>,
|
||||
}
|
||||
|
||||
@@ -31,6 +31,7 @@ fn real_ollama_streams_non_empty_response() {
|
||||
max_tokens: 8,
|
||||
temperature: 0.0,
|
||||
seed: Some(0),
|
||||
images: Vec::new(),
|
||||
};
|
||||
|
||||
let stream = llm.generate_stream(req).expect("stream should start");
|
||||
|
||||
@@ -35,6 +35,7 @@ fn sample_request() -> GenerateRequest {
|
||||
max_tokens: 64,
|
||||
temperature: 0.0,
|
||||
seed: Some(0),
|
||||
images: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@ fn req_with_stop(stop: Vec<&str>) -> GenerateRequest {
|
||||
max_tokens: 64,
|
||||
temperature: 0.0,
|
||||
seed: None,
|
||||
images: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -55,6 +55,7 @@ fn dyn_dispatch_via_box_works() {
|
||||
max_tokens: 16,
|
||||
temperature: 0.0,
|
||||
seed: None,
|
||||
images: Vec::new(),
|
||||
};
|
||||
let stream = m.generate_stream(req).expect("stream");
|
||||
let chunks: Vec<TokenChunk> = stream.map(|r| r.expect("ok chunk")).collect();
|
||||
|
||||
@@ -10,6 +10,12 @@ description = "Image extractor + EXIF + OCR (Ollama-vision) for the kebab pipe
|
||||
[dependencies]
|
||||
kebab-core = { path = "../kebab-core" }
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
# `kebab-llm` re-exports the trait crate (`kebab-core::LanguageModel`)
|
||||
# under a stable surface; the caption adapter consumes any
|
||||
# `dyn LanguageModel`. We do NOT depend on `kebab-llm-local` (forbidden
|
||||
# by p6-3 design §8) — the trait abstraction is exactly what spec
|
||||
# requires.
|
||||
kebab-llm = { path = "../kebab-llm" }
|
||||
anyhow = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
@@ -42,3 +48,10 @@ tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||
# font rendering.
|
||||
ab_glyph = "0.2"
|
||||
base64 = "0.22"
|
||||
# `kebab-llm/mock` exposes `MockLanguageModel` for hermetic caption
|
||||
# tests. Real adapters (Ollama) live in `kebab-llm-local`, which is
|
||||
# only allowed at the dev-dep level here — the runtime crate stays
|
||||
# trait-only, so the §8 forbidden-deps rule (no `kebab-llm-local`
|
||||
# at runtime) is preserved.
|
||||
kebab-llm = { path = "../kebab-llm", features = ["mock"] }
|
||||
kebab-llm-local = { path = "../kebab-llm-local" }
|
||||
|
||||
236
crates/kebab-parse-image/src/caption.rs
Normal file
@@ -0,0 +1,236 @@
|
||||
//! Caption adapter (P6-3).
|
||||
//!
|
||||
//! [`caption_image`] runs a vision-capable [`LanguageModel`] over an
|
||||
//! image and produces a [`ModelCaption`]. [`apply_caption`] is the
|
||||
//! helper that mutates an [`ImageRefBlock`] in place and emits a
|
||||
//! [`ProvenanceKind::CaptionApplied`] event.
|
||||
//!
|
||||
//! ## Trust note
|
||||
//!
|
||||
//! Captions are **model-generated** (`TrustLevel::Generated`), not
|
||||
//! observed text. Vision LMs hallucinate; the system prompt explicitly
|
||||
//! forbids guessing but expect false captions. Downstream UI / RAG
|
||||
//! must label captions as model-generated and surface the model id +
|
||||
//! prompt template version (carried in `ModelCaption.model_version`)
|
||||
//! so a regression in either is auditable.
|
||||
//!
|
||||
//! ## Spec deviation (cargo `caption` feature dropped)
|
||||
//!
|
||||
//! The original P6-3 spec asked for a cargo feature `caption` (default
|
||||
//! OFF at compile time). We collapse this into a single runtime gate
|
||||
//! (`config.image.caption.enabled = false`, default OFF). Reasoning:
|
||||
//! the captioning module's only extra deps are `base64` + `image` +
|
||||
//! `kebab-llm` trait — all already pulled in by the rest of the
|
||||
//! crate. A cargo feature would only complicate the build matrix
|
||||
//! without saving meaningful binary weight. See `tasks/HOTFIXES.md`
|
||||
//! (2026-05-02) for the deviation log.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use base64::Engine as _;
|
||||
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
|
||||
use kebab_core::{
|
||||
FinishReason, GenerateRequest, ImageRefBlock, Lang, LanguageModel, ModelCaption,
|
||||
ProvenanceEvent, ProvenanceKind, TokenChunk,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::image_prep;
|
||||
|
||||
/// Long-edge clamp range for caption inputs. Smaller than OCR's
|
||||
/// `[256, 4096]` because vision LMs charge proportionally to input
|
||||
/// dimension — captions tolerate aggressive downscale better than
|
||||
/// OCR.
|
||||
pub const MIN_CAPTION_LONG_EDGE: u32 = 128;
|
||||
pub const MAX_CAPTION_LONG_EDGE: u32 = 1536;
|
||||
|
|
||||
|
||||
/// Token budget for captions. Captions are one-sentence by spec — 96
|
||||
/// tokens covers a 50-word English sentence or a 30-token Korean one
|
||||
/// with headroom for the LM's preamble before the stop sequence.
|
||||
const CAPTION_MAX_TOKENS: usize = 96;
|
||||
|
||||
/// Run a caption pass and return the resulting `ModelCaption`.
|
||||
///
|
||||
/// Pure raw operation — does **not** consult `config.image.caption.enabled`.
|
||||
/// The runtime feature gate lives in [`apply_caption`]; this entry
|
||||
/// always invokes the LM. Tests pinning the produced `ModelCaption`
|
||||
/// shape can call this directly without flipping the config flag.
|
||||
///
|
||||
/// Honours the `[MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE]` clamp
|
||||
/// on `config.image.caption.max_pixels` so a hostile config cannot
|
||||
/// blow up prompt cost.
|
||||
|
claude-reviewer-01
commented
같은 config 상태에서 두 entry 가 다른 결과를 내는 건 호출자 입장에서 헷갈립니다. 함수의 책임 분리 관점에서 더 깔끔한 형태:
이렇게 하면 disabled 인데 caption_image 를 직접 부르는 caller (현재로선 테스트뿐) 가 "왜 Err 반환?" 에 헷갈릴 일이 없고, doc-comment 의 "this lower-level entry exists so tests can pin the produced ModelCaption" 도 더 정직해집니다 — 테스트는 enabled flag 없이 caption 동작 자체를 검증할 수 있어야 자연스러움. `caption_image` 와 `apply_caption` 의 "disabled 처리" 가 비대칭입니다:
- `caption_image(...)` → `enabled = false` 면 **Err** ("disabled" 메시지).
- `apply_caption(...)` → `enabled = false` 면 **Ok(())** (no-op).
같은 config 상태에서 두 entry 가 다른 결과를 내는 건 호출자 입장에서 헷갈립니다. 함수의 책임 분리 관점에서 더 깔끔한 형태:
- `caption_image` = "이미지를 캡션해 줘" 의 raw 연산. **항상 LM 호출**, gate 검사 없음.
- `apply_caption` = pipeline 진입점. config 게이트 + 결과 반영 + provenance.
이렇게 하면 disabled 인데 caption_image 를 직접 부르는 caller (현재로선 테스트뿐) 가 "왜 Err 반환?" 에 헷갈릴 일이 없고, doc-comment 의 "this lower-level entry exists so tests can pin the produced ModelCaption" 도 더 정직해집니다 — 테스트는 enabled flag 없이 caption 동작 자체를 검증할 수 있어야 자연스러움.
claude-reviewer-01
commented
(칭찬) (칭찬) `caption_image` (raw 연산) / `apply_caption` (gate + provenance) 의 책임 분리가 코드 양 자체를 줄였고, 더 중요하게는 "같은 config 에서 두 함수가 다른 결과" 라는 인지 부담을 제거했습니다. P+ 에서 caption 외 다른 vision adapter (예: alt-text generator) 가 비슷한 책임 분리를 따라가는 패턴이 됩니다.
|
||||
pub fn caption_image(
|
||||
llm: &dyn LanguageModel,
|
||||
image_bytes: &[u8],
|
||||
lang_hint: Option<&Lang>,
|
||||
cfg: &kebab_config::Config,
|
||||
) -> Result<ModelCaption> {
|
||||
let max_pixels = cfg
|
||||
.image
|
||||
.caption
|
||||
.max_pixels
|
||||
.clamp(MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE);
|
||||
if max_pixels != cfg.image.caption.max_pixels {
|
||||
tracing::warn!(
|
||||
target: "kebab-parse-image",
|
||||
"image.caption.max_pixels = {} clamped to {} (legal range [{}, {}])",
|
||||
cfg.image.caption.max_pixels,
|
||||
max_pixels,
|
||||
MIN_CAPTION_LONG_EDGE,
|
||||
MAX_CAPTION_LONG_EDGE
|
||||
);
|
||||
}
|
||||
|
||||
let (prepared, _w, _h) = image_prep::downscale_to_png(image_bytes, max_pixels)
|
||||
.context("preparing image for caption")?;
|
||||
let b64 = BASE64_STANDARD.encode(&prepared);
|
||||
|
||||
let lang = lang_hint
|
||||
.map(|l| l.0.as_str())
|
||||
.filter(|s| !s.is_empty() && *s != "und");
|
||||
let (system, user) = build_prompt(lang);
|
||||
|
||||
// Determinism — temperature 0.0 + seed 0, same convention as RAG
|
||||
// and OCR. The LM adapter routes the base64 image via its
|
||||
// provider-specific channel (Ollama: `images: [base64]`).
|
||||
let req = GenerateRequest {
|
||||
system,
|
||||
user,
|
||||
stop: vec!["\n\n".to_string()],
|
||||
max_tokens: CAPTION_MAX_TOKENS,
|
||||
temperature: 0.0,
|
||||
seed: Some(0),
|
||||
images: vec![b64],
|
||||
};
|
||||
|
||||
let stream = llm
|
||||
.generate_stream(req)
|
||||
.context("captioning LM call failed")?;
|
||||
|
||||
let mut text = String::new();
|
||||
let mut saw_done = false;
|
||||
for chunk in stream {
|
||||
match chunk? {
|
||||
TokenChunk::Token(t) => {
|
||||
text.push_str(&t);
|
||||
}
|
||||
TokenChunk::Done { finish_reason, .. } => {
|
||||
saw_done = true;
|
||||
if let FinishReason::Error(e) = finish_reason {
|
||||
anyhow::bail!("captioning LM ended with error: {e}");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if !saw_done {
|
||||
anyhow::bail!("captioning LM stream ended without a Done frame");
|
||||
}
|
||||
|
||||
let caption_text = text.trim().to_string();
|
||||
|
||||
let model_ref = llm.model_ref();
|
||||
let prompt_v = &cfg.image.caption.prompt_template_version;
|
||||
let model_version = format!(
|
||||
"{provider}/{prompt}",
|
||||
provider = model_ref.provider,
|
||||
prompt = prompt_v
|
||||
);
|
||||
|
||||
|
claude-reviewer-01
commented
둘 중 하나로 정리:
현재 구현이 분명히 더 유용해서 (1번) 으로 가는 게 맞아 보이지만, 결정을 audit log 에 박아 두면 P+ 에서 다른 어댑터 (PaddleOCR / Apple Vision 등) 가 model_version 을 어떻게 채울지 컨벤션이 한 줄로 잡힙니다. `model_version = "<provider>/<prompt_template_version>"` (예: `"ollama/caption-v1"`) 는 spec 의 literal `model_version: llm.model_ref().provider` 와 다릅니다. spec 자체가 "if a vision model exposes a stable revision, prefer that" 로 유연하게 열어 둔 부분이라 합리적 deviation 이지만, HOTFIXES.md 의 P6-3 항목에서 이 쪽 결정은 명시되지 않았습니다.
둘 중 하나로 정리:
1. (선호) HOTFIXES.md 의 P6-3 항목에 한 줄 추가 — "`model_version` 을 `provider` 단독에서 `<provider>/<prompt_template_version>` 으로 확장. prompt 회귀와 모델 회귀를 별도 축으로 추적 가능."
2. spec literal 로 후퇴 (`provider` 만), prompt_template_version 은 별도 필드 (예: provenance note) 에 박기.
현재 구현이 분명히 더 유용해서 (1번) 으로 가는 게 맞아 보이지만, 결정을 audit log 에 박아 두면 P+ 에서 다른 어댑터 (PaddleOCR / Apple Vision 등) 가 model_version 을 어떻게 채울지 컨벤션이 한 줄로 잡힙니다.
|
||||
tracing::debug!(
|
||||
target: "kebab-parse-image",
|
||||
"caption ok (model={}, prompt={}, chars={})",
|
||||
model_ref.id,
|
||||
prompt_v,
|
||||
caption_text.chars().count()
|
||||
);
|
||||
|
||||
Ok(ModelCaption {
|
||||
text: caption_text,
|
||||
model: model_ref.id,
|
||||
model_version,
|
||||
})
|
||||
}
|
||||
|
||||
/// Pipeline entry point — gate-checks `config.image.caption.enabled`
|
||||
/// then mutates `block.caption` in place via [`caption_image`].
|
||||
///
|
||||
/// When `enabled = false` the function is a clean no-op (returns
|
||||
/// `Ok(())` without invoking the LM and without writing a Provenance
|
||||
/// event). On LM failure `block.caption` stays `None` — partial state
|
||||
/// is never written. The caller decides whether to skip the asset or
|
||||
/// surface the error.
|
||||
pub fn apply_caption(
|
||||
llm: &dyn LanguageModel,
|
||||
image_bytes: &[u8],
|
||||
block: &mut ImageRefBlock,
|
||||
lang_hint: Option<&Lang>,
|
||||
cfg: &kebab_config::Config,
|
||||
events: &mut Vec<ProvenanceEvent>,
|
||||
) -> Result<()> {
|
||||
if !cfg.image.caption.enabled {
|
||||
tracing::debug!(
|
||||
target: "kebab-parse-image",
|
||||
"captioning skipped — image.caption.enabled = false"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
let caption = caption_image(llm, image_bytes, lang_hint, cfg)?;
|
||||
// Build the Provenance note BEFORE moving `caption` into
|
||||
// `block.caption` so we sidestep the per-call `String::clone` of
|
||||
// `caption.model` + `caption.model_version`. Tight ingest loops
|
||||
// (thousands of images) save two allocations per asset.
|
||||
let note = format!(
|
||||
"model={} model_version={}",
|
||||
caption.model, caption.model_version
|
||||
|
claude-reviewer-01
commented
사소하지만 hot ingest path (수천 장 이미지 캡션) 에선 미세 차이가 남습니다. `caption.model.clone()` + `caption.model_version.clone()` 두 번의 String 알로케이션이 발생합니다 (ProvenanceEvent.note 포맷팅 용). caption 을 block.caption 으로 move 하기 전에 note 를 먼저 빌드하면 clone 0회 가능:
```rust
let note = format!(
"model={} model_version={}",
caption.model, caption.model_version
);
block.caption = Some(caption);
events.push(ProvenanceEvent {
at: OffsetDateTime::now_utc(),
agent: "kb-parse-image".to_string(),
kind: ProvenanceKind::CaptionApplied,
note: Some(note),
});
```
사소하지만 hot ingest path (수천 장 이미지 캡션) 에선 미세 차이가 남습니다.
|
||||
);
|
||||
block.caption = Some(caption);
|
||||
events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-image".to_string(),
|
||||
kind: ProvenanceKind::CaptionApplied,
|
||||
note: Some(note),
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compose the `(system, user)` prompt pair for the caption call.
|
||||
/// Korean / English split keeps the model on the requested output
|
||||
/// language; everything else falls through to English.
|
||||
fn build_prompt(lang_hint: Option<&str>) -> (String, String) {
|
||||
match lang_hint {
|
||||
Some("ko") | Some("kor") => (
|
||||
"이미지를 한 문장으로 객관적으로 설명한다. 추측은 피하고, \
|
||||
보이는 것만 적는다. 마크다운 / 따옴표 / 부가 설명 없이 \
|
||||
한 문장만 출력."
|
||||
.to_string(),
|
||||
"위 이미지를 한국어로 한 문장으로 설명하라.".to_string(),
|
||||
),
|
||||
_ => (
|
||||
"Describe the image in one objective sentence. Do not \
|
||||
speculate; describe only what is visible. No markdown, \
|
||||
no quotes, no commentary — output a single sentence."
|
||||
.to_string(),
|
||||
"Describe the image above in one English sentence.".to_string(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn build_prompt_korean_for_ko_hint() {
|
||||
|
claude-reviewer-01
commented
중복은 두 모듈 사이에 cross-module helper 를 만들 만합니다. 예: 그러면 ocr.rs 도 caption.rs 도 같은 함수를 호출하고, 1px 후행 클램프 / PNG passthrough / 에러 메시지 패턴이 한 곳에서 관리됩니다. 향후 PDF / video thumbnail 등 같은 다운스케일이 필요한 모듈이 합류해도 같은 helper 를 재사용 가능. 본 PR scope 가 P6-3 라 강제는 아니지���, 머지 전에 정리하면 P6-3 와 P6-2 에서 발견될 다운스케일 회귀 (예: 1px 클램프 미적용) 가 한 번에 해결됩니다. `downscale_to_png` (caption.rs) 와 P6-2 의 `downscale_to_long_edge` (ocr.rs) 가 거의 동일 알고리즘입니다 — 헤더 sniff → PNG passthrough hot path → 단일 디코드 → resize → PNG re-encode. 차이는 단 두 가지: (a) caption 은 `(w, h)` 를 버리고 `Vec<u8>` 만 반환, (b) 클램프 범위 상수 (caption: [128, 1536], OCR: [256, 4096]).
중복은 두 모듈 사이에 cross-module helper 를 만들 만합니다. 예: `crates/kebab-parse-image/src/image_prep.rs` 같은 private 모듈에:
```rust
pub(crate) fn downscale_to_png(
bytes: &[u8],
max_long_edge: u32,
) -> Result<(Vec<u8>, u32, u32)> { /* OCR 의 본체 그대로 */ }
```
그러면 ocr.rs 도 caption.rs 도 같은 함수를 호출하고, 1px 후행 클램프 / PNG passthrough / 에러 메시지 패턴이 한 곳에서 관리됩니다. 향후 PDF / video thumbnail 등 같은 다운스케일이 필요한 모듈이 합류해도 같은 helper 를 재사용 가능.
본 PR scope 가 P6-3 라 강제는 아니지���, 머지 전에 정리하면 P6-3 와 P6-2 에서 발견될 다운스케일 회귀 (예: 1px 클램프 미적용) 가 한 번에 해결됩니다.
|
||||
let (sys, user) = build_prompt(Some("ko"));
|
||||
assert!(sys.contains("이미지를 한 문장으로"));
|
||||
assert!(user.contains("한국어로"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_prompt_english_for_no_hint_or_und() {
|
||||
let (sys, _) = build_prompt(None);
|
||||
assert!(sys.contains("Describe the image"));
|
||||
let (sys2, _) = build_prompt(Some("en"));
|
||||
assert!(sys2.contains("Describe the image"));
|
||||
}
|
||||
}
|
||||
189
crates/kebab-parse-image/src/image_prep.rs
Normal file
@@ -0,0 +1,189 @@
|
||||
//! Shared image preparation for any image-to-LM pipeline.
|
||||
//!
|
||||
//! P6-2 OCR and P6-3 caption both need the same pre-LM step: clamp
|
||||
//! the long edge to a configured max, re-encode as PNG (the wire
|
||||
//! format vision channels expect — Ollama's `images: [base64, ...]`
|
||||
//! takes PNG/JPEG, but PNG keeps the alpha + lossless invariant we
|
||||
//! prefer for hand-drawn / screenshot inputs), pass through the
|
||||
//! source bytes when they already satisfy both constraints.
|
||||
//! Centralising this here keeps the 1px-rounding fix, the PNG
|
||||
//! passthrough hot path, and the error messages in one place —
|
||||
//! future image-to-LM channels (PDF page thumbnails, video
|
||||
|
claude-reviewer-01
commented
(작은 doc 권장) 모듈 doc 의 "send to vision models" 표현이 caption / OCR 만 시야에 둔 톤입니다. doc-comment 자체가 "future modules (PDF page thumbnails, video keyframes, …) plug in" 까지 약속하고 있으니 지금부터 "vision pipelines" / "image-to-LM channel" 정도로 일반화해 두면 미래 호출자가 doc 만 보고 호출 의도를 파악합니다. 사소합니다. (작은 doc 권장) 모듈 doc 의 "send to vision models" 표현이 caption / OCR 만 시야에 둔 톤입니다. doc-comment 자체가 "future modules (PDF page thumbnails, video keyframes, …) plug in" 까지 약속하고 있으니 지금부터 "vision pipelines" / "image-to-LM channel" 정도로 일반화해 두면 미래 호출자가 doc 만 보고 호출 의도를 파악합니다. 사소합니다.
claude-reviewer-01
commented
(칭찬) 모듈 doc 의 "image-to-LM pipeline / channel" 일반화가 좋습니다. 향후 PDF page thumbnail / video keyframe 등 같은 다운스케일을 필요로 하는 모듈이 합류할 때, 새 호출자가 doc 만 보고 "이 helper 가 OCR/caption 전용이 아니구나" 를 즉시 파악할 수 있게 됐습니다. (칭찬) 모듈 doc 의 "image-to-LM pipeline / channel" 일반화가 좋습니다. 향후 PDF page thumbnail / video keyframe 등 같은 다운스케일을 필요로 하는 모듈이 합류할 때, 새 호출자가 doc 만 보고 "이 helper 가 OCR/caption 전용이 아니구나" 를 즉시 파악할 수 있게 됐습니다.
|
||||
//! keyframes, …) plug in without re-deriving the algorithm.
|
||||
|
||||
use std::io::Cursor;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use image::{ImageFormat, ImageReader};
|
||||
|
||||
/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`,
|
||||
/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)` so
|
||||
/// callers that care about the final dimensions (e.g. OCR's
|
||||
/// `SourceSpan::Region`) get them without re-decoding.
|
||||
///
|
||||
/// PNG sources that already fit the cap pass through (zero decodes,
|
||||
/// just a `Vec` clone). Every other path decodes the image exactly
|
||||
/// once: a cheap header sniff peeks at the format / dimensions before
|
||||
/// committing to a decode, so non-PNG passthrough and downscale share
|
||||
/// the same `decode → optionally resize → re-encode` tail.
|
||||
|
claude-reviewer-01
commented
회차 1 에서 추출된 공용 helper 인데, 자체 회귀 테스트가 비어 있습니다. caption / ocr integration test 가 간접 검증을 하긴 하지만, helper 시그니처가 변경되거나 1px 후행 클램프가 무심코 사라져도 두 호출처 모두 그린 머지가 가능 (예: 다운스케일이 1px 초과해도 caption 측 wire 는 그대로 동작). 간단한 unit 테스트 4건 추가 권장: 공용 helper 가 워크스페이스의 다음 다운스케일 사용처 (PDF / video) 에도 같은 invariant 를 보장한다는 신호가 됩니다. 회차 1 에서 추출된 공용 helper 인데, 자체 회귀 테스트가 비어 있습니다. caption / ocr integration test 가 간접 검증을 하긴 하지만, helper 시그니처가 변경되거나 1px 후행 클램프가 무심코 사라져도 두 호출처 모두 그린 머지가 가능 (예: 다운스케일이 1px 초과해도 caption 측 wire 는 그대로 동작).
간단한 unit 테스트 4건 추가 권장:
```rust
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
use image::{ImageBuffer, Rgb};
fn png(w: u32, h: u32) -> Vec<u8> {
let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_pixel(w, h, Rgb([0,0,255]));
let mut buf = Cursor::new(Vec::new());
img.write_to(&mut buf, image::ImageFormat::Png).unwrap();
buf.into_inner()
}
#[test]
fn png_within_cap_passes_through_zero_decode() {
let bytes = png(100, 50);
let (out, w, h) = downscale_to_png(&bytes, 1024).unwrap();
assert_eq!((w, h), (100, 50));
assert_eq!(out, bytes, "PNG passthrough must return source bytes verbatim");
}
#[test]
fn long_edge_clamped_to_max() {
let bytes = png(4001, 3000);
let (_out, w, h) = downscale_to_png(&bytes, 1601).unwrap();
assert!(w.max(h) <= 1601, "long edge {} > max", w.max(h));
}
#[test]
fn aspect_ratio_preserved_within_rounding() {
let bytes = png(4000, 3000);
let (_out, w, h) = downscale_to_png(&bytes, 1024).unwrap();
let ratio = w as f32 / h as f32;
assert!((ratio - 4.0/3.0).abs() < 0.02, "aspect drift: {ratio}");
}
#[test]
fn corrupt_bytes_return_err() {
let r = downscale_to_png(&[0x89, 0x50, 0x4E, 0x47], 1024);
assert!(r.is_err());
}
}
```
공용 helper 가 워크스페이스의 다음 다운스케일 사용처 (PDF / video) 에도 같은 invariant 를 보장한다는 신호가 됩니다.
|
||||
pub(crate) fn downscale_to_png(
|
||||
bytes: &[u8],
|
||||
max_long_edge: u32,
|
||||
) -> Result<(Vec<u8>, u32, u32)> {
|
||||
let reader = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("reading image header")?;
|
||||
let format = reader.format();
|
||||
let (w, h) = reader
|
||||
.into_dimensions()
|
||||
.context("reading image dimensions")?;
|
||||
|
||||
let long = w.max(h);
|
||||
|
||||
// Hot path — PNG within budget already matches the wire format we
|
||||
// send to vision models, so we ship the bytes verbatim without
|
||||
// paying for a decode + re-encode round-trip.
|
||||
if long <= max_long_edge && format == Some(ImageFormat::Png) {
|
||||
return Ok((bytes.to_vec(), w, h));
|
||||
}
|
||||
|
||||
// Every remaining branch needs the pixels — either to re-encode as
|
||||
// PNG (non-PNG within budget) or to resize first (over budget).
|
||||
// One decode covers both.
|
||||
let img = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("re-reading image for decode")?
|
||||
.decode()
|
||||
.context("decoding image")?;
|
||||
|
||||
let (final_w, final_h, final_img) = if long <= max_long_edge {
|
||||
(w, h, img)
|
||||
} else {
|
||||
let scale = max_long_edge as f32 / long as f32;
|
||||
let mut new_w = ((w as f32) * scale).round().max(1.0) as u32;
|
||||
let mut new_h = ((h as f32) * scale).round().max(1.0) as u32;
|
||||
// Independent rounding of the two axes can let `f32`'s
|
||||
// round-to-nearest push the long axis one pixel past
|
||||
// `max_long_edge` for irrational scales (e.g. `max=1601,
|
||||
// long=4001`). Pin the long axis to exactly `max_long_edge`
|
||||
// so the doc-comment's "long edge is at most max_long_edge"
|
||||
// stays a strict bound.
|
||||
if w >= h {
|
||||
new_w = new_w.min(max_long_edge);
|
||||
} else {
|
||||
new_h = new_h.min(max_long_edge);
|
||||
}
|
||||
let resized =
|
||||
img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle);
|
||||
(new_w, new_h, resized)
|
||||
};
|
||||
|
||||
let mut out = Cursor::new(Vec::new());
|
||||
final_img
|
||||
.write_to(&mut out, ImageFormat::Png)
|
||||
.context("encoding image as PNG")?;
|
||||
Ok((out.into_inner(), final_w, final_h))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::io::Cursor;
|
||||
|
||||
use image::{ImageBuffer, Rgb};
|
||||
|
||||
/// Solid-colour PNG of the given dimensions. Solid colour
|
||||
/// compresses aggressively so even 4001×3001 stays under a few
|
||||
/// kilobytes.
|
||||
fn solid_png(w: u32, h: u32) -> Vec<u8> {
|
||||
let img: ImageBuffer<Rgb<u8>, _> =
|
||||
ImageBuffer::from_pixel(w, h, Rgb([0, 0, 255]));
|
||||
let mut buf = Cursor::new(Vec::new());
|
||||
img.write_to(&mut buf, ImageFormat::Png)
|
||||
.expect("encoding solid PNG must not fail");
|
||||
buf.into_inner()
|
||||
}
|
||||
|
||||
fn solid_jpeg(w: u32, h: u32) -> Vec<u8> {
|
||||
let img: ImageBuffer<Rgb<u8>, _> =
|
||||
ImageBuffer::from_pixel(w, h, Rgb([255, 255, 255]));
|
||||
let mut buf = Cursor::new(Vec::new());
|
||||
img.write_to(&mut buf, ImageFormat::Jpeg)
|
||||
.expect("encoding solid JPEG must not fail");
|
||||
buf.into_inner()
|
||||
}
|
||||
|
||||
/// PNG within budget skips the decode + re-encode round-trip
|
||||
/// entirely. Source bytes survive byte-for-byte.
|
||||
#[test]
|
||||
fn png_within_cap_passes_through_zero_decode() {
|
||||
let bytes = solid_png(100, 50);
|
||||
let (out, w, h) =
|
||||
downscale_to_png(&bytes, 1024).expect("PNG passthrough must succeed");
|
||||
assert_eq!((w, h), (100, 50));
|
||||
assert_eq!(out, bytes, "PNG passthrough must return source bytes verbatim");
|
||||
}
|
||||
|
||||
/// JPEG within budget gets re-encoded as PNG (the wire format)
|
||||
/// while preserving dimensions.
|
||||
#[test]
|
||||
fn jpeg_within_cap_reencodes_as_png() {
|
||||
let bytes = solid_jpeg(100, 50);
|
||||
let (out, w, h) =
|
||||
downscale_to_png(&bytes, 1024).expect("JPEG re-encode must succeed");
|
||||
assert_eq!((w, h), (100, 50));
|
||||
// Byte stream must now start with the PNG magic.
|
||||
assert_eq!(
|
||||
&out[..8],
|
||||
&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A],
|
||||
"output must be PNG-encoded after JPEG input"
|
||||
);
|
||||
}
|
||||
|
||||
/// Pathological irrational scale — `max=1601, long=4001` would let
|
||||
/// independent f32 round-to-nearest push the long axis to 1602.
|
||||
/// The post-resize clamp pins it back to `max_long_edge`.
|
||||
#[test]
|
||||
fn long_edge_clamped_strictly_to_max_for_irrational_scale() {
|
||||
let bytes = solid_png(4001, 3001);
|
||||
let (_out, w, h) =
|
||||
downscale_to_png(&bytes, 1601).expect("downscale must succeed");
|
||||
let long = w.max(h);
|
||||
assert!(long <= 1601, "long edge must be ≤ max, got {long}");
|
||||
}
|
||||
|
||||
/// Aspect ratio survives the downscale within 2%.
|
||||
#[test]
|
||||
fn aspect_ratio_preserved_within_rounding() {
|
||||
let bytes = solid_png(4000, 3000);
|
||||
let (_out, w, h) =
|
||||
downscale_to_png(&bytes, 1024).expect("downscale must succeed");
|
||||
let ratio = w as f32 / h as f32;
|
||||
assert!(
|
||||
(ratio - 4.0 / 3.0).abs() < 0.02,
|
||||
"aspect drift: in=4/3 out={}/{}={ratio}",
|
||||
w,
|
||||
h
|
||||
);
|
||||
}
|
||||
|
||||
/// Truncated PNG header — format guess succeeds (8-byte signature
|
||||
/// intact) but `into_dimensions` fails. Surfaced as Err so
|
||||
/// callers can route to "skip + warning" without confusing the
|
||||
/// downstream pipeline with a zero-size image.
|
||||
#[test]
|
||||
fn corrupt_bytes_return_err() {
|
||||
let truncated = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
|
||||
let r = downscale_to_png(&truncated, 1024);
|
||||
assert!(r.is_err(), "corrupt PNG must surface as Err");
|
||||
}
|
||||
|
||||
/// Unrecognised bytes (not any image format) — header sniff fails
|
||||
/// before dimension read.
|
||||
#[test]
|
||||
fn unrecognised_bytes_return_err() {
|
||||
let r = downscale_to_png(b"definitely not an image", 1024);
|
||||
assert!(r.is_err(), "non-image bytes must surface as Err");
|
||||
}
|
||||
}
|
||||
@@ -13,14 +13,25 @@
|
||||
//! consumers can branch trust by engine (Tesseract / Apple Vision
|
||||
//! adapters, when added, will write a different `engine` string).
|
||||
//!
|
||||
//! P6-3 adds the [`caption`] module: [`caption_image`] /
|
||||
//! [`apply_caption`] route an image through any vision-capable
|
||||
//! [`kebab_core::LanguageModel`] (text-only LMs are not vision-aware
|
||||
//! and will surface a model-side error). Captions are explicitly
|
||||
//! marked **model-generated** — the trust gap between OCR (observed,
|
||||
//! engine-tagged) and caption (generated, prompt-tagged) is the
|
||||
//! workspace's central trust contract.
|
||||
//!
|
||||
//! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText /
|
||||
//! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption
|
||||
//! provenance), §9 (versioning).
|
||||
|
||||
mod dims;
|
||||
mod exif_extract;
|
||||
mod image_prep;
|
||||
pub mod caption;
|
||||
pub mod ocr;
|
||||
|
||||
pub use caption::{apply_caption, caption_image};
|
||||
pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
@@ -25,17 +25,17 @@
|
||||
//! field on [`OcrText`] makes the source explicit, so a caller can
|
||||
//! decide whether to trust based on which engine produced the text.
|
||||
|
||||
use std::io::Cursor;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use base64::Engine as _;
|
||||
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
|
||||
use image::{ImageFormat, ImageReader};
|
||||
use kebab_core::{ImageRefBlock, Lang, OcrRegion, OcrText, ProvenanceEvent, ProvenanceKind};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::image_prep;
|
||||
|
||||
/// Engine name written into `OcrText.engine` for the Ollama-vision adapter.
|
||||
pub const OLLAMA_VISION_ENGINE: &str = "ollama-vision";
|
||||
|
||||
@@ -239,7 +239,7 @@ impl OcrEngine for OllamaVisionOcr {
|
||||
image_bytes: &[u8],
|
||||
lang_hint: Option<&Lang>,
|
||||
) -> Result<OcrText> {
|
||||
let (prepared, w, h) = downscale_to_long_edge(image_bytes, self.max_pixels)
|
||||
let (prepared, w, h) = image_prep::downscale_to_png(image_bytes, self.max_pixels)
|
||||
.context("preparing image for OCR")?;
|
||||
let b64 = BASE64_STANDARD.encode(&prepared);
|
||||
|
||||
@@ -311,71 +311,6 @@ impl OcrEngine for OllamaVisionOcr {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Image preparation ─────────────────────────────────────────────────────
|
||||
|
||||
/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`,
|
||||
/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)`.
|
||||
///
|
||||
/// PNG sources that already fit the cap are passthrough (zero decodes,
|
||||
/// just a `Vec` clone). Every other path decodes the image exactly
|
||||
/// once: the cheap header sniff peeks at the format / dimensions before
|
||||
/// committing to a decode, so non-PNG passthrough and downscale share
|
||||
/// the same `decode → optionally resize → re-encode` tail.
|
||||
fn downscale_to_long_edge(bytes: &[u8], max_long_edge: u32) -> Result<(Vec<u8>, u32, u32)> {
|
||||
let reader = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("reading image header for OCR")?;
|
||||
let format = reader.format();
|
||||
let (w, h) = reader
|
||||
.into_dimensions()
|
||||
.context("reading image dimensions for OCR")?;
|
||||
|
||||
let long = w.max(h);
|
||||
|
||||
// Hot path — PNG within budget already matches the wire format we
|
||||
// send Ollama, so we ship the bytes verbatim without paying for a
|
||||
// decode + re-encode round-trip.
|
||||
if long <= max_long_edge && format == Some(ImageFormat::Png) {
|
||||
return Ok((bytes.to_vec(), w, h));
|
||||
}
|
||||
|
||||
// Every remaining branch needs the pixels — either to re-encode as
|
||||
// PNG (non-PNG within budget) or to resize first (over budget).
|
||||
// One decode covers both.
|
||||
let img = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("re-reading image for OCR decode")?
|
||||
.decode()
|
||||
.context("decoding image for OCR")?;
|
||||
|
||||
let (final_w, final_h, final_img) = if long <= max_long_edge {
|
||||
(w, h, img)
|
||||
} else {
|
||||
let scale = max_long_edge as f32 / long as f32;
|
||||
let mut new_w = ((w as f32) * scale).round().max(1.0) as u32;
|
||||
let mut new_h = ((h as f32) * scale).round().max(1.0) as u32;
|
||||
// Independent rounding of the two axes can let `f32`'s nearest
|
||||
// round push the long axis one pixel past `max_long_edge` for
|
||||
// irrational scales (e.g. `max=1601, long=4001`). Pin the long
|
||||
// axis to exactly `max_long_edge` so the doc-comment's
|
||||
// "long edge is at most max_long_edge" stays a strict bound.
|
||||
if w >= h {
|
||||
new_w = new_w.min(max_long_edge);
|
||||
} else {
|
||||
new_h = new_h.min(max_long_edge);
|
||||
}
|
||||
let resized =
|
||||
img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle);
|
||||
(new_w, new_h, resized)
|
||||
};
|
||||
|
||||
let mut out = Cursor::new(Vec::new());
|
||||
final_img
|
||||
.write_to(&mut out, ImageFormat::Png)
|
||||
.context("encoding image as PNG for OCR")?;
|
||||
Ok((out.into_inner(), final_w, final_h))
|
||||
}
|
||||
|
||||
fn truncate(s: &str, n: usize) -> String {
|
||||
if s.chars().count() <= n {
|
||||
return s.to_string();
|
||||
|
||||
366
crates/kebab-parse-image/tests/caption.rs
Normal file
@@ -0,0 +1,366 @@
|
||||
//! Integration tests for the caption adapter (P6-3).
|
||||
//!
|
||||
//! All hermetic tests use `MockLanguageModel` from `kebab-llm/mock`
|
||||
//! which captures `req.images` indirectly via the canned response. A
|
||||
//! single opt-in test (`#[ignore]`) wires the real
|
||||
//! `kebab-llm-local::OllamaLanguageModel` against the workspace's
|
||||
//! Ollama daemon to verify the `images: [base64]` round-trip.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use kebab_config::Config;
|
||||
use kebab_core::{
|
||||
AssetId, BlockId, CommonBlock, FinishReason, GenerateRequest, ImageRefBlock, Lang,
|
||||
LanguageModel, ModelRef, ProvenanceEvent, ProvenanceKind, SourceSpan, TokenChunk,
|
||||
TokenUsage,
|
||||
};
|
||||
use kebab_llm::MockLanguageModel;
|
||||
use kebab_parse_image::{apply_caption, caption_image};
|
||||
|
||||
use crate::common::red_100x50_png;
|
||||
|
||||
fn cfg_with_caption_enabled() -> Config {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.image.caption.enabled = true;
|
||||
cfg.image.caption.max_pixels = 512;
|
||||
cfg
|
||||
}
|
||||
|
||||
fn empty_image_block() -> ImageRefBlock {
|
||||
ImageRefBlock {
|
||||
common: CommonBlock {
|
||||
block_id: BlockId("0".repeat(32)),
|
||||
heading_path: Vec::new(),
|
||||
source_span: SourceSpan::Region {
|
||||
x: 0,
|
||||
y: 0,
|
||||
w: 100,
|
||||
h: 50,
|
||||
},
|
||||
},
|
||||
asset_id: Some(AssetId("a".repeat(32))),
|
||||
src: "img/x.png".to_string(),
|
||||
alt: "x.png".to_string(),
|
||||
ocr: None,
|
||||
caption: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn mk_mock(canned: &str) -> MockLanguageModel {
|
||||
MockLanguageModel {
|
||||
model_id: "vision-mock:1b".to_string(),
|
||||
provider: "mock".to_string(),
|
||||
context_tokens: 4096,
|
||||
canned_response: canned.to_string(),
|
||||
canned_finish: FinishReason::Stop,
|
||||
canned_usage: TokenUsage {
|
||||
prompt_tokens: 0,
|
||||
completion_tokens: 0,
|
||||
latency_ms: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ── Disabled feature gate ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn apply_caption_no_op_when_feature_disabled() {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.image.caption.enabled = false;
|
||||
let mock = mk_mock("ignored");
|
||||
let mut block = empty_image_block();
|
||||
let mut events: Vec<ProvenanceEvent> = Vec::new();
|
||||
let bytes = red_100x50_png();
|
||||
apply_caption(&mock, &bytes, &mut block, None, &cfg, &mut events)
|
||||
.expect("disabled apply_caption must return Ok(())");
|
||||
assert!(
|
||||
block.caption.is_none(),
|
||||
"disabled apply_caption must not write caption"
|
||||
);
|
||||
assert!(
|
||||
events.is_empty(),
|
||||
"disabled apply_caption must not append a Provenance event"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn caption_image_runs_regardless_of_enabled_flag() {
|
||||
// Feature gate lives in `apply_caption`; `caption_image` is the
|
||||
// raw operation. Calling it directly with enabled = false must
|
||||
// still produce a `ModelCaption` so tests can pin the produced
|
||||
// shape independent of pipeline gating.
|
||||
let cfg = Config::defaults(); // enabled = false (default)
|
||||
let mock = mk_mock("hi");
|
||||
let bytes = red_100x50_png();
|
||||
let cap = caption_image(&mock, &bytes, None, &cfg)
|
||||
.expect("caption_image runs even when enabled = false");
|
||||
assert_eq!(cap.text, "hi");
|
||||
}
|
||||
|
||||
// ── Happy path ────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn apply_caption_sets_block_caption_and_appends_provenance() {
|
||||
let cfg = cfg_with_caption_enabled();
|
||||
let mock = mk_mock("사진 한 장");
|
||||
let mut block = empty_image_block();
|
||||
let mut events: Vec<ProvenanceEvent> = Vec::new();
|
||||
let bytes = red_100x50_png();
|
||||
apply_caption(
|
||||
&mock,
|
||||
&bytes,
|
||||
&mut block,
|
||||
Some(&Lang("ko".to_string())),
|
||||
&cfg,
|
||||
&mut events,
|
||||
)
|
||||
.expect("apply_caption must succeed");
|
||||
|
||||
let cap = block.caption.as_ref().expect("caption Some");
|
||||
assert_eq!(cap.text, "사진 한 장");
|
||||
assert_eq!(cap.model, "vision-mock:1b");
|
||||
assert_eq!(cap.model_version, "mock/caption-v1");
|
||||
|
||||
assert_eq!(events.len(), 1);
|
||||
assert_eq!(events[0].kind, ProvenanceKind::CaptionApplied);
|
||||
assert_eq!(events[0].agent, "kb-parse-image");
|
||||
let note = events[0].note.as_deref().unwrap_or("");
|
||||
assert!(note.contains("vision-mock:1b") && note.contains("caption-v1"), "{note}");
|
||||
}
|
||||
|
||||
// ── Empty token stream → empty caption text ──────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn caption_image_empty_stream_yields_empty_text() {
|
||||
let cfg = cfg_with_caption_enabled();
|
||||
let mock = mk_mock("");
|
||||
let bytes = red_100x50_png();
|
||||
let cap = caption_image(&mock, &bytes, None, &cfg).expect("empty stream must succeed");
|
||||
assert_eq!(cap.text, "");
|
||||
// Spec contract: caller can distinguish "captioning attempted, no
|
||||
// result" from "captioning never attempted" by `caption.is_some()`.
|
||||
// The text being empty does not erase the attempt.
|
||||
assert!(!cap.model.is_empty());
|
||||
}
|
||||
|
||||
// ── Korean vs English prompt selection ───────────────────────────────────
|
||||
|
||||
/// `LanguageModel` impl that captures the `system` prompt sent to it
|
||||
/// so tests can verify the language branch picked by `build_prompt`
|
||||
/// (the function is private; this is the cleanest observable signal).
|
||||
struct CapturingMock {
|
||||
captured_system: Arc<Mutex<Option<String>>>,
|
||||
captured_images: Arc<Mutex<Vec<String>>>,
|
||||
}
|
||||
|
||||
impl LanguageModel for CapturingMock {
|
||||
fn model_ref(&self) -> ModelRef {
|
||||
ModelRef {
|
||||
id: "capture:1".to_string(),
|
||||
provider: "mock".to_string(),
|
||||
dimensions: None,
|
||||
}
|
||||
}
|
||||
fn context_tokens(&self) -> usize {
|
||||
4096
|
||||
}
|
||||
fn generate_stream(
|
||||
&self,
|
||||
req: GenerateRequest,
|
||||
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>> {
|
||||
*self.captured_system.lock().unwrap() = Some(req.system);
|
||||
*self.captured_images.lock().unwrap() = req.images;
|
||||
let chunks: Vec<TokenChunk> = vec![
|
||||
TokenChunk::Token("ok".to_string()),
|
||||
TokenChunk::Done {
|
||||
finish_reason: FinishReason::Stop,
|
||||
usage: TokenUsage {
|
||||
prompt_tokens: 0,
|
||||
completion_tokens: 0,
|
||||
latency_ms: 0,
|
||||
},
|
||||
},
|
||||
];
|
||||
Ok(Box::new(chunks.into_iter().map(Ok)))
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn caption_image_routes_image_into_request_images_field() {
|
||||
let cfg = cfg_with_caption_enabled();
|
||||
let captured_system: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
|
||||
let captured_images: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
|
||||
let mock = CapturingMock {
|
||||
captured_system: captured_system.clone(),
|
||||
captured_images: captured_images.clone(),
|
||||
};
|
||||
let bytes = red_100x50_png();
|
||||
let _ = caption_image(&mock, &bytes, Some(&Lang("ko".to_string())), &cfg)
|
||||
.expect("caption succeeds");
|
||||
|
||||
let imgs = captured_images.lock().unwrap();
|
||||
assert_eq!(imgs.len(), 1, "exactly one base64 image routed");
|
||||
use base64::Engine as _;
|
||||
let decoded = base64::engine::general_purpose::STANDARD
|
||||
.decode(&imgs[0])
|
||||
.expect("base64 decodes");
|
||||
assert!(
|
||||
!decoded.is_empty(),
|
||||
"decoded image bytes must be non-empty"
|
||||
);
|
||||
|
||||
let sys = captured_system.lock().unwrap().clone().unwrap();
|
||||
assert!(
|
||||
sys.contains("이미지를 한 문장으로"),
|
||||
"Korean hint must produce Korean system prompt: {sys}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn caption_image_uses_english_prompt_for_undetermined_lang() {
|
||||
let cfg = cfg_with_caption_enabled();
|
||||
let captured_system: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
|
||||
let mock = CapturingMock {
|
||||
captured_system: captured_system.clone(),
|
||||
captured_images: Arc::new(Mutex::new(Vec::new())),
|
||||
};
|
||||
let bytes = red_100x50_png();
|
||||
let _ = caption_image(&mock, &bytes, Some(&Lang("und".to_string())), &cfg)
|
||||
.expect("caption succeeds");
|
||||
let sys = captured_system.lock().unwrap().clone().unwrap();
|
||||
assert!(sys.contains("Describe the image"), "{sys}");
|
||||
}
|
||||
|
||||
// ── LM error propagates ──────────────────────────────────────────────────
|
||||
|
||||
/// LM that returns Err immediately from `generate_stream` (before any
|
||||
/// token).
|
||||
struct FailingLm;
|
||||
impl LanguageModel for FailingLm {
|
||||
fn model_ref(&self) -> ModelRef {
|
||||
ModelRef {
|
||||
id: "fail".into(),
|
||||
provider: "mock".into(),
|
||||
dimensions: None,
|
||||
}
|
||||
}
|
||||
fn context_tokens(&self) -> usize {
|
||||
0
|
||||
}
|
||||
fn generate_stream(
|
||||
&self,
|
||||
_req: GenerateRequest,
|
||||
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>> {
|
||||
Err(anyhow::anyhow!("simulated LM connection refused"))
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_caption_lm_error_leaves_block_untouched() {
|
||||
let cfg = cfg_with_caption_enabled();
|
||||
let mut block = empty_image_block();
|
||||
let mut events: Vec<ProvenanceEvent> = Vec::new();
|
||||
let bytes = red_100x50_png();
|
||||
let r = apply_caption(&FailingLm, &bytes, &mut block, None, &cfg, &mut events);
|
||||
assert!(r.is_err());
|
||||
assert!(
|
||||
block.caption.is_none(),
|
||||
"caption stays None when LM fails — partial state must not leak"
|
||||
);
|
||||
assert!(events.is_empty(), "no provenance event when LM fails");
|
||||
}
|
||||
|
||||
// ── Determinism — identical mock input → identical caption ───────────────
|
||||
|
||||
#[test]
|
||||
fn caption_image_deterministic_with_identical_inputs() {
|
||||
let cfg = cfg_with_caption_enabled();
|
||||
let bytes = red_100x50_png();
|
||||
let mock1 = mk_mock("a deterministic caption");
|
||||
let mock2 = mk_mock("a deterministic caption");
|
||||
let cap1 = caption_image(&mock1, &bytes, None, &cfg).unwrap();
|
||||
let cap2 = caption_image(&mock2, &bytes, None, &cfg).unwrap();
|
||||
assert_eq!(cap1, cap2);
|
||||
}
|
||||
|
||||
// ── max_pixels clamp ─────────────────────────────────────────────────────
|
||||
|
||||
/// Out-of-range `max_pixels` is silently clamped at construction so a
|
||||
/// bad config can't kill ingest. The captured `images` field's
|
||||
/// decoded long edge confirms the clamp engaged.
|
||||
#[test]
|
||||
fn caption_image_clamps_oversized_max_pixels() {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.image.caption.enabled = true;
|
||||
cfg.image.caption.max_pixels = 99_999; // way over MAX_CAPTION_LONG_EDGE
|
||||
let captured_images: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
|
||||
let mock = CapturingMock {
|
||||
captured_system: Arc::new(Mutex::new(None)),
|
||||
captured_images: captured_images.clone(),
|
||||
};
|
||||
// 4000×3000 PNG well above the 1536 cap.
|
||||
let bytes = common::large_blue_4000x3000_png();
|
||||
let _ = caption_image(&mock, &bytes, None, &cfg).expect("caption succeeds");
|
||||
let imgs = captured_images.lock().unwrap();
|
||||
use base64::Engine as _;
|
||||
let decoded = base64::engine::general_purpose::STANDARD
|
||||
.decode(&imgs[0])
|
||||
.unwrap();
|
||||
let reader = image::ImageReader::new(std::io::Cursor::new(decoded))
|
||||
.with_guessed_format()
|
||||
.unwrap();
|
||||
let (w, h) = reader.into_dimensions().unwrap();
|
||||
let long = w.max(h);
|
||||
assert!(
|
||||
long <= kebab_parse_image::caption::MAX_CAPTION_LONG_EDGE,
|
||||
"max_pixels must clamp to MAX_CAPTION_LONG_EDGE={}, got {long}",
|
||||
kebab_parse_image::caption::MAX_CAPTION_LONG_EDGE
|
||||
);
|
||||
}
|
||||
|
||||
// ── Real Ollama integration (opt-in) ─────────────────────────────────────
|
||||
|
||||
/// End-to-end captioning against the workspace's real Ollama daemon
|
||||
/// via `kebab-llm-local::OllamaLanguageModel` (dev-dep). Skipped by
|
||||
/// default via `#[ignore]`; opt in with `--ignored`.
|
||||
///
|
||||
/// Run with:
|
||||
///
|
||||
/// ```sh
|
||||
/// KEBAB_MODELS_LLM_ENDPOINT=http://192.168.0.47:11434 \
|
||||
/// KEBAB_MODELS_LLM_MODEL=gemma4:e4b \
|
||||
/// cargo test -p kebab-parse-image --test caption \
|
||||
/// caption_integration -- --ignored --nocapture
|
||||
/// ```
|
||||
#[test]
|
||||
#[ignore = "hits a real Ollama daemon; opt in via `cargo test -- --ignored`"]
|
||||
fn caption_integration_real_ollama_describes_image() {
|
||||
use kebab_llm_local::OllamaLanguageModel;
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.image.caption.enabled = true;
|
||||
cfg.image.caption.max_pixels = 768;
|
||||
if let Ok(ep) = std::env::var("KEBAB_MODELS_LLM_ENDPOINT") {
|
||||
cfg.models.llm.endpoint = ep;
|
||||
} else {
|
||||
cfg.models.llm.endpoint = "http://192.168.0.47:11434".to_string();
|
||||
}
|
||||
if let Ok(m) = std::env::var("KEBAB_MODELS_LLM_MODEL") {
|
||||
cfg.models.llm.model = m;
|
||||
} else {
|
||||
cfg.models.llm.model = "gemma4:e4b".to_string();
|
||||
}
|
||||
cfg.models.llm.provider = "ollama".to_string();
|
||||
|
||||
let llm = OllamaLanguageModel::new(&cfg).expect("OllamaLanguageModel::new");
|
||||
let bytes = red_100x50_png();
|
||||
let cap = caption_image(&llm, &bytes, Some(&Lang("en".to_string())), &cfg)
|
||||
.expect("real-Ollama caption_image must succeed");
|
||||
eprintln!("integration caption: {}", cap.text);
|
||||
assert!(!cap.text.is_empty(), "caption must be non-empty");
|
||||
assert_eq!(cap.model, "gemma4:e4b");
|
||||
assert!(cap.model_version.contains("ollama"));
|
||||
assert!(cap.model_version.contains("caption-v1"));
|
||||
}
|
||||
@@ -195,6 +195,9 @@ impl RagPipeline {
|
||||
max_tokens: max_completion,
|
||||
temperature,
|
||||
seed,
|
||||
// RAG is text-only — vision inputs only flow when a
|
||||
// future multimodal pipeline injects images here.
|
||||
images: Vec::new(),
|
||||
};
|
||||
|
||||
let mut acc = String::new();
|
||||
|
||||
@@ -14,6 +14,32 @@ historical contract that was implemented; this file accumulates the
|
||||
deltas so phase 5+ readers can find the live behavior without diffing
|
||||
git history.
|
||||
|
||||
## 2026-05-02 — P6-3 caption: GenerateRequest.images + cargo feature dropped
|
||||
|
||||
**Discovered**: P6-3 implementation start.
|
||||
|
||||
**Symptom 1**: `tasks/p6/p6-3-caption-adapter.md` § Public surface declares `caption_image(llm: &dyn kebab_core::LanguageModel, ...)`, but the frozen `LanguageModel` trait + `GenerateRequest` from p4-1 carry no vision input. The spec's behavior contract ("the adapter is responsible for rendering the prompt to wire") implicitly relied on a trait extension that p4-1 never specced.
|
||||
|
||||
**Symptom 2**: Spec § Definition of Done asks for `cargo check -p kebab-parse-image --features caption` — i.e. a cargo feature gate. The captioning module's only extra deps are `base64` + `image` + the `kebab-llm` trait, all already pulled in by P6-2. A cargo feature would only complicate the build matrix without saving meaningful binary weight.
|
||||
|
||||
**Root cause**: Two small spec gaps that resolve cleanly together — extend the `LanguageModel` trait once for vision routing, and collapse compile-time + runtime gating into a single runtime gate.
|
||||
|
||||
**Fix** (PR #34, feat/p6-3-caption-adapter):
|
||||
- `kebab-core::GenerateRequest` gains an `images: Vec<String>` field (`#[serde(default)]` for backward compat with pre-P6 wire payloads / snapshots). Empty for the text-only RAG path; populated with one or more base64 strings by vision-aware callers.
|
||||
- `kebab-llm-local::OllamaLanguageModel` routes `req.images` onto the wire as `images: [base64, ...]` (Ollama's vision channel). The wire shape stays byte-identical for empty `images` because the field uses `#[serde(skip_serializing_if = "<[String]>::is_empty")]`.
|
||||
- `kebab-parse-image::caption` module: `caption_image` / `apply_caption` build `GenerateRequest { images: vec![b64], temperature: 0.0, seed: 0, ... }` and accept any `&dyn LanguageModel`. Korean / English prompt branch picked from `lang_hint`.
|
||||
- Cargo feature `caption` is **not** introduced — the runtime gate `config.image.caption.enabled = false` (default OFF) suffices.
|
||||
- All existing `GenerateRequest { ... }` literals (kebab-rag, kebab-llm tests, kebab-llm-local tests) gained `images: Vec::new()` to satisfy the new field.
|
||||
|
||||
**Trust note**: Captions stay explicitly model-generated. `ModelCaption.model_version` carries `"<provider>/<prompt_template_version>"` (e.g. `"ollama/caption-v1"`) so a regression in either prompt or model is auditable from the wire.
|
||||
|
||||
**`model_version` shape deviation**: spec literal says `model_version: llm.model_ref().provider` (provider as a coarse version proxy). We extend to `<provider>/<prompt_template_version>` because prompt template churn is a real regression vector independent of the model — pinning both axes in one string lets `kebab-eval` (P5) detect either drift without a schema bump. Spec already left the door open ("if a vision model exposes a stable revision, prefer that"); the prompt template version is the closest stable revision we have today. Future PaddleOCR / Apple Vision adapters that expose a real model revision string can substitute it for `prompt_template_version` without breaking the wire shape.
|
||||
|
||||
**Amends**:
|
||||
- tasks/p4/p4-1-llm-trait.md (`GenerateRequest` schema gained `images: Vec<String>`).
|
||||
- tasks/p4/p4-2-ollama-adapter.md (request body now optionally includes `images: [...]`).
|
||||
- tasks/p6/p6-3-caption-adapter.md ("Definition of Done" cargo feature `caption` dropped; runtime gate is the only feature gate).
|
||||
|
||||
## 2026-05-02 — P6-2 default OCR engine: Tesseract → Ollama-vision
|
||||
|
||||
**Discovered**: P6-2 implementation start.
|
||||
|
||||
@@ -3,7 +3,7 @@ phase: P6
|
||||
component: kebab-parse-image (caption adapter)
|
||||
task_id: p6-3
|
||||
title: "ModelCaption adapter (LanguageModel-driven, feature-gated)"
|
||||
status: planned
|
||||
status: completed
|
||||
depends_on: [p6-1, p4-2]
|
||||
unblocks: []
|
||||
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
|
||||
|
||||
(작은 권장)
MIN_CAPTION_LONG_EDGE/MAX_CAPTION_LONG_EDGE가const인데 visibility 가 module-private 입니다. P6-2 의MAX_DECODE_DIM은pub const. 같은 crate 내 다른 모듈이 "caption 의 cap 이 1536 이라" 가정하고 싶을 때 (또는 외부 test/eval 이 정책을 검증할 때) 두 상수도pub으로 노출하는 게 일관됩니다.P6-2 의
MAX_DECODE_DIM과 시각적 일관성. tests/caption.rs 의99_999 → 1536검증도 literal 1536 대신 상수 참조로 갱신할 수 있습니다.