feat(kebab-parse-image): P6-3 caption adapter — vision LM via trait #34

Merged
altair823 merged 3 commits from feat/p6-3-caption-adapter into main 2026-05-02 06:22:19 +00:00
17 changed files with 954 additions and 71 deletions

2
Cargo.lock generated
View File

@@ -3576,6 +3576,8 @@ dependencies = [
"kamadak-exif",
"kebab-config",
"kebab-core",
"kebab-llm",
"kebab-llm-local",
"reqwest",
"serde",
"serde_json",

View File

@@ -105,18 +105,20 @@ pub struct RagCfg {
}
/// Settings for the image ingest pipeline (P6). `ocr` controls OCR
/// behaviour; future fields (e.g. `caption`) will join here as P6-3
/// lands.
/// behaviour (P6-2); `caption` controls vision-LM captioning (P6-3).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ImageCfg {
#[serde(default = "OcrCfg::defaults")]
pub ocr: OcrCfg,
#[serde(default = "CaptionCfg::defaults")]
pub caption: CaptionCfg,
}
impl ImageCfg {
pub fn defaults() -> Self {
Self {
ocr: OcrCfg::defaults(),
caption: CaptionCfg::defaults(),
}
}
}
@@ -162,6 +164,36 @@ impl OcrCfg {
}
}
/// Caption settings (P6-3). Caption uses the same Ollama-vision /
/// `LanguageModel` pipeline as the rest of the workspace; the trait
/// abstraction is the part the spec demands. `enabled` defaults to
/// `false` because captioning costs one model call per asset and the
/// output is model-generated (low trust).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct CaptionCfg {
/// Run captioning on every image during ingest. Default `false`.
pub enabled: bool,
/// Cap the long edge of the image (in pixels) before sending. The
/// spec recommends an aggressive 768×768 cap because larger
/// vision-LM inputs translate directly into prompt cost. Default
/// `768`.
pub max_pixels: u32,
/// Caption prompt template version pinned into wire output via
/// `ModelCaption.model_version`. Bump when the prompt changes so
/// downstream eval can detect regressions.
pub prompt_template_version: String,
}
impl CaptionCfg {
pub fn defaults() -> Self {
Self {
enabled: false,
max_pixels: 768,
prompt_template_version: "caption-v1".to_string(),
}
}
}
impl Config {
/// Defaults per design §6.4.
pub fn defaults() -> Self {
@@ -417,6 +449,19 @@ impl Config {
}
}
// image.caption (P6-3)
"KEBAB_IMAGE_CAPTION_ENABLED" => {
self.image.caption.enabled = parse_bool(v);
}
"KEBAB_IMAGE_CAPTION_MAX_PIXELS" => {
if let Ok(n) = v.parse::<u32>() {
self.image.caption.max_pixels = n;
}
}
"KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION" => {
self.image.caption.prompt_template_version = v.clone();
}
// Unknown KEBAB_* keys are silently ignored — see
// `env_unknown_key_is_ignored` test.
_ => {}
@@ -608,6 +653,35 @@ mod tests {
/// Pre-P6 config files don't have an `[image]` section. The
/// `#[serde(default)]` attribute on `Config::image` must let those
/// files load with `ImageCfg::defaults()` instead of erroring.
#[test]
fn image_caption_defaults_disabled() {
let c = Config::defaults();
assert!(!c.image.caption.enabled);
assert_eq!(c.image.caption.max_pixels, 768);
assert_eq!(c.image.caption.prompt_template_version, "caption-v1");
}
#[test]
fn image_caption_env_overrides() {
let mut env = HashMap::new();
env.insert(
"KEBAB_IMAGE_CAPTION_ENABLED".to_string(),
"true".to_string(),
);
env.insert(
"KEBAB_IMAGE_CAPTION_MAX_PIXELS".to_string(),
"1024".to_string(),
);
env.insert(
"KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION".to_string(),
"caption-v2".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert!(c.image.caption.enabled);
assert_eq!(c.image.caption.max_pixels, 1024);
assert_eq!(c.image.caption.prompt_template_version, "caption-v2");
}
/// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None`
/// rather than to `Some("")` so the fallback to `models.llm.endpoint`
/// kicks in. Covers the env-equivalent of a missing TOML key.

View File

@@ -69,6 +69,17 @@ pub struct GenerateRequest {
pub max_tokens: usize,
pub temperature: f32,
pub seed: Option<u64>,
/// Vision inputs (base64-encoded, one per image). Empty for the
/// text-only path that P4-2 / P4-3 / RAG uses; non-empty when a
/// vision-capable adapter (P6-3 caption, future multimodal RAG)
/// drives the call. The LM adapter is responsible for routing
/// these onto the wire — Ollama uses `images: [base64, ...]`,
/// other backends may differ.
///
/// Defaulted on deserialization so older `*.json` payloads /
/// snapshots that predate the field still parse.
#[serde(default)]
pub images: Vec<String>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]

View File

@@ -140,9 +140,15 @@ impl LanguageModel for OllamaLanguageModel {
format!("{}\n\n{}", req.system, req.user)
};
// Vision inputs (P6-3) flow through the request via Ollama's
// `images: [base64, ...]` field. Empty for the text-only RAG
// path so older snapshots and JSON dumps stay byte-identical
// (the field is `#[serde(default)]` here so it's omitted from
// the wire when empty).
let body = OllamaRequest {
model: &self.model_id,
prompt,
images: &req.images,
stream: true,
options: OllamaOptions {
temperature: effective_temperature,
@@ -188,6 +194,13 @@ impl LanguageModel for OllamaLanguageModel {
struct OllamaRequest<'a> {
model: &'a str,
prompt: String,
/// Skipped from the JSON when empty so the text-only path keeps
/// the same on-the-wire shape it had pre-P6-3 (`{"model": ...,
/// "prompt": ..., "stream": ..., "options": ...}` — no `images`
/// key). Vision-capable callers populate this with one or more
/// base64-encoded images.
#[serde(skip_serializing_if = "<[String]>::is_empty")]
images: &'a [String],
stream: bool,
options: OllamaOptions<'a>,
}

View File

@@ -31,6 +31,7 @@ fn real_ollama_streams_non_empty_response() {
max_tokens: 8,
temperature: 0.0,
seed: Some(0),
images: Vec::new(),
};
let stream = llm.generate_stream(req).expect("stream should start");

View File

@@ -35,6 +35,7 @@ fn sample_request() -> GenerateRequest {
max_tokens: 64,
temperature: 0.0,
seed: Some(0),
images: Vec::new(),
}
}

View File

@@ -26,6 +26,7 @@ fn req_with_stop(stop: Vec<&str>) -> GenerateRequest {
max_tokens: 64,
temperature: 0.0,
seed: None,
images: Vec::new(),
}
}

View File

@@ -55,6 +55,7 @@ fn dyn_dispatch_via_box_works() {
max_tokens: 16,
temperature: 0.0,
seed: None,
images: Vec::new(),
};
let stream = m.generate_stream(req).expect("stream");
let chunks: Vec<TokenChunk> = stream.map(|r| r.expect("ok chunk")).collect();

View File

@@ -10,6 +10,12 @@ description = "Image extractor + EXIF + OCR (Ollama-vision) for the kebab pipe
[dependencies]
kebab-core = { path = "../kebab-core" }
kebab-config = { path = "../kebab-config" }
# `kebab-llm` re-exports the trait crate (`kebab-core::LanguageModel`)
# under a stable surface; the caption adapter consumes any
# `dyn LanguageModel`. We do NOT depend on `kebab-llm-local` (forbidden
# by p6-3 design §8) — the trait abstraction is exactly what spec
# requires.
kebab-llm = { path = "../kebab-llm" }
anyhow = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
@@ -42,3 +48,10 @@ tokio = { workspace = true, features = ["rt-multi-thread"] }
# font rendering.
ab_glyph = "0.2"
base64 = "0.22"
# `kebab-llm/mock` exposes `MockLanguageModel` for hermetic caption
# tests. Real adapters (Ollama) live in `kebab-llm-local`, which is
# only allowed at the dev-dep level here — the runtime crate stays
# trait-only, so the §8 forbidden-deps rule (no `kebab-llm-local`
# at runtime) is preserved.
kebab-llm = { path = "../kebab-llm", features = ["mock"] }
kebab-llm-local = { path = "../kebab-llm-local" }

View File

@@ -0,0 +1,236 @@
//! Caption adapter (P6-3).
//!
//! [`caption_image`] runs a vision-capable [`LanguageModel`] over an
//! image and produces a [`ModelCaption`]. [`apply_caption`] is the
//! helper that mutates an [`ImageRefBlock`] in place and emits a
//! [`ProvenanceKind::CaptionApplied`] event.
//!
//! ## Trust note
//!
//! Captions are **model-generated** (`TrustLevel::Generated`), not
//! observed text. Vision LMs hallucinate; the system prompt explicitly
//! forbids guessing but expect false captions. Downstream UI / RAG
//! must label captions as model-generated and surface the model id +
//! prompt template version (carried in `ModelCaption.model_version`)
//! so a regression in either is auditable.
//!
//! ## Spec deviation (cargo `caption` feature dropped)
//!
//! The original P6-3 spec asked for a cargo feature `caption` (default
//! OFF at compile time). We collapse this into a single runtime gate
//! (`config.image.caption.enabled = false`, default OFF). Reasoning:
//! the captioning module's only extra deps are `base64` + `image` +
//! `kebab-llm` trait — all already pulled in by the rest of the
//! crate. A cargo feature would only complicate the build matrix
//! without saving meaningful binary weight. See `tasks/HOTFIXES.md`
//! (2026-05-02) for the deviation log.
use anyhow::{Context, Result};
use base64::Engine as _;
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use kebab_core::{
FinishReason, GenerateRequest, ImageRefBlock, Lang, LanguageModel, ModelCaption,
ProvenanceEvent, ProvenanceKind, TokenChunk,
};
use time::OffsetDateTime;
use crate::image_prep;
/// Long-edge clamp range for caption inputs. Smaller than OCR's
/// `[256, 4096]` because vision LMs charge proportionally to input
/// dimension — captions tolerate aggressive downscale better than
/// OCR.
pub const MIN_CAPTION_LONG_EDGE: u32 = 128;
pub const MAX_CAPTION_LONG_EDGE: u32 = 1536;

(작은 권장) MIN_CAPTION_LONG_EDGE / MAX_CAPTION_LONG_EDGEconst 인데 visibility 가 module-private 입니다. P6-2 의 MAX_DECODE_DIMpub const. 같은 crate 내 다른 모듈이 "caption 의 cap 이 1536 이라" 가정하고 싶을 때 (또는 외부 test/eval 이 정책을 검증할 때) 두 상수도 pub 으로 노출하는 게 일관됩니다.

pub const MIN_CAPTION_LONG_EDGE: u32 = 128;
pub const MAX_CAPTION_LONG_EDGE: u32 = 1536;

P6-2 의 MAX_DECODE_DIM 과 시각적 일관성. tests/caption.rs 의 99_999 → 1536 검증도 literal 1536 대신 상수 참조로 갱신할 수 있습니다.

(작은 권장) `MIN_CAPTION_LONG_EDGE` / `MAX_CAPTION_LONG_EDGE` 가 `const` 인데 visibility 가 module-private 입니다. P6-2 의 `MAX_DECODE_DIM` 은 `pub const`. 같은 crate 내 다른 모듈이 "caption 의 cap 이 1536 이라" 가정하고 싶을 때 (또는 외부 test/eval 이 정책을 검증할 때) 두 상수도 `pub` 으로 노출하는 게 일관됩니다. ```rust pub const MIN_CAPTION_LONG_EDGE: u32 = 128; pub const MAX_CAPTION_LONG_EDGE: u32 = 1536; ``` P6-2 의 `MAX_DECODE_DIM` 과 시각적 일관성. tests/caption.rs 의 `99_999 → 1536` 검증도 literal 1536 대신 상수 참조로 갱신할 수 있습니다.
/// Token budget for captions. Captions are one-sentence by spec — 96
/// tokens covers a 50-word English sentence or a 30-token Korean one
/// with headroom for the LM's preamble before the stop sequence.
const CAPTION_MAX_TOKENS: usize = 96;
/// Run a caption pass and return the resulting `ModelCaption`.
///
/// Pure raw operation — does **not** consult `config.image.caption.enabled`.
/// The runtime feature gate lives in [`apply_caption`]; this entry
/// always invokes the LM. Tests pinning the produced `ModelCaption`
/// shape can call this directly without flipping the config flag.
///
/// Honours the `[MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE]` clamp
/// on `config.image.caption.max_pixels` so a hostile config cannot
/// blow up prompt cost.

caption_imageapply_caption 의 "disabled 처리" 가 비대칭입니다:

  • caption_image(...)enabled = falseErr ("disabled" 메시지).
  • apply_caption(...)enabled = falseOk(()) (no-op).

같은 config 상태에서 두 entry 가 다른 결과를 내는 건 호출자 입장에서 헷갈립니다. 함수의 책임 분리 관점에서 더 깔끔한 형태:

  • caption_image = "이미지를 캡션해 줘" 의 raw 연산. 항상 LM 호출, gate 검사 없음.
  • apply_caption = pipeline 진입점. config 게이트 + 결과 반영 + provenance.

이렇게 하면 disabled 인데 caption_image 를 직접 부르는 caller (현재로선 테스트뿐) 가 "왜 Err 반환?" 에 헷갈릴 일이 없고, doc-comment 의 "this lower-level entry exists so tests can pin the produced ModelCaption" 도 더 정직해집니다 — 테스트는 enabled flag 없이 caption 동작 자체를 검증할 수 있어야 자연스러움.

`caption_image` 와 `apply_caption` 의 "disabled 처리" 가 비대칭입니다: - `caption_image(...)` → `enabled = false` 면 **Err** ("disabled" 메시지). - `apply_caption(...)` → `enabled = false` 면 **Ok(())** (no-op). 같은 config 상태에서 두 entry 가 다른 결과를 내는 건 호출자 입장에서 헷갈립니다. 함수의 책임 분리 관점에서 더 깔끔한 형태: - `caption_image` = "이미지를 캡션해 줘" 의 raw 연산. **항상 LM 호출**, gate 검사 없음. - `apply_caption` = pipeline 진입점. config 게이트 + 결과 반영 + provenance. 이렇게 하면 disabled 인데 caption_image 를 직접 부르는 caller (현재로선 테스트뿐) 가 "왜 Err 반환?" 에 헷갈릴 일이 없고, doc-comment 의 "this lower-level entry exists so tests can pin the produced ModelCaption" 도 더 정직해집니다 — 테스트는 enabled flag 없이 caption 동작 자체를 검증할 수 있어야 자연스러움.

(칭찬) caption_image (raw 연산) / apply_caption (gate + provenance) 의 책임 분리가 코드 양 자체를 줄였고, 더 중요하게는 "같은 config 에서 두 함수가 다른 결과" 라는 인지 부담을 제거했습니다. P+ 에서 caption 외 다른 vision adapter (예: alt-text generator) 가 비슷한 책임 분리를 따라가는 패턴이 됩니다.

(칭찬) `caption_image` (raw 연산) / `apply_caption` (gate + provenance) 의 책임 분리가 코드 양 자체를 줄였고, 더 중요하게는 "같은 config 에서 두 함수가 다른 결과" 라는 인지 부담을 제거했습니다. P+ 에서 caption 외 다른 vision adapter (예: alt-text generator) 가 비슷한 책임 분리를 따라가는 패턴이 됩니다.
pub fn caption_image(
llm: &dyn LanguageModel,
image_bytes: &[u8],
lang_hint: Option<&Lang>,
cfg: &kebab_config::Config,
) -> Result<ModelCaption> {
let max_pixels = cfg
.image
.caption
.max_pixels
.clamp(MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE);
if max_pixels != cfg.image.caption.max_pixels {
tracing::warn!(
target: "kebab-parse-image",
"image.caption.max_pixels = {} clamped to {} (legal range [{}, {}])",
cfg.image.caption.max_pixels,
max_pixels,
MIN_CAPTION_LONG_EDGE,
MAX_CAPTION_LONG_EDGE
);
}
let (prepared, _w, _h) = image_prep::downscale_to_png(image_bytes, max_pixels)
.context("preparing image for caption")?;
let b64 = BASE64_STANDARD.encode(&prepared);
let lang = lang_hint
.map(|l| l.0.as_str())
.filter(|s| !s.is_empty() && *s != "und");
let (system, user) = build_prompt(lang);
// Determinism — temperature 0.0 + seed 0, same convention as RAG
// and OCR. The LM adapter routes the base64 image via its
// provider-specific channel (Ollama: `images: [base64]`).
let req = GenerateRequest {
system,
user,
stop: vec!["\n\n".to_string()],
max_tokens: CAPTION_MAX_TOKENS,
temperature: 0.0,
seed: Some(0),
images: vec![b64],
};
let stream = llm
.generate_stream(req)
.context("captioning LM call failed")?;
let mut text = String::new();
let mut saw_done = false;
for chunk in stream {
match chunk? {
TokenChunk::Token(t) => {
text.push_str(&t);
}
TokenChunk::Done { finish_reason, .. } => {
saw_done = true;
if let FinishReason::Error(e) = finish_reason {
anyhow::bail!("captioning LM ended with error: {e}");
}
break;
}
}
}
if !saw_done {
anyhow::bail!("captioning LM stream ended without a Done frame");
}
let caption_text = text.trim().to_string();
let model_ref = llm.model_ref();
let prompt_v = &cfg.image.caption.prompt_template_version;
let model_version = format!(
"{provider}/{prompt}",
provider = model_ref.provider,
prompt = prompt_v
);

model_version = "<provider>/<prompt_template_version>" (예: "ollama/caption-v1") 는 spec 의 literal model_version: llm.model_ref().provider 와 다릅니다. spec 자체가 "if a vision model exposes a stable revision, prefer that" 로 유연하게 열어 둔 부분이라 합리적 deviation 이지만, HOTFIXES.md 의 P6-3 항목에서 이 쪽 결정은 명시되지 않았습니다.

둘 중 하나로 정리:

  1. (선호) HOTFIXES.md 의 P6-3 항목에 한 줄 추가 — "model_versionprovider 단독에서 <provider>/<prompt_template_version> 으로 확장. prompt 회귀와 모델 회귀를 별도 축으로 추적 가능."
  2. spec literal 로 후퇴 (provider 만), prompt_template_version 은 별도 필드 (예: provenance note) 에 박기.

현재 구현이 분명히 더 유용해서 (1번) 으로 가는 게 맞아 보이지만, 결정을 audit log 에 박아 두면 P+ 에서 다른 어댑터 (PaddleOCR / Apple Vision 등) 가 model_version 을 어떻게 채울지 컨벤션이 한 줄로 잡힙니다.

`model_version = "<provider>/<prompt_template_version>"` (예: `"ollama/caption-v1"`) 는 spec 의 literal `model_version: llm.model_ref().provider` 와 다릅니다. spec 자체가 "if a vision model exposes a stable revision, prefer that" 로 유연하게 열어 둔 부분이라 합리적 deviation 이지만, HOTFIXES.md 의 P6-3 항목에서 이 쪽 결정은 명시되지 않았습니다. 둘 중 하나로 정리: 1. (선호) HOTFIXES.md 의 P6-3 항목에 한 줄 추가 — "`model_version` 을 `provider` 단독에서 `<provider>/<prompt_template_version>` 으로 확장. prompt 회귀와 모델 회귀를 별도 축으로 추적 가능." 2. spec literal 로 후퇴 (`provider` 만), prompt_template_version 은 별도 필드 (예: provenance note) 에 박기. 현재 구현이 분명히 더 유용해서 (1번) 으로 가는 게 맞아 보이지만, 결정을 audit log 에 박아 두면 P+ 에서 다른 어댑터 (PaddleOCR / Apple Vision 등) 가 model_version 을 어떻게 채울지 컨벤션이 한 줄로 잡힙니다.
tracing::debug!(
target: "kebab-parse-image",
"caption ok (model={}, prompt={}, chars={})",
model_ref.id,
prompt_v,
caption_text.chars().count()
);
Ok(ModelCaption {
text: caption_text,
model: model_ref.id,
model_version,
})
}
/// Pipeline entry point — gate-checks `config.image.caption.enabled`
/// then mutates `block.caption` in place via [`caption_image`].
///
/// When `enabled = false` the function is a clean no-op (returns
/// `Ok(())` without invoking the LM and without writing a Provenance
/// event). On LM failure `block.caption` stays `None` — partial state
/// is never written. The caller decides whether to skip the asset or
/// surface the error.
pub fn apply_caption(
llm: &dyn LanguageModel,
image_bytes: &[u8],
block: &mut ImageRefBlock,
lang_hint: Option<&Lang>,
cfg: &kebab_config::Config,
events: &mut Vec<ProvenanceEvent>,
) -> Result<()> {
if !cfg.image.caption.enabled {
tracing::debug!(
target: "kebab-parse-image",
"captioning skipped — image.caption.enabled = false"
);
return Ok(());
}
let caption = caption_image(llm, image_bytes, lang_hint, cfg)?;
// Build the Provenance note BEFORE moving `caption` into
// `block.caption` so we sidestep the per-call `String::clone` of
// `caption.model` + `caption.model_version`. Tight ingest loops
// (thousands of images) save two allocations per asset.
let note = format!(
"model={} model_version={}",
caption.model, caption.model_version

caption.model.clone() + caption.model_version.clone() 두 번의 String 알로케이션이 발생합니다 (ProvenanceEvent.note 포맷팅 용). caption 을 block.caption 으로 move 하기 전에 note 를 먼저 빌드하면 clone 0회 가능:

let note = format!(
    "model={} model_version={}",
    caption.model, caption.model_version
);
block.caption = Some(caption);
events.push(ProvenanceEvent {
    at: OffsetDateTime::now_utc(),
    agent: "kb-parse-image".to_string(),
    kind: ProvenanceKind::CaptionApplied,
    note: Some(note),
});

사소하지만 hot ingest path (수천 장 이미지 캡션) 에선 미세 차이가 남습니다.

`caption.model.clone()` + `caption.model_version.clone()` 두 번의 String 알로케이션이 발생합니다 (ProvenanceEvent.note 포맷팅 용). caption 을 block.caption 으로 move 하기 전에 note 를 먼저 빌드하면 clone 0회 가능: ```rust let note = format!( "model={} model_version={}", caption.model, caption.model_version ); block.caption = Some(caption); events.push(ProvenanceEvent { at: OffsetDateTime::now_utc(), agent: "kb-parse-image".to_string(), kind: ProvenanceKind::CaptionApplied, note: Some(note), }); ``` 사소하지만 hot ingest path (수천 장 이미지 캡션) 에선 미세 차이가 남습니다.
);
block.caption = Some(caption);
events.push(ProvenanceEvent {
at: OffsetDateTime::now_utc(),
agent: "kb-parse-image".to_string(),
kind: ProvenanceKind::CaptionApplied,
note: Some(note),
});
Ok(())
}
/// Compose the `(system, user)` prompt pair for the caption call.
/// Korean / English split keeps the model on the requested output
/// language; everything else falls through to English.
fn build_prompt(lang_hint: Option<&str>) -> (String, String) {
match lang_hint {
Some("ko") | Some("kor") => (
"이미지를 한 문장으로 객관적으로 설명한다. 추측은 피하고, \
보이는 것만 적는다. 마크다운 / 따옴표 / 부가 설명 없이 \
한 문장만 출력."
.to_string(),
"위 이미지를 한국어로 한 문장으로 설명하라.".to_string(),
),
_ => (
"Describe the image in one objective sentence. Do not \
speculate; describe only what is visible. No markdown, \
no quotes, no commentary — output a single sentence."
.to_string(),
"Describe the image above in one English sentence.".to_string(),
),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn build_prompt_korean_for_ko_hint() {

downscale_to_png (caption.rs) 와 P6-2 의 downscale_to_long_edge (ocr.rs) 가 거의 동일 알고리즘입니다 — 헤더 sniff → PNG passthrough hot path → 단일 디코드 → resize → PNG re-encode. 차이는 단 두 가지: (a) caption 은 (w, h) 를 버리고 Vec<u8> 만 반환, (b) 클램프 범위 상수 (caption: [128, 1536], OCR: [256, 4096]).

중복은 두 모듈 사이에 cross-module helper 를 만들 만합니다. 예: crates/kebab-parse-image/src/image_prep.rs 같은 private 모듈에:

pub(crate) fn downscale_to_png(
    bytes: &[u8],
    max_long_edge: u32,
) -> Result<(Vec<u8>, u32, u32)> { /* OCR 의 본체 그대로 */ }

그러면 ocr.rs 도 caption.rs 도 같은 함수를 호출하고, 1px 후행 클램프 / PNG passthrough / 에러 메시지 패턴이 한 곳에서 관리됩니다. 향후 PDF / video thumbnail 등 같은 다운스케일이 필요한 모듈이 합류해도 같은 helper 를 재사용 가능.

본 PR scope 가 P6-3 라 강제는 아니지���, 머지 전에 정리하면 P6-3 와 P6-2 에서 발견될 다운스케일 회귀 (예: 1px 클램프 미적용) 가 한 번에 해결됩니다.

`downscale_to_png` (caption.rs) 와 P6-2 의 `downscale_to_long_edge` (ocr.rs) 가 거의 동일 알고리즘입니다 — 헤더 sniff → PNG passthrough hot path → 단일 디코드 → resize → PNG re-encode. 차이는 단 두 가지: (a) caption 은 `(w, h)` 를 버리고 `Vec<u8>` 만 반환, (b) 클램프 범위 상수 (caption: [128, 1536], OCR: [256, 4096]). 중복은 두 모듈 사이에 cross-module helper 를 만들 만합니다. 예: `crates/kebab-parse-image/src/image_prep.rs` 같은 private 모듈에: ```rust pub(crate) fn downscale_to_png( bytes: &[u8], max_long_edge: u32, ) -> Result<(Vec<u8>, u32, u32)> { /* OCR 의 본체 그대로 */ } ``` 그러면 ocr.rs 도 caption.rs 도 같은 함수를 호출하고, 1px 후행 클램프 / PNG passthrough / 에러 메시지 패턴이 한 곳에서 관리됩니다. 향후 PDF / video thumbnail 등 같은 다운스케일이 필요한 모듈이 합류해도 같은 helper 를 재사용 가능. 본 PR scope 가 P6-3 라 강제는 아니지���, 머지 전에 정리하면 P6-3 와 P6-2 에서 발견될 다운스케일 회귀 (예: 1px 클램프 미적용) 가 한 번에 해결됩니다.
let (sys, user) = build_prompt(Some("ko"));
assert!(sys.contains("이미지를 한 문장으로"));
assert!(user.contains("한국어로"));
}
#[test]
fn build_prompt_english_for_no_hint_or_und() {
let (sys, _) = build_prompt(None);
assert!(sys.contains("Describe the image"));
let (sys2, _) = build_prompt(Some("en"));
assert!(sys2.contains("Describe the image"));
}
}

View File

@@ -0,0 +1,189 @@
//! Shared image preparation for any image-to-LM pipeline.
//!
//! P6-2 OCR and P6-3 caption both need the same pre-LM step: clamp
//! the long edge to a configured max, re-encode as PNG (the wire
//! format vision channels expect — Ollama's `images: [base64, ...]`
//! takes PNG/JPEG, but PNG keeps the alpha + lossless invariant we
//! prefer for hand-drawn / screenshot inputs), pass through the
//! source bytes when they already satisfy both constraints.
//! Centralising this here keeps the 1px-rounding fix, the PNG
//! passthrough hot path, and the error messages in one place —
//! future image-to-LM channels (PDF page thumbnails, video

(작은 doc 권장) 모듈 doc 의 "send to vision models" 표현이 caption / OCR 만 시야에 둔 톤입니다. doc-comment 자체가 "future modules (PDF page thumbnails, video keyframes, …) plug in" 까지 약속하고 있으니 지금부터 "vision pipelines" / "image-to-LM channel" 정도로 일반화해 두면 미래 호출자가 doc 만 보고 호출 의도를 파악합니다. 사소합니다.

(작은 doc 권장) 모듈 doc 의 "send to vision models" 표현이 caption / OCR 만 시야에 둔 톤입니다. doc-comment 자체가 "future modules (PDF page thumbnails, video keyframes, …) plug in" 까지 약속하고 있으니 지금부터 "vision pipelines" / "image-to-LM channel" 정도로 일반화해 두면 미래 호출자가 doc 만 보고 호출 의도를 파악합니다. 사소합니다.

(칭찬) 모듈 doc 의 "image-to-LM pipeline / channel" 일반화가 좋습니다. 향후 PDF page thumbnail / video keyframe 등 같은 다운스케일을 필요로 하는 모듈이 합류할 때, 새 호출자가 doc 만 보고 "이 helper 가 OCR/caption 전용이 아니구나" 를 즉시 파악할 수 있게 됐습니다.

(칭찬) 모듈 doc 의 "image-to-LM pipeline / channel" 일반화가 좋습니다. 향후 PDF page thumbnail / video keyframe 등 같은 다운스케일을 필요로 하는 모듈이 합류할 때, 새 호출자가 doc 만 보고 "이 helper 가 OCR/caption 전용이 아니구나" 를 즉시 파악할 수 있게 됐습니다.
//! keyframes, …) plug in without re-deriving the algorithm.
use std::io::Cursor;
use anyhow::{Context, Result};
use image::{ImageFormat, ImageReader};
/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`,
/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)` so
/// callers that care about the final dimensions (e.g. OCR's
/// `SourceSpan::Region`) get them without re-decoding.
///
/// PNG sources that already fit the cap pass through (zero decodes,
/// just a `Vec` clone). Every other path decodes the image exactly
/// once: a cheap header sniff peeks at the format / dimensions before
/// committing to a decode, so non-PNG passthrough and downscale share
/// the same `decode → optionally resize → re-encode` tail.

회차 1 에서 추출된 공용 helper 인데, 자체 회귀 테스트가 비어 있습니다. caption / ocr integration test 가 간접 검증을 하긴 하지만, helper 시그니처가 변경되거나 1px 후행 클램프가 무심코 사라져도 두 호출처 모두 그린 머지가 가능 (예: 다운스케일이 1px 초과해도 caption 측 wire 는 그대로 동작).

간단한 unit 테스트 4건 추가 권장:

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Cursor;
    use image::{ImageBuffer, Rgb};

    fn png(w: u32, h: u32) -> Vec<u8> {
        let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_pixel(w, h, Rgb([0,0,255]));
        let mut buf = Cursor::new(Vec::new());
        img.write_to(&mut buf, image::ImageFormat::Png).unwrap();
        buf.into_inner()
    }

    #[test]
    fn png_within_cap_passes_through_zero_decode() {
        let bytes = png(100, 50);
        let (out, w, h) = downscale_to_png(&bytes, 1024).unwrap();
        assert_eq!((w, h), (100, 50));
        assert_eq!(out, bytes, "PNG passthrough must return source bytes verbatim");
    }

    #[test]
    fn long_edge_clamped_to_max() {
        let bytes = png(4001, 3000);
        let (_out, w, h) = downscale_to_png(&bytes, 1601).unwrap();
        assert!(w.max(h) <= 1601, "long edge {} > max", w.max(h));
    }

    #[test]
    fn aspect_ratio_preserved_within_rounding() {
        let bytes = png(4000, 3000);
        let (_out, w, h) = downscale_to_png(&bytes, 1024).unwrap();
        let ratio = w as f32 / h as f32;
        assert!((ratio - 4.0/3.0).abs() < 0.02, "aspect drift: {ratio}");
    }

    #[test]
    fn corrupt_bytes_return_err() {
        let r = downscale_to_png(&[0x89, 0x50, 0x4E, 0x47], 1024);
        assert!(r.is_err());
    }
}

공용 helper 가 워크스페이스의 다음 다운스케일 사용처 (PDF / video) 에도 같은 invariant 를 보장한다는 신호가 됩니다.

회차 1 에서 추출된 공용 helper 인데, 자체 회귀 테스트가 비어 있습니다. caption / ocr integration test 가 간접 검증을 하긴 하지만, helper 시그니처가 변경되거나 1px 후행 클램프가 무심코 사라져도 두 호출처 모두 그린 머지가 가능 (예: 다운스케일이 1px 초과해도 caption 측 wire 는 그대로 동작). 간단한 unit 테스트 4건 추가 권장: ```rust #[cfg(test)] mod tests { use super::*; use std::io::Cursor; use image::{ImageBuffer, Rgb}; fn png(w: u32, h: u32) -> Vec<u8> { let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_pixel(w, h, Rgb([0,0,255])); let mut buf = Cursor::new(Vec::new()); img.write_to(&mut buf, image::ImageFormat::Png).unwrap(); buf.into_inner() } #[test] fn png_within_cap_passes_through_zero_decode() { let bytes = png(100, 50); let (out, w, h) = downscale_to_png(&bytes, 1024).unwrap(); assert_eq!((w, h), (100, 50)); assert_eq!(out, bytes, "PNG passthrough must return source bytes verbatim"); } #[test] fn long_edge_clamped_to_max() { let bytes = png(4001, 3000); let (_out, w, h) = downscale_to_png(&bytes, 1601).unwrap(); assert!(w.max(h) <= 1601, "long edge {} > max", w.max(h)); } #[test] fn aspect_ratio_preserved_within_rounding() { let bytes = png(4000, 3000); let (_out, w, h) = downscale_to_png(&bytes, 1024).unwrap(); let ratio = w as f32 / h as f32; assert!((ratio - 4.0/3.0).abs() < 0.02, "aspect drift: {ratio}"); } #[test] fn corrupt_bytes_return_err() { let r = downscale_to_png(&[0x89, 0x50, 0x4E, 0x47], 1024); assert!(r.is_err()); } } ``` 공용 helper 가 워크스페이스의 다음 다운스케일 사용처 (PDF / video) 에도 같은 invariant 를 보장한다는 신호가 됩니다.
pub(crate) fn downscale_to_png(
bytes: &[u8],
max_long_edge: u32,
) -> Result<(Vec<u8>, u32, u32)> {
let reader = ImageReader::new(Cursor::new(bytes))
.with_guessed_format()
.context("reading image header")?;
let format = reader.format();
let (w, h) = reader
.into_dimensions()
.context("reading image dimensions")?;
let long = w.max(h);
// Hot path — PNG within budget already matches the wire format we
// send to vision models, so we ship the bytes verbatim without
// paying for a decode + re-encode round-trip.
if long <= max_long_edge && format == Some(ImageFormat::Png) {
return Ok((bytes.to_vec(), w, h));
}
// Every remaining branch needs the pixels — either to re-encode as
// PNG (non-PNG within budget) or to resize first (over budget).
// One decode covers both.
let img = ImageReader::new(Cursor::new(bytes))
.with_guessed_format()
.context("re-reading image for decode")?
.decode()
.context("decoding image")?;
let (final_w, final_h, final_img) = if long <= max_long_edge {
(w, h, img)
} else {
let scale = max_long_edge as f32 / long as f32;
let mut new_w = ((w as f32) * scale).round().max(1.0) as u32;
let mut new_h = ((h as f32) * scale).round().max(1.0) as u32;
// Independent rounding of the two axes can let `f32`'s
// round-to-nearest push the long axis one pixel past
// `max_long_edge` for irrational scales (e.g. `max=1601,
// long=4001`). Pin the long axis to exactly `max_long_edge`
// so the doc-comment's "long edge is at most max_long_edge"
// stays a strict bound.
if w >= h {
new_w = new_w.min(max_long_edge);
} else {
new_h = new_h.min(max_long_edge);
}
let resized =
img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle);
(new_w, new_h, resized)
};
let mut out = Cursor::new(Vec::new());
final_img
.write_to(&mut out, ImageFormat::Png)
.context("encoding image as PNG")?;
Ok((out.into_inner(), final_w, final_h))
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
use image::{ImageBuffer, Rgb};
/// Solid-colour PNG of the given dimensions. Solid colour
/// compresses aggressively so even 4001×3001 stays under a few
/// kilobytes.
fn solid_png(w: u32, h: u32) -> Vec<u8> {
let img: ImageBuffer<Rgb<u8>, _> =
ImageBuffer::from_pixel(w, h, Rgb([0, 0, 255]));
let mut buf = Cursor::new(Vec::new());
img.write_to(&mut buf, ImageFormat::Png)
.expect("encoding solid PNG must not fail");
buf.into_inner()
}
fn solid_jpeg(w: u32, h: u32) -> Vec<u8> {
let img: ImageBuffer<Rgb<u8>, _> =
ImageBuffer::from_pixel(w, h, Rgb([255, 255, 255]));
let mut buf = Cursor::new(Vec::new());
img.write_to(&mut buf, ImageFormat::Jpeg)
.expect("encoding solid JPEG must not fail");
buf.into_inner()
}
/// PNG within budget skips the decode + re-encode round-trip
/// entirely. Source bytes survive byte-for-byte.
#[test]
fn png_within_cap_passes_through_zero_decode() {
let bytes = solid_png(100, 50);
let (out, w, h) =
downscale_to_png(&bytes, 1024).expect("PNG passthrough must succeed");
assert_eq!((w, h), (100, 50));
assert_eq!(out, bytes, "PNG passthrough must return source bytes verbatim");
}
/// JPEG within budget gets re-encoded as PNG (the wire format)
/// while preserving dimensions.
#[test]
fn jpeg_within_cap_reencodes_as_png() {
let bytes = solid_jpeg(100, 50);
let (out, w, h) =
downscale_to_png(&bytes, 1024).expect("JPEG re-encode must succeed");
assert_eq!((w, h), (100, 50));
// Byte stream must now start with the PNG magic.
assert_eq!(
&out[..8],
&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A],
"output must be PNG-encoded after JPEG input"
);
}
/// Pathological irrational scale — `max=1601, long=4001` would let
/// independent f32 round-to-nearest push the long axis to 1602.
/// The post-resize clamp pins it back to `max_long_edge`.
#[test]
fn long_edge_clamped_strictly_to_max_for_irrational_scale() {
let bytes = solid_png(4001, 3001);
let (_out, w, h) =
downscale_to_png(&bytes, 1601).expect("downscale must succeed");
let long = w.max(h);
assert!(long <= 1601, "long edge must be ≤ max, got {long}");
}
/// Aspect ratio survives the downscale within 2%.
#[test]
fn aspect_ratio_preserved_within_rounding() {
let bytes = solid_png(4000, 3000);
let (_out, w, h) =
downscale_to_png(&bytes, 1024).expect("downscale must succeed");
let ratio = w as f32 / h as f32;
assert!(
(ratio - 4.0 / 3.0).abs() < 0.02,
"aspect drift: in=4/3 out={}/{}={ratio}",
w,
h
);
}
/// Truncated PNG header — format guess succeeds (8-byte signature
/// intact) but `into_dimensions` fails. Surfaced as Err so
/// callers can route to "skip + warning" without confusing the
/// downstream pipeline with a zero-size image.
#[test]
fn corrupt_bytes_return_err() {
let truncated = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
let r = downscale_to_png(&truncated, 1024);
assert!(r.is_err(), "corrupt PNG must surface as Err");
}
/// Unrecognised bytes (not any image format) — header sniff fails
/// before dimension read.
#[test]
fn unrecognised_bytes_return_err() {
let r = downscale_to_png(b"definitely not an image", 1024);
assert!(r.is_err(), "non-image bytes must surface as Err");
}
}

View File

@@ -13,14 +13,25 @@
//! consumers can branch trust by engine (Tesseract / Apple Vision
//! adapters, when added, will write a different `engine` string).
//!
//! P6-3 adds the [`caption`] module: [`caption_image`] /
//! [`apply_caption`] route an image through any vision-capable
//! [`kebab_core::LanguageModel`] (text-only LMs are not vision-aware
//! and will surface a model-side error). Captions are explicitly
//! marked **model-generated** — the trust gap between OCR (observed,
//! engine-tagged) and caption (generated, prompt-tagged) is the
//! workspace's central trust contract.
//!
//! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText /
//! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption
//! provenance), §9 (versioning).
mod dims;
mod exif_extract;
mod image_prep;
pub mod caption;
pub mod ocr;
pub use caption::{apply_caption, caption_image};
pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};
use anyhow::{Context, Result};

View File

@@ -25,17 +25,17 @@
//! field on [`OcrText`] makes the source explicit, so a caller can
//! decide whether to trust based on which engine produced the text.
use std::io::Cursor;
use std::time::Duration;
use anyhow::{Context, Result};
use base64::Engine as _;
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use image::{ImageFormat, ImageReader};
use kebab_core::{ImageRefBlock, Lang, OcrRegion, OcrText, ProvenanceEvent, ProvenanceKind};
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use crate::image_prep;
/// Engine name written into `OcrText.engine` for the Ollama-vision adapter.
pub const OLLAMA_VISION_ENGINE: &str = "ollama-vision";
@@ -239,7 +239,7 @@ impl OcrEngine for OllamaVisionOcr {
image_bytes: &[u8],
lang_hint: Option<&Lang>,
) -> Result<OcrText> {
let (prepared, w, h) = downscale_to_long_edge(image_bytes, self.max_pixels)
let (prepared, w, h) = image_prep::downscale_to_png(image_bytes, self.max_pixels)
.context("preparing image for OCR")?;
let b64 = BASE64_STANDARD.encode(&prepared);
@@ -311,71 +311,6 @@ impl OcrEngine for OllamaVisionOcr {
}
}
// ── Image preparation ─────────────────────────────────────────────────────
/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`,
/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)`.
///
/// PNG sources that already fit the cap are passthrough (zero decodes,
/// just a `Vec` clone). Every other path decodes the image exactly
/// once: the cheap header sniff peeks at the format / dimensions before
/// committing to a decode, so non-PNG passthrough and downscale share
/// the same `decode → optionally resize → re-encode` tail.
fn downscale_to_long_edge(bytes: &[u8], max_long_edge: u32) -> Result<(Vec<u8>, u32, u32)> {
let reader = ImageReader::new(Cursor::new(bytes))
.with_guessed_format()
.context("reading image header for OCR")?;
let format = reader.format();
let (w, h) = reader
.into_dimensions()
.context("reading image dimensions for OCR")?;
let long = w.max(h);
// Hot path — PNG within budget already matches the wire format we
// send Ollama, so we ship the bytes verbatim without paying for a
// decode + re-encode round-trip.
if long <= max_long_edge && format == Some(ImageFormat::Png) {
return Ok((bytes.to_vec(), w, h));
}
// Every remaining branch needs the pixels — either to re-encode as
// PNG (non-PNG within budget) or to resize first (over budget).
// One decode covers both.
let img = ImageReader::new(Cursor::new(bytes))
.with_guessed_format()
.context("re-reading image for OCR decode")?
.decode()
.context("decoding image for OCR")?;
let (final_w, final_h, final_img) = if long <= max_long_edge {
(w, h, img)
} else {
let scale = max_long_edge as f32 / long as f32;
let mut new_w = ((w as f32) * scale).round().max(1.0) as u32;
let mut new_h = ((h as f32) * scale).round().max(1.0) as u32;
// Independent rounding of the two axes can let `f32`'s nearest
// round push the long axis one pixel past `max_long_edge` for
// irrational scales (e.g. `max=1601, long=4001`). Pin the long
// axis to exactly `max_long_edge` so the doc-comment's
// "long edge is at most max_long_edge" stays a strict bound.
if w >= h {
new_w = new_w.min(max_long_edge);
} else {
new_h = new_h.min(max_long_edge);
}
let resized =
img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle);
(new_w, new_h, resized)
};
let mut out = Cursor::new(Vec::new());
final_img
.write_to(&mut out, ImageFormat::Png)
.context("encoding image as PNG for OCR")?;
Ok((out.into_inner(), final_w, final_h))
}
fn truncate(s: &str, n: usize) -> String {
if s.chars().count() <= n {
return s.to_string();

View File

@@ -0,0 +1,366 @@
//! Integration tests for the caption adapter (P6-3).
//!
//! All hermetic tests use `MockLanguageModel` from `kebab-llm/mock`
//! which captures `req.images` indirectly via the canned response. A
//! single opt-in test (`#[ignore]`) wires the real
//! `kebab-llm-local::OllamaLanguageModel` against the workspace's
//! Ollama daemon to verify the `images: [base64]` round-trip.
mod common;
use std::sync::{Arc, Mutex};
use kebab_config::Config;
use kebab_core::{
AssetId, BlockId, CommonBlock, FinishReason, GenerateRequest, ImageRefBlock, Lang,
LanguageModel, ModelRef, ProvenanceEvent, ProvenanceKind, SourceSpan, TokenChunk,
TokenUsage,
};
use kebab_llm::MockLanguageModel;
use kebab_parse_image::{apply_caption, caption_image};
use crate::common::red_100x50_png;
fn cfg_with_caption_enabled() -> Config {
let mut cfg = Config::defaults();
cfg.image.caption.enabled = true;
cfg.image.caption.max_pixels = 512;
cfg
}
fn empty_image_block() -> ImageRefBlock {
ImageRefBlock {
common: CommonBlock {
block_id: BlockId("0".repeat(32)),
heading_path: Vec::new(),
source_span: SourceSpan::Region {
x: 0,
y: 0,
w: 100,
h: 50,
},
},
asset_id: Some(AssetId("a".repeat(32))),
src: "img/x.png".to_string(),
alt: "x.png".to_string(),
ocr: None,
caption: None,
}
}
fn mk_mock(canned: &str) -> MockLanguageModel {
MockLanguageModel {
model_id: "vision-mock:1b".to_string(),
provider: "mock".to_string(),
context_tokens: 4096,
canned_response: canned.to_string(),
canned_finish: FinishReason::Stop,
canned_usage: TokenUsage {
prompt_tokens: 0,
completion_tokens: 0,
latency_ms: 0,
},
}
}
// ── Disabled feature gate ─────────────────────────────────────────────────
#[test]
fn apply_caption_no_op_when_feature_disabled() {
let mut cfg = Config::defaults();
cfg.image.caption.enabled = false;
let mock = mk_mock("ignored");
let mut block = empty_image_block();
let mut events: Vec<ProvenanceEvent> = Vec::new();
let bytes = red_100x50_png();
apply_caption(&mock, &bytes, &mut block, None, &cfg, &mut events)
.expect("disabled apply_caption must return Ok(())");
assert!(
block.caption.is_none(),
"disabled apply_caption must not write caption"
);
assert!(
events.is_empty(),
"disabled apply_caption must not append a Provenance event"
);
}
#[test]
fn caption_image_runs_regardless_of_enabled_flag() {
// Feature gate lives in `apply_caption`; `caption_image` is the
// raw operation. Calling it directly with enabled = false must
// still produce a `ModelCaption` so tests can pin the produced
// shape independent of pipeline gating.
let cfg = Config::defaults(); // enabled = false (default)
let mock = mk_mock("hi");
let bytes = red_100x50_png();
let cap = caption_image(&mock, &bytes, None, &cfg)
.expect("caption_image runs even when enabled = false");
assert_eq!(cap.text, "hi");
}
// ── Happy path ────────────────────────────────────────────────────────────
#[test]
fn apply_caption_sets_block_caption_and_appends_provenance() {
let cfg = cfg_with_caption_enabled();
let mock = mk_mock("사진 한 장");
let mut block = empty_image_block();
let mut events: Vec<ProvenanceEvent> = Vec::new();
let bytes = red_100x50_png();
apply_caption(
&mock,
&bytes,
&mut block,
Some(&Lang("ko".to_string())),
&cfg,
&mut events,
)
.expect("apply_caption must succeed");
let cap = block.caption.as_ref().expect("caption Some");
assert_eq!(cap.text, "사진 한 장");
assert_eq!(cap.model, "vision-mock:1b");
assert_eq!(cap.model_version, "mock/caption-v1");
assert_eq!(events.len(), 1);
assert_eq!(events[0].kind, ProvenanceKind::CaptionApplied);
assert_eq!(events[0].agent, "kb-parse-image");
let note = events[0].note.as_deref().unwrap_or("");
assert!(note.contains("vision-mock:1b") && note.contains("caption-v1"), "{note}");
}
// ── Empty token stream → empty caption text ──────────────────────────────
#[test]
fn caption_image_empty_stream_yields_empty_text() {
let cfg = cfg_with_caption_enabled();
let mock = mk_mock("");
let bytes = red_100x50_png();
let cap = caption_image(&mock, &bytes, None, &cfg).expect("empty stream must succeed");
assert_eq!(cap.text, "");
// Spec contract: caller can distinguish "captioning attempted, no
// result" from "captioning never attempted" by `caption.is_some()`.
// The text being empty does not erase the attempt.
assert!(!cap.model.is_empty());
}
// ── Korean vs English prompt selection ───────────────────────────────────
/// `LanguageModel` impl that captures the `system` prompt sent to it
/// so tests can verify the language branch picked by `build_prompt`
/// (the function is private; this is the cleanest observable signal).
struct CapturingMock {
captured_system: Arc<Mutex<Option<String>>>,
captured_images: Arc<Mutex<Vec<String>>>,
}
impl LanguageModel for CapturingMock {
fn model_ref(&self) -> ModelRef {
ModelRef {
id: "capture:1".to_string(),
provider: "mock".to_string(),
dimensions: None,
}
}
fn context_tokens(&self) -> usize {
4096
}
fn generate_stream(
&self,
req: GenerateRequest,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>> {
*self.captured_system.lock().unwrap() = Some(req.system);
*self.captured_images.lock().unwrap() = req.images;
let chunks: Vec<TokenChunk> = vec![
TokenChunk::Token("ok".to_string()),
TokenChunk::Done {
finish_reason: FinishReason::Stop,
usage: TokenUsage {
prompt_tokens: 0,
completion_tokens: 0,
latency_ms: 0,
},
},
];
Ok(Box::new(chunks.into_iter().map(Ok)))
}
}
#[test]
fn caption_image_routes_image_into_request_images_field() {
let cfg = cfg_with_caption_enabled();
let captured_system: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
let captured_images: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
let mock = CapturingMock {
captured_system: captured_system.clone(),
captured_images: captured_images.clone(),
};
let bytes = red_100x50_png();
let _ = caption_image(&mock, &bytes, Some(&Lang("ko".to_string())), &cfg)
.expect("caption succeeds");
let imgs = captured_images.lock().unwrap();
assert_eq!(imgs.len(), 1, "exactly one base64 image routed");
use base64::Engine as _;
let decoded = base64::engine::general_purpose::STANDARD
.decode(&imgs[0])
.expect("base64 decodes");
assert!(
!decoded.is_empty(),
"decoded image bytes must be non-empty"
);
let sys = captured_system.lock().unwrap().clone().unwrap();
assert!(
sys.contains("이미지를 한 문장으로"),
"Korean hint must produce Korean system prompt: {sys}"
);
}
#[test]
fn caption_image_uses_english_prompt_for_undetermined_lang() {
let cfg = cfg_with_caption_enabled();
let captured_system: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
let mock = CapturingMock {
captured_system: captured_system.clone(),
captured_images: Arc::new(Mutex::new(Vec::new())),
};
let bytes = red_100x50_png();
let _ = caption_image(&mock, &bytes, Some(&Lang("und".to_string())), &cfg)
.expect("caption succeeds");
let sys = captured_system.lock().unwrap().clone().unwrap();
assert!(sys.contains("Describe the image"), "{sys}");
}
// ── LM error propagates ──────────────────────────────────────────────────
/// LM that returns Err immediately from `generate_stream` (before any
/// token).
struct FailingLm;
impl LanguageModel for FailingLm {
fn model_ref(&self) -> ModelRef {
ModelRef {
id: "fail".into(),
provider: "mock".into(),
dimensions: None,
}
}
fn context_tokens(&self) -> usize {
0
}
fn generate_stream(
&self,
_req: GenerateRequest,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>> {
Err(anyhow::anyhow!("simulated LM connection refused"))
}
}
#[test]
fn apply_caption_lm_error_leaves_block_untouched() {
let cfg = cfg_with_caption_enabled();
let mut block = empty_image_block();
let mut events: Vec<ProvenanceEvent> = Vec::new();
let bytes = red_100x50_png();
let r = apply_caption(&FailingLm, &bytes, &mut block, None, &cfg, &mut events);
assert!(r.is_err());
assert!(
block.caption.is_none(),
"caption stays None when LM fails — partial state must not leak"
);
assert!(events.is_empty(), "no provenance event when LM fails");
}
// ── Determinism — identical mock input → identical caption ───────────────
#[test]
fn caption_image_deterministic_with_identical_inputs() {
let cfg = cfg_with_caption_enabled();
let bytes = red_100x50_png();
let mock1 = mk_mock("a deterministic caption");
let mock2 = mk_mock("a deterministic caption");
let cap1 = caption_image(&mock1, &bytes, None, &cfg).unwrap();
let cap2 = caption_image(&mock2, &bytes, None, &cfg).unwrap();
assert_eq!(cap1, cap2);
}
// ── max_pixels clamp ─────────────────────────────────────────────────────
/// Out-of-range `max_pixels` is silently clamped at construction so a
/// bad config can't kill ingest. The captured `images` field's
/// decoded long edge confirms the clamp engaged.
#[test]
fn caption_image_clamps_oversized_max_pixels() {
let mut cfg = Config::defaults();
cfg.image.caption.enabled = true;
cfg.image.caption.max_pixels = 99_999; // way over MAX_CAPTION_LONG_EDGE
let captured_images: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
let mock = CapturingMock {
captured_system: Arc::new(Mutex::new(None)),
captured_images: captured_images.clone(),
};
// 4000×3000 PNG well above the 1536 cap.
let bytes = common::large_blue_4000x3000_png();
let _ = caption_image(&mock, &bytes, None, &cfg).expect("caption succeeds");
let imgs = captured_images.lock().unwrap();
use base64::Engine as _;
let decoded = base64::engine::general_purpose::STANDARD
.decode(&imgs[0])
.unwrap();
let reader = image::ImageReader::new(std::io::Cursor::new(decoded))
.with_guessed_format()
.unwrap();
let (w, h) = reader.into_dimensions().unwrap();
let long = w.max(h);
assert!(
long <= kebab_parse_image::caption::MAX_CAPTION_LONG_EDGE,
"max_pixels must clamp to MAX_CAPTION_LONG_EDGE={}, got {long}",
kebab_parse_image::caption::MAX_CAPTION_LONG_EDGE
);
}
// ── Real Ollama integration (opt-in) ─────────────────────────────────────
/// End-to-end captioning against the workspace's real Ollama daemon
/// via `kebab-llm-local::OllamaLanguageModel` (dev-dep). Skipped by
/// default via `#[ignore]`; opt in with `--ignored`.
///
/// Run with:
///
/// ```sh
/// KEBAB_MODELS_LLM_ENDPOINT=http://192.168.0.47:11434 \
/// KEBAB_MODELS_LLM_MODEL=gemma4:e4b \
/// cargo test -p kebab-parse-image --test caption \
/// caption_integration -- --ignored --nocapture
/// ```
#[test]
#[ignore = "hits a real Ollama daemon; opt in via `cargo test -- --ignored`"]
fn caption_integration_real_ollama_describes_image() {
use kebab_llm_local::OllamaLanguageModel;
let mut cfg = Config::defaults();
cfg.image.caption.enabled = true;
cfg.image.caption.max_pixels = 768;
if let Ok(ep) = std::env::var("KEBAB_MODELS_LLM_ENDPOINT") {
cfg.models.llm.endpoint = ep;
} else {
cfg.models.llm.endpoint = "http://192.168.0.47:11434".to_string();
}
if let Ok(m) = std::env::var("KEBAB_MODELS_LLM_MODEL") {
cfg.models.llm.model = m;
} else {
cfg.models.llm.model = "gemma4:e4b".to_string();
}
cfg.models.llm.provider = "ollama".to_string();
let llm = OllamaLanguageModel::new(&cfg).expect("OllamaLanguageModel::new");
let bytes = red_100x50_png();
let cap = caption_image(&llm, &bytes, Some(&Lang("en".to_string())), &cfg)
.expect("real-Ollama caption_image must succeed");
eprintln!("integration caption: {}", cap.text);
assert!(!cap.text.is_empty(), "caption must be non-empty");
assert_eq!(cap.model, "gemma4:e4b");
assert!(cap.model_version.contains("ollama"));
assert!(cap.model_version.contains("caption-v1"));
}

View File

@@ -195,6 +195,9 @@ impl RagPipeline {
max_tokens: max_completion,
temperature,
seed,
// RAG is text-only — vision inputs only flow when a
// future multimodal pipeline injects images here.
images: Vec::new(),
};
let mut acc = String::new();

View File

@@ -14,6 +14,32 @@ historical contract that was implemented; this file accumulates the
deltas so phase 5+ readers can find the live behavior without diffing
git history.
## 2026-05-02 — P6-3 caption: GenerateRequest.images + cargo feature dropped
**Discovered**: P6-3 implementation start.
**Symptom 1**: `tasks/p6/p6-3-caption-adapter.md` § Public surface declares `caption_image(llm: &dyn kebab_core::LanguageModel, ...)`, but the frozen `LanguageModel` trait + `GenerateRequest` from p4-1 carry no vision input. The spec's behavior contract ("the adapter is responsible for rendering the prompt to wire") implicitly relied on a trait extension that p4-1 never specced.
**Symptom 2**: Spec § Definition of Done asks for `cargo check -p kebab-parse-image --features caption` — i.e. a cargo feature gate. The captioning module's only extra deps are `base64` + `image` + the `kebab-llm` trait, all already pulled in by P6-2. A cargo feature would only complicate the build matrix without saving meaningful binary weight.
**Root cause**: Two small spec gaps that resolve cleanly together — extend the `LanguageModel` trait once for vision routing, and collapse compile-time + runtime gating into a single runtime gate.
**Fix** (PR #34, feat/p6-3-caption-adapter):
- `kebab-core::GenerateRequest` gains an `images: Vec<String>` field (`#[serde(default)]` for backward compat with pre-P6 wire payloads / snapshots). Empty for the text-only RAG path; populated with one or more base64 strings by vision-aware callers.
- `kebab-llm-local::OllamaLanguageModel` routes `req.images` onto the wire as `images: [base64, ...]` (Ollama's vision channel). The wire shape stays byte-identical for empty `images` because the field uses `#[serde(skip_serializing_if = "<[String]>::is_empty")]`.
- `kebab-parse-image::caption` module: `caption_image` / `apply_caption` build `GenerateRequest { images: vec![b64], temperature: 0.0, seed: 0, ... }` and accept any `&dyn LanguageModel`. Korean / English prompt branch picked from `lang_hint`.
- Cargo feature `caption` is **not** introduced — the runtime gate `config.image.caption.enabled = false` (default OFF) suffices.
- All existing `GenerateRequest { ... }` literals (kebab-rag, kebab-llm tests, kebab-llm-local tests) gained `images: Vec::new()` to satisfy the new field.
**Trust note**: Captions stay explicitly model-generated. `ModelCaption.model_version` carries `"<provider>/<prompt_template_version>"` (e.g. `"ollama/caption-v1"`) so a regression in either prompt or model is auditable from the wire.
**`model_version` shape deviation**: spec literal says `model_version: llm.model_ref().provider` (provider as a coarse version proxy). We extend to `<provider>/<prompt_template_version>` because prompt template churn is a real regression vector independent of the model — pinning both axes in one string lets `kebab-eval` (P5) detect either drift without a schema bump. Spec already left the door open ("if a vision model exposes a stable revision, prefer that"); the prompt template version is the closest stable revision we have today. Future PaddleOCR / Apple Vision adapters that expose a real model revision string can substitute it for `prompt_template_version` without breaking the wire shape.
**Amends**:
- tasks/p4/p4-1-llm-trait.md (`GenerateRequest` schema gained `images: Vec<String>`).
- tasks/p4/p4-2-ollama-adapter.md (request body now optionally includes `images: [...]`).
- tasks/p6/p6-3-caption-adapter.md ("Definition of Done" cargo feature `caption` dropped; runtime gate is the only feature gate).
## 2026-05-02 — P6-2 default OCR engine: Tesseract → Ollama-vision
**Discovered**: P6-2 implementation start.

View File

@@ -3,7 +3,7 @@ phase: P6
component: kebab-parse-image (caption adapter)
task_id: p6-3
title: "ModelCaption adapter (LanguageModel-driven, feature-gated)"
status: planned
status: completed
depends_on: [p6-1, p4-2]
unblocks: []
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md