2026-05-02 06:22:19 +00:00 · 2026-05-02 06:08:33 +00:00 · 2026-05-02 06:08:33 +00:00 · 2026-05-02 06:15:36 +00:00 · 2026-05-02 06:08:33 +00:00 · 2026-05-02 06:08:33 +00:00
17 changed files with 954 additions and 71 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3576,6 +3576,8 @@ dependencies = [
 "kamadak-exif",
 "kebab-config",
 "kebab-core",
+ "kebab-llm",
+ "kebab-llm-local",
 "reqwest",
 "serde",
 "serde_json",
--- a/crates/kebab-config/src/lib.rs
+++ b/crates/kebab-config/src/lib.rs
@@ -105,18 +105,20 @@ pub struct RagCfg {
 }

 /// Settings for the image ingest pipeline (P6). `ocr` controls OCR
-/// behaviour; future fields (e.g. `caption`) will join here as P6-3
-/// lands.
+/// behaviour (P6-2); `caption` controls vision-LM captioning (P6-3).
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct ImageCfg {
    #[serde(default = "OcrCfg::defaults")]
    pub ocr: OcrCfg,
+    #[serde(default = "CaptionCfg::defaults")]
+    pub caption: CaptionCfg,
 }

 impl ImageCfg {
    pub fn defaults() -> Self {
        Self {
            ocr: OcrCfg::defaults(),
+            caption: CaptionCfg::defaults(),
        }
    }
 }
@@ -162,6 +164,36 @@ impl OcrCfg {
    }
 }

+/// Caption settings (P6-3). Caption uses the same Ollama-vision /
+/// `LanguageModel` pipeline as the rest of the workspace; the trait
+/// abstraction is the part the spec demands. `enabled` defaults to
+/// `false` because captioning costs one model call per asset and the
+/// output is model-generated (low trust).
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct CaptionCfg {
+    /// Run captioning on every image during ingest. Default `false`.
+    pub enabled: bool,
+    /// Cap the long edge of the image (in pixels) before sending. The
+    /// spec recommends an aggressive 768×768 cap because larger
+    /// vision-LM inputs translate directly into prompt cost. Default
+    /// `768`.
+    pub max_pixels: u32,
+    /// Caption prompt template version pinned into wire output via
+    /// `ModelCaption.model_version`. Bump when the prompt changes so
+    /// downstream eval can detect regressions.
+    pub prompt_template_version: String,
+}
+
+impl CaptionCfg {
+    pub fn defaults() -> Self {
+        Self {
+            enabled: false,
+            max_pixels: 768,
+            prompt_template_version: "caption-v1".to_string(),
+        }
+    }
+}
+
 impl Config {
    /// Defaults per design §6.4.
    pub fn defaults() -> Self {
@@ -417,6 +449,19 @@ impl Config {
                    }
                }

+                // image.caption (P6-3)
+                "KEBAB_IMAGE_CAPTION_ENABLED" => {
+                    self.image.caption.enabled = parse_bool(v);
+                }
+                "KEBAB_IMAGE_CAPTION_MAX_PIXELS" => {
+                    if let Ok(n) = v.parse::<u32>() {
+                        self.image.caption.max_pixels = n;
+                    }
+                }
+                "KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION" => {
+                    self.image.caption.prompt_template_version = v.clone();
+                }
+
                // Unknown KEBAB_* keys are silently ignored — see
                // `env_unknown_key_is_ignored` test.
                _ => {}
@@ -608,6 +653,35 @@ mod tests {
    /// Pre-P6 config files don't have an `[image]` section. The
    /// `#[serde(default)]` attribute on `Config::image` must let those
    /// files load with `ImageCfg::defaults()` instead of erroring.
+    #[test]
+    fn image_caption_defaults_disabled() {
+        let c = Config::defaults();
+        assert!(!c.image.caption.enabled);
+        assert_eq!(c.image.caption.max_pixels, 768);
+        assert_eq!(c.image.caption.prompt_template_version, "caption-v1");
+    }
+
+    #[test]
+    fn image_caption_env_overrides() {
+        let mut env = HashMap::new();
+        env.insert(
+            "KEBAB_IMAGE_CAPTION_ENABLED".to_string(),
+            "true".to_string(),
+        );
+        env.insert(
+            "KEBAB_IMAGE_CAPTION_MAX_PIXELS".to_string(),
+            "1024".to_string(),
+        );
+        env.insert(
+            "KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION".to_string(),
+            "caption-v2".to_string(),
+        );
+        let c = Config::defaults().apply_env(&env);
+        assert!(c.image.caption.enabled);
+        assert_eq!(c.image.caption.max_pixels, 1024);
+        assert_eq!(c.image.caption.prompt_template_version, "caption-v2");
+    }
+
    /// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None`
    /// rather than to `Some("")` so the fallback to `models.llm.endpoint`
    /// kicks in. Covers the env-equivalent of a missing TOML key.
--- a/crates/kebab-core/src/traits.rs
+++ b/crates/kebab-core/src/traits.rs
@@ -69,6 +69,17 @@ pub struct GenerateRequest {
    pub max_tokens: usize,
    pub temperature: f32,
    pub seed: Option<u64>,
+    /// Vision inputs (base64-encoded, one per image). Empty for the
+    /// text-only path that P4-2 / P4-3 / RAG uses; non-empty when a
+    /// vision-capable adapter (P6-3 caption, future multimodal RAG)
+    /// drives the call. The LM adapter is responsible for routing
+    /// these onto the wire — Ollama uses `images: [base64, ...]`,
+    /// other backends may differ.
+    ///
+    /// Defaulted on deserialization so older `*.json` payloads /
+    /// snapshots that predate the field still parse.
+    #[serde(default)]
+    pub images: Vec<String>,
 }

 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
--- a/crates/kebab-llm-local/src/ollama.rs
+++ b/crates/kebab-llm-local/src/ollama.rs
@@ -140,9 +140,15 @@ impl LanguageModel for OllamaLanguageModel {
            format!("{}\n\n{}", req.system, req.user)
        };

+        // Vision inputs (P6-3) flow through the request via Ollama's
+        // `images: [base64, ...]` field. Empty for the text-only RAG
+        // path so older snapshots and JSON dumps stay byte-identical
+        // (the field is `#[serde(default)]` here so it's omitted from
+        // the wire when empty).
        let body = OllamaRequest {
            model: &self.model_id,
            prompt,
+            images: &req.images,
            stream: true,
            options: OllamaOptions {
                temperature: effective_temperature,
@@ -188,6 +194,13 @@ impl LanguageModel for OllamaLanguageModel {
 struct OllamaRequest<'a> {
    model: &'a str,
    prompt: String,
+    /// Skipped from the JSON when empty so the text-only path keeps
+    /// the same on-the-wire shape it had pre-P6-3 (`{"model": ...,
+    /// "prompt": ..., "stream": ..., "options": ...}` — no `images`
+    /// key). Vision-capable callers populate this with one or more
+    /// base64-encoded images.
+    #[serde(skip_serializing_if = "<[String]>::is_empty")]
+    images: &'a [String],
    stream: bool,
    options: OllamaOptions<'a>,
 }
--- a/crates/kebab-llm-local/tests/integration.rs
+++ b/crates/kebab-llm-local/tests/integration.rs
@@ -31,6 +31,7 @@ fn real_ollama_streams_non_empty_response() {
        max_tokens: 8,
        temperature: 0.0,
        seed: Some(0),
+        images: Vec::new(),
    };

    let stream = llm.generate_stream(req).expect("stream should start");
--- a/crates/kebab-llm-local/tests/streaming.rs
+++ b/crates/kebab-llm-local/tests/streaming.rs
@@ -35,6 +35,7 @@ fn sample_request() -> GenerateRequest {
        max_tokens: 64,
        temperature: 0.0,
        seed: Some(0),
+        images: Vec::new(),
    }
 }

--- a/crates/kebab-llm/tests/mock.rs
+++ b/crates/kebab-llm/tests/mock.rs
@@ -26,6 +26,7 @@ fn req_with_stop(stop: Vec<&str>) -> GenerateRequest {
        max_tokens: 64,
        temperature: 0.0,
        seed: None,
+        images: Vec::new(),
    }
 }

--- a/crates/kebab-llm/tests/reexports.rs
+++ b/crates/kebab-llm/tests/reexports.rs
@@ -55,6 +55,7 @@ fn dyn_dispatch_via_box_works() {
        max_tokens: 16,
        temperature: 0.0,
        seed: None,
+        images: Vec::new(),
    };
    let stream = m.generate_stream(req).expect("stream");
    let chunks: Vec<TokenChunk> = stream.map(|r| r.expect("ok chunk")).collect();
--- a/crates/kebab-parse-image/Cargo.toml
+++ b/crates/kebab-parse-image/Cargo.toml
@@ -10,6 +10,12 @@ description   = "Image extractor + EXIF + OCR (Ollama-vision) for the kebab pipe
 [dependencies]
 kebab-core   = { path = "../kebab-core" }
 kebab-config = { path = "../kebab-config" }
+# `kebab-llm` re-exports the trait crate (`kebab-core::LanguageModel`)
+# under a stable surface; the caption adapter consumes any
+# `dyn LanguageModel`. We do NOT depend on `kebab-llm-local` (forbidden
+# by p6-3 design §8) — the trait abstraction is exactly what spec
+# requires.
+kebab-llm    = { path = "../kebab-llm" }
 anyhow       = { workspace = true }
 serde        = { workspace = true }
 serde_json   = { workspace = true }
@@ -42,3 +48,10 @@ tokio        = { workspace = true, features = ["rt-multi-thread"] }
 # font rendering.
 ab_glyph     = "0.2"
 base64       = "0.22"
+# `kebab-llm/mock` exposes `MockLanguageModel` for hermetic caption
+# tests. Real adapters (Ollama) live in `kebab-llm-local`, which is
+# only allowed at the dev-dep level here — the runtime crate stays
+# trait-only, so the §8 forbidden-deps rule (no `kebab-llm-local`
+# at runtime) is preserved.
+kebab-llm        = { path = "../kebab-llm", features = ["mock"] }
+kebab-llm-local  = { path = "../kebab-llm-local" }
--- a/crates/kebab-parse-image/src/caption.rs
+++ b/crates/kebab-parse-image/src/caption.rs
@@ -0,0 +1,236 @@
+//! Caption adapter (P6-3).
+//!
+//! [`caption_image`] runs a vision-capable [`LanguageModel`] over an
+//! image and produces a [`ModelCaption`]. [`apply_caption`] is the
+//! helper that mutates an [`ImageRefBlock`] in place and emits a
+//! [`ProvenanceKind::CaptionApplied`] event.
+//!
+//! ## Trust note
+//!
+//! Captions are **model-generated** (`TrustLevel::Generated`), not
+//! observed text. Vision LMs hallucinate; the system prompt explicitly
+//! forbids guessing but expect false captions. Downstream UI / RAG
+//! must label captions as model-generated and surface the model id +
+//! prompt template version (carried in `ModelCaption.model_version`)
+//! so a regression in either is auditable.
+//!
+//! ## Spec deviation (cargo `caption` feature dropped)
+//!
+//! The original P6-3 spec asked for a cargo feature `caption` (default
+//! OFF at compile time). We collapse this into a single runtime gate
+//! (`config.image.caption.enabled = false`, default OFF). Reasoning:
+//! the captioning module's only extra deps are `base64` + `image` +
+//! `kebab-llm` trait — all already pulled in by the rest of the
+//! crate. A cargo feature would only complicate the build matrix
+//! without saving meaningful binary weight. See `tasks/HOTFIXES.md`
+//! (2026-05-02) for the deviation log.
+
+use anyhow::{Context, Result};
+use base64::Engine as _;
+use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
+use kebab_core::{
+    FinishReason, GenerateRequest, ImageRefBlock, Lang, LanguageModel, ModelCaption,
+    ProvenanceEvent, ProvenanceKind, TokenChunk,
+};
+use time::OffsetDateTime;
+
+use crate::image_prep;
+
+/// Long-edge clamp range for caption inputs. Smaller than OCR's
+/// `[256, 4096]` because vision LMs charge proportionally to input
+/// dimension — captions tolerate aggressive downscale better than
+/// OCR.
+pub const MIN_CAPTION_LONG_EDGE: u32 = 128;
+pub const MAX_CAPTION_LONG_EDGE: u32 = 1536;
+
+/// Token budget for captions. Captions are one-sentence by spec — 96
+/// tokens covers a 50-word English sentence or a 30-token Korean one
+/// with headroom for the LM's preamble before the stop sequence.
+const CAPTION_MAX_TOKENS: usize = 96;
+
+/// Run a caption pass and return the resulting `ModelCaption`.
+///
+/// Pure raw operation — does **not** consult `config.image.caption.enabled`.
+/// The runtime feature gate lives in [`apply_caption`]; this entry
+/// always invokes the LM. Tests pinning the produced `ModelCaption`
+/// shape can call this directly without flipping the config flag.
+///
+/// Honours the `[MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE]` clamp
+/// on `config.image.caption.max_pixels` so a hostile config cannot
+/// blow up prompt cost.
+pub fn caption_image(
+    llm: &dyn LanguageModel,
+    image_bytes: &[u8],
+    lang_hint: Option<&Lang>,
+    cfg: &kebab_config::Config,
+) -> Result<ModelCaption> {
+    let max_pixels = cfg
+        .image
+        .caption
+        .max_pixels
+        .clamp(MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE);
+    if max_pixels != cfg.image.caption.max_pixels {
+        tracing::warn!(
+            target: "kebab-parse-image",
+            "image.caption.max_pixels = {} clamped to {} (legal range [{}, {}])",
+            cfg.image.caption.max_pixels,
+            max_pixels,
+            MIN_CAPTION_LONG_EDGE,
+            MAX_CAPTION_LONG_EDGE
+        );
+    }
+
+    let (prepared, _w, _h) = image_prep::downscale_to_png(image_bytes, max_pixels)
+        .context("preparing image for caption")?;
+    let b64 = BASE64_STANDARD.encode(&prepared);
+
+    let lang = lang_hint
+        .map(|l| l.0.as_str())
+        .filter(|s| !s.is_empty() && *s != "und");
+    let (system, user) = build_prompt(lang);
+
+    // Determinism — temperature 0.0 + seed 0, same convention as RAG
+    // and OCR. The LM adapter routes the base64 image via its
+    // provider-specific channel (Ollama: `images: [base64]`).
+    let req = GenerateRequest {
+        system,
+        user,
+        stop: vec!["\n\n".to_string()],
+        max_tokens: CAPTION_MAX_TOKENS,
+        temperature: 0.0,
+        seed: Some(0),
+        images: vec![b64],
+    };
+
+    let stream = llm
+        .generate_stream(req)
+        .context("captioning LM call failed")?;
+
+    let mut text = String::new();
+    let mut saw_done = false;
+    for chunk in stream {
+        match chunk? {
+            TokenChunk::Token(t) => {
+                text.push_str(&t);
+            }
+            TokenChunk::Done { finish_reason, .. } => {
+                saw_done = true;
+                if let FinishReason::Error(e) = finish_reason {
+                    anyhow::bail!("captioning LM ended with error: {e}");
+                }
+                break;
+            }
+        }
+    }
+    if !saw_done {
+        anyhow::bail!("captioning LM stream ended without a Done frame");
+    }
+
+    let caption_text = text.trim().to_string();
+
+    let model_ref = llm.model_ref();
+    let prompt_v = &cfg.image.caption.prompt_template_version;
+    let model_version = format!(
+        "{provider}/{prompt}",
+        provider = model_ref.provider,
+        prompt = prompt_v
+    );
+
+    tracing::debug!(
+        target: "kebab-parse-image",
+        "caption ok (model={}, prompt={}, chars={})",
+        model_ref.id,
+        prompt_v,
+        caption_text.chars().count()
+    );
+
+    Ok(ModelCaption {
+        text: caption_text,
+        model: model_ref.id,
+        model_version,
+    })
+}
+
+/// Pipeline entry point — gate-checks `config.image.caption.enabled`
+/// then mutates `block.caption` in place via [`caption_image`].
+///
+/// When `enabled = false` the function is a clean no-op (returns
+/// `Ok(())` without invoking the LM and without writing a Provenance
+/// event). On LM failure `block.caption` stays `None` — partial state
+/// is never written. The caller decides whether to skip the asset or
+/// surface the error.
+pub fn apply_caption(
+    llm: &dyn LanguageModel,
+    image_bytes: &[u8],
+    block: &mut ImageRefBlock,
+    lang_hint: Option<&Lang>,
+    cfg: &kebab_config::Config,
+    events: &mut Vec<ProvenanceEvent>,
+) -> Result<()> {
+    if !cfg.image.caption.enabled {
+        tracing::debug!(
+            target: "kebab-parse-image",
+            "captioning skipped — image.caption.enabled = false"
+        );
+        return Ok(());
+    }
+    let caption = caption_image(llm, image_bytes, lang_hint, cfg)?;
+    // Build the Provenance note BEFORE moving `caption` into
+    // `block.caption` so we sidestep the per-call `String::clone` of
+    // `caption.model` + `caption.model_version`. Tight ingest loops
+    // (thousands of images) save two allocations per asset.
+    let note = format!(
+        "model={} model_version={}",
+        caption.model, caption.model_version
+    );
+    block.caption = Some(caption);
+    events.push(ProvenanceEvent {
+        at: OffsetDateTime::now_utc(),
+        agent: "kb-parse-image".to_string(),
+        kind: ProvenanceKind::CaptionApplied,
+        note: Some(note),
+    });
+    Ok(())
+}
+
+/// Compose the `(system, user)` prompt pair for the caption call.
+/// Korean / English split keeps the model on the requested output
+/// language; everything else falls through to English.
+fn build_prompt(lang_hint: Option<&str>) -> (String, String) {
+    match lang_hint {
+        Some("ko") | Some("kor") => (
+            "이미지를 한 문장으로 객관적으로 설명한다. 추측은 피하고, \
+             보이는 것만 적는다. 마크다운 / 따옴표 / 부가 설명 없이 \
+             한 문장만 출력."
+                .to_string(),
+            "위 이미지를 한국어로 한 문장으로 설명하라.".to_string(),
+        ),
+        _ => (
+            "Describe the image in one objective sentence. Do not \
+             speculate; describe only what is visible. No markdown, \
+             no quotes, no commentary — output a single sentence."
+                .to_string(),
+            "Describe the image above in one English sentence.".to_string(),
+        ),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn build_prompt_korean_for_ko_hint() {
+        let (sys, user) = build_prompt(Some("ko"));
+        assert!(sys.contains("이미지를 한 문장으로"));
+        assert!(user.contains("한국어로"));
+    }
+
+    #[test]
+    fn build_prompt_english_for_no_hint_or_und() {
+        let (sys, _) = build_prompt(None);
+        assert!(sys.contains("Describe the image"));
+        let (sys2, _) = build_prompt(Some("en"));
+        assert!(sys2.contains("Describe the image"));
+    }
+}
--- a/crates/kebab-parse-image/src/image_prep.rs
+++ b/crates/kebab-parse-image/src/image_prep.rs
@@ -0,0 +1,189 @@
+//! Shared image preparation for any image-to-LM pipeline.
+//!
+//! P6-2 OCR and P6-3 caption both need the same pre-LM step: clamp
+//! the long edge to a configured max, re-encode as PNG (the wire
+//! format vision channels expect — Ollama's `images: [base64, ...]`
+//! takes PNG/JPEG, but PNG keeps the alpha + lossless invariant we
+//! prefer for hand-drawn / screenshot inputs), pass through the
+//! source bytes when they already satisfy both constraints.
+//! Centralising this here keeps the 1px-rounding fix, the PNG
+//! passthrough hot path, and the error messages in one place —
+//! future image-to-LM channels (PDF page thumbnails, video
+//! keyframes, …) plug in without re-deriving the algorithm.
+
+use std::io::Cursor;
+
+use anyhow::{Context, Result};
+use image::{ImageFormat, ImageReader};
+
+/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`,
+/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)` so
+/// callers that care about the final dimensions (e.g. OCR's
+/// `SourceSpan::Region`) get them without re-decoding.
+///
+/// PNG sources that already fit the cap pass through (zero decodes,
+/// just a `Vec` clone). Every other path decodes the image exactly
+/// once: a cheap header sniff peeks at the format / dimensions before
+/// committing to a decode, so non-PNG passthrough and downscale share
+/// the same `decode → optionally resize → re-encode` tail.
+pub(crate) fn downscale_to_png(
+    bytes: &[u8],
+    max_long_edge: u32,
+) -> Result<(Vec<u8>, u32, u32)> {
+    let reader = ImageReader::new(Cursor::new(bytes))
+        .with_guessed_format()
+        .context("reading image header")?;
+    let format = reader.format();
+    let (w, h) = reader
+        .into_dimensions()
+        .context("reading image dimensions")?;
+
+    let long = w.max(h);
+
+    // Hot path — PNG within budget already matches the wire format we
+    // send to vision models, so we ship the bytes verbatim without
+    // paying for a decode + re-encode round-trip.
+    if long <= max_long_edge && format == Some(ImageFormat::Png) {
+        return Ok((bytes.to_vec(), w, h));
+    }
+
+    // Every remaining branch needs the pixels — either to re-encode as
+    // PNG (non-PNG within budget) or to resize first (over budget).
+    // One decode covers both.
+    let img = ImageReader::new(Cursor::new(bytes))
+        .with_guessed_format()
+        .context("re-reading image for decode")?
+        .decode()
+        .context("decoding image")?;
+
+    let (final_w, final_h, final_img) = if long <= max_long_edge {
+        (w, h, img)
+    } else {
+        let scale = max_long_edge as f32 / long as f32;
+        let mut new_w = ((w as f32) * scale).round().max(1.0) as u32;
+        let mut new_h = ((h as f32) * scale).round().max(1.0) as u32;
+        // Independent rounding of the two axes can let `f32`'s
+        // round-to-nearest push the long axis one pixel past
+        // `max_long_edge` for irrational scales (e.g. `max=1601,
+        // long=4001`). Pin the long axis to exactly `max_long_edge`
+        // so the doc-comment's "long edge is at most max_long_edge"
+        // stays a strict bound.
+        if w >= h {
+            new_w = new_w.min(max_long_edge);
+        } else {
+            new_h = new_h.min(max_long_edge);
+        }
+        let resized =
+            img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle);
+        (new_w, new_h, resized)
+    };
+
+    let mut out = Cursor::new(Vec::new());
+    final_img
+        .write_to(&mut out, ImageFormat::Png)
+        .context("encoding image as PNG")?;
+    Ok((out.into_inner(), final_w, final_h))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::io::Cursor;
+
+    use image::{ImageBuffer, Rgb};
+
+    /// Solid-colour PNG of the given dimensions. Solid colour
+    /// compresses aggressively so even 4001×3001 stays under a few
+    /// kilobytes.
+    fn solid_png(w: u32, h: u32) -> Vec<u8> {
+        let img: ImageBuffer<Rgb<u8>, _> =
+            ImageBuffer::from_pixel(w, h, Rgb([0, 0, 255]));
+        let mut buf = Cursor::new(Vec::new());
+        img.write_to(&mut buf, ImageFormat::Png)
+            .expect("encoding solid PNG must not fail");
+        buf.into_inner()
+    }
+
+    fn solid_jpeg(w: u32, h: u32) -> Vec<u8> {
+        let img: ImageBuffer<Rgb<u8>, _> =
+            ImageBuffer::from_pixel(w, h, Rgb([255, 255, 255]));
+        let mut buf = Cursor::new(Vec::new());
+        img.write_to(&mut buf, ImageFormat::Jpeg)
+            .expect("encoding solid JPEG must not fail");
+        buf.into_inner()
+    }
+
+    /// PNG within budget skips the decode + re-encode round-trip
+    /// entirely. Source bytes survive byte-for-byte.
+    #[test]
+    fn png_within_cap_passes_through_zero_decode() {
+        let bytes = solid_png(100, 50);
+        let (out, w, h) =
+            downscale_to_png(&bytes, 1024).expect("PNG passthrough must succeed");
+        assert_eq!((w, h), (100, 50));
+        assert_eq!(out, bytes, "PNG passthrough must return source bytes verbatim");
+    }
+
+    /// JPEG within budget gets re-encoded as PNG (the wire format)
+    /// while preserving dimensions.
+    #[test]
+    fn jpeg_within_cap_reencodes_as_png() {
+        let bytes = solid_jpeg(100, 50);
+        let (out, w, h) =
+            downscale_to_png(&bytes, 1024).expect("JPEG re-encode must succeed");
+        assert_eq!((w, h), (100, 50));
+        // Byte stream must now start with the PNG magic.
+        assert_eq!(
+            &out[..8],
+            &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A],
+            "output must be PNG-encoded after JPEG input"
+        );
+    }
+
+    /// Pathological irrational scale — `max=1601, long=4001` would let
+    /// independent f32 round-to-nearest push the long axis to 1602.
+    /// The post-resize clamp pins it back to `max_long_edge`.
+    #[test]
+    fn long_edge_clamped_strictly_to_max_for_irrational_scale() {
+        let bytes = solid_png(4001, 3001);
+        let (_out, w, h) =
+            downscale_to_png(&bytes, 1601).expect("downscale must succeed");
+        let long = w.max(h);
+        assert!(long <= 1601, "long edge must be ≤ max, got {long}");
+    }
+
+    /// Aspect ratio survives the downscale within 2%.
+    #[test]
+    fn aspect_ratio_preserved_within_rounding() {
+        let bytes = solid_png(4000, 3000);
+        let (_out, w, h) =
+            downscale_to_png(&bytes, 1024).expect("downscale must succeed");
+        let ratio = w as f32 / h as f32;
+        assert!(
+            (ratio - 4.0 / 3.0).abs() < 0.02,
+            "aspect drift: in=4/3 out={}/{}={ratio}",
+            w,
+            h
+        );
+    }
+
+    /// Truncated PNG header — format guess succeeds (8-byte signature
+    /// intact) but `into_dimensions` fails. Surfaced as Err so
+    /// callers can route to "skip + warning" without confusing the
+    /// downstream pipeline with a zero-size image.
+    #[test]
+    fn corrupt_bytes_return_err() {
+        let truncated = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
+        let r = downscale_to_png(&truncated, 1024);
+        assert!(r.is_err(), "corrupt PNG must surface as Err");
+    }
+
+    /// Unrecognised bytes (not any image format) — header sniff fails
+    /// before dimension read.
+    #[test]
+    fn unrecognised_bytes_return_err() {
+        let r = downscale_to_png(b"definitely not an image", 1024);
+        assert!(r.is_err(), "non-image bytes must surface as Err");
+    }
+}
--- a/crates/kebab-parse-image/src/lib.rs
+++ b/crates/kebab-parse-image/src/lib.rs
@@ -13,14 +13,25 @@
 //! consumers can branch trust by engine (Tesseract / Apple Vision
 //! adapters, when added, will write a different `engine` string).
 //!
+//! P6-3 adds the [`caption`] module: [`caption_image`] /
+//! [`apply_caption`] route an image through any vision-capable
+//! [`kebab_core::LanguageModel`] (text-only LMs are not vision-aware
+//! and will surface a model-side error). Captions are explicitly
+//! marked **model-generated** — the trust gap between OCR (observed,
+//! engine-tagged) and caption (generated, prompt-tagged) is the
+//! workspace's central trust contract.
+//!
 //! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText /
 //! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption
 //! provenance), §9 (versioning).

 mod dims;
 mod exif_extract;
+mod image_prep;
+pub mod caption;
 pub mod ocr;

+pub use caption::{apply_caption, caption_image};
 pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};

 use anyhow::{Context, Result};
--- a/crates/kebab-parse-image/src/ocr.rs
+++ b/crates/kebab-parse-image/src/ocr.rs
@@ -25,17 +25,17 @@
 //! field on [`OcrText`] makes the source explicit, so a caller can
 //! decide whether to trust based on which engine produced the text.

-use std::io::Cursor;
 use std::time::Duration;

 use anyhow::{Context, Result};
 use base64::Engine as _;
 use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
-use image::{ImageFormat, ImageReader};
 use kebab_core::{ImageRefBlock, Lang, OcrRegion, OcrText, ProvenanceEvent, ProvenanceKind};
 use serde::{Deserialize, Serialize};
 use time::OffsetDateTime;

+use crate::image_prep;
+
 /// Engine name written into `OcrText.engine` for the Ollama-vision adapter.
 pub const OLLAMA_VISION_ENGINE: &str = "ollama-vision";

@@ -239,7 +239,7 @@ impl OcrEngine for OllamaVisionOcr {
        image_bytes: &[u8],
        lang_hint: Option<&Lang>,
    ) -> Result<OcrText> {
-        let (prepared, w, h) = downscale_to_long_edge(image_bytes, self.max_pixels)
+        let (prepared, w, h) = image_prep::downscale_to_png(image_bytes, self.max_pixels)
            .context("preparing image for OCR")?;
        let b64 = BASE64_STANDARD.encode(&prepared);

@@ -311,71 +311,6 @@ impl OcrEngine for OllamaVisionOcr {
    }
 }

-// ── Image preparation ─────────────────────────────────────────────────────
-
-/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`,
-/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)`.
-///
-/// PNG sources that already fit the cap are passthrough (zero decodes,
-/// just a `Vec` clone). Every other path decodes the image exactly
-/// once: the cheap header sniff peeks at the format / dimensions before
-/// committing to a decode, so non-PNG passthrough and downscale share
-/// the same `decode → optionally resize → re-encode` tail.
-fn downscale_to_long_edge(bytes: &[u8], max_long_edge: u32) -> Result<(Vec<u8>, u32, u32)> {
-    let reader = ImageReader::new(Cursor::new(bytes))
-        .with_guessed_format()
-        .context("reading image header for OCR")?;
-    let format = reader.format();
-    let (w, h) = reader
-        .into_dimensions()
-        .context("reading image dimensions for OCR")?;
-
-    let long = w.max(h);
-
-    // Hot path — PNG within budget already matches the wire format we
-    // send Ollama, so we ship the bytes verbatim without paying for a
-    // decode + re-encode round-trip.
-    if long <= max_long_edge && format == Some(ImageFormat::Png) {
-        return Ok((bytes.to_vec(), w, h));
-    }
-
-    // Every remaining branch needs the pixels — either to re-encode as
-    // PNG (non-PNG within budget) or to resize first (over budget).
-    // One decode covers both.
-    let img = ImageReader::new(Cursor::new(bytes))
-        .with_guessed_format()
-        .context("re-reading image for OCR decode")?
-        .decode()
-        .context("decoding image for OCR")?;
-
-    let (final_w, final_h, final_img) = if long <= max_long_edge {
-        (w, h, img)
-    } else {
-        let scale = max_long_edge as f32 / long as f32;
-        let mut new_w = ((w as f32) * scale).round().max(1.0) as u32;
-        let mut new_h = ((h as f32) * scale).round().max(1.0) as u32;
-        // Independent rounding of the two axes can let `f32`'s nearest
-        // round push the long axis one pixel past `max_long_edge` for
-        // irrational scales (e.g. `max=1601, long=4001`). Pin the long
-        // axis to exactly `max_long_edge` so the doc-comment's
-        // "long edge is at most max_long_edge" stays a strict bound.
-        if w >= h {
-            new_w = new_w.min(max_long_edge);
-        } else {
-            new_h = new_h.min(max_long_edge);
-        }
-        let resized =
-            img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle);
-        (new_w, new_h, resized)
-    };
-
-    let mut out = Cursor::new(Vec::new());
-    final_img
-        .write_to(&mut out, ImageFormat::Png)
-        .context("encoding image as PNG for OCR")?;
-    Ok((out.into_inner(), final_w, final_h))
-}
-
 fn truncate(s: &str, n: usize) -> String {
    if s.chars().count() <= n {
        return s.to_string();
--- a/crates/kebab-parse-image/tests/caption.rs
+++ b/crates/kebab-parse-image/tests/caption.rs
@@ -0,0 +1,366 @@
+//! Integration tests for the caption adapter (P6-3).
+//!
+//! All hermetic tests use `MockLanguageModel` from `kebab-llm/mock`
+//! which captures `req.images` indirectly via the canned response. A
+//! single opt-in test (`#[ignore]`) wires the real
+//! `kebab-llm-local::OllamaLanguageModel` against the workspace's
+//! Ollama daemon to verify the `images: [base64]` round-trip.
+
+mod common;
+
+use std::sync::{Arc, Mutex};
+
+use kebab_config::Config;
+use kebab_core::{
+    AssetId, BlockId, CommonBlock, FinishReason, GenerateRequest, ImageRefBlock, Lang,
+    LanguageModel, ModelRef, ProvenanceEvent, ProvenanceKind, SourceSpan, TokenChunk,
+    TokenUsage,
+};
+use kebab_llm::MockLanguageModel;
+use kebab_parse_image::{apply_caption, caption_image};
+
+use crate::common::red_100x50_png;
+
+fn cfg_with_caption_enabled() -> Config {
+    let mut cfg = Config::defaults();
+    cfg.image.caption.enabled = true;
+    cfg.image.caption.max_pixels = 512;
+    cfg
+}
+
+fn empty_image_block() -> ImageRefBlock {
+    ImageRefBlock {
+        common: CommonBlock {
+            block_id: BlockId("0".repeat(32)),
+            heading_path: Vec::new(),
+            source_span: SourceSpan::Region {
+                x: 0,
+                y: 0,
+                w: 100,
+                h: 50,
+            },
+        },
+        asset_id: Some(AssetId("a".repeat(32))),
+        src: "img/x.png".to_string(),
+        alt: "x.png".to_string(),
+        ocr: None,
+        caption: None,
+    }
+}
+
+fn mk_mock(canned: &str) -> MockLanguageModel {
+    MockLanguageModel {
+        model_id: "vision-mock:1b".to_string(),
+        provider: "mock".to_string(),
+        context_tokens: 4096,
+        canned_response: canned.to_string(),
+        canned_finish: FinishReason::Stop,
+        canned_usage: TokenUsage {
+            prompt_tokens: 0,
+            completion_tokens: 0,
+            latency_ms: 0,
+        },
+    }
+}
+
+// ── Disabled feature gate ─────────────────────────────────────────────────
+
+#[test]
+fn apply_caption_no_op_when_feature_disabled() {
+    let mut cfg = Config::defaults();
+    cfg.image.caption.enabled = false;
+    let mock = mk_mock("ignored");
+    let mut block = empty_image_block();
+    let mut events: Vec<ProvenanceEvent> = Vec::new();
+    let bytes = red_100x50_png();
+    apply_caption(&mock, &bytes, &mut block, None, &cfg, &mut events)
+        .expect("disabled apply_caption must return Ok(())");
+    assert!(
+        block.caption.is_none(),
+        "disabled apply_caption must not write caption"
+    );
+    assert!(
+        events.is_empty(),
+        "disabled apply_caption must not append a Provenance event"
+    );
+}
+
+#[test]
+fn caption_image_runs_regardless_of_enabled_flag() {
+    // Feature gate lives in `apply_caption`; `caption_image` is the
+    // raw operation. Calling it directly with enabled = false must
+    // still produce a `ModelCaption` so tests can pin the produced
+    // shape independent of pipeline gating.
+    let cfg = Config::defaults(); // enabled = false (default)
+    let mock = mk_mock("hi");
+    let bytes = red_100x50_png();
+    let cap = caption_image(&mock, &bytes, None, &cfg)
+        .expect("caption_image runs even when enabled = false");
+    assert_eq!(cap.text, "hi");
+}
+
+// ── Happy path ────────────────────────────────────────────────────────────
+
+#[test]
+fn apply_caption_sets_block_caption_and_appends_provenance() {
+    let cfg = cfg_with_caption_enabled();
+    let mock = mk_mock("사진 한 장");
+    let mut block = empty_image_block();
+    let mut events: Vec<ProvenanceEvent> = Vec::new();
+    let bytes = red_100x50_png();
+    apply_caption(
+        &mock,
+        &bytes,
+        &mut block,
+        Some(&Lang("ko".to_string())),
+        &cfg,
+        &mut events,
+    )
+    .expect("apply_caption must succeed");
+
+    let cap = block.caption.as_ref().expect("caption Some");
+    assert_eq!(cap.text, "사진 한 장");
+    assert_eq!(cap.model, "vision-mock:1b");
+    assert_eq!(cap.model_version, "mock/caption-v1");
+
+    assert_eq!(events.len(), 1);
+    assert_eq!(events[0].kind, ProvenanceKind::CaptionApplied);
+    assert_eq!(events[0].agent, "kb-parse-image");
+    let note = events[0].note.as_deref().unwrap_or("");
+    assert!(note.contains("vision-mock:1b") && note.contains("caption-v1"), "{note}");
+}
+
+// ── Empty token stream → empty caption text ──────────────────────────────
+
+#[test]
+fn caption_image_empty_stream_yields_empty_text() {
+    let cfg = cfg_with_caption_enabled();
+    let mock = mk_mock("");
+    let bytes = red_100x50_png();
+    let cap = caption_image(&mock, &bytes, None, &cfg).expect("empty stream must succeed");
+    assert_eq!(cap.text, "");
+    // Spec contract: caller can distinguish "captioning attempted, no
+    // result" from "captioning never attempted" by `caption.is_some()`.
+    // The text being empty does not erase the attempt.
+    assert!(!cap.model.is_empty());
+}
+
+// ── Korean vs English prompt selection ───────────────────────────────────
+
+/// `LanguageModel` impl that captures the `system` prompt sent to it
+/// so tests can verify the language branch picked by `build_prompt`
+/// (the function is private; this is the cleanest observable signal).
+struct CapturingMock {
+    captured_system: Arc<Mutex<Option<String>>>,
+    captured_images: Arc<Mutex<Vec<String>>>,
+}
+
+impl LanguageModel for CapturingMock {
+    fn model_ref(&self) -> ModelRef {
+        ModelRef {
+            id: "capture:1".to_string(),
+            provider: "mock".to_string(),
+            dimensions: None,
+        }
+    }
+    fn context_tokens(&self) -> usize {
+        4096
+    }
+    fn generate_stream(
+        &self,
+        req: GenerateRequest,
+    ) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>> {
+        *self.captured_system.lock().unwrap() = Some(req.system);
+        *self.captured_images.lock().unwrap() = req.images;
+        let chunks: Vec<TokenChunk> = vec![
+            TokenChunk::Token("ok".to_string()),
+            TokenChunk::Done {
+                finish_reason: FinishReason::Stop,
+                usage: TokenUsage {
+                    prompt_tokens: 0,
+                    completion_tokens: 0,
+                    latency_ms: 0,
+                },
+            },
+        ];
+        Ok(Box::new(chunks.into_iter().map(Ok)))
+    }
+}
+
+#[test]
+fn caption_image_routes_image_into_request_images_field() {
+    let cfg = cfg_with_caption_enabled();
+    let captured_system: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
+    let captured_images: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
+    let mock = CapturingMock {
+        captured_system: captured_system.clone(),
+        captured_images: captured_images.clone(),
+    };
+    let bytes = red_100x50_png();
+    let _ = caption_image(&mock, &bytes, Some(&Lang("ko".to_string())), &cfg)
+        .expect("caption succeeds");
+
+    let imgs = captured_images.lock().unwrap();
+    assert_eq!(imgs.len(), 1, "exactly one base64 image routed");
+    use base64::Engine as _;
+    let decoded = base64::engine::general_purpose::STANDARD
+        .decode(&imgs[0])
+        .expect("base64 decodes");
+    assert!(
+        !decoded.is_empty(),
+        "decoded image bytes must be non-empty"
+    );
+
+    let sys = captured_system.lock().unwrap().clone().unwrap();
+    assert!(
+        sys.contains("이미지를 한 문장으로"),
+        "Korean hint must produce Korean system prompt: {sys}"
+    );
+}
+
+#[test]
+fn caption_image_uses_english_prompt_for_undetermined_lang() {
+    let cfg = cfg_with_caption_enabled();
+    let captured_system: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
+    let mock = CapturingMock {
+        captured_system: captured_system.clone(),
+        captured_images: Arc::new(Mutex::new(Vec::new())),
+    };
+    let bytes = red_100x50_png();
+    let _ = caption_image(&mock, &bytes, Some(&Lang("und".to_string())), &cfg)
+        .expect("caption succeeds");
+    let sys = captured_system.lock().unwrap().clone().unwrap();
+    assert!(sys.contains("Describe the image"), "{sys}");
+}
+
+// ── LM error propagates ──────────────────────────────────────────────────
+
+/// LM that returns Err immediately from `generate_stream` (before any
+/// token).
+struct FailingLm;
+impl LanguageModel for FailingLm {
+    fn model_ref(&self) -> ModelRef {
+        ModelRef {
+            id: "fail".into(),
+            provider: "mock".into(),
+            dimensions: None,
+        }
+    }
+    fn context_tokens(&self) -> usize {
+        0
+    }
+    fn generate_stream(
+        &self,
+        _req: GenerateRequest,
+    ) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>> {
+        Err(anyhow::anyhow!("simulated LM connection refused"))
+    }
+}
+
+#[test]
+fn apply_caption_lm_error_leaves_block_untouched() {
+    let cfg = cfg_with_caption_enabled();
+    let mut block = empty_image_block();
+    let mut events: Vec<ProvenanceEvent> = Vec::new();
+    let bytes = red_100x50_png();
+    let r = apply_caption(&FailingLm, &bytes, &mut block, None, &cfg, &mut events);
+    assert!(r.is_err());
+    assert!(
+        block.caption.is_none(),
+        "caption stays None when LM fails — partial state must not leak"
+    );
+    assert!(events.is_empty(), "no provenance event when LM fails");
+}
+
+// ── Determinism — identical mock input → identical caption ───────────────
+
+#[test]
+fn caption_image_deterministic_with_identical_inputs() {
+    let cfg = cfg_with_caption_enabled();
+    let bytes = red_100x50_png();
+    let mock1 = mk_mock("a deterministic caption");
+    let mock2 = mk_mock("a deterministic caption");
+    let cap1 = caption_image(&mock1, &bytes, None, &cfg).unwrap();
+    let cap2 = caption_image(&mock2, &bytes, None, &cfg).unwrap();
+    assert_eq!(cap1, cap2);
+}
+
+// ── max_pixels clamp ─────────────────────────────────────────────────────
+
+/// Out-of-range `max_pixels` is silently clamped at construction so a
+/// bad config can't kill ingest. The captured `images` field's
+/// decoded long edge confirms the clamp engaged.
+#[test]
+fn caption_image_clamps_oversized_max_pixels() {
+    let mut cfg = Config::defaults();
+    cfg.image.caption.enabled = true;
+    cfg.image.caption.max_pixels = 99_999; // way over MAX_CAPTION_LONG_EDGE
+    let captured_images: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
+    let mock = CapturingMock {
+        captured_system: Arc::new(Mutex::new(None)),
+        captured_images: captured_images.clone(),
+    };
+    // 4000×3000 PNG well above the 1536 cap.
+    let bytes = common::large_blue_4000x3000_png();
+    let _ = caption_image(&mock, &bytes, None, &cfg).expect("caption succeeds");
+    let imgs = captured_images.lock().unwrap();
+    use base64::Engine as _;
+    let decoded = base64::engine::general_purpose::STANDARD
+        .decode(&imgs[0])
+        .unwrap();
+    let reader = image::ImageReader::new(std::io::Cursor::new(decoded))
+        .with_guessed_format()
+        .unwrap();
+    let (w, h) = reader.into_dimensions().unwrap();
+    let long = w.max(h);
+    assert!(
+        long <= kebab_parse_image::caption::MAX_CAPTION_LONG_EDGE,
+        "max_pixels must clamp to MAX_CAPTION_LONG_EDGE={}, got {long}",
+        kebab_parse_image::caption::MAX_CAPTION_LONG_EDGE
+    );
+}
+
+// ── Real Ollama integration (opt-in) ─────────────────────────────────────
+
+/// End-to-end captioning against the workspace's real Ollama daemon
+/// via `kebab-llm-local::OllamaLanguageModel` (dev-dep). Skipped by
+/// default via `#[ignore]`; opt in with `--ignored`.
+///
+/// Run with:
+///
+/// ```sh
+/// KEBAB_MODELS_LLM_ENDPOINT=http://192.168.0.47:11434 \
+/// KEBAB_MODELS_LLM_MODEL=gemma4:e4b \
+/// cargo test -p kebab-parse-image --test caption \
+///   caption_integration -- --ignored --nocapture
+/// ```
+#[test]
+#[ignore = "hits a real Ollama daemon; opt in via `cargo test -- --ignored`"]
+fn caption_integration_real_ollama_describes_image() {
+    use kebab_llm_local::OllamaLanguageModel;
+
+    let mut cfg = Config::defaults();
+    cfg.image.caption.enabled = true;
+    cfg.image.caption.max_pixels = 768;
+    if let Ok(ep) = std::env::var("KEBAB_MODELS_LLM_ENDPOINT") {
+        cfg.models.llm.endpoint = ep;
+    } else {
+        cfg.models.llm.endpoint = "http://192.168.0.47:11434".to_string();
+    }
+    if let Ok(m) = std::env::var("KEBAB_MODELS_LLM_MODEL") {
+        cfg.models.llm.model = m;
+    } else {
+        cfg.models.llm.model = "gemma4:e4b".to_string();
+    }
+    cfg.models.llm.provider = "ollama".to_string();
+
+    let llm = OllamaLanguageModel::new(&cfg).expect("OllamaLanguageModel::new");
+    let bytes = red_100x50_png();
+    let cap = caption_image(&llm, &bytes, Some(&Lang("en".to_string())), &cfg)
+        .expect("real-Ollama caption_image must succeed");
+    eprintln!("integration caption: {}", cap.text);
+    assert!(!cap.text.is_empty(), "caption must be non-empty");
+    assert_eq!(cap.model, "gemma4:e4b");
+    assert!(cap.model_version.contains("ollama"));
+    assert!(cap.model_version.contains("caption-v1"));
+}
--- a/crates/kebab-rag/src/pipeline.rs
+++ b/crates/kebab-rag/src/pipeline.rs
@@ -195,6 +195,9 @@ impl RagPipeline {
            max_tokens: max_completion,
            temperature,
            seed,
+            // RAG is text-only — vision inputs only flow when a
+            // future multimodal pipeline injects images here.
+            images: Vec::new(),
        };

        let mut acc = String::new();
--- a/tasks/HOTFIXES.md
+++ b/tasks/HOTFIXES.md
@@ -14,6 +14,32 @@ historical contract that was implemented; this file accumulates the
 deltas so phase 5+ readers can find the live behavior without diffing
 git history.

+## 2026-05-02 — P6-3 caption: GenerateRequest.images + cargo feature dropped
+
+**Discovered**: P6-3 implementation start.
+
+**Symptom 1**: `tasks/p6/p6-3-caption-adapter.md` § Public surface declares `caption_image(llm: &dyn kebab_core::LanguageModel, ...)`, but the frozen `LanguageModel` trait + `GenerateRequest` from p4-1 carry no vision input. The spec's behavior contract ("the adapter is responsible for rendering the prompt to wire") implicitly relied on a trait extension that p4-1 never specced.
+
+**Symptom 2**: Spec § Definition of Done asks for `cargo check -p kebab-parse-image --features caption` — i.e. a cargo feature gate. The captioning module's only extra deps are `base64` + `image` + the `kebab-llm` trait, all already pulled in by P6-2. A cargo feature would only complicate the build matrix without saving meaningful binary weight.
+
+**Root cause**: Two small spec gaps that resolve cleanly together — extend the `LanguageModel` trait once for vision routing, and collapse compile-time + runtime gating into a single runtime gate.
+
+**Fix** (PR #34, feat/p6-3-caption-adapter):
+- `kebab-core::GenerateRequest` gains an `images: Vec<String>` field (`#[serde(default)]` for backward compat with pre-P6 wire payloads / snapshots). Empty for the text-only RAG path; populated with one or more base64 strings by vision-aware callers.
+- `kebab-llm-local::OllamaLanguageModel` routes `req.images` onto the wire as `images: [base64, ...]` (Ollama's vision channel). The wire shape stays byte-identical for empty `images` because the field uses `#[serde(skip_serializing_if = "<[String]>::is_empty")]`.
+- `kebab-parse-image::caption` module: `caption_image` / `apply_caption` build `GenerateRequest { images: vec![b64], temperature: 0.0, seed: 0, ... }` and accept any `&dyn LanguageModel`. Korean / English prompt branch picked from `lang_hint`.
+- Cargo feature `caption` is **not** introduced — the runtime gate `config.image.caption.enabled = false` (default OFF) suffices.
+- All existing `GenerateRequest { ... }` literals (kebab-rag, kebab-llm tests, kebab-llm-local tests) gained `images: Vec::new()` to satisfy the new field.
+
+**Trust note**: Captions stay explicitly model-generated. `ModelCaption.model_version` carries `"<provider>/<prompt_template_version>"` (e.g. `"ollama/caption-v1"`) so a regression in either prompt or model is auditable from the wire.
+
+**`model_version` shape deviation**: spec literal says `model_version: llm.model_ref().provider` (provider as a coarse version proxy). We extend to `<provider>/<prompt_template_version>` because prompt template churn is a real regression vector independent of the model — pinning both axes in one string lets `kebab-eval` (P5) detect either drift without a schema bump. Spec already left the door open ("if a vision model exposes a stable revision, prefer that"); the prompt template version is the closest stable revision we have today. Future PaddleOCR / Apple Vision adapters that expose a real model revision string can substitute it for `prompt_template_version` without breaking the wire shape.
+
+**Amends**:
+- tasks/p4/p4-1-llm-trait.md (`GenerateRequest` schema gained `images: Vec<String>`).
+- tasks/p4/p4-2-ollama-adapter.md (request body now optionally includes `images: [...]`).
+- tasks/p6/p6-3-caption-adapter.md ("Definition of Done" cargo feature `caption` dropped; runtime gate is the only feature gate).
+
 ## 2026-05-02 — P6-2 default OCR engine: Tesseract → Ollama-vision

 **Discovered**: P6-2 implementation start.
--- a/tasks/p6/p6-3-caption-adapter.md
+++ b/tasks/p6/p6-3-caption-adapter.md
@@ -3,7 +3,7 @@ phase: P6
 component: kebab-parse-image (caption adapter)
 task_id: p6-3
 title: "ModelCaption adapter (LanguageModel-driven, feature-gated)"
-status: planned
+status: completed
 depends_on: [p6-1, p4-2]
 unblocks: []
 contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md