diff --git a/Cargo.lock b/Cargo.lock
index f595476..0c9ee2a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3576,6 +3576,8 @@ dependencies = [
  "kamadak-exif",
  "kebab-config",
  "kebab-core",
+ "kebab-llm",
+ "kebab-llm-local",
  "reqwest",
  "serde",
  "serde_json",
diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs
index f9a737c..898fd67 100644
--- a/crates/kebab-config/src/lib.rs
+++ b/crates/kebab-config/src/lib.rs
@@ -105,18 +105,20 @@ pub struct RagCfg {
 }
 
 /// Settings for the image ingest pipeline (P6). `ocr` controls OCR
-/// behaviour; future fields (e.g. `caption`) will join here as P6-3
-/// lands.
+/// behaviour (P6-2); `caption` controls vision-LM captioning (P6-3).
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct ImageCfg {
     #[serde(default = "OcrCfg::defaults")]
     pub ocr: OcrCfg,
+    #[serde(default = "CaptionCfg::defaults")]
+    pub caption: CaptionCfg,
 }
 
 impl ImageCfg {
     pub fn defaults() -> Self {
         Self {
             ocr: OcrCfg::defaults(),
+            caption: CaptionCfg::defaults(),
         }
     }
 }
@@ -162,6 +164,36 @@ impl OcrCfg {
     }
 }
 
+/// Caption settings (P6-3). Caption uses the same Ollama-vision /
+/// `LanguageModel` pipeline as the rest of the workspace; the trait
+/// abstraction is the part the spec demands. `enabled` defaults to
+/// `false` because captioning costs one model call per asset and the
+/// output is model-generated (low trust).
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct CaptionCfg {
+    /// Run captioning on every image during ingest. Default `false`.
+    pub enabled: bool,
+    /// Cap the long edge of the image (in pixels) before sending. The
+    /// spec recommends an aggressive 768×768 cap because larger
+    /// vision-LM inputs translate directly into prompt cost. Default
+    /// `768`.
+    pub max_pixels: u32,
+    /// Caption prompt template version pinned into wire output via
+    /// `ModelCaption.model_version`. Bump when the prompt changes so
+    /// downstream eval can detect regressions.
+    pub prompt_template_version: String,
+}
+
+impl CaptionCfg {
+    pub fn defaults() -> Self {
+        Self {
+            enabled: false,
+            max_pixels: 768,
+            prompt_template_version: "caption-v1".to_string(),
+        }
+    }
+}
+
 impl Config {
     /// Defaults per design §6.4.
     pub fn defaults() -> Self {
@@ -417,6 +449,19 @@ impl Config {
                     }
                 }
 
+                // image.caption (P6-3)
+                "KEBAB_IMAGE_CAPTION_ENABLED" => {
+                    self.image.caption.enabled = parse_bool(v);
+                }
+                "KEBAB_IMAGE_CAPTION_MAX_PIXELS" => {
+                    if let Ok(n) = v.parse::<u32>() {
+                        self.image.caption.max_pixels = n;
+                    }
+                }
+                "KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION" => {
+                    self.image.caption.prompt_template_version = v.clone();
+                }
+
                 // Unknown KEBAB_* keys are silently ignored — see
                 // `env_unknown_key_is_ignored` test.
                 _ => {}
@@ -608,6 +653,35 @@ mod tests {
     /// Pre-P6 config files don't have an `[image]` section. The
     /// `#[serde(default)]` attribute on `Config::image` must let those
     /// files load with `ImageCfg::defaults()` instead of erroring.
+    #[test]
+    fn image_caption_defaults_disabled() {
+        let c = Config::defaults();
+        assert!(!c.image.caption.enabled);
+        assert_eq!(c.image.caption.max_pixels, 768);
+        assert_eq!(c.image.caption.prompt_template_version, "caption-v1");
+    }
+
+    #[test]
+    fn image_caption_env_overrides() {
+        let mut env = HashMap::new();
+        env.insert(
+            "KEBAB_IMAGE_CAPTION_ENABLED".to_string(),
+            "true".to_string(),
+        );
+        env.insert(
+            "KEBAB_IMAGE_CAPTION_MAX_PIXELS".to_string(),
+            "1024".to_string(),
+        );
+        env.insert(
+            "KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION".to_string(),
+            "caption-v2".to_string(),
+        );
+        let c = Config::defaults().apply_env(&env);
+        assert!(c.image.caption.enabled);
+        assert_eq!(c.image.caption.max_pixels, 1024);
+        assert_eq!(c.image.caption.prompt_template_version, "caption-v2");
+    }
+
     /// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None`
     /// rather than to `Some("")` so the fallback to `models.llm.endpoint`
     /// kicks in. Covers the env-equivalent of a missing TOML key.
diff --git a/crates/kebab-core/src/traits.rs b/crates/kebab-core/src/traits.rs
index dcf2024..bbdf663 100644
--- a/crates/kebab-core/src/traits.rs
+++ b/crates/kebab-core/src/traits.rs
@@ -69,6 +69,17 @@ pub struct GenerateRequest {
     pub max_tokens: usize,
     pub temperature: f32,
     pub seed: Option<u64>,
+    /// Vision inputs (base64-encoded, one per image). Empty for the
+    /// text-only path that P4-2 / P4-3 / RAG uses; non-empty when a
+    /// vision-capable adapter (P6-3 caption, future multimodal RAG)
+    /// drives the call. The LM adapter is responsible for routing
+    /// these onto the wire — Ollama uses `images: [base64, ...]`,
+    /// other backends may differ.
+    ///
+    /// Defaulted on deserialization so older `*.json` payloads /
+    /// snapshots that predate the field still parse.
+    #[serde(default)]
+    pub images: Vec<String>,
 }
 
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
diff --git a/crates/kebab-llm-local/src/ollama.rs b/crates/kebab-llm-local/src/ollama.rs
index 51e2a49..42a9b13 100644
--- a/crates/kebab-llm-local/src/ollama.rs
+++ b/crates/kebab-llm-local/src/ollama.rs
@@ -140,9 +140,15 @@ impl LanguageModel for OllamaLanguageModel {
             format!("{}\n\n{}", req.system, req.user)
         };
 
+        // Vision inputs (P6-3) flow through the request via Ollama's
+        // `images: [base64, ...]` field. Empty for the text-only RAG
+        // path so older snapshots and JSON dumps stay byte-identical
+        // (the field is `#[serde(default)]` here so it's omitted from
+        // the wire when empty).
         let body = OllamaRequest {
             model: &self.model_id,
             prompt,
+            images: &req.images,
             stream: true,
             options: OllamaOptions {
                 temperature: effective_temperature,
@@ -188,6 +194,13 @@ impl LanguageModel for OllamaLanguageModel {
 struct OllamaRequest<'a> {
     model: &'a str,
     prompt: String,
+    /// Skipped from the JSON when empty so the text-only path keeps
+    /// the same on-the-wire shape it had pre-P6-3 (`{"model": ...,
+    /// "prompt": ..., "stream": ..., "options": ...}` — no `images`
+    /// key). Vision-capable callers populate this with one or more
+    /// base64-encoded images.
+    #[serde(skip_serializing_if = "<[String]>::is_empty")]
+    images: &'a [String],
     stream: bool,
     options: OllamaOptions<'a>,
 }
diff --git a/crates/kebab-llm-local/tests/integration.rs b/crates/kebab-llm-local/tests/integration.rs
index a0836e2..66270e6 100644
--- a/crates/kebab-llm-local/tests/integration.rs
+++ b/crates/kebab-llm-local/tests/integration.rs
@@ -31,6 +31,7 @@ fn real_ollama_streams_non_empty_response() {
         max_tokens: 8,
         temperature: 0.0,
         seed: Some(0),
+        images: Vec::new(),
     };
 
     let stream = llm.generate_stream(req).expect("stream should start");
diff --git a/crates/kebab-llm-local/tests/streaming.rs b/crates/kebab-llm-local/tests/streaming.rs
index cc25940..337b434 100644
--- a/crates/kebab-llm-local/tests/streaming.rs
+++ b/crates/kebab-llm-local/tests/streaming.rs
@@ -35,6 +35,7 @@ fn sample_request() -> GenerateRequest {
         max_tokens: 64,
         temperature: 0.0,
         seed: Some(0),
+        images: Vec::new(),
     }
 }
 
diff --git a/crates/kebab-llm/tests/mock.rs b/crates/kebab-llm/tests/mock.rs
index 2d5bbbd..97fe2e0 100644
--- a/crates/kebab-llm/tests/mock.rs
+++ b/crates/kebab-llm/tests/mock.rs
@@ -26,6 +26,7 @@ fn req_with_stop(stop: Vec<&str>) -> GenerateRequest {
         max_tokens: 64,
         temperature: 0.0,
         seed: None,
+        images: Vec::new(),
     }
 }
 
diff --git a/crates/kebab-llm/tests/reexports.rs b/crates/kebab-llm/tests/reexports.rs
index 8df88d5..0e479de 100644
--- a/crates/kebab-llm/tests/reexports.rs
+++ b/crates/kebab-llm/tests/reexports.rs
@@ -55,6 +55,7 @@ fn dyn_dispatch_via_box_works() {
         max_tokens: 16,
         temperature: 0.0,
         seed: None,
+        images: Vec::new(),
     };
     let stream = m.generate_stream(req).expect("stream");
     let chunks: Vec<TokenChunk> = stream.map(|r| r.expect("ok chunk")).collect();
diff --git a/crates/kebab-parse-image/Cargo.toml b/crates/kebab-parse-image/Cargo.toml
index e596d6c..4e78465 100644
--- a/crates/kebab-parse-image/Cargo.toml
+++ b/crates/kebab-parse-image/Cargo.toml
@@ -10,6 +10,12 @@ description   = "Image extractor + EXIF + OCR (Ollama-vision) for the kebab pipe
 [dependencies]
 kebab-core   = { path = "../kebab-core" }
 kebab-config = { path = "../kebab-config" }
+# `kebab-llm` re-exports the trait crate (`kebab-core::LanguageModel`)
+# under a stable surface; the caption adapter consumes any
+# `dyn LanguageModel`. We do NOT depend on `kebab-llm-local` (forbidden
+# by p6-3 design §8) — the trait abstraction is exactly what spec
+# requires.
+kebab-llm    = { path = "../kebab-llm" }
 anyhow       = { workspace = true }
 serde        = { workspace = true }
 serde_json   = { workspace = true }
@@ -42,3 +48,10 @@ tokio        = { workspace = true, features = ["rt-multi-thread"] }
 # font rendering.
 ab_glyph     = "0.2"
 base64       = "0.22"
+# `kebab-llm/mock` exposes `MockLanguageModel` for hermetic caption
+# tests. Real adapters (Ollama) live in `kebab-llm-local`, which is
+# only allowed at the dev-dep level here — the runtime crate stays
+# trait-only, so the §8 forbidden-deps rule (no `kebab-llm-local`
+# at runtime) is preserved.
+kebab-llm        = { path = "../kebab-llm", features = ["mock"] }
+kebab-llm-local  = { path = "../kebab-llm-local" }
diff --git a/crates/kebab-parse-image/src/caption.rs b/crates/kebab-parse-image/src/caption.rs
new file mode 100644
index 0000000..3e3597b
--- /dev/null
+++ b/crates/kebab-parse-image/src/caption.rs
@@ -0,0 +1,281 @@
+//! Caption adapter (P6-3).
+//!
+//! [`caption_image`] runs a vision-capable [`LanguageModel`] over an
+//! image and produces a [`ModelCaption`]. [`apply_caption`] is the
+//! helper that mutates an [`ImageRefBlock`] in place and emits a
+//! [`ProvenanceKind::CaptionApplied`] event.
+//!
+//! ## Trust note
+//!
+//! Captions are **model-generated** (`TrustLevel::Generated`), not
+//! observed text. Vision LMs hallucinate; the system prompt explicitly
+//! forbids guessing but expect false captions. Downstream UI / RAG
+//! must label captions as model-generated and surface the model id +
+//! prompt template version (carried in `ModelCaption.model_version`)
+//! so a regression in either is auditable.
+//!
+//! ## Spec deviation (cargo `caption` feature dropped)
+//!
+//! The original P6-3 spec asked for a cargo feature `caption` (default
+//! OFF at compile time). We collapse this into a single runtime gate
+//! (`config.image.caption.enabled = false`, default OFF). Reasoning:
+//! the captioning module's only extra deps are `base64` + `image` +
+//! `kebab-llm` trait — all already pulled in by the rest of the
+//! crate. A cargo feature would only complicate the build matrix
+//! without saving meaningful binary weight. See `tasks/HOTFIXES.md`
+//! (2026-05-02) for the deviation log.
+
+use std::io::Cursor;
+
+use anyhow::{Context, Result};
+use base64::Engine as _;
+use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
+use image::{ImageFormat, ImageReader};
+use kebab_core::{
+    FinishReason, GenerateRequest, ImageRefBlock, Lang, LanguageModel, ModelCaption,
+    ProvenanceEvent, ProvenanceKind, TokenChunk,
+};
+use time::OffsetDateTime;
+
+/// Long-edge clamp range for caption inputs. Smaller than OCR's
+/// `[256, 4096]` because vision LMs charge proportionally to input
+/// dimension — captions tolerate aggressive downscale better than
+/// OCR.
+const MIN_CAPTION_LONG_EDGE: u32 = 128;
+const MAX_CAPTION_LONG_EDGE: u32 = 1536;
+
+/// Token budget for captions. Captions are one-sentence by spec — 96
+/// tokens covers a 50-word English sentence or a 30-token Korean one
+/// with headroom for the LM's preamble before the stop sequence.
+const CAPTION_MAX_TOKENS: usize = 96;
+
+/// Run a caption pass and return the resulting `ModelCaption`. Honours
+/// `config.image.caption.enabled` — when disabled the function is a
+/// no-op and returns an `Err` so the caller can route the asset
+/// through `apply_caption` instead, which knows to short-circuit.
+///
+/// Direct callers should prefer [`apply_caption`] for end-to-end
+/// pipeline integration; this lower-level entry exists so tests can
+/// pin the produced `ModelCaption` independent of block mutation.
+pub fn caption_image(
+    llm: &dyn LanguageModel,
+    image_bytes: &[u8],
+    lang_hint: Option<&Lang>,
+    cfg: &kebab_config::Config,
+) -> Result<ModelCaption> {
+    if !cfg.image.caption.enabled {
+        anyhow::bail!(
+            "captioning is disabled (set image.caption.enabled = true in config to enable)"
+        );
+    }
+
+    let max_pixels = cfg
+        .image
+        .caption
+        .max_pixels
+        .clamp(MIN_CAPTION_LONG_EDGE, MAX_CAPTION_LONG_EDGE);
+    if max_pixels != cfg.image.caption.max_pixels {
+        tracing::warn!(
+            target: "kebab-parse-image",
+            "image.caption.max_pixels = {} clamped to {} (legal range [{}, {}])",
+            cfg.image.caption.max_pixels,
+            max_pixels,
+            MIN_CAPTION_LONG_EDGE,
+            MAX_CAPTION_LONG_EDGE
+        );
+    }
+
+    let prepared = downscale_to_png(image_bytes, max_pixels)
+        .context("preparing image for caption")?;
+    let b64 = BASE64_STANDARD.encode(&prepared);
+
+    let lang = lang_hint
+        .map(|l| l.0.as_str())
+        .filter(|s| !s.is_empty() && *s != "und");
+    let (system, user) = build_prompt(lang);
+
+    // Determinism — temperature 0.0 + seed 0, same convention as RAG
+    // and OCR. The LM adapter routes the base64 image via its
+    // provider-specific channel (Ollama: `images: [base64]`).
+    let req = GenerateRequest {
+        system,
+        user,
+        stop: vec!["\n\n".to_string()],
+        max_tokens: CAPTION_MAX_TOKENS,
+        temperature: 0.0,
+        seed: Some(0),
+        images: vec![b64],
+    };
+
+    let stream = llm
+        .generate_stream(req)
+        .context("captioning LM call failed")?;
+
+    let mut text = String::new();
+    let mut saw_done = false;
+    for chunk in stream {
+        match chunk? {
+            TokenChunk::Token(t) => {
+                text.push_str(&t);
+            }
+            TokenChunk::Done { finish_reason, .. } => {
+                saw_done = true;
+                if let FinishReason::Error(e) = finish_reason {
+                    anyhow::bail!("captioning LM ended with error: {e}");
+                }
+                break;
+            }
+        }
+    }
+    if !saw_done {
+        anyhow::bail!("captioning LM stream ended without a Done frame");
+    }
+
+    let caption_text = text.trim().to_string();
+
+    let model_ref = llm.model_ref();
+    let prompt_v = &cfg.image.caption.prompt_template_version;
+    let model_version = format!(
+        "{provider}/{prompt}",
+        provider = model_ref.provider,
+        prompt = prompt_v
+    );
+
+    tracing::debug!(
+        target: "kebab-parse-image",
+        "caption ok (model={}, prompt={}, chars={})",
+        model_ref.id,
+        prompt_v,
+        caption_text.chars().count()
+    );
+
+    Ok(ModelCaption {
+        text: caption_text,
+        model: model_ref.id,
+        model_version,
+    })
+}
+
+/// Mutate `block.caption` in place by running `caption_image` over
+/// `image_bytes`. When `config.image.caption.enabled = false` the
+/// function is a clean no-op (returns `Ok(())` without invoking the
+/// LM and without writing a Provenance event).
+///
+/// On LM failure, `block.caption` stays `None` — partial state is
+/// never written. The caller decides whether to skip the asset or
+/// surface the error.
+pub fn apply_caption(
+    llm: &dyn LanguageModel,
+    image_bytes: &[u8],
+    block: &mut ImageRefBlock,
+    lang_hint: Option<&Lang>,
+    cfg: &kebab_config::Config,
+    events: &mut Vec<ProvenanceEvent>,
+) -> Result<()> {
+    if !cfg.image.caption.enabled {
+        tracing::debug!(
+            target: "kebab-parse-image",
+            "captioning skipped — image.caption.enabled = false"
+        );
+        return Ok(());
+    }
+    let caption = caption_image(llm, image_bytes, lang_hint, cfg)?;
+    let model_label = caption.model.clone();
+    let model_version_label = caption.model_version.clone();
+    block.caption = Some(caption);
+    events.push(ProvenanceEvent {
+        at: OffsetDateTime::now_utc(),
+        agent: "kb-parse-image".to_string(),
+        kind: ProvenanceKind::CaptionApplied,
+        note: Some(format!(
+            "model={model_label} model_version={model_version_label}"
+        )),
+    });
+    Ok(())
+}
+
+/// Compose the `(system, user)` prompt pair for the caption call.
+/// Korean / English split keeps the model on the requested output
+/// language; everything else falls through to English.
+fn build_prompt(lang_hint: Option<&str>) -> (String, String) {
+    match lang_hint {
+        Some("ko") | Some("kor") => (
+            "이미지를 한 문장으로 객관적으로 설명한다. 추측은 피하고, \
+             보이는 것만 적는다. 마크다운 / 따옴표 / 부가 설명 없이 \
+             한 문장만 출력."
+                .to_string(),
+            "위 이미지를 한국어로 한 문장으로 설명하라.".to_string(),
+        ),
+        _ => (
+            "Describe the image in one objective sentence. Do not \
+             speculate; describe only what is visible. No markdown, \
+             no quotes, no commentary — output a single sentence."
+                .to_string(),
+            "Describe the image above in one English sentence.".to_string(),
+        ),
+    }
+}
+
+/// Decode `bytes`, downscale long-edge to `max_long_edge`, re-encode as
+/// PNG. Mirrors the OCR pipeline's pattern but with the caption-side
+/// long-edge bounds. PNG sources within the cap pass through without
+/// re-encode.
+fn downscale_to_png(bytes: &[u8], max_long_edge: u32) -> Result<Vec<u8>> {
+    let reader = ImageReader::new(Cursor::new(bytes))
+        .with_guessed_format()
+        .context("reading image header for caption")?;
+    let format = reader.format();
+    let (w, h) = reader
+        .into_dimensions()
+        .context("reading image dimensions for caption")?;
+
+    let long = w.max(h);
+    if long <= max_long_edge && format == Some(ImageFormat::Png) {
+        return Ok(bytes.to_vec());
+    }
+
+    let img = ImageReader::new(Cursor::new(bytes))
+        .with_guessed_format()
+        .context("re-reading image for caption decode")?
+        .decode()
+        .context("decoding image for caption")?;
+    let final_img = if long <= max_long_edge {
+        img
+    } else {
+        let scale = max_long_edge as f32 / long as f32;
+        let mut new_w = ((w as f32) * scale).round().max(1.0) as u32;
+        let mut new_h = ((h as f32) * scale).round().max(1.0) as u32;
+        if w >= h {
+            new_w = new_w.min(max_long_edge);
+        } else {
+            new_h = new_h.min(max_long_edge);
+        }
+        img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle)
+    };
+
+    let mut out = Cursor::new(Vec::new());
+    final_img
+        .write_to(&mut out, ImageFormat::Png)
+        .context("encoding image as PNG for caption")?;
+    Ok(out.into_inner())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn build_prompt_korean_for_ko_hint() {
+        let (sys, user) = build_prompt(Some("ko"));
+        assert!(sys.contains("이미지를 한 문장으로"));
+        assert!(user.contains("한국어로"));
+    }
+
+    #[test]
+    fn build_prompt_english_for_no_hint_or_und() {
+        let (sys, _) = build_prompt(None);
+        assert!(sys.contains("Describe the image"));
+        let (sys2, _) = build_prompt(Some("en"));
+        assert!(sys2.contains("Describe the image"));
+    }
+}
diff --git a/crates/kebab-parse-image/src/lib.rs b/crates/kebab-parse-image/src/lib.rs
index b5d826b..c5e1b87 100644
--- a/crates/kebab-parse-image/src/lib.rs
+++ b/crates/kebab-parse-image/src/lib.rs
@@ -13,14 +13,24 @@
 //! consumers can branch trust by engine (Tesseract / Apple Vision
 //! adapters, when added, will write a different `engine` string).
 //!
+//! P6-3 adds the [`caption`] module: [`caption_image`] /
+//! [`apply_caption`] route an image through any vision-capable
+//! [`kebab_core::LanguageModel`] (text-only LMs are not vision-aware
+//! and will surface a model-side error). Captions are explicitly
+//! marked **model-generated** — the trust gap between OCR (observed,
+//! engine-tagged) and caption (generated, prompt-tagged) is the
+//! workspace's central trust contract.
+//!
 //! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText /
 //! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption
 //! provenance), §9 (versioning).
 
 mod dims;
 mod exif_extract;
+pub mod caption;
 pub mod ocr;
 
+pub use caption::{apply_caption, caption_image};
 pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};
 
 use anyhow::{Context, Result};
diff --git a/crates/kebab-parse-image/tests/caption.rs b/crates/kebab-parse-image/tests/caption.rs
new file mode 100644
index 0000000..dc8c78e
--- /dev/null
+++ b/crates/kebab-parse-image/tests/caption.rs
@@ -0,0 +1,368 @@
+//! Integration tests for the caption adapter (P6-3).
+//!
+//! All hermetic tests use `MockLanguageModel` from `kebab-llm/mock`
+//! which captures `req.images` indirectly via the canned response. A
+//! single opt-in test (`#[ignore]`) wires the real
+//! `kebab-llm-local::OllamaLanguageModel` against the workspace's
+//! Ollama daemon to verify the `images: [base64]` round-trip.
+
+mod common;
+
+use std::sync::{Arc, Mutex};
+
+use kebab_config::Config;
+use kebab_core::{
+    AssetId, BlockId, CommonBlock, FinishReason, GenerateRequest, ImageRefBlock, Lang,
+    LanguageModel, ModelRef, ProvenanceEvent, ProvenanceKind, SourceSpan, TokenChunk,
+    TokenUsage,
+};
+use kebab_llm::MockLanguageModel;
+use kebab_parse_image::{apply_caption, caption_image};
+
+use crate::common::red_100x50_png;
+
+fn cfg_with_caption_enabled() -> Config {
+    let mut cfg = Config::defaults();
+    cfg.image.caption.enabled = true;
+    cfg.image.caption.max_pixels = 512;
+    cfg
+}
+
+fn empty_image_block() -> ImageRefBlock {
+    ImageRefBlock {
+        common: CommonBlock {
+            block_id: BlockId("0".repeat(32)),
+            heading_path: Vec::new(),
+            source_span: SourceSpan::Region {
+                x: 0,
+                y: 0,
+                w: 100,
+                h: 50,
+            },
+        },
+        asset_id: Some(AssetId("a".repeat(32))),
+        src: "img/x.png".to_string(),
+        alt: "x.png".to_string(),
+        ocr: None,
+        caption: None,
+    }
+}
+
+fn mk_mock(canned: &str) -> MockLanguageModel {
+    MockLanguageModel {
+        model_id: "vision-mock:1b".to_string(),
+        provider: "mock".to_string(),
+        context_tokens: 4096,
+        canned_response: canned.to_string(),
+        canned_finish: FinishReason::Stop,
+        canned_usage: TokenUsage {
+            prompt_tokens: 0,
+            completion_tokens: 0,
+            latency_ms: 0,
+        },
+    }
+}
+
+// ── Disabled feature gate ─────────────────────────────────────────────────
+
+#[test]
+fn apply_caption_no_op_when_feature_disabled() {
+    let mut cfg = Config::defaults();
+    cfg.image.caption.enabled = false;
+    let mock = mk_mock("ignored");
+    let mut block = empty_image_block();
+    let mut events: Vec<ProvenanceEvent> = Vec::new();
+    let bytes = red_100x50_png();
+    apply_caption(&mock, &bytes, &mut block, None, &cfg, &mut events)
+        .expect("disabled apply_caption must return Ok(())");
+    assert!(
+        block.caption.is_none(),
+        "disabled apply_caption must not write caption"
+    );
+    assert!(
+        events.is_empty(),
+        "disabled apply_caption must not append a Provenance event"
+    );
+}
+
+#[test]
+fn caption_image_errors_when_feature_disabled() {
+    let cfg = Config::defaults(); // enabled = false
+    let mock = mk_mock("ignored");
+    let bytes = red_100x50_png();
+    let r = caption_image(&mock, &bytes, None, &cfg);
+    assert!(
+        r.is_err(),
+        "caption_image must Err when image.caption.enabled = false"
+    );
+    let msg = format!("{:#}", r.unwrap_err());
+    assert!(
+        msg.contains("disabled"),
+        "error must mention disabled state: {msg}"
+    );
+}
+
+// ── Happy path ────────────────────────────────────────────────────────────
+
+#[test]
+fn apply_caption_sets_block_caption_and_appends_provenance() {
+    let cfg = cfg_with_caption_enabled();
+    let mock = mk_mock("사진 한 장");
+    let mut block = empty_image_block();
+    let mut events: Vec<ProvenanceEvent> = Vec::new();
+    let bytes = red_100x50_png();
+    apply_caption(
+        &mock,
+        &bytes,
+        &mut block,
+        Some(&Lang("ko".to_string())),
+        &cfg,
+        &mut events,
+    )
+    .expect("apply_caption must succeed");
+
+    let cap = block.caption.as_ref().expect("caption Some");
+    assert_eq!(cap.text, "사진 한 장");
+    assert_eq!(cap.model, "vision-mock:1b");
+    assert_eq!(cap.model_version, "mock/caption-v1");
+
+    assert_eq!(events.len(), 1);
+    assert_eq!(events[0].kind, ProvenanceKind::CaptionApplied);
+    assert_eq!(events[0].agent, "kb-parse-image");
+    let note = events[0].note.as_deref().unwrap_or("");
+    assert!(note.contains("vision-mock:1b") && note.contains("caption-v1"), "{note}");
+}
+
+// ── Empty token stream → empty caption text ──────────────────────────────
+
+#[test]
+fn caption_image_empty_stream_yields_empty_text() {
+    let cfg = cfg_with_caption_enabled();
+    let mock = mk_mock("");
+    let bytes = red_100x50_png();
+    let cap = caption_image(&mock, &bytes, None, &cfg).expect("empty stream must succeed");
+    assert_eq!(cap.text, "");
+    // Spec contract: caller can distinguish "captioning attempted, no
+    // result" from "captioning never attempted" by `caption.is_some()`.
+    // The text being empty does not erase the attempt.
+    assert!(!cap.model.is_empty());
+}
+
+// ── Korean vs English prompt selection ───────────────────────────────────
+
+/// `LanguageModel` impl that captures the `system` prompt sent to it
+/// so tests can verify the language branch picked by `build_prompt`
+/// (the function is private; this is the cleanest observable signal).
+struct CapturingMock {
+    captured_system: Arc<Mutex<Option<String>>>,
+    captured_images: Arc<Mutex<Vec<String>>>,
+}
+
+impl LanguageModel for CapturingMock {
+    fn model_ref(&self) -> ModelRef {
+        ModelRef {
+            id: "capture:1".to_string(),
+            provider: "mock".to_string(),
+            dimensions: None,
+        }
+    }
+    fn context_tokens(&self) -> usize {
+        4096
+    }
+    fn generate_stream(
+        &self,
+        req: GenerateRequest,
+    ) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>> {
+        *self.captured_system.lock().unwrap() = Some(req.system);
+        *self.captured_images.lock().unwrap() = req.images;
+        let chunks: Vec<TokenChunk> = vec![
+            TokenChunk::Token("ok".to_string()),
+            TokenChunk::Done {
+                finish_reason: FinishReason::Stop,
+                usage: TokenUsage {
+                    prompt_tokens: 0,
+                    completion_tokens: 0,
+                    latency_ms: 0,
+                },
+            },
+        ];
+        Ok(Box::new(chunks.into_iter().map(Ok)))
+    }
+}
+
+#[test]
+fn caption_image_routes_image_into_request_images_field() {
+    let cfg = cfg_with_caption_enabled();
+    let captured_system: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
+    let captured_images: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
+    let mock = CapturingMock {
+        captured_system: captured_system.clone(),
+        captured_images: captured_images.clone(),
+    };
+    let bytes = red_100x50_png();
+    let _ = caption_image(&mock, &bytes, Some(&Lang("ko".to_string())), &cfg)
+        .expect("caption succeeds");
+
+    let imgs = captured_images.lock().unwrap();
+    assert_eq!(imgs.len(), 1, "exactly one base64 image routed");
+    use base64::Engine as _;
+    let decoded = base64::engine::general_purpose::STANDARD
+        .decode(&imgs[0])
+        .expect("base64 decodes");
+    assert!(
+        !decoded.is_empty(),
+        "decoded image bytes must be non-empty"
+    );
+
+    let sys = captured_system.lock().unwrap().clone().unwrap();
+    assert!(
+        sys.contains("이미지를 한 문장으로"),
+        "Korean hint must produce Korean system prompt: {sys}"
+    );
+}
+
+#[test]
+fn caption_image_uses_english_prompt_for_undetermined_lang() {
+    let cfg = cfg_with_caption_enabled();
+    let captured_system: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
+    let mock = CapturingMock {
+        captured_system: captured_system.clone(),
+        captured_images: Arc::new(Mutex::new(Vec::new())),
+    };
+    let bytes = red_100x50_png();
+    let _ = caption_image(&mock, &bytes, Some(&Lang("und".to_string())), &cfg)
+        .expect("caption succeeds");
+    let sys = captured_system.lock().unwrap().clone().unwrap();
+    assert!(sys.contains("Describe the image"), "{sys}");
+}
+
+// ── LM error propagates ──────────────────────────────────────────────────
+
+/// LM that returns Err immediately from `generate_stream` (before any
+/// token).
+struct FailingLm;
+impl LanguageModel for FailingLm {
+    fn model_ref(&self) -> ModelRef {
+        ModelRef {
+            id: "fail".into(),
+            provider: "mock".into(),
+            dimensions: None,
+        }
+    }
+    fn context_tokens(&self) -> usize {
+        0
+    }
+    fn generate_stream(
+        &self,
+        _req: GenerateRequest,
+    ) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>> {
+        Err(anyhow::anyhow!("simulated LM connection refused"))
+    }
+}
+
+#[test]
+fn apply_caption_lm_error_leaves_block_untouched() {
+    let cfg = cfg_with_caption_enabled();
+    let mut block = empty_image_block();
+    let mut events: Vec<ProvenanceEvent> = Vec::new();
+    let bytes = red_100x50_png();
+    let r = apply_caption(&FailingLm, &bytes, &mut block, None, &cfg, &mut events);
+    assert!(r.is_err());
+    assert!(
+        block.caption.is_none(),
+        "caption stays None when LM fails — partial state must not leak"
+    );
+    assert!(events.is_empty(), "no provenance event when LM fails");
+}
+
+// ── Determinism — identical mock input → identical caption ───────────────
+
+#[test]
+fn caption_image_deterministic_with_identical_inputs() {
+    let cfg = cfg_with_caption_enabled();
+    let bytes = red_100x50_png();
+    let mock1 = mk_mock("a deterministic caption");
+    let mock2 = mk_mock("a deterministic caption");
+    let cap1 = caption_image(&mock1, &bytes, None, &cfg).unwrap();
+    let cap2 = caption_image(&mock2, &bytes, None, &cfg).unwrap();
+    assert_eq!(cap1, cap2);
+}
+
+// ── max_pixels clamp ─────────────────────────────────────────────────────
+
+/// Out-of-range `max_pixels` is silently clamped at construction so a
+/// bad config can't kill ingest. The captured `images` field's
+/// decoded long edge confirms the clamp engaged.
+#[test]
+fn caption_image_clamps_oversized_max_pixels() {
+    let mut cfg = Config::defaults();
+    cfg.image.caption.enabled = true;
+    cfg.image.caption.max_pixels = 99_999; // way over MAX_CAPTION_LONG_EDGE
+    let captured_images: Arc<Mutex<Vec<String>>> = Arc::new(Mutex::new(Vec::new()));
+    let mock = CapturingMock {
+        captured_system: Arc::new(Mutex::new(None)),
+        captured_images: captured_images.clone(),
+    };
+    // 4000×3000 PNG well above the 1536 cap.
+    let bytes = common::large_blue_4000x3000_png();
+    let _ = caption_image(&mock, &bytes, None, &cfg).expect("caption succeeds");
+    let imgs = captured_images.lock().unwrap();
+    use base64::Engine as _;
+    let decoded = base64::engine::general_purpose::STANDARD
+        .decode(&imgs[0])
+        .unwrap();
+    let reader = image::ImageReader::new(std::io::Cursor::new(decoded))
+        .with_guessed_format()
+        .unwrap();
+    let (w, h) = reader.into_dimensions().unwrap();
+    let long = w.max(h);
+    assert!(
+        long <= 1536,
+        "max_pixels must clamp to MAX_CAPTION_LONG_EDGE=1536, got {long}"
+    );
+}
+
+// ── Real Ollama integration (opt-in) ─────────────────────────────────────
+
+/// End-to-end captioning against the workspace's real Ollama daemon
+/// via `kebab-llm-local::OllamaLanguageModel` (dev-dep). Skipped by
+/// default via `#[ignore]`; opt in with `--ignored`.
+///
+/// Run with:
+///
+/// ```sh
+/// KEBAB_MODELS_LLM_ENDPOINT=http://192.168.0.47:11434 \
+/// KEBAB_MODELS_LLM_MODEL=gemma4:e4b \
+/// cargo test -p kebab-parse-image --test caption \
+///   caption_integration -- --ignored --nocapture
+/// ```
+#[test]
+#[ignore = "hits a real Ollama daemon; opt in via `cargo test -- --ignored`"]
+fn caption_integration_real_ollama_describes_image() {
+    use kebab_llm_local::OllamaLanguageModel;
+
+    let mut cfg = Config::defaults();
+    cfg.image.caption.enabled = true;
+    cfg.image.caption.max_pixels = 768;
+    if let Ok(ep) = std::env::var("KEBAB_MODELS_LLM_ENDPOINT") {
+        cfg.models.llm.endpoint = ep;
+    } else {
+        cfg.models.llm.endpoint = "http://192.168.0.47:11434".to_string();
+    }
+    if let Ok(m) = std::env::var("KEBAB_MODELS_LLM_MODEL") {
+        cfg.models.llm.model = m;
+    } else {
+        cfg.models.llm.model = "gemma4:e4b".to_string();
+    }
+    cfg.models.llm.provider = "ollama".to_string();
+
+    let llm = OllamaLanguageModel::new(&cfg).expect("OllamaLanguageModel::new");
+    let bytes = red_100x50_png();
+    let cap = caption_image(&llm, &bytes, Some(&Lang("en".to_string())), &cfg)
+        .expect("real-Ollama caption_image must succeed");
+    eprintln!("integration caption: {}", cap.text);
+    assert!(!cap.text.is_empty(), "caption must be non-empty");
+    assert_eq!(cap.model, "gemma4:e4b");
+    assert!(cap.model_version.contains("ollama"));
+    assert!(cap.model_version.contains("caption-v1"));
+}
diff --git a/crates/kebab-rag/src/pipeline.rs b/crates/kebab-rag/src/pipeline.rs
index 3271f63..af53265 100644
--- a/crates/kebab-rag/src/pipeline.rs
+++ b/crates/kebab-rag/src/pipeline.rs
@@ -195,6 +195,9 @@ impl RagPipeline {
             max_tokens: max_completion,
             temperature,
             seed,
+            // RAG is text-only — vision inputs only flow when a
+            // future multimodal pipeline injects images here.
+            images: Vec::new(),
         };
 
         let mut acc = String::new();
diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md
index e11346a..d1863b0 100644
--- a/tasks/HOTFIXES.md
+++ b/tasks/HOTFIXES.md
@@ -14,6 +14,30 @@ historical contract that was implemented; this file accumulates the
 deltas so phase 5+ readers can find the live behavior without diffing
 git history.
 
+## 2026-05-02 — P6-3 caption: GenerateRequest.images + cargo feature dropped
+
+**Discovered**: P6-3 implementation start.
+
+**Symptom 1**: `tasks/p6/p6-3-caption-adapter.md` § Public surface declares `caption_image(llm: &dyn kebab_core::LanguageModel, ...)`, but the frozen `LanguageModel` trait + `GenerateRequest` from p4-1 carry no vision input. The spec's behavior contract ("the adapter is responsible for rendering the prompt to wire") implicitly relied on a trait extension that p4-1 never specced.
+
+**Symptom 2**: Spec § Definition of Done asks for `cargo check -p kebab-parse-image --features caption` — i.e. a cargo feature gate. The captioning module's only extra deps are `base64` + `image` + the `kebab-llm` trait, all already pulled in by P6-2. A cargo feature would only complicate the build matrix without saving meaningful binary weight.
+
+**Root cause**: Two small spec gaps that resolve cleanly together — extend the `LanguageModel` trait once for vision routing, and collapse compile-time + runtime gating into a single runtime gate.
+
+**Fix** (PR #34, feat/p6-3-caption-adapter):
+- `kebab-core::GenerateRequest` gains an `images: Vec<String>` field (`#[serde(default)]` for backward compat with pre-P6 wire payloads / snapshots). Empty for the text-only RAG path; populated with one or more base64 strings by vision-aware callers.
+- `kebab-llm-local::OllamaLanguageModel` routes `req.images` onto the wire as `images: [base64, ...]` (Ollama's vision channel). The wire shape stays byte-identical for empty `images` because the field uses `#[serde(skip_serializing_if = "<[String]>::is_empty")]`.
+- `kebab-parse-image::caption` module: `caption_image` / `apply_caption` build `GenerateRequest { images: vec![b64], temperature: 0.0, seed: 0, ... }` and accept any `&dyn LanguageModel`. Korean / English prompt branch picked from `lang_hint`.
+- Cargo feature `caption` is **not** introduced — the runtime gate `config.image.caption.enabled = false` (default OFF) suffices.
+- All existing `GenerateRequest { ... }` literals (kebab-rag, kebab-llm tests, kebab-llm-local tests) gained `images: Vec::new()` to satisfy the new field.
+
+**Trust note**: Captions stay explicitly model-generated. `ModelCaption.model_version` carries `"<provider>/<prompt_template_version>"` (e.g. `"ollama/caption-v1"`) so a regression in either prompt or model is auditable from the wire.
+
+**Amends**:
+- tasks/p4/p4-1-llm-trait.md (`GenerateRequest` schema gained `images: Vec<String>`).
+- tasks/p4/p4-2-ollama-adapter.md (request body now optionally includes `images: [...]`).
+- tasks/p6/p6-3-caption-adapter.md ("Definition of Done" cargo feature `caption` dropped; runtime gate is the only feature gate).
+
 ## 2026-05-02 — P6-2 default OCR engine: Tesseract → Ollama-vision
 
 **Discovered**: P6-2 implementation start.
diff --git a/tasks/p6/p6-3-caption-adapter.md b/tasks/p6/p6-3-caption-adapter.md
index c4a99fd..8f8a4fb 100644
--- a/tasks/p6/p6-3-caption-adapter.md
+++ b/tasks/p6/p6-3-caption-adapter.md
@@ -3,7 +3,7 @@ phase: P6
 component: kebab-parse-image (caption adapter)
 task_id: p6-3
 title: "ModelCaption adapter (LanguageModel-driven, feature-gated)"
-status: planned
+status: completed
 depends_on: [p6-1, p4-2]
 unblocks: []
 contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md