2026-05-02 08:22:27 +00:00 · 2026-05-02 08:14:49 +00:00 · 2026-05-02 07:40:55 +00:00 · 2026-05-02 07:40:55 +00:00 · 2026-05-02 07:40:55 +00:00 · 2026-05-02 07:40:55 +00:00
7 changed files with 917 additions and 33 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3397,6 +3397,7 @@ dependencies = [
 "anyhow",
 "blake3",
 "dirs 5.0.1",
+ "image",
 "kebab-chunk",
 "kebab-config",
 "kebab-core",
@@ -3405,6 +3406,7 @@ dependencies = [
 "kebab-llm",
 "kebab-llm-local",
 "kebab-normalize",
+ "kebab-parse-image",
 "kebab-parse-md",
 "kebab-parse-types",
 "kebab-rag",
@@ -3417,10 +3419,12 @@ dependencies = [
 "serde_json",
 "tempfile",
 "time",
+ "tokio",
 "toml",
 "tracing",
 "tracing-appender",
 "tracing-subscriber",
+ "wiremock",
 ]

 [[package]]
--- a/crates/kebab-app/Cargo.toml
+++ b/crates/kebab-app/Cargo.toml
@@ -23,6 +23,11 @@ kebab-embed-local = { path = "../kebab-embed-local" }
 kebab-llm = { path = "../kebab-llm" }
 kebab-llm-local = { path = "../kebab-llm-local" }
 kebab-rag = { path = "../kebab-rag" }
+# P6-4: image extractor + OCR + caption adapters live here. App
+# threads them into the per-asset dispatch (see `ingest_one_asset`
+# image branch). Trait-only consumption — no `kebab-parse-image`
+# internals leak into kb-app code.
+kebab-parse-image = { path = "../kebab-parse-image" }
 anyhow               = { workspace = true }
 blake3               = { workspace = true }
 serde                = { workspace = true }
@@ -37,3 +42,9 @@ dirs                 = "5"
 [dev-dependencies]
 rusqlite             = { workspace = true }
 tempfile             = { workspace = true }
+# Image-pipeline integration tests use wiremock to stub Ollama for OCR
+# / caption HTTP calls. Async runtime to host the mock server only;
+# the kb-app code under test stays sync.
+wiremock             = { workspace = true }
+tokio                = { workspace = true, features = ["rt-multi-thread"] }
+image                = { version = "0.25", default-features = false, features = ["png"] }
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -41,12 +41,15 @@ use serde::{Deserialize, Serialize};

 use kebab_chunk::MdHeadingV1Chunker;
 use kebab_core::{
-    Answer, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
+    Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
    DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
-    EmbeddingKind, IngestReport, ParserVersion, RawAsset, SearchHit, SearchQuery,
-    SourceConnector, SourceScope, SourceUri, VectorRecord, VectorStore,
+    EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType,
+    ParserVersion, RawAsset, SearchHit, SearchQuery, SourceConnector, SourceScope,
+    SourceUri, VectorRecord, VectorStore,
 };
+use kebab_llm_local::OllamaLanguageModel;
 use kebab_normalize::build_canonical_document;
+use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr};
 use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
 use kebab_source_fs::FsSourceConnector;

@@ -190,6 +193,35 @@ pub fn ingest_with_config(
    let parser_version = ParserVersion(KEBAB_PARSE_MD_VERSION.to_string());
    let chunk_policy = chunk_policy_from_config(&app.config);

+    // P6-4: build OCR / caption adapters once per ingest invocation,
+    // gated on their respective `enabled` flags. `reqwest::blocking::Client`
+    // is internally Arc-shared so reusing one instance across the asset
+    // loop is correct and cheap. Construction failure (e.g. invalid
+    // endpoint) aborts ingest fail-fast — better than silently disabling
+    // OCR/caption mid-run.
+    let ocr_engine: Option<OllamaVisionOcr> = if app.config.image.ocr.enabled {
+        Some(
+            OllamaVisionOcr::new(&app.config)
+                .context("kb-app::ingest: build OllamaVisionOcr")?,
+        )
+    } else {
+        None
+    };
+    let caption_llm: Option<Box<dyn LanguageModel>> = if app.config.image.caption.enabled {
+        Some(Box::new(
+            OllamaLanguageModel::new(&app.config)
+                .context("kb-app::ingest: build OllamaLanguageModel for caption")?,
+        ))
+    } else {
+        None
+    };
+    let image_extractor = ImageExtractor::new();
+    let image_pipeline = ImagePipeline {
+        extractor: &image_extractor,
+        ocr_engine: ocr_engine.as_ref(),
+        caption_llm: caption_llm.as_deref(),
+    };
+
    // Pre-load every existing doc_id so we can label `IngestItem.kind`
    // as `New` vs `Updated` correctly. `list_documents` returns one
    // row per `(workspace_path, asset_id)` — index by the deterministic
@@ -230,6 +262,7 @@ pub fn ingest_with_config(
            embedder.as_ref(),
            vector_store.as_ref(),
            &existing_doc_ids,
+            &image_pipeline,
        );

        let item = match item {
@@ -241,7 +274,12 @@ pub fn ingest_with_config(
                    error = %e,
                    "kb-app::ingest: per-file fatal"
                );
-                error_count = error_count.saturating_add(1);
+                // Note: `error_count += 1` happens below in the
+                // `match item.kind { Error => ... }` arm — incrementing
+                // here too would double-count (a regression first
+                // surfaced by P6-4 image dispatch where Err returns
+                // are common; markdown rarely propagated Err so the
+                // bug went unnoticed).
                kebab_core::IngestItem {
                    kind: kebab_core::IngestItemKind::Error,
                    doc_id: None,
@@ -434,10 +472,20 @@ fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String {
 /// `<… as JobRepo>` to be explicit.
 type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;

+/// P6-4: borrowed bundle of the three image-pipeline components built
+/// once per ingest invocation. Threaded through `ingest_one_asset` so
+/// the dispatch does not need ten separate parameters.
+struct ImagePipeline<'a> {
+    extractor: &'a ImageExtractor,
+    ocr_engine: Option<&'a OllamaVisionOcr>,
+    caption_llm: Option<&'a dyn LanguageModel>,
+}
+
 /// Process a single asset: read bytes, parse, normalize, chunk,
 /// persist, embed. Per-asset failures bubble up to the caller for
 /// labelling as `IngestItemKind::Error` — they do NOT abort the
 /// whole run.
+#[allow(clippy::too_many_arguments)]
 fn ingest_one_asset(
    app: &App,
    asset: &RawAsset,
@@ -446,27 +494,47 @@ fn ingest_one_asset(
    embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
    vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
    existing_doc_ids: &std::collections::HashSet<String>,
+    image_pipeline: &ImagePipeline<'_>,
 ) -> anyhow::Result<kebab_core::IngestItem> {
    tracing::debug!(
        target: "kebab-app::ingest",
        path = %asset.workspace_path.0,
+        media_type = ?asset.media_type,
        "processing asset"
    );
-    // Only handle Markdown for now; other media types are P6+ work.
-    if asset.media_type != kebab_core::MediaType::Markdown {
-        return Ok(kebab_core::IngestItem {
-            kind: kebab_core::IngestItemKind::Skipped,
-            doc_id: None,
-            doc_path: asset.workspace_path.clone(),
-            asset_id: Some(asset.asset_id.clone()),
-            byte_len: Some(asset.byte_len),
-            block_count: None,
-            chunk_count: None,
-            parser_version: None,
-            chunker_version: None,
-            warnings: Vec::new(),
-            error: None,
-        });
+    // P6-4: dispatch on media_type. Markdown takes the existing
+    // parse-md / normalize path; image takes the new
+    // ImageExtractor + (optional) OCR + (optional) caption path.
+    // Anything else (PDF, audio, unknown) is skipped — the
+    // respective phases (P7 / P8) wire them in later.
+    match &asset.media_type {
+        MediaType::Markdown => { /* fall through to markdown path */ }
+        MediaType::Image(_) => {
+            return ingest_one_image_asset(
+                app,
+                asset,
+                chunk_policy,
+                embedder,
+                vector_store,
+                existing_doc_ids,
+                image_pipeline,
+            );
+        }
+        _ => {
+            return Ok(kebab_core::IngestItem {
+                kind: kebab_core::IngestItemKind::Skipped,
+                doc_id: None,
+                doc_path: asset.workspace_path.clone(),
+                asset_id: Some(asset.asset_id.clone()),
+                byte_len: Some(asset.byte_len),
+                block_count: None,
+                chunk_count: None,
+                parser_version: None,
+                chunker_version: None,
+                warnings: Vec::new(),
+                error: None,
+            });
+        }
    }

    let path = match &asset.source_uri {
@@ -612,6 +680,277 @@ fn ingest_one_asset(
    })
 }

+/// P6-4: process one `MediaType::Image(_)` asset end-to-end.
+///
+/// Pipeline: read bytes → `ImageExtractor::extract` → optional
+/// `apply_ocr` → optional `apply_caption` → existing chunker / embedder
+/// / store path (the same one markdown uses, which already handles
+/// `Block::ImageRef` per P1-5).
+///
+/// Failure semantics (per P6-4 spec):
+/// - `ImageExtractor::extract` Err → propagate (caller increments
+///   `errors`).
+/// - OCR / caption Err → log + `Provenance::Warning` event, continue.
+///   `block.ocr` / `block.caption` stay `None`. `errors` NOT incremented.
+#[allow(clippy::too_many_arguments)]
+fn ingest_one_image_asset(
+    app: &App,
+    asset: &RawAsset,
+    chunk_policy: &ChunkPolicy,
+    embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
+    vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
+    existing_doc_ids: &std::collections::HashSet<String>,
+    image_pipeline: &ImagePipeline<'_>,
+) -> anyhow::Result<kebab_core::IngestItem> {
+    let image_extractor = image_pipeline.extractor;
+    let ocr_engine = image_pipeline.ocr_engine;
+    let caption_llm = image_pipeline.caption_llm;
+    let path = match &asset.source_uri {
+        SourceUri::File(p) => p.clone(),
+        SourceUri::Kb(_) => {
+            return Ok(kebab_core::IngestItem {
+                kind: kebab_core::IngestItemKind::Skipped,
+                doc_id: None,
+                doc_path: asset.workspace_path.clone(),
+                asset_id: Some(asset.asset_id.clone()),
+                byte_len: Some(asset.byte_len),
+                block_count: None,
+                chunk_count: None,
+                parser_version: None,
+                chunker_version: None,
+                warnings: vec![
+                    "kb:// source URIs are not supported by the fs ingester".into(),
+                ],
+                error: None,
+            });
+        }
+    };
+    let bytes = std::fs::read(&path)
+        .with_context(|| format!("read image asset bytes from {}", path.display()))?;
+
+    // 1. Decode + EXIF + dimensions. ExtractContext.config carries
+    //    nothing the image extractor reads today; we pass a default
+    //    instance per the trait shape.
+    let extract_config = kebab_core::ExtractConfig::default();
+    let workspace_root = std::path::PathBuf::from(&app.config.workspace.root);
+    let ctx = ExtractContext {
+        asset,
+        workspace_root: &workspace_root,
+        config: &extract_config,
+    };
+    let mut canonical = image_extractor
+        .extract(&ctx, &bytes)
+        .context("kb-parse-image::ImageExtractor::extract")?;
+
+    // 2 + 3. Apply OCR / caption when their adapters exist. Both are
+    //        Lenient — failure is captured into Provenance Warning,
+    //        `block.ocr` / `block.caption` stay `None`. P6-4 spec
+    //        explicitly: such partial failures do NOT increment the
+    //        `errors` counter.
+    //
+    //        Determinism stress (per spec Risks): the per-document
+    //        Provenance timestamps for any analysis-stage Warning
+    //        events share a single `now_utc()` reading taken once
+    //        here, mirroring `kb-normalize::build_canonical_document`.
+    let lang_hint = lang_hint_from_doc(&canonical);
+    let now = time::OffsetDateTime::now_utc();
+    let mut warning_notes: Vec<String> = Vec::new();
+    match canonical.blocks.first_mut() {
+        Some(Block::ImageRef(block)) => {
+            if let Some(engine) = ocr_engine
+                && let Err(e) = apply_ocr(
+                    engine,
+                    &bytes,
+                    block,
+                    lang_hint.as_ref(),
+                    &mut canonical.provenance.events,
+                )
+            {
+                record_image_analysis_failure(
+                    asset,
+                    &mut canonical.provenance.events,
+                    &mut warning_notes,
+                    "OcrFailed",
+                    e,
+                    now,
+                );
+            }
+            if let Some(llm) = caption_llm
+                && let Err(e) = apply_caption(
+                    llm,
+                    &bytes,
+                    block,
+                    lang_hint.as_ref(),
+                    &app.config,
+                    &mut canonical.provenance.events,
+                )
+            {
+                record_image_analysis_failure(
+                    asset,
+                    &mut canonical.provenance.events,
+                    &mut warning_notes,
+                    "CaptionFailed",
+                    e,
+                    now,
+                );
+            }
+        }
+        // P6-1 contract: image documents always have exactly one
+        // `Block::ImageRef`. If a future task introduces multi-block
+        // image documents the silent-skip would mask a real bug, so
+        // this arm surfaces the divergence loudly.
+        other => {
+            tracing::warn!(
+                target: "kebab-app",
+                path = %asset.workspace_path.0,
+                blocks = canonical.blocks.len(),
+                "image document missing leading ImageRef block — OCR/caption skipped (first block: {:?})",
+                other.map(|b| std::mem::discriminant(b))
+            );
+            canonical.provenance.events.push(kebab_core::ProvenanceEvent {
+                at: now,
+                agent: "kb-app".to_string(),
+                kind: kebab_core::ProvenanceKind::Warning,
+                note: Some(
+                    "image document missing leading ImageRef block — OCR/caption skipped"
+                        .to_string(),
+                ),
+            });
+            warning_notes
+                .push("ImageDispatchAnomaly: missing ImageRef block".to_string());
+        }
+    }
+
+    // 4. Chunk via the same `MdHeadingV1Chunker` markdown uses — its
+    //    `Block::ImageRef` arm already produces a single chunk per
+    //    image (P1-5). The chunk text now follows the (β) plain-concat
+    //    contract per the kebab-chunk render_block_text update.
+    let chunks = MdHeadingV1Chunker
+        .chunk(&canonical, chunk_policy)
+        .context("kb-chunk::MdHeadingV1Chunker::chunk (image)")?;
+
+    // 5. Persist + embed — identical sequence to markdown.
+    app.sqlite
+        .put_asset_with_bytes(asset, &bytes)
+        .context("DocumentStore::put_asset_with_bytes (image)")?;
+    app.sqlite
+        .put_document(&canonical)
+        .context("DocumentStore::put_document (image)")?;
+    app.sqlite
+        .put_blocks(&canonical.doc_id, &canonical.blocks)
+        .context("DocumentStore::put_blocks (image)")?;
+    app.sqlite
+        .put_chunks(&canonical.doc_id, &chunks)
+        .context("DocumentStore::put_chunks (image)")?;
+
+    if let (Some(emb), Some(vec_store)) = (embedder, vector_store)
+        && !chunks.is_empty()
+    {
+        let inputs: Vec<EmbeddingInput<'_>> = chunks
+            .iter()
+            .map(|c| EmbeddingInput {
+                text: c.text.as_str(),
+                kind: EmbeddingKind::Document,
+            })
+            .collect();
+        let vectors = emb
+            .embed(&inputs)
+            .context("Embedder::embed (image chunks)")?;
+        let model_id = emb.model_id();
+        let model_version = emb.model_version();
+        let dimensions = emb.dimensions();
+        let records: Vec<VectorRecord> = chunks
+            .iter()
+            .zip(vectors)
+            .map(|(c, v)| VectorRecord {
+                embedding_id: kebab_core::id_for_embedding(
+                    &c.chunk_id,
+                    &model_id,
+                    &model_version,
+                    dimensions,
+                ),
+                chunk_id: c.chunk_id.clone(),
+                vector: v,
+                doc_id: canonical.doc_id.clone(),
+                text: c.text.clone(),
+                heading_path: c.heading_path.clone(),
+                model_id: model_id.clone(),
+                model_version: model_version.clone(),
+                dimensions,
+            })
+            .collect();
+        vec_store
+            .upsert(&records)
+            .context("VectorStore::upsert (image)")?;
+    }
+
+    let kind = if existing_doc_ids.contains(&canonical.doc_id.0) {
+        kebab_core::IngestItemKind::Updated
+    } else {
+        kebab_core::IngestItemKind::New
+    };
+
+    Ok(kebab_core::IngestItem {
+        kind,
+        doc_id: Some(canonical.doc_id.clone()),
+        doc_path: asset.workspace_path.clone(),
+        asset_id: Some(asset.asset_id.clone()),
+        byte_len: Some(asset.byte_len),
+        block_count: u32::try_from(canonical.blocks.len()).ok(),
+        chunk_count: u32::try_from(chunks.len()).ok(),
+        parser_version: Some(canonical.parser_version.clone()),
+        chunker_version: Some(MdHeadingV1Chunker.chunker_version()),
+        warnings: warning_notes,
+        error: None,
+    })
+}
+
+/// Centralised handling for image-analysis (OCR / caption) failures.
+/// Emits a `tracing::warn!`, appends a `ProvenanceKind::Warning`
+/// event sharing the caller's per-document `now`, and pushes a
+/// `<WarningKind>: <err>` note onto the `IngestItem.warnings` slot
+/// using the same shape the markdown path uses (so downstream wire
+/// readers don't have to learn two formats — see kb-normalize's
+/// `warning_agent`).
+fn record_image_analysis_failure(
+    asset: &RawAsset,
+    events: &mut Vec<kebab_core::ProvenanceEvent>,
+    warning_notes: &mut Vec<String>,
+    kind_label: &str,
+    err: anyhow::Error,
+    now: time::OffsetDateTime,
+) {
+    let detail = format!("{err:#}");
+    let note = format!("{kind_label}: {detail}");
+    tracing::warn!(
+        target: "kebab-app",
+        path = %asset.workspace_path.0,
+        "image analysis stage {} failed: {}",
+        kind_label,
+        detail
+    );
+    events.push(kebab_core::ProvenanceEvent {
+        at: now,
+        agent: "kb-app".to_string(),
+        kind: kebab_core::ProvenanceKind::Warning,
+        note: Some(note.clone()),
+    });
+    warning_notes.push(note);
+}
+
+/// Pull the BCP-47 language hint from the canonical document. P6-1
+/// stamps `Lang("und")` by default; image-pipeline OCR / caption
+/// adapters special-case "und" so the hint is intentionally dropped
+/// from prompts.
+fn lang_hint_from_doc(doc: &CanonicalDocument) -> Option<Lang> {
+    let s = doc.lang.0.as_str();
+    if s.is_empty() || s == "und" {
+        None
+    } else {
+        Some(doc.lang.clone())
+    }
+}
+
 /// Convenience: end byte of the frontmatter region (or 0 when absent).
 fn fm_span_end(span: Option<kebab_parse_md::FrontmatterSpan>) -> usize {
    span.map(|s| s.end).unwrap_or(0)
--- a/crates/kebab-app/tests/image_pipeline.rs
+++ b/crates/kebab-app/tests/image_pipeline.rs
@@ -0,0 +1,421 @@
+//! P6-4 image ingest wiring — end-to-end integration.
+//!
+//! Each test spins up a `TempDir` workspace + writes one PNG fixture +
+//! routes OCR / caption HTTP calls through a `wiremock` server that
+//! impersonates Ollama's `/api/generate` endpoint. The kb-app code
+//! under test is sync; the wiremock server is async, so test bodies
+//! drive blocking work via `tokio::task::spawn_blocking`.
+
+mod common;
+
+use std::path::Path;
+
+use common::TestEnv;
+use kebab_config::Config;
+use serde_json::json;
+use tokio::task::spawn_blocking;
+use wiremock::matchers::{method, path};
+use wiremock::{Mock, MockServer, ResponseTemplate};
+
+// ── Fixture helpers ──────────────────────────────────────────────────────
+
+/// Tiny solid-red PNG written into the test workspace at `<root>/<name>`.
+/// 100×50 — small enough to skip downscale by default but non-trivially
+/// inspectable in stored DB rows.
+fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf {
+    use image::{ImageBuffer, Rgb};
+    let img: ImageBuffer<Rgb<u8>, _> =
+        ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
+    let path = root.join(name);
+    img.save(&path).expect("write PNG fixture");
+    path
+}
+
+fn cfg_with_image_pipeline(env: &TestEnv, mock_endpoint: &str) -> Config {
+    let mut cfg = env.config.clone();
+    // Ensure image assets are scanned.
+    cfg.workspace
+        .include
+        .push("**/*.png".to_string());
+    cfg.image.ocr.enabled = true;
+    cfg.image.ocr.endpoint = Some(mock_endpoint.to_string());
+    cfg.image.ocr.model = "vision-mock:1b".to_string();
+    cfg.image.ocr.max_pixels = 512;
+    cfg.image.caption.enabled = false; // tested separately below
+    cfg.models.llm.endpoint = mock_endpoint.to_string();
+    cfg.models.llm.model = "vision-mock:1b".to_string();
+    cfg
+}
+
+// ── 1. Happy path: OCR-only ingest ───────────────────────────────────────
+
+/// One PNG asset + OCR enabled (caption off) → ingest produces 1 doc + 1
+/// chunk; chunk text contains alt + OCR transcription joined by `\n\n`.
+#[tokio::test]
+async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(json!({
+            "model": "vision-mock:1b",
+            "response": "Hello World 2026",
+            "done": true,
+            "done_reason": "stop"
+        })))
+        .mount(&server)
+        .await;
+
+    let env = TestEnv::lexical_only();
+    let png = write_red_png(&env.workspace_root, "diagram.png");
+    eprintln!("PNG written to {}", png.display());
+    let cfg = cfg_with_image_pipeline(&env, &server.uri());
+    let cfg_clone = cfg.clone();
+    let env_workspace = env.workspace_root.clone();
+    let env_scope = env.scope();
+
+    let report = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg_clone, env_scope, false)
+            .expect("image ingest must succeed")
+    })
+    .await
+    .expect("blocking task panicked");
+
+    // Counters: scanned should include the PNG; new ≥ 1 (markdown
+    // fixtures from the workspace tree may also count).
+    assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items);
+    assert_eq!(report.errors, 0, "no errors on lenient OCR path");
+
+    // Locate the image doc in the report items.
+    let items = report.items.expect("items present (summary_only=false)");
+    let img_item = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("diagram.png"))
+        .expect("image doc item must be present");
+    assert_eq!(
+        img_item.kind,
+        kebab_core::IngestItemKind::New,
+        "image asset must be classified New on first ingest"
+    );
+    assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk");
+
+    // Inspect the stored chunk text via kb-app's inspect_chunk facade.
+    let doc_id = img_item.doc_id.clone().expect("image doc id");
+    let doc = kebab_app::inspect_doc_with_config(cfg.clone(), &doc_id)
+        .expect("inspect_doc returns the image document");
+    let block = match doc.blocks.first() {
+        Some(kebab_core::Block::ImageRef(b)) => b,
+        other => panic!("expected ImageRef, got {other:?}"),
+    };
+    assert!(block.ocr.is_some(), "block.ocr populated by apply_ocr");
+    assert_eq!(
+        block.ocr.as_ref().unwrap().joined,
+        "Hello World 2026",
+        "OCR text from mock"
+    );
+    assert!(
+        block.caption.is_none(),
+        "caption disabled in cfg → block.caption stays None"
+    );
+
+    // Sanity: the doc was actually persisted into SQLite (kb-app's
+    // list_docs facade reads the same store the chunker writes to).
+    let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
+        .expect("list_docs");
+    assert!(
+        summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")),
+        "image doc must appear in list_docs"
+    );
+
+    drop(env_workspace); // keep TempDir alive until here
+    drop(env);
+}
+
+// ── 2. OCR + caption together ────────────────────────────────────────────
+
+/// Both OCR and caption enabled. The mock returns the same JSON body
+/// for every `/api/generate` POST — wiremock has no per-prompt routing
+/// on the default `Mock` so we treat both calls as equivalent. We then
+/// verify both `block.ocr` and `block.caption` are populated, and the
+/// chunk text contains both fragments separated by `\n\n`.
+#[tokio::test]
+async fn ingest_image_with_ocr_and_caption_populates_both_fields() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(json!({
+            "response": "shared mock body",
+            "done": true,
+            "done_reason": "stop"
+        })))
+        .mount(&server)
+        .await;
+
+    let env = TestEnv::lexical_only();
+    write_red_png(&env.workspace_root, "diagram.png");
+    let mut cfg = cfg_with_image_pipeline(&env, &server.uri());
+    cfg.image.caption.enabled = true;
+    cfg.image.caption.max_pixels = 384;
+
+    let cfg_clone = cfg.clone();
+    let scope = env.scope();
+    let report = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg_clone, scope, false)
+            .expect("ingest must succeed with both OCR+caption")
+    })
+    .await
+    .expect("task");
+
+    assert_eq!(report.errors, 0);
+    let img_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("diagram.png"))
+        .unwrap();
+    let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
+        .unwrap();
+    let block = match &doc.blocks[0] {
+        kebab_core::Block::ImageRef(b) => b,
+        _ => unreachable!(),
+    };
+    assert!(block.ocr.is_some(), "OCR populated");
+    assert!(block.caption.is_some(), "caption populated");
+    drop(env);
+}
+
+// ── 3. Lenient failure: OCR Ollama 503 → asset still indexed ─────────────
+
+/// OCR endpoint returns 503. Spec contract: image is still indexed,
+/// `block.ocr = None`, Provenance has a Warning event, `errors`
+/// counter NOT incremented.
+#[tokio::test]
+async fn ocr_failure_indexes_asset_with_warning_no_error_counter() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(503))
+        .mount(&server)
+        .await;
+
+    let env = TestEnv::lexical_only();
+    write_red_png(&env.workspace_root, "broken.png");
+    let cfg = cfg_with_image_pipeline(&env, &server.uri());
+
+    let cfg_clone = cfg.clone();
+    let scope = env.scope();
+    let report = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg_clone, scope, false)
+            .expect("ingest does not abort on lenient OCR failure")
+    })
+    .await
+    .expect("task");
+
+    assert_eq!(
+        report.errors, 0,
+        "lenient OCR failure must NOT increment errors counter (spec)"
+    );
+    let img_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("broken.png"))
+        .expect("asset still indexed despite OCR failure");
+    assert_eq!(img_item.kind, kebab_core::IngestItemKind::New);
+    assert_eq!(img_item.chunk_count, Some(1));
+    assert!(
+        !img_item.warnings.is_empty(),
+        "lenient OCR failure must surface a warning on the IngestItem"
+    );
+
+    let doc_id = img_item.doc_id.clone().unwrap();
+    let doc = kebab_app::inspect_doc_with_config(cfg, &doc_id).unwrap();
+    let block = match &doc.blocks[0] {
+        kebab_core::Block::ImageRef(b) => b,
+        _ => unreachable!(),
+    };
+    assert!(block.ocr.is_none(), "block.ocr stays None on OCR failure");
+    let warning = doc
+        .provenance
+        .events
+        .iter()
+        .find(|e| e.kind == kebab_core::ProvenanceKind::Warning && e.agent == "kb-app")
+        .expect("Provenance Warning attributed to kb-app");
+    let note = warning.note.as_deref().unwrap_or("");
+    assert!(
+        note.contains("OcrFailed"),
+        "warning note must describe OCR failure with OcrFailed prefix \
+         (markdown-style WarningKind format): {note}"
+    );
+}
+
+// ── 4. Both image.ocr.enabled and image.caption.enabled = false ──────────
+
+/// When both adapters are disabled, the image is still extracted +
+/// chunked. Chunk text falls back to the filename. EXIF + dimensions
+/// are populated by the extractor regardless.
+#[tokio::test]
+async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
+    // No mock server needed — neither HTTP path is touched.
+    let env = TestEnv::lexical_only();
+    write_red_png(&env.workspace_root, "raw.png");
+    let mut cfg = env.config.clone();
+    cfg.workspace.include.push("**/*.png".to_string());
+    cfg.image.ocr.enabled = false;
+    cfg.image.caption.enabled = false;
+
+    let cfg_clone = cfg.clone();
+    let scope = env.scope();
+    let report = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg_clone, scope, false)
+            .expect("ingest with no OCR/caption")
+    })
+    .await
+    .expect("task");
+
+    assert_eq!(report.errors, 0);
+    let img_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("raw.png"))
+        .unwrap();
+    assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk");
+    let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
+        .unwrap();
+    let block = match &doc.blocks[0] {
+        kebab_core::Block::ImageRef(b) => b,
+        _ => unreachable!(),
+    };
+    assert!(block.ocr.is_none() && block.caption.is_none());
+    // EXIF + dimensions still populated by the extractor.
+    let dims = doc
+        .metadata
+        .user
+        .get("dimensions")
+        .and_then(|v: &serde_json::Value| v.as_object())
+        .expect("dimensions object present");
+    assert_eq!(
+        dims.get("w").and_then(|v: &serde_json::Value| v.as_u64()),
+        Some(100)
+    );
+    assert_eq!(
+        dims.get("h").and_then(|v: &serde_json::Value| v.as_u64()),
+        Some(50)
+    );
+}
+
+// ── 5. Garbage bytes (not an image) → errors counter exactly 1 ──────────
+
+/// `kebab-source-fs` classifies a `.png` extension as
+/// `MediaType::Image(Png)` regardless of content. When the bytes don't
+/// decode as any image format, `ImageExtractor::extract` returns Err
+/// and the asset must be classified as `IngestItemKind::Error` with
+/// the `errors` counter incremented **exactly once** (regression for
+/// the double-count bug surfaced during P6-4 manual smoke).
+#[tokio::test]
+async fn garbage_png_increments_errors_counter_exactly_once() {
+    // No mock server needed — extract fails before any HTTP call.
+    let env = TestEnv::lexical_only();
+    // Single non-image asset with .png extension.
+    std::fs::write(
+        env.workspace_root.join("garbage.png"),
+        b"this is not an image at all",
+    )
+    .expect("write garbage fixture");
+    let mut cfg = env.config.clone();
+    cfg.workspace.include.push("**/*.png".to_string());
+    cfg.image.ocr.enabled = false;
+    cfg.image.caption.enabled = false;
+
+    let cfg_clone = cfg.clone();
+    let scope = env.scope();
+    let report = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg_clone, scope, false)
+            .expect("ingest does not abort on per-asset failure")
+    })
+    .await
+    .expect("task");
+
+    // Exactly-once: scanned counts the asset, errors counts it once,
+    // and (scanned == new + updated + skipped + errors) holds.
+    assert_eq!(
+        report.errors, 1,
+        "garbage PNG must increment errors exactly once, not twice (double-count regression)"
+    );
+    assert_eq!(
+        report.scanned,
+        report.new + report.updated + report.skipped + report.errors,
+        "counter sum must equal scanned — invariant of the IngestReport contract"
+    );
+
+    // The single Error item carries the propagated extract error.
+    let items = report.items.expect("items present");
+    let err_item = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("garbage.png"))
+        .expect("garbage item present");
+    assert_eq!(err_item.kind, kebab_core::IngestItemKind::Error);
+    assert!(err_item.error.is_some(), "Error item carries error string");
+}
+
+// ── 6. Determinism: re-ingest produces identical doc_id / chunk_id ───────
+
+/// Idempotency contract — running the same ingest twice should mark
+/// the asset Updated on the second run with byte-identical IDs.
+#[tokio::test]
+async fn re_ingest_image_produces_updated_with_same_doc_id() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(json!({
+            "response": "stable",
+            "done": true,
+            "done_reason": "stop"
+        })))
+        .mount(&server)
+        .await;
+
+    let env = TestEnv::lexical_only();
+    write_red_png(&env.workspace_root, "diagram.png");
+    let cfg = cfg_with_image_pipeline(&env, &server.uri());
+
+    let scope = env.scope();
+    let cfg1 = cfg.clone();
+    let cfg2 = cfg.clone();
+    let scope1 = scope.clone();
+    let scope2 = scope.clone();
+
+    let r1 = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg1, scope1, false).unwrap()
+    })
+    .await
+    .unwrap();
+    let r2 = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg2, scope2, false).unwrap()
+    })
+    .await
+    .unwrap();
+
+    let id1 = r1
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("diagram.png"))
+        .unwrap()
+        .doc_id
+        .clone()
+        .unwrap();
+    let img2 = r2
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("diagram.png"))
+        .unwrap();
+    assert_eq!(img2.kind, kebab_core::IngestItemKind::Updated);
+    assert_eq!(img2.doc_id.as_ref().unwrap(), &id1);
+}
--- a/crates/kebab-chunk/src/md_heading_v1.rs
+++ b/crates/kebab-chunk/src/md_heading_v1.rs
@@ -381,17 +381,41 @@ fn render_block_text(b: &Block) -> String {
            }
            s
        }
-        // ImageRef text portion = alt (per task spec). Fall back to
-        // model caption text if alt is empty.
+        // ImageRef text portion follows the P6-4 (β) plain-concat
+        // contract — `[alt, ocr.joined, caption.text]` joined by
+        // `\n\n`, dropping empty parts. Filename fallback for empty
+        // alt keeps lexical search hits on filenames working even when
+        // P6-1's filename auto-fill is bypassed.
        Block::ImageRef(i) => {
-            if !i.alt.is_empty() {
+            let alt = if !i.alt.is_empty() {
                i.alt.clone()
            } else {
-                i.caption
-                    .as_ref()
-                    .map(|c| c.text.clone())
-                    .unwrap_or_default()
-            }
+                // P6-1 falls back to filename so this branch is
+                // defensive — keep it lest a future test fixture or
+                // synthetic block path skip the auto-fill.
+                i.src
+                    .rsplit('/')
+                    .next()
+                    .filter(|s| !s.is_empty())
+                    .unwrap_or("[image]")
+                    .to_string()
+            };
+            let ocr = i
+                .ocr
+                .as_ref()
+                .map(|o| o.joined.as_str())
+                .unwrap_or("");
+            let cap = i
+                .caption
+                .as_ref()
+                .map(|c| c.text.as_str())
+                .unwrap_or("");
+            [alt.as_str(), ocr, cap]
+                .iter()
+                .filter(|s| !s.is_empty())
+                .copied()
+                .collect::<Vec<_>>()
+                .join("\n\n")
        }
        // AudioRef has no caption preview yet (transcript joins land
        // in P8). Empty string per task spec.
@@ -700,6 +724,63 @@ mod tests {
        }
    }

+    /// P6-4 (β) plain concatenation — alt + ocr.joined + caption.text
+    /// joined by `\n\n`, dropping empty parts. Verifies all four
+    /// (alt-only, alt+ocr, alt+caption, alt+ocr+caption) shapes.
+    #[test]
+    fn image_ref_p6_4_plain_concat_drops_empty_parts() {
+        use kebab_core::{ModelCaption, OcrText};
+
+        let mk = |alt: &str, ocr: Option<&str>, cap: Option<&str>| {
+            Block::ImageRef(ImageRefBlock {
+                common: common_for("imageref", &[], 0, span(1, 1)),
+                asset_id: None,
+                src: "img.png".into(),
+                alt: alt.into(),
+                ocr: ocr.map(|t| OcrText {
+                    joined: t.into(),
+                    regions: vec![],
+                    engine: "test".into(),
+                    engine_version: "v1".into(),
+                }),
+                caption: cap.map(|t| ModelCaption {
+                    text: t.into(),
+                    model: "m".into(),
+                    model_version: "v".into(),
+                }),
+            })
+        };
+
+        // alt-only — no separators between empty parts.
+        assert_eq!(render_block_text(&mk("photo.png", None, None)), "photo.png");
+
+        // alt + ocr — joined by exactly one `\n\n`.
+        assert_eq!(
+            render_block_text(&mk("photo.png", Some("Hello"), None)),
+            "photo.png\n\nHello"
+        );
+
+        // alt + caption.
+        assert_eq!(
+            render_block_text(&mk("photo.png", None, Some("a red square"))),
+            "photo.png\n\na red square"
+        );
+
+        // alt + ocr + caption — three parts joined by `\n\n` each.
+        assert_eq!(
+            render_block_text(&mk("photo.png", Some("Hello"), Some("a red square"))),
+            "photo.png\n\nHello\n\na red square"
+        );
+
+        // empty alt — falls back to filename derived from `src`.
+        let blk = mk("", Some("text from image"), None);
+        assert_eq!(
+            render_block_text(&blk),
+            "img.png\n\ntext from image",
+            "empty alt must fall back to the basename of `src`"
+        );
+    }
+
    /// ImageRef → own chunk, token_estimate=0.
    #[test]
    fn image_ref_emits_own_chunk_zero_tokens() {
--- a/docs/SMOKE.md
+++ b/docs/SMOKE.md
@@ -118,16 +118,41 @@ max_context_tokens = 6000
 KEBAB() { ./target/debug/kebab --config /tmp/kebab-smoke/config.toml "$@"; }

 KB doctor                                          # 1. health check
-KB ingest                                          # 2. 워크스페이스 색인
-KB list docs                                       # 3. 색인 결과 목록
+KB ingest                                          # 2. 워크스페이스 색인 (markdown + image)
+KB list docs                                       # 3. 색인 결과 목록 (markdown + image 모두 표시)
 KB search --mode lexical "코루틴" --k 3            # 4. lexical 검색
 KB search --mode vector "memory safety" --k 3      # 5. vector 검색
 KB search --mode hybrid "Cargo workspace" --k 3    # 6. hybrid 검색
-KB inspect chunk <chunk_id>                        # 7. raw chunk 보기
-KB ask "이 KB 안에서 ..." --mode hybrid --k 5     # 8. RAG 답변 (Ollama 필요)
-KB --json ask "..." --mode hybrid                  # 9. 기계 친화 출력 검증
+KB search --mode lexical "Hello World" --k 3       # 7. image OCR 텍스트 검색 (P6-4)
+KB inspect chunk <chunk_id>                        # 8. raw chunk 보기
+KB ask "이 KB 안에서 ..." --mode hybrid --k 5     # 9. RAG 답변 (Ollama 필요)
+KB --json ask "..." --mode hybrid                  # 10. 기계 친화 출력 검증
 ```

+## P6-4 이미지 ingestion 옵션
+
+`config.toml` 에 다음 절을 추가하면 `kebab ingest` 가 `**/*.png` / `**/*.jpg` 등 이미지 자산도 함께 색인합니다 (텍스트만 색인하려면 생략):
+
+```toml
+[workspace]
+include = ["**/*.md", "**/*.png", "**/*.jpg"]
+
+[image.ocr]
+enabled = true                        # vision LM 으로 이미지 안 텍스트 전사
+engine = "ollama-vision"
+model = "gemma4:e4b"                  # 사용자 환경의 비전 모델
+endpoint = "http://192.168.0.47:11434"  # 비우면 models.llm.endpoint fallback
+languages = ["eng", "kor"]
+max_pixels = 1600                     # long-edge cap
+
+[image.caption]
+enabled = true                        # vision LM 으로 한 문장 객관 설명 생성
+max_pixels = 768
+prompt_template_version = "caption-v1"
+```
+
+이미지 자산 한 장당 OCR 1 호출 + Caption 1 호출 → ~3-6초 (`gemma4:e4b` 기준). 다이어그램 / 카메라 사진 / 스크린샷 위주 워크스페이스에 권장. 책 / 스캔본은 P7 PDF 라인으로 (P7 머지 후).
+
 각 명령은 0 종료 코드면 정상. `kebab ask` 는 거절 시 종료 코드 1 (`RefusalSignal`) — 의도된 동작.

 ## 검증 체크리스트
@@ -138,6 +163,8 @@ KB --json ask "..." --mode hybrid                  # 9. 기계 친화 출력 검
 - `kebab search --mode hybrid` 의 `fusion_score` 가 `[0, 1]` 범위 (top-1 종종 1.0 — 두 retriever 모두 rank 1 일 때).
 - `kebab ask` JSON 응답에 `model.id` 가 config 의 모델 (`gemma4:26b` 등) 과 일치, `embedding.id = multilingual-e5-small`, `citations[].marker` 가 `[1]` / `[2]` 형식 (square-bracketed bare index).
 - 코퍼스에 없는 주제로 `kebab ask` → `refusal_reason: "llm_self_judge"` (또는 `no_chunks` / `score_gate`) + `grounded: false`.
+- (P6-4) `image.ocr.enabled = true` 로 PNG 자산을 ingest 하면 `kebab list docs` 가 markdown 옆에 image doc 도 출력 (`workspace_path` 가 `*.png`). `kebab inspect doc <image_doc_id>` 의 `block.ocr.joined` 가 vision LM 의 OCR 결과 (예: 스크린샷 안의 텍스트). `kebab search --mode lexical "<OCR text>"` 가 그 image chunk 를 반환하면 wiring 정상.
+- OCR / caption 부분 실패는 `errors` 카운터 미증가 — `kebab inspect doc <id>` 의 Provenance Warning 이벤트 또는 `--debug` 로그에서만 확인.

 ## 정리

@@ -154,5 +181,6 @@ rm -rf /tmp/kebab-smoke              # 통째로 정리
 - `kebab ask` 응답 시간 = LLM 토큰 throughput 에 종속. M4 Pro 48GB + gemma4:26b 기준 답변 50–100 토큰에 20–55초.
 - `--config` path 가 존재하지 않거나 malformed 면 `kebab doctor` 가 hard fail (defaults 가 silently mask 하지 않게 하는 hotfix 동작).
 - 매 CLI invocation 마다 fastembed 모델 init 비용 (~4초) — process-level 캐시 부재 때문. P9 TUI 진입 시 `App` 의 `OnceLock` 으로 세션 동안 한 번만 init.
+- (P6-4) `image.ocr.enabled = true` + `image.caption.enabled = true` 인 워크스페이스에 PNG 가 N장 있으면 ingest 시간 ≈ markdown_time + N × (OCR + Caption latency). `gemma4:e4b` + 192.168.0.47 로 자산당 ~5-10초. 다수의 책 페이지를 이미지로 넣지 말 것 — 책은 P7 PDF 라인 사용 권장 (P7 머지 후).

 자세한 history 와 발견된 버그는 [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 참조.
--- a/tasks/p6/p6-4-image-ingest-wiring.md
+++ b/tasks/p6/p6-4-image-ingest-wiring.md
@@ -3,7 +3,7 @@ phase: P6
 component: kebab-app (image ingest dispatch + chunking)
 task_id: p6-4
 title: "Wire ImageExtractor + OCR + caption into kebab-app::ingest end-to-end"
-status: planned
+status: completed
 depends_on: [p6-1, p6-2, p6-3, p1-6, p3-5]
 unblocks: []
 contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md