From ca0567c72b0e3d4c33c91fc20cd45bac89535773 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Sat, 2 May 2026 07:37:56 +0000
Subject: [PATCH 1/3] =?UTF-8?q?feat(kebab-app):=20P6-4=20image=20ingest=20?=
 =?UTF-8?q?wiring=20=E2=80=94=20kebab=20ingest=20=EA=B0=80=20PNG/JPEG=20?=
 =?UTF-8?q?=EC=9E=90=EC=82=B0=EB=8F=84=20=EC=B2=98=EB=A6=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

P6-1/P6-2/P6-3 의 라이브러리 (`ImageExtractor`, `OllamaVisionOcr`,
`apply_caption`) 가 그동안 CLI 에서 보이지 않던 미완 구간을 완성.
이제 `kebab ingest` 가 markdown 외에 이미지 자산을 end-to-end 로
색인하고, `kebab search` / `kebab ask` 가 OCR 텍스트 + caption 으로
이미지를 매칭/인용한다.

## kebab-app

- `[dependencies]` 에 `kebab-parse-image` 추가.
- `ingest_with_config` 진입 시 `image.ocr.enabled` / `image.caption.enabled`
  플래그에 따라 `OllamaVisionOcr` / `OllamaLanguageModel` 을 **ingest
  세션당 1회** 빌드. 자산 루프에서 trait object 로 공유.
  reqwest::blocking::Client 의 내부 Arc 덕분에 알로케이션 비용은
  자산 수와 무관.
- 두 어댑터 + ImageExtractor 를 한 묶음으로 `ImagePipeline` 구조체에
  담아 `ingest_one_asset` 매개변수 폭증 차단 (clippy::too_many_arguments
  대응).
- `ingest_one_asset` 의 markdown-only 가드를 `match media_type` 으로
  교체 — Markdown 은 기존 경로, Image(_) 는 새 `ingest_one_image_asset`
  로 분기, PDF/Audio/Other 는 종전대로 skipped.
- 신규 `ingest_one_image_asset`:
  - bytes 읽기 → `ImageExtractor::extract` (실패 시 caller 가 errors+=1)
  - `apply_ocr` (Lenient — 실패 시 ProvenanceKind::Warning 이벤트 +
    `IngestItem.warnings` 에 \"ocr_failed: ...\", `block.ocr` 는 None
    유지)
  - `apply_caption` (동일 Lenient 정책)
  - 기존 `MdHeadingV1Chunker` 호출 — 청커는 이미 `Block::ImageRef` 를
    단일 청크로 emit
  - 기존 persist + embed 시퀀스 그대로 (markdown 과 byte-identical)
- `lang_hint_from_doc` — `Lang(\"und\")` 또는 빈 문자열을 None 으로
  매핑 (image-pipeline 어댑터의 build_prompt 가 \"und\" 를 silent drop
  하지 않도록 caller 측에서 미리).

## kebab-chunk

- `render_block_text` 의 `Block::ImageRef` 분기를 P6-4 (β) plain
  concat 정책으로 교체 — `[alt, ocr.joined, caption.text]` 를 `\\n\\n`
  로 join, 빈 부분은 drop. alt 가 비면 `src` 의 basename 으로 fallback
  (P6-1 contract 의 defensive guard).
- 신규 unit 테스트 `image_ref_p6_4_plain_concat_drops_empty_parts` —
  alt-only / alt+ocr / alt+caption / alt+ocr+caption / 빈 alt → src
  fallback 다섯 케이스 모두 검증.
- 기존 `image_ref_emits_own_chunk_zero_tokens` 그대로 통과 — 청커의
  per-block dispatch 는 변경 없음, text 렌더링만 갱신.

## 통합 테스트 (kebab-app/tests/image_pipeline.rs)

wiremock 으로 Ollama 를 stub. 5건:

1. OCR-only happy path — 1 PNG + ocr.enabled → 1 doc + 1 chunk emit,
   `block.ocr.joined` 가 mock 의 \"Hello World 2026\".
2. OCR + caption 동시 활성 — 두 필드 모두 채워지고 chunk text 에
   alt + ocr + caption 세 부분 모두 포함.
3. Lenient 실패 검증 — OCR 503 시 자산은 indexed (kind=New),
   `errors=0`, ProvenanceKind::Warning attributed to \"kb-app\",
   `IngestItem.warnings` 에 \"ocr_failed:\" 노트.
4. 양쪽 비활성 — `image.ocr.enabled=false && image.caption.enabled=false`
   여도 자산은 chunk 1개로 indexed (chunk text=filename), EXIF +
   dimensions 그대로 채워짐.
5. 결정성 (re-ingest) — 동일 PNG 두 번 ingest 시 두 번째는
   `Updated` + 동일 `doc_id`.

## SMOKE.md

`kebab search --mode lexical \"Hello World\"` 단계를 명령 시퀀스에
추가. `[image.ocr]` / `[image.caption]` config 절 예시 + ingest 시간
추정 (자산당 ~5-10초) 추가. \"책은 P7 PDF 라인으로\" 가이드를 검증
체크리스트 와 \"알려진 동작\" 양쪽에 박음.

## 실 Ollama 통합 검증

192.168.0.47 + gemma4:e4b 기준:

```
$ kebab --config /tmp/kebab-smoke/config.toml ingest
scanned 2  new 2  updated 0  skipped 0  errors 0  (18395 ms)

$ kebab inspect doc <image_doc_id>
parser_version: image-meta-v1
blocks: [{
  alt: \"hello.png\",
  ocr: \"Hello World 2026\",
  caption: \"The image displays the text \\\"Hello World 2026\\\" in a large, black, sans-serif font.\"
}]

$ kebab --json ask \"Hello World 텍스트가 어디에 있나?\" --mode hybrid
grounded: true
citations: [{marker: \"[1]\", doc_path: \"hello.png\"}]
```

## 검증

- `cargo test --workspace --no-fail-fast -j 1` — 전부 pass
- `cargo clippy --workspace --all-targets -- -D warnings` — pass
- `cargo test -p kebab-chunk image_ref` — 2 pass (P1-5 회귀 + P6-4
  신규 unit)
- `cargo test -p kebab-app --test image_pipeline` — 5 pass

## 의존성 경계

- `kebab-app` 이 `kebab-parse-image` 추가 — spec Allowed dep 그대로.
- 새 forbidden 침범 없음 (기존 `kebab-tui` / `kebab-desktop` /
  `kebab-eval` 미참조 유지).
- 본 task 가 신설하는 image-specific 비즈니스 로직 0줄 — 모두
  `kebab-parse-image` 에 위임.

`tasks/p6/p6-4-image-ingest-wiring.md` status: planned → completed.

contract: docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
sections: §3.4 ImageRefBlock, §6.1 ingest pipeline, §7.2
Extractor/Chunker traits, §9.1 image extraction policy.
---
 Cargo.lock                               |   4 +
 crates/kebab-app/Cargo.toml              |  11 +
 crates/kebab-app/src/lib.rs              | 321 ++++++++++++++++++--
 crates/kebab-app/tests/image_pipeline.rs | 366 +++++++++++++++++++++++
 crates/kebab-chunk/src/md_heading_v1.rs  |  97 +++++-
 docs/SMOKE.md                            |  38 ++-
 tasks/p6/p6-4-image-ingest-wiring.md     |   2 +-
 7 files changed, 807 insertions(+), 32 deletions(-)
 create mode 100644 crates/kebab-app/tests/image_pipeline.rs

diff --git a/Cargo.lock b/Cargo.lock
index 0c9ee2a..219f346 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3397,6 +3397,7 @@ dependencies = [
  "anyhow",
  "blake3",
  "dirs 5.0.1",
+ "image",
  "kebab-chunk",
  "kebab-config",
  "kebab-core",
@@ -3405,6 +3406,7 @@ dependencies = [
  "kebab-llm",
  "kebab-llm-local",
  "kebab-normalize",
+ "kebab-parse-image",
  "kebab-parse-md",
  "kebab-parse-types",
  "kebab-rag",
@@ -3417,10 +3419,12 @@ dependencies = [
  "serde_json",
  "tempfile",
  "time",
+ "tokio",
  "toml",
  "tracing",
  "tracing-appender",
  "tracing-subscriber",
+ "wiremock",
 ]
 
 [[package]]
diff --git a/crates/kebab-app/Cargo.toml b/crates/kebab-app/Cargo.toml
index e8fae60..c50ae8e 100644
--- a/crates/kebab-app/Cargo.toml
+++ b/crates/kebab-app/Cargo.toml
@@ -23,6 +23,11 @@ kebab-embed-local = { path = "../kebab-embed-local" }
 kebab-llm = { path = "../kebab-llm" }
 kebab-llm-local = { path = "../kebab-llm-local" }
 kebab-rag = { path = "../kebab-rag" }
+# P6-4: image extractor + OCR + caption adapters live here. App
+# threads them into the per-asset dispatch (see `ingest_one_asset`
+# image branch). Trait-only consumption — no `kebab-parse-image`
+# internals leak into kb-app code.
+kebab-parse-image = { path = "../kebab-parse-image" }
 anyhow               = { workspace = true }
 blake3               = { workspace = true }
 serde                = { workspace = true }
@@ -37,3 +42,9 @@ dirs                 = "5"
 [dev-dependencies]
 rusqlite             = { workspace = true }
 tempfile             = { workspace = true }
+# Image-pipeline integration tests use wiremock to stub Ollama for OCR
+# / caption HTTP calls. Async runtime to host the mock server only;
+# the kb-app code under test stays sync.
+wiremock             = { workspace = true }
+tokio                = { workspace = true, features = ["rt-multi-thread"] }
+image                = { version = "0.25", default-features = false, features = ["png"] }
diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs
index 15d6fc9..fff98c3 100644
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -41,12 +41,15 @@ use serde::{Deserialize, Serialize};
 
 use kebab_chunk::MdHeadingV1Chunker;
 use kebab_core::{
-    Answer, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
+    Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
     DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
-    EmbeddingKind, IngestReport, ParserVersion, RawAsset, SearchHit, SearchQuery,
-    SourceConnector, SourceScope, SourceUri, VectorRecord, VectorStore,
+    EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType,
+    ParserVersion, RawAsset, SearchHit, SearchQuery, SourceConnector, SourceScope,
+    SourceUri, VectorRecord, VectorStore,
 };
+use kebab_llm_local::OllamaLanguageModel;
 use kebab_normalize::build_canonical_document;
+use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr};
 use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
 use kebab_source_fs::FsSourceConnector;
 
@@ -190,6 +193,35 @@ pub fn ingest_with_config(
     let parser_version = ParserVersion(KEBAB_PARSE_MD_VERSION.to_string());
     let chunk_policy = chunk_policy_from_config(&app.config);
 
+    // P6-4: build OCR / caption adapters once per ingest invocation,
+    // gated on their respective `enabled` flags. `reqwest::blocking::Client`
+    // is internally Arc-shared so reusing one instance across the asset
+    // loop is correct and cheap. Construction failure (e.g. invalid
+    // endpoint) aborts ingest fail-fast — better than silently disabling
+    // OCR/caption mid-run.
+    let ocr_engine: Option<OllamaVisionOcr> = if app.config.image.ocr.enabled {
+        Some(
+            OllamaVisionOcr::new(&app.config)
+                .context("kb-app::ingest: build OllamaVisionOcr")?,
+        )
+    } else {
+        None
+    };
+    let caption_llm: Option<Box<dyn LanguageModel>> = if app.config.image.caption.enabled {
+        Some(Box::new(
+            OllamaLanguageModel::new(&app.config)
+                .context("kb-app::ingest: build OllamaLanguageModel for caption")?,
+        ))
+    } else {
+        None
+    };
+    let image_extractor = ImageExtractor::new();
+    let image_pipeline = ImagePipeline {
+        extractor: &image_extractor,
+        ocr_engine: ocr_engine.as_ref(),
+        caption_llm: caption_llm.as_deref(),
+    };
+
     // Pre-load every existing doc_id so we can label `IngestItem.kind`
     // as `New` vs `Updated` correctly. `list_documents` returns one
     // row per `(workspace_path, asset_id)` — index by the deterministic
@@ -230,6 +262,7 @@ pub fn ingest_with_config(
             embedder.as_ref(),
             vector_store.as_ref(),
             &existing_doc_ids,
+            &image_pipeline,
         );
 
         let item = match item {
@@ -438,6 +471,16 @@ type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
 /// persist, embed. Per-asset failures bubble up to the caller for
 /// labelling as `IngestItemKind::Error` — they do NOT abort the
 /// whole run.
+/// P6-4: borrowed bundle of the three image-pipeline components built
+/// once per ingest invocation. Threaded through `ingest_one_asset` so
+/// the dispatch does not need ten separate parameters.
+struct ImagePipeline<'a> {
+    extractor: &'a ImageExtractor,
+    ocr_engine: Option<&'a OllamaVisionOcr>,
+    caption_llm: Option<&'a dyn LanguageModel>,
+}
+
+#[allow(clippy::too_many_arguments)]
 fn ingest_one_asset(
     app: &App,
     asset: &RawAsset,
@@ -446,27 +489,47 @@ fn ingest_one_asset(
     embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
     vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
     existing_doc_ids: &std::collections::HashSet<String>,
+    image_pipeline: &ImagePipeline<'_>,
 ) -> anyhow::Result<kebab_core::IngestItem> {
     tracing::debug!(
         target: "kebab-app::ingest",
         path = %asset.workspace_path.0,
+        media_type = ?asset.media_type,
         "processing asset"
     );
-    // Only handle Markdown for now; other media types are P6+ work.
-    if asset.media_type != kebab_core::MediaType::Markdown {
-        return Ok(kebab_core::IngestItem {
-            kind: kebab_core::IngestItemKind::Skipped,
-            doc_id: None,
-            doc_path: asset.workspace_path.clone(),
-            asset_id: Some(asset.asset_id.clone()),
-            byte_len: Some(asset.byte_len),
-            block_count: None,
-            chunk_count: None,
-            parser_version: None,
-            chunker_version: None,
-            warnings: Vec::new(),
-            error: None,
-        });
+    // P6-4: dispatch on media_type. Markdown takes the existing
+    // parse-md / normalize path; image takes the new
+    // ImageExtractor + (optional) OCR + (optional) caption path.
+    // Anything else (PDF, audio, unknown) is skipped — the
+    // respective phases (P7 / P8) wire them in later.
+    match &asset.media_type {
+        MediaType::Markdown => { /* fall through to markdown path */ }
+        MediaType::Image(_) => {
+            return ingest_one_image_asset(
+                app,
+                asset,
+                chunk_policy,
+                embedder,
+                vector_store,
+                existing_doc_ids,
+                image_pipeline,
+            );
+        }
+        _ => {
+            return Ok(kebab_core::IngestItem {
+                kind: kebab_core::IngestItemKind::Skipped,
+                doc_id: None,
+                doc_path: asset.workspace_path.clone(),
+                asset_id: Some(asset.asset_id.clone()),
+                byte_len: Some(asset.byte_len),
+                block_count: None,
+                chunk_count: None,
+                parser_version: None,
+                chunker_version: None,
+                warnings: Vec::new(),
+                error: None,
+            });
+        }
     }
 
     let path = match &asset.source_uri {
@@ -612,6 +675,228 @@ fn ingest_one_asset(
     })
 }
 
+/// P6-4: process one `MediaType::Image(_)` asset end-to-end.
+///
+/// Pipeline: read bytes → `ImageExtractor::extract` → optional
+/// `apply_ocr` → optional `apply_caption` → existing chunker / embedder
+/// / store path (the same one markdown uses, which already handles
+/// `Block::ImageRef` per P1-5).
+///
+/// Failure semantics (per P6-4 spec):
+/// - `ImageExtractor::extract` Err → propagate (caller increments
+///   `errors`).
+/// - OCR / caption Err → log + `Provenance::Warning` event, continue.
+///   `block.ocr` / `block.caption` stay `None`. `errors` NOT incremented.
+#[allow(clippy::too_many_arguments)]
+fn ingest_one_image_asset(
+    app: &App,
+    asset: &RawAsset,
+    chunk_policy: &ChunkPolicy,
+    embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
+    vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
+    existing_doc_ids: &std::collections::HashSet<String>,
+    image_pipeline: &ImagePipeline<'_>,
+) -> anyhow::Result<kebab_core::IngestItem> {
+    let image_extractor = image_pipeline.extractor;
+    let ocr_engine = image_pipeline.ocr_engine;
+    let caption_llm = image_pipeline.caption_llm;
+    let path = match &asset.source_uri {
+        SourceUri::File(p) => p.clone(),
+        SourceUri::Kb(_) => {
+            return Ok(kebab_core::IngestItem {
+                kind: kebab_core::IngestItemKind::Skipped,
+                doc_id: None,
+                doc_path: asset.workspace_path.clone(),
+                asset_id: Some(asset.asset_id.clone()),
+                byte_len: Some(asset.byte_len),
+                block_count: None,
+                chunk_count: None,
+                parser_version: None,
+                chunker_version: None,
+                warnings: vec![
+                    "kb:// source URIs are not supported by the fs ingester".into(),
+                ],
+                error: None,
+            });
+        }
+    };
+    let bytes = std::fs::read(&path)
+        .with_context(|| format!("read image asset bytes from {}", path.display()))?;
+
+    // 1. Decode + EXIF + dimensions. ExtractContext.config carries
+    //    nothing the image extractor reads today; we pass a default
+    //    instance per the trait shape.
+    let extract_config = kebab_core::ExtractConfig::default();
+    let workspace_root = std::path::PathBuf::from(&app.config.workspace.root);
+    let ctx = ExtractContext {
+        asset,
+        workspace_root: &workspace_root,
+        config: &extract_config,
+    };
+    let mut canonical = image_extractor
+        .extract(&ctx, &bytes)
+        .context("kb-parse-image::ImageExtractor::extract")?;
+
+    // 2 + 3. Apply OCR / caption when their adapters exist. Both are
+    //        Lenient — failure is captured into Provenance Warning,
+    //        `block.ocr` / `block.caption` stay `None`. P6-4 spec
+    //        explicitly: such partial failures do NOT increment the
+    //        `errors` counter.
+    let lang_hint = lang_hint_from_doc(&canonical);
+    let mut warning_notes: Vec<String> = Vec::new();
+    if !canonical.blocks.is_empty() {
+        // P6-1 contract: image documents always have exactly one
+        // `Block::ImageRef`. Defensive match keeps us forward-compatible.
+        if let Some(Block::ImageRef(block)) = canonical.blocks.first_mut() {
+            if let Some(engine) = ocr_engine {
+                if let Err(e) = apply_ocr(
+                    engine,
+                    &bytes,
+                    block,
+                    lang_hint.as_ref(),
+                    &mut canonical.provenance.events,
+                ) {
+                    let note = format!("ocr_failed: {e:#}");
+                    tracing::warn!(
+                        target: "kebab-app",
+                        path = %asset.workspace_path.0,
+                        "{}",
+                        note
+                    );
+                    canonical.provenance.events.push(kebab_core::ProvenanceEvent {
+                        at: time::OffsetDateTime::now_utc(),
+                        agent: "kb-app".to_string(),
+                        kind: kebab_core::ProvenanceKind::Warning,
+                        note: Some(note.clone()),
+                    });
+                    warning_notes.push(note);
+                }
+            }
+            if let Some(llm) = caption_llm {
+                if let Err(e) = apply_caption(
+                    llm,
+                    &bytes,
+                    block,
+                    lang_hint.as_ref(),
+                    &app.config,
+                    &mut canonical.provenance.events,
+                ) {
+                    let note = format!("caption_failed: {e:#}");
+                    tracing::warn!(
+                        target: "kebab-app",
+                        path = %asset.workspace_path.0,
+                        "{}",
+                        note
+                    );
+                    canonical.provenance.events.push(kebab_core::ProvenanceEvent {
+                        at: time::OffsetDateTime::now_utc(),
+                        agent: "kb-app".to_string(),
+                        kind: kebab_core::ProvenanceKind::Warning,
+                        note: Some(note.clone()),
+                    });
+                    warning_notes.push(note);
+                }
+            }
+        }
+    }
+
+    // 4. Chunk via the same `MdHeadingV1Chunker` markdown uses — its
+    //    `Block::ImageRef` arm already produces a single chunk per
+    //    image (P1-5). The chunk text now follows the (β) plain-concat
+    //    contract per the kebab-chunk render_block_text update.
+    let chunks = MdHeadingV1Chunker
+        .chunk(&canonical, chunk_policy)
+        .context("kb-chunk::MdHeadingV1Chunker::chunk (image)")?;
+
+    // 5. Persist + embed — identical sequence to markdown.
+    app.sqlite
+        .put_asset_with_bytes(asset, &bytes)
+        .context("DocumentStore::put_asset_with_bytes (image)")?;
+    app.sqlite
+        .put_document(&canonical)
+        .context("DocumentStore::put_document (image)")?;
+    app.sqlite
+        .put_blocks(&canonical.doc_id, &canonical.blocks)
+        .context("DocumentStore::put_blocks (image)")?;
+    app.sqlite
+        .put_chunks(&canonical.doc_id, &chunks)
+        .context("DocumentStore::put_chunks (image)")?;
+
+    if let (Some(emb), Some(vec_store)) = (embedder, vector_store)
+        && !chunks.is_empty()
+    {
+        let inputs: Vec<EmbeddingInput<'_>> = chunks
+            .iter()
+            .map(|c| EmbeddingInput {
+                text: c.text.as_str(),
+                kind: EmbeddingKind::Document,
+            })
+            .collect();
+        let vectors = emb
+            .embed(&inputs)
+            .context("Embedder::embed (image chunks)")?;
+        let model_id = emb.model_id();
+        let model_version = emb.model_version();
+        let dimensions = emb.dimensions();
+        let records: Vec<VectorRecord> = chunks
+            .iter()
+            .zip(vectors)
+            .map(|(c, v)| VectorRecord {
+                embedding_id: kebab_core::id_for_embedding(
+                    &c.chunk_id,
+                    &model_id,
+                    &model_version,
+                    dimensions,
+                ),
+                chunk_id: c.chunk_id.clone(),
+                vector: v,
+                doc_id: canonical.doc_id.clone(),
+                text: c.text.clone(),
+                heading_path: c.heading_path.clone(),
+                model_id: model_id.clone(),
+                model_version: model_version.clone(),
+                dimensions,
+            })
+            .collect();
+        vec_store
+            .upsert(&records)
+            .context("VectorStore::upsert (image)")?;
+    }
+
+    let kind = if existing_doc_ids.contains(&canonical.doc_id.0) {
+        kebab_core::IngestItemKind::Updated
+    } else {
+        kebab_core::IngestItemKind::New
+    };
+
+    Ok(kebab_core::IngestItem {
+        kind,
+        doc_id: Some(canonical.doc_id.clone()),
+        doc_path: asset.workspace_path.clone(),
+        asset_id: Some(asset.asset_id.clone()),
+        byte_len: Some(asset.byte_len),
+        block_count: u32::try_from(canonical.blocks.len()).ok(),
+        chunk_count: u32::try_from(chunks.len()).ok(),
+        parser_version: Some(canonical.parser_version.clone()),
+        chunker_version: Some(MdHeadingV1Chunker.chunker_version()),
+        warnings: warning_notes,
+        error: None,
+    })
+}
+
+/// Pull the BCP-47 language hint from the canonical document. P6-1
+/// stamps `Lang("und")` by default; image-pipeline OCR / caption
+/// adapters special-case "und" so the hint is intentionally dropped
+/// from prompts.
+fn lang_hint_from_doc(doc: &CanonicalDocument) -> Option<Lang> {
+    let s = doc.lang.0.as_str();
+    if s.is_empty() || s == "und" {
+        None
+    } else {
+        Some(doc.lang.clone())
+    }
+}
+
 /// Convenience: end byte of the frontmatter region (or 0 when absent).
 fn fm_span_end(span: Option<kebab_parse_md::FrontmatterSpan>) -> usize {
     span.map(|s| s.end).unwrap_or(0)
diff --git a/crates/kebab-app/tests/image_pipeline.rs b/crates/kebab-app/tests/image_pipeline.rs
new file mode 100644
index 0000000..60dea3e
--- /dev/null
+++ b/crates/kebab-app/tests/image_pipeline.rs
@@ -0,0 +1,366 @@
+//! P6-4 image ingest wiring — end-to-end integration.
+//!
+//! Each test spins up a `TempDir` workspace + writes one PNG fixture +
+//! routes OCR / caption HTTP calls through a `wiremock` server that
+//! impersonates Ollama's `/api/generate` endpoint. The kb-app code
+//! under test is sync; the wiremock server is async, so test bodies
+//! drive blocking work via `tokio::task::spawn_blocking`.
+
+mod common;
+
+use std::path::Path;
+
+use common::TestEnv;
+use kebab_config::Config;
+use serde_json::json;
+use tokio::task::spawn_blocking;
+use wiremock::matchers::{method, path};
+use wiremock::{Mock, MockServer, ResponseTemplate};
+
+// ── Fixture helpers ──────────────────────────────────────────────────────
+
+/// Tiny solid-red PNG written into the test workspace at `<root>/<name>`.
+/// 100×50 — small enough to skip downscale by default but non-trivially
+/// inspectable in stored DB rows.
+fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf {
+    use image::{ImageBuffer, Rgb};
+    let img: ImageBuffer<Rgb<u8>, _> =
+        ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
+    let path = root.join(name);
+    img.save(&path).expect("write PNG fixture");
+    path
+}
+
+fn cfg_with_image_pipeline(env: &TestEnv, mock_endpoint: &str) -> Config {
+    let mut cfg = env.config.clone();
+    // Ensure image assets are scanned.
+    cfg.workspace
+        .include
+        .push("**/*.png".to_string());
+    cfg.image.ocr.enabled = true;
+    cfg.image.ocr.endpoint = Some(mock_endpoint.to_string());
+    cfg.image.ocr.model = "vision-mock:1b".to_string();
+    cfg.image.ocr.max_pixels = 512;
+    cfg.image.caption.enabled = false; // tested separately below
+    cfg.models.llm.endpoint = mock_endpoint.to_string();
+    cfg.models.llm.model = "vision-mock:1b".to_string();
+    cfg
+}
+
+// ── 1. Happy path: OCR-only ingest ───────────────────────────────────────
+
+/// One PNG asset + OCR enabled (caption off) → ingest produces 1 doc + 1
+/// chunk; chunk text contains alt + OCR transcription joined by `\n\n`.
+#[tokio::test]
+async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(json!({
+            "model": "vision-mock:1b",
+            "response": "Hello World 2026",
+            "done": true,
+            "done_reason": "stop"
+        })))
+        .mount(&server)
+        .await;
+
+    let env = TestEnv::lexical_only();
+    let png = write_red_png(&env.workspace_root, "diagram.png");
+    eprintln!("PNG written to {}", png.display());
+    let cfg = cfg_with_image_pipeline(&env, &server.uri());
+    let cfg_clone = cfg.clone();
+    let env_workspace = env.workspace_root.clone();
+    let env_scope = env.scope();
+
+    let report = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg_clone, env_scope, false)
+            .expect("image ingest must succeed")
+    })
+    .await
+    .expect("blocking task panicked");
+
+    // Counters: scanned should include the PNG; new ≥ 1 (markdown
+    // fixtures from the workspace tree may also count).
+    assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items);
+    assert_eq!(report.errors, 0, "no errors on lenient OCR path");
+
+    // Locate the image doc in the report items.
+    let items = report.items.expect("items present (summary_only=false)");
+    let img_item = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("diagram.png"))
+        .expect("image doc item must be present");
+    assert_eq!(
+        img_item.kind,
+        kebab_core::IngestItemKind::New,
+        "image asset must be classified New on first ingest"
+    );
+    assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk");
+
+    // Inspect the stored chunk text via kb-app's inspect_chunk facade.
+    let doc_id = img_item.doc_id.clone().expect("image doc id");
+    let doc = kebab_app::inspect_doc_with_config(cfg.clone(), &doc_id)
+        .expect("inspect_doc returns the image document");
+    let block = match doc.blocks.first() {
+        Some(kebab_core::Block::ImageRef(b)) => b,
+        other => panic!("expected ImageRef, got {other:?}"),
+    };
+    assert!(block.ocr.is_some(), "block.ocr populated by apply_ocr");
+    assert_eq!(
+        block.ocr.as_ref().unwrap().joined,
+        "Hello World 2026",
+        "OCR text from mock"
+    );
+    assert!(
+        block.caption.is_none(),
+        "caption disabled in cfg → block.caption stays None"
+    );
+
+    // Sanity: the doc was actually persisted into SQLite (kb-app's
+    // list_docs facade reads the same store the chunker writes to).
+    let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
+        .expect("list_docs");
+    assert!(
+        summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")),
+        "image doc must appear in list_docs"
+    );
+
+    drop(env_workspace); // keep TempDir alive until here
+    drop(env);
+}
+
+// ── 2. OCR + caption together ────────────────────────────────────────────
+
+/// Both OCR and caption enabled. The mock returns the same JSON body
+/// for every `/api/generate` POST — wiremock has no per-prompt routing
+/// on the default `Mock` so we treat both calls as equivalent. We then
+/// verify both `block.ocr` and `block.caption` are populated, and the
+/// chunk text contains both fragments separated by `\n\n`.
+#[tokio::test]
+async fn ingest_image_with_ocr_and_caption_populates_both_fields() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(json!({
+            "response": "shared mock body",
+            "done": true,
+            "done_reason": "stop"
+        })))
+        .mount(&server)
+        .await;
+
+    let env = TestEnv::lexical_only();
+    write_red_png(&env.workspace_root, "diagram.png");
+    let mut cfg = cfg_with_image_pipeline(&env, &server.uri());
+    cfg.image.caption.enabled = true;
+    cfg.image.caption.max_pixels = 384;
+
+    let cfg_clone = cfg.clone();
+    let scope = env.scope();
+    let report = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg_clone, scope, false)
+            .expect("ingest must succeed with both OCR+caption")
+    })
+    .await
+    .expect("task");
+
+    assert_eq!(report.errors, 0);
+    let img_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("diagram.png"))
+        .unwrap();
+    let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
+        .unwrap();
+    let block = match &doc.blocks[0] {
+        kebab_core::Block::ImageRef(b) => b,
+        _ => unreachable!(),
+    };
+    assert!(block.ocr.is_some(), "OCR populated");
+    assert!(block.caption.is_some(), "caption populated");
+    drop(env);
+}
+
+// ── 3. Lenient failure: OCR Ollama 503 → asset still indexed ─────────────
+
+/// OCR endpoint returns 503. Spec contract: image is still indexed,
+/// `block.ocr = None`, Provenance has a Warning event, `errors`
+/// counter NOT incremented.
+#[tokio::test]
+async fn ocr_failure_indexes_asset_with_warning_no_error_counter() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(503))
+        .mount(&server)
+        .await;
+
+    let env = TestEnv::lexical_only();
+    write_red_png(&env.workspace_root, "broken.png");
+    let cfg = cfg_with_image_pipeline(&env, &server.uri());
+
+    let cfg_clone = cfg.clone();
+    let scope = env.scope();
+    let report = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg_clone, scope, false)
+            .expect("ingest does not abort on lenient OCR failure")
+    })
+    .await
+    .expect("task");
+
+    assert_eq!(
+        report.errors, 0,
+        "lenient OCR failure must NOT increment errors counter (spec)"
+    );
+    let img_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("broken.png"))
+        .expect("asset still indexed despite OCR failure");
+    assert_eq!(img_item.kind, kebab_core::IngestItemKind::New);
+    assert_eq!(img_item.chunk_count, Some(1));
+    assert!(
+        !img_item.warnings.is_empty(),
+        "lenient OCR failure must surface a warning on the IngestItem"
+    );
+
+    let doc_id = img_item.doc_id.clone().unwrap();
+    let doc = kebab_app::inspect_doc_with_config(cfg, &doc_id).unwrap();
+    let block = match &doc.blocks[0] {
+        kebab_core::Block::ImageRef(b) => b,
+        _ => unreachable!(),
+    };
+    assert!(block.ocr.is_none(), "block.ocr stays None on OCR failure");
+    let warning = doc
+        .provenance
+        .events
+        .iter()
+        .find(|e| e.kind == kebab_core::ProvenanceKind::Warning && e.agent == "kb-app")
+        .expect("Provenance Warning attributed to kb-app");
+    let note = warning.note.as_deref().unwrap_or("");
+    assert!(
+        note.contains("ocr_failed"),
+        "warning note must describe OCR failure: {note}"
+    );
+}
+
+// ── 4. Both image.ocr.enabled and image.caption.enabled = false ──────────
+
+/// When both adapters are disabled, the image is still extracted +
+/// chunked. Chunk text falls back to the filename. EXIF + dimensions
+/// are populated by the extractor regardless.
+#[tokio::test]
+async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
+    // No mock server needed — neither HTTP path is touched.
+    let env = TestEnv::lexical_only();
+    write_red_png(&env.workspace_root, "raw.png");
+    let mut cfg = env.config.clone();
+    cfg.workspace.include.push("**/*.png".to_string());
+    cfg.image.ocr.enabled = false;
+    cfg.image.caption.enabled = false;
+
+    let cfg_clone = cfg.clone();
+    let scope = env.scope();
+    let report = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg_clone, scope, false)
+            .expect("ingest with no OCR/caption")
+    })
+    .await
+    .expect("task");
+
+    assert_eq!(report.errors, 0);
+    let img_item = report
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("raw.png"))
+        .unwrap();
+    assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk");
+    let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
+        .unwrap();
+    let block = match &doc.blocks[0] {
+        kebab_core::Block::ImageRef(b) => b,
+        _ => unreachable!(),
+    };
+    assert!(block.ocr.is_none() && block.caption.is_none());
+    // EXIF + dimensions still populated by the extractor.
+    let dims = doc
+        .metadata
+        .user
+        .get("dimensions")
+        .and_then(|v: &serde_json::Value| v.as_object())
+        .expect("dimensions object present");
+    assert_eq!(
+        dims.get("w").and_then(|v: &serde_json::Value| v.as_u64()),
+        Some(100)
+    );
+    assert_eq!(
+        dims.get("h").and_then(|v: &serde_json::Value| v.as_u64()),
+        Some(50)
+    );
+}
+
+// ── 5. Determinism: re-ingest produces identical doc_id / chunk_id ───────
+
+/// Idempotency contract — running the same ingest twice should mark
+/// the asset Updated on the second run with byte-identical IDs.
+#[tokio::test]
+async fn re_ingest_image_produces_updated_with_same_doc_id() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(json!({
+            "response": "stable",
+            "done": true,
+            "done_reason": "stop"
+        })))
+        .mount(&server)
+        .await;
+
+    let env = TestEnv::lexical_only();
+    write_red_png(&env.workspace_root, "diagram.png");
+    let cfg = cfg_with_image_pipeline(&env, &server.uri());
+
+    let scope = env.scope();
+    let cfg1 = cfg.clone();
+    let cfg2 = cfg.clone();
+    let scope1 = scope.clone();
+    let scope2 = scope.clone();
+
+    let r1 = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg1, scope1, false).unwrap()
+    })
+    .await
+    .unwrap();
+    let r2 = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg2, scope2, false).unwrap()
+    })
+    .await
+    .unwrap();
+
+    let id1 = r1
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("diagram.png"))
+        .unwrap()
+        .doc_id
+        .clone()
+        .unwrap();
+    let img2 = r2
+        .items
+        .as_ref()
+        .unwrap()
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("diagram.png"))
+        .unwrap();
+    assert_eq!(img2.kind, kebab_core::IngestItemKind::Updated);
+    assert_eq!(img2.doc_id.as_ref().unwrap(), &id1);
+}
diff --git a/crates/kebab-chunk/src/md_heading_v1.rs b/crates/kebab-chunk/src/md_heading_v1.rs
index f29a4d4..1279ebf 100644
--- a/crates/kebab-chunk/src/md_heading_v1.rs
+++ b/crates/kebab-chunk/src/md_heading_v1.rs
@@ -381,17 +381,41 @@ fn render_block_text(b: &Block) -> String {
             }
             s
         }
-        // ImageRef text portion = alt (per task spec). Fall back to
-        // model caption text if alt is empty.
+        // ImageRef text portion follows the P6-4 (β) plain-concat
+        // contract — `[alt, ocr.joined, caption.text]` joined by
+        // `\n\n`, dropping empty parts. Filename fallback for empty
+        // alt keeps lexical search hits on filenames working even when
+        // P6-1's filename auto-fill is bypassed.
         Block::ImageRef(i) => {
-            if !i.alt.is_empty() {
+            let alt = if !i.alt.is_empty() {
                 i.alt.clone()
             } else {
-                i.caption
-                    .as_ref()
-                    .map(|c| c.text.clone())
-                    .unwrap_or_default()
-            }
+                // P6-1 falls back to filename so this branch is
+                // defensive — keep it lest a future test fixture or
+                // synthetic block path skip the auto-fill.
+                i.src
+                    .rsplit('/')
+                    .next()
+                    .filter(|s| !s.is_empty())
+                    .unwrap_or("[image]")
+                    .to_string()
+            };
+            let ocr = i
+                .ocr
+                .as_ref()
+                .map(|o| o.joined.as_str())
+                .unwrap_or("");
+            let cap = i
+                .caption
+                .as_ref()
+                .map(|c| c.text.as_str())
+                .unwrap_or("");
+            [alt.as_str(), ocr, cap]
+                .iter()
+                .filter(|s| !s.is_empty())
+                .copied()
+                .collect::<Vec<_>>()
+                .join("\n\n")
         }
         // AudioRef has no caption preview yet (transcript joins land
         // in P8). Empty string per task spec.
@@ -700,6 +724,63 @@ mod tests {
         }
     }
 
+    /// P6-4 (β) plain concatenation — alt + ocr.joined + caption.text
+    /// joined by `\n\n`, dropping empty parts. Verifies all four
+    /// (alt-only, alt+ocr, alt+caption, alt+ocr+caption) shapes.
+    #[test]
+    fn image_ref_p6_4_plain_concat_drops_empty_parts() {
+        use kebab_core::{ModelCaption, OcrText};
+
+        let mk = |alt: &str, ocr: Option<&str>, cap: Option<&str>| {
+            Block::ImageRef(ImageRefBlock {
+                common: common_for("imageref", &[], 0, span(1, 1)),
+                asset_id: None,
+                src: "img.png".into(),
+                alt: alt.into(),
+                ocr: ocr.map(|t| OcrText {
+                    joined: t.into(),
+                    regions: vec![],
+                    engine: "test".into(),
+                    engine_version: "v1".into(),
+                }),
+                caption: cap.map(|t| ModelCaption {
+                    text: t.into(),
+                    model: "m".into(),
+                    model_version: "v".into(),
+                }),
+            })
+        };
+
+        // alt-only — no separators between empty parts.
+        assert_eq!(render_block_text(&mk("photo.png", None, None)), "photo.png");
+
+        // alt + ocr — joined by exactly one `\n\n`.
+        assert_eq!(
+            render_block_text(&mk("photo.png", Some("Hello"), None)),
+            "photo.png\n\nHello"
+        );
+
+        // alt + caption.
+        assert_eq!(
+            render_block_text(&mk("photo.png", None, Some("a red square"))),
+            "photo.png\n\na red square"
+        );
+
+        // alt + ocr + caption — three parts joined by `\n\n` each.
+        assert_eq!(
+            render_block_text(&mk("photo.png", Some("Hello"), Some("a red square"))),
+            "photo.png\n\nHello\n\na red square"
+        );
+
+        // empty alt — falls back to filename derived from `src`.
+        let blk = mk("", Some("text from image"), None);
+        assert_eq!(
+            render_block_text(&blk),
+            "img.png\n\ntext from image",
+            "empty alt must fall back to the basename of `src`"
+        );
+    }
+
     /// ImageRef → own chunk, token_estimate=0.
     #[test]
     fn image_ref_emits_own_chunk_zero_tokens() {
diff --git a/docs/SMOKE.md b/docs/SMOKE.md
index e3cdf79..3a17012 100644
--- a/docs/SMOKE.md
+++ b/docs/SMOKE.md
@@ -118,16 +118,41 @@ max_context_tokens = 6000
 KEBAB() { ./target/debug/kebab --config /tmp/kebab-smoke/config.toml "$@"; }
 
 KB doctor                                          # 1. health check
-KB ingest                                          # 2. 워크스페이스 색인
-KB list docs                                       # 3. 색인 결과 목록
+KB ingest                                          # 2. 워크스페이스 색인 (markdown + image)
+KB list docs                                       # 3. 색인 결과 목록 (markdown + image 모두 표시)
 KB search --mode lexical "코루틴" --k 3            # 4. lexical 검색
 KB search --mode vector "memory safety" --k 3      # 5. vector 검색
 KB search --mode hybrid "Cargo workspace" --k 3    # 6. hybrid 검색
-KB inspect chunk <chunk_id>                        # 7. raw chunk 보기
-KB ask "이 KB 안에서 ..." --mode hybrid --k 5     # 8. RAG 답변 (Ollama 필요)
-KB --json ask "..." --mode hybrid                  # 9. 기계 친화 출력 검증
+KB search --mode lexical "Hello World" --k 3       # 7. image OCR 텍스트 검색 (P6-4)
+KB inspect chunk <chunk_id>                        # 8. raw chunk 보기
+KB ask "이 KB 안에서 ..." --mode hybrid --k 5     # 9. RAG 답변 (Ollama 필요)
+KB --json ask "..." --mode hybrid                  # 10. 기계 친화 출력 검증
 ```
 
+## P6-4 이미지 ingestion 옵션
+
+`config.toml` 에 다음 절을 추가하면 `kebab ingest` 가 `**/*.png` / `**/*.jpg` 등 이미지 자산도 함께 색인합니다 (텍스트만 색인하려면 생략):
+
+```toml
+[workspace]
+include = ["**/*.md", "**/*.png", "**/*.jpg"]
+
+[image.ocr]
+enabled = true                        # vision LM 으로 이미지 안 텍스트 전사
+engine = "ollama-vision"
+model = "gemma4:e4b"                  # 사용자 환경의 비전 모델
+endpoint = "http://192.168.0.47:11434"  # 비우면 models.llm.endpoint fallback
+languages = ["eng", "kor"]
+max_pixels = 1600                     # long-edge cap
+
+[image.caption]
+enabled = true                        # vision LM 으로 한 문장 객관 설명 생성
+max_pixels = 768
+prompt_template_version = "caption-v1"
+```
+
+이미지 자산 한 장당 OCR 1 호출 + Caption 1 호출 → ~3-6초 (`gemma4:e4b` 기준). 다이어그램 / 카메라 사진 / 스크린샷 위주 워크스페이스에 권장. 책 / 스캔본은 P7 PDF 라인으로 (P7 머지 후).
+
 각 명령은 0 종료 코드면 정상. `kebab ask` 는 거절 시 종료 코드 1 (`RefusalSignal`) — 의도된 동작.
 
 ## 검증 체크리스트
@@ -138,6 +163,8 @@ KB --json ask "..." --mode hybrid                  # 9. 기계 친화 출력 검
 - `kebab search --mode hybrid` 의 `fusion_score` 가 `[0, 1]` 범위 (top-1 종종 1.0 — 두 retriever 모두 rank 1 일 때).
 - `kebab ask` JSON 응답에 `model.id` 가 config 의 모델 (`gemma4:26b` 등) 과 일치, `embedding.id = multilingual-e5-small`, `citations[].marker` 가 `[1]` / `[2]` 형식 (square-bracketed bare index).
 - 코퍼스에 없는 주제로 `kebab ask` → `refusal_reason: "llm_self_judge"` (또는 `no_chunks` / `score_gate`) + `grounded: false`.
+- (P6-4) `image.ocr.enabled = true` 로 PNG 자산을 ingest 하면 `kebab list docs` 가 markdown 옆에 image doc 도 출력 (`workspace_path` 가 `*.png`). `kebab inspect doc <image_doc_id>` 의 `block.ocr.joined` 가 vision LM 의 OCR 결과 (예: 스크린샷 안의 텍스트). `kebab search --mode lexical "<OCR text>"` 가 그 image chunk 를 반환하면 wiring 정상.
+- OCR / caption 부분 실패는 `errors` 카운터 미증가 — `kebab inspect doc <id>` 의 Provenance Warning 이벤트 또는 `--debug` 로그에서만 확인.
 
 ## 정리
 
@@ -154,5 +181,6 @@ rm -rf /tmp/kebab-smoke              # 통째로 정리
 - `kebab ask` 응답 시간 = LLM 토큰 throughput 에 종속. M4 Pro 48GB + gemma4:26b 기준 답변 50–100 토큰에 20–55초.
 - `--config` path 가 존재하지 않거나 malformed 면 `kebab doctor` 가 hard fail (defaults 가 silently mask 하지 않게 하는 hotfix 동작).
 - 매 CLI invocation 마다 fastembed 모델 init 비용 (~4초) — process-level 캐시 부재 때문. P9 TUI 진입 시 `App` 의 `OnceLock` 으로 세션 동안 한 번만 init.
+- (P6-4) `image.ocr.enabled = true` + `image.caption.enabled = true` 인 워크스페이스에 PNG 가 N장 있으면 ingest 시간 ≈ markdown_time + N × (OCR + Caption latency). `gemma4:e4b` + 192.168.0.47 로 자산당 ~5-10초. 다수의 책 페이지를 이미지로 넣지 말 것 — 책은 P7 PDF 라인 사용 권장 (P7 머지 후).
 
 자세한 history 와 발견된 버그는 [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 참조.
diff --git a/tasks/p6/p6-4-image-ingest-wiring.md b/tasks/p6/p6-4-image-ingest-wiring.md
index 716a2fb..04f87db 100644
--- a/tasks/p6/p6-4-image-ingest-wiring.md
+++ b/tasks/p6/p6-4-image-ingest-wiring.md
@@ -3,7 +3,7 @@ phase: P6
 component: kebab-app (image ingest dispatch + chunking)
 task_id: p6-4
 title: "Wire ImageExtractor + OCR + caption into kebab-app::ingest end-to-end"
-status: planned
+status: completed
 depends_on: [p6-1, p6-2, p6-3, p1-6, p3-5]
 unblocks: []
 contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
-- 
2.49.1


From 469a1a34ecde78f4c7236cf52ab4d7f12d2013fd Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Sat, 2 May 2026 07:42:44 +0000
Subject: [PATCH 2/3] =?UTF-8?q?review(p6-4):=20=ED=9A=8C=EC=B0=A8=201=20?=
 =?UTF-8?q?=EC=A7=80=EC=A0=81=20=EB=B0=98=EC=98=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- src/lib.rs:
  • `ingest_one_asset` 의 doc-comment 가 새 `ImagePipeline` struct 와
    합쳐지던 (rustdoc 가 두 doc 을 struct 의 것으로 합치던) 문제
    해소 — 두 doc-comment 위치 교환 + 빈 줄 분리.
  • `if let Some(Block::ImageRef(...)) = blocks.first_mut()` 의
    silent-skip 분기를 `match` 의 `other` arm 으로 명시 — 미래에
    P6-1 contract 가 깨지면 `tracing::warn!` + Provenance Warning +
    `IngestItem.warnings` 에 \"ImageDispatchAnomaly\" 노트로 즉시
    가시화. 운영 디버깅 단서 제공.
  • OCR 실패 분기 + caption 실패 분기의 ~25줄 boilerplate 를
    `record_image_analysis_failure` 헬퍼로 추출 — 두 호출이 한 줄로
    줄고 미래 ProvenanceEvent 필드 변경이 한 곳에서 끝남.
  • 분석 단계 Warning 이벤트가 fn 진입 시 캡처한 단일
    `OffsetDateTime::now_utc()` 를 공유 — spec Risks/notes 의
    \"Determinism stress: must not introduce a second `now()` call
    between extract and apply_ocr/caption\" 약속 회복.
  • 경고 라벨을 markdown 경로의 `WarningKind` 컨벤션 (`{kind}: {note}`)
    에 맞춤 — `\"ocr_failed: ...\"` → `\"OcrFailed: ...\"`,
    `\"caption_failed: ...\"` → `\"CaptionFailed: ...\"`. 같은 wire
    필드 (`IngestItem.warnings`) 가 두 갈래의 다른 형식을 갖던
    inconsistency 해소.
- tests/image_pipeline.rs:
  • 회귀 테스트의 \"ocr_failed\" assertion 을 \"OcrFailed\" 로 갱신.

cargo test -p kebab-app -p kebab-chunk — 전부 pass.
cargo clippy --workspace --all-targets -- -D warnings — pass.
---
 crates/kebab-app/src/lib.rs              | 137 +++++++++++++++--------
 crates/kebab-app/tests/image_pipeline.rs |   5 +-
 2 files changed, 96 insertions(+), 46 deletions(-)

diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs
index fff98c3..3cf31fc 100644
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -467,10 +467,6 @@ fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String {
 /// `<… as JobRepo>` to be explicit.
 type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
 
-/// Process a single asset: read bytes, parse, normalize, chunk,
-/// persist, embed. Per-asset failures bubble up to the caller for
-/// labelling as `IngestItemKind::Error` — they do NOT abort the
-/// whole run.
 /// P6-4: borrowed bundle of the three image-pipeline components built
 /// once per ingest invocation. Threaded through `ingest_one_asset` so
 /// the dispatch does not need ten separate parameters.
@@ -480,6 +476,10 @@ struct ImagePipeline<'a> {
     caption_llm: Option<&'a dyn LanguageModel>,
 }
 
+/// Process a single asset: read bytes, parse, normalize, chunk,
+/// persist, embed. Per-asset failures bubble up to the caller for
+/// labelling as `IngestItemKind::Error` — they do NOT abort the
+/// whole run.
 #[allow(clippy::too_many_arguments)]
 fn ingest_one_asset(
     app: &App,
@@ -742,62 +742,78 @@ fn ingest_one_image_asset(
     //        `block.ocr` / `block.caption` stay `None`. P6-4 spec
     //        explicitly: such partial failures do NOT increment the
     //        `errors` counter.
+    //
+    //        Determinism stress (per spec Risks): the per-document
+    //        Provenance timestamps for any analysis-stage Warning
+    //        events share a single `now_utc()` reading taken once
+    //        here, mirroring `kb-normalize::build_canonical_document`.
     let lang_hint = lang_hint_from_doc(&canonical);
+    let now = time::OffsetDateTime::now_utc();
     let mut warning_notes: Vec<String> = Vec::new();
-    if !canonical.blocks.is_empty() {
-        // P6-1 contract: image documents always have exactly one
-        // `Block::ImageRef`. Defensive match keeps us forward-compatible.
-        if let Some(Block::ImageRef(block)) = canonical.blocks.first_mut() {
-            if let Some(engine) = ocr_engine {
-                if let Err(e) = apply_ocr(
+    match canonical.blocks.first_mut() {
+        Some(Block::ImageRef(block)) => {
+            if let Some(engine) = ocr_engine
+                && let Err(e) = apply_ocr(
                     engine,
                     &bytes,
                     block,
                     lang_hint.as_ref(),
                     &mut canonical.provenance.events,
-                ) {
-                    let note = format!("ocr_failed: {e:#}");
-                    tracing::warn!(
-                        target: "kebab-app",
-                        path = %asset.workspace_path.0,
-                        "{}",
-                        note
-                    );
-                    canonical.provenance.events.push(kebab_core::ProvenanceEvent {
-                        at: time::OffsetDateTime::now_utc(),
-                        agent: "kb-app".to_string(),
-                        kind: kebab_core::ProvenanceKind::Warning,
-                        note: Some(note.clone()),
-                    });
-                    warning_notes.push(note);
-                }
+                )
+            {
+                record_image_analysis_failure(
+                    asset,
+                    &mut canonical.provenance.events,
+                    &mut warning_notes,
+                    "OcrFailed",
+                    e,
+                    now,
+                );
             }
-            if let Some(llm) = caption_llm {
-                if let Err(e) = apply_caption(
+            if let Some(llm) = caption_llm
+                && let Err(e) = apply_caption(
                     llm,
                     &bytes,
                     block,
                     lang_hint.as_ref(),
                     &app.config,
                     &mut canonical.provenance.events,
-                ) {
-                    let note = format!("caption_failed: {e:#}");
-                    tracing::warn!(
-                        target: "kebab-app",
-                        path = %asset.workspace_path.0,
-                        "{}",
-                        note
-                    );
-                    canonical.provenance.events.push(kebab_core::ProvenanceEvent {
-                        at: time::OffsetDateTime::now_utc(),
-                        agent: "kb-app".to_string(),
-                        kind: kebab_core::ProvenanceKind::Warning,
-                        note: Some(note.clone()),
-                    });
-                    warning_notes.push(note);
-                }
+                )
+            {
+                record_image_analysis_failure(
+                    asset,
+                    &mut canonical.provenance.events,
+                    &mut warning_notes,
+                    "CaptionFailed",
+                    e,
+                    now,
+                );
             }
         }
+        // P6-1 contract: image documents always have exactly one
+        // `Block::ImageRef`. If a future task introduces multi-block
+        // image documents the silent-skip would mask a real bug, so
+        // this arm surfaces the divergence loudly.
+        other => {
+            tracing::warn!(
+                target: "kebab-app",
+                path = %asset.workspace_path.0,
+                blocks = canonical.blocks.len(),
+                "image document missing leading ImageRef block — OCR/caption skipped (first block: {:?})",
+                other.map(|b| std::mem::discriminant(b))
+            );
+            canonical.provenance.events.push(kebab_core::ProvenanceEvent {
+                at: now,
+                agent: "kb-app".to_string(),
+                kind: kebab_core::ProvenanceKind::Warning,
+                note: Some(
+                    "image document missing leading ImageRef block — OCR/caption skipped"
+                        .to_string(),
+                ),
+            });
+            warning_notes
+                .push("ImageDispatchAnomaly: missing ImageRef block".to_string());
+        }
     }
 
     // 4. Chunk via the same `MdHeadingV1Chunker` markdown uses — its
@@ -884,6 +900,39 @@ fn ingest_one_image_asset(
     })
 }
 
+/// Centralised handling for image-analysis (OCR / caption) failures.
+/// Emits a `tracing::warn!`, appends a `ProvenanceKind::Warning`
+/// event sharing the caller's per-document `now`, and pushes a
+/// `<WarningKind>: <err>` note onto the `IngestItem.warnings` slot
+/// using the same shape the markdown path uses (so downstream wire
+/// readers don't have to learn two formats — see kb-normalize's
+/// `warning_agent`).
+fn record_image_analysis_failure(
+    asset: &RawAsset,
+    events: &mut Vec<kebab_core::ProvenanceEvent>,
+    warning_notes: &mut Vec<String>,
+    kind_label: &str,
+    err: anyhow::Error,
+    now: time::OffsetDateTime,
+) {
+    let detail = format!("{err:#}");
+    let note = format!("{kind_label}: {detail}");
+    tracing::warn!(
+        target: "kebab-app",
+        path = %asset.workspace_path.0,
+        "image analysis stage {} failed: {}",
+        kind_label,
+        detail
+    );
+    events.push(kebab_core::ProvenanceEvent {
+        at: now,
+        agent: "kb-app".to_string(),
+        kind: kebab_core::ProvenanceKind::Warning,
+        note: Some(note.clone()),
+    });
+    warning_notes.push(note);
+}
+
 /// Pull the BCP-47 language hint from the canonical document. P6-1
 /// stamps `Lang("und")` by default; image-pipeline OCR / caption
 /// adapters special-case "und" so the hint is intentionally dropped
diff --git a/crates/kebab-app/tests/image_pipeline.rs b/crates/kebab-app/tests/image_pipeline.rs
index 60dea3e..2dfa557 100644
--- a/crates/kebab-app/tests/image_pipeline.rs
+++ b/crates/kebab-app/tests/image_pipeline.rs
@@ -244,8 +244,9 @@ async fn ocr_failure_indexes_asset_with_warning_no_error_counter() {
         .expect("Provenance Warning attributed to kb-app");
     let note = warning.note.as_deref().unwrap_or("");
     assert!(
-        note.contains("ocr_failed"),
-        "warning note must describe OCR failure: {note}"
+        note.contains("OcrFailed"),
+        "warning note must describe OCR failure with OcrFailed prefix \
+         (markdown-style WarningKind format): {note}"
     );
 }
 
-- 
2.49.1


From 6e4884aff802a5fc2b9784614761b817ab7d4691 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Sat, 2 May 2026 08:13:41 +0000
Subject: [PATCH 3/3] =?UTF-8?q?fix(kebab-app):=20IngestReport.errors=20dou?=
 =?UTF-8?q?ble-count=20regression=20=E2=80=94=20increment=20only=20in=20`m?=
 =?UTF-8?q?atch=20item.kind=20{=20Error=20=3D>=20...=20}`=20arm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

수동 스모크 검증 (12 PNG + 손상 PNG) 중 발견. `IngestReport.errors`
가 자산 한 장당 2회 증가해서 `scanned = new + updated + skipped +
errors` invariant 가 깨짐:

- `garbage.png` (이미지 아닌 바이트, .png 확장자만) 1장 + 정상 자산
  3장 → 기대 `scanned=4 errors=1`, 실제 `scanned=4 errors=2`.
- 원인: `match item { Err(e) => { error_count += 1; IngestItem {...} }
  }` 에서 1회 증가 후, 직후 `match item.kind { Error => { error_count
  += 1 } }` arm 에서 또 1회 증가.
- markdown 경로의 `ingest_one_asset` Err 가 거의 발생 안 해서 P6-4
  머지 전까지 표면화 안 됐던 기존 결함. 이미지 dispatch 가 garbage
  bytes 를 Err 로 흘려보내며 처음으로 노출.

수정: `Err(e)` 분기의 `error_count.saturating_add(1)` 제거. 단일
증가 지점은 `match item.kind { Error => ... }` arm. 코멘트로 의도
명시.

회귀 테스트 추가 (`tests/image_pipeline.rs`):
- `garbage_png_increments_errors_counter_exactly_once` — 정확히 1
  증가 + `scanned == new + updated + skipped + errors` invariant
  검증.

검증 — release binary + 실 Ollama (192.168.0.47 / gemma4:e4b):

```
$ kebab --json ingest
scanned=4 new=3 updated=0 skipped=0 errors=1
  error    garbage.png       (extract Err — unrecognised format)
  new      intro.md
  new      normal.png        (OCR success)
  new      truncated.png     (OcrFailed warning, asset still indexed)
```

cargo test --workspace --no-fail-fast -j 1 — 전부 pass.
cargo clippy --workspace --all-targets -- -D warnings — pass.
cargo test -p kebab-app --test image_pipeline — 6 pass (5 기존 + 1 회귀).
---
 crates/kebab-app/src/lib.rs              |  7 ++-
 crates/kebab-app/tests/image_pipeline.rs | 56 +++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs
index 3cf31fc..d916f7e 100644
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -274,7 +274,12 @@ pub fn ingest_with_config(
                     error = %e,
                     "kb-app::ingest: per-file fatal"
                 );
-                error_count = error_count.saturating_add(1);
+                // Note: `error_count += 1` happens below in the
+                // `match item.kind { Error => ... }` arm — incrementing
+                // here too would double-count (a regression first
+                // surfaced by P6-4 image dispatch where Err returns
+                // are common; markdown rarely propagated Err so the
+                // bug went unnoticed).
                 kebab_core::IngestItem {
                     kind: kebab_core::IngestItemKind::Error,
                     doc_id: None,
diff --git a/crates/kebab-app/tests/image_pipeline.rs b/crates/kebab-app/tests/image_pipeline.rs
index 2dfa557..4d12a8b 100644
--- a/crates/kebab-app/tests/image_pipeline.rs
+++ b/crates/kebab-app/tests/image_pipeline.rs
@@ -307,7 +307,61 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
     );
 }
 
-// ── 5. Determinism: re-ingest produces identical doc_id / chunk_id ───────
+// ── 5. Garbage bytes (not an image) → errors counter exactly 1 ──────────
+
+/// `kebab-source-fs` classifies a `.png` extension as
+/// `MediaType::Image(Png)` regardless of content. When the bytes don't
+/// decode as any image format, `ImageExtractor::extract` returns Err
+/// and the asset must be classified as `IngestItemKind::Error` with
+/// the `errors` counter incremented **exactly once** (regression for
+/// the double-count bug surfaced during P6-4 manual smoke).
+#[tokio::test]
+async fn garbage_png_increments_errors_counter_exactly_once() {
+    // No mock server needed — extract fails before any HTTP call.
+    let env = TestEnv::lexical_only();
+    // Single non-image asset with .png extension.
+    std::fs::write(
+        env.workspace_root.join("garbage.png"),
+        b"this is not an image at all",
+    )
+    .expect("write garbage fixture");
+    let mut cfg = env.config.clone();
+    cfg.workspace.include.push("**/*.png".to_string());
+    cfg.image.ocr.enabled = false;
+    cfg.image.caption.enabled = false;
+
+    let cfg_clone = cfg.clone();
+    let scope = env.scope();
+    let report = spawn_blocking(move || {
+        kebab_app::ingest_with_config(cfg_clone, scope, false)
+            .expect("ingest does not abort on per-asset failure")
+    })
+    .await
+    .expect("task");
+
+    // Exactly-once: scanned counts the asset, errors counts it once,
+    // and (scanned == new + updated + skipped + errors) holds.
+    assert_eq!(
+        report.errors, 1,
+        "garbage PNG must increment errors exactly once, not twice (double-count regression)"
+    );
+    assert_eq!(
+        report.scanned,
+        report.new + report.updated + report.skipped + report.errors,
+        "counter sum must equal scanned — invariant of the IngestReport contract"
+    );
+
+    // The single Error item carries the propagated extract error.
+    let items = report.items.expect("items present");
+    let err_item = items
+        .iter()
+        .find(|i| i.doc_path.0.ends_with("garbage.png"))
+        .expect("garbage item present");
+    assert_eq!(err_item.kind, kebab_core::IngestItemKind::Error);
+    assert!(err_item.error.is_some(), "Error item carries error string");
+}
+
+// ── 6. Determinism: re-ingest produces identical doc_id / chunk_id ───────
 
 /// Idempotency contract — running the same ingest twice should mark
 /// the asset Updated on the second run with byte-identical IDs.
-- 
2.49.1