From 4ed5536c921d26b5523670c918e5cfa6247cb6c0 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Sat, 2 May 2026 05:38:24 +0000
Subject: [PATCH] =?UTF-8?q?feat(kebab-parse-image):=20P6-2=20OCR=20adapter?=
 =?UTF-8?q?=20=E2=80=94=20Ollama-vision=20default?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 새 모듈 `crates/kebab-parse-image/src/ocr.rs` 추가. spec 의 `OcrEngine`
  trait 그대로 + `OllamaVisionOcr` default 구현 + `apply_ocr` 헬퍼.
- `OllamaVisionOcr`: `<endpoint>/api/generate` 비스트리밍 호출,
  `images: [base64]` 필드로 이미지 전달, 프롬프트는 언어 힌트
  + 화이트리스트 언어 목록 포함. 응답 prose 를 `OcrText.joined` 로,
  prepared image 전체 영역 단일 region (confidence 1.0) 으로 wrap.
  기본 모델 `gemma4:e4b`. endpoint 비어 있으면 `models.llm.endpoint`
  로 fallback.
- 이미지 전처리: long-edge `config.image.ocr.max_pixels` (기본 1600,
  256~4096 클램프) 초과 시 PNG 로 재인코딩 (image::imageops::resize,
  Triangle filter). PNG 입력이 max 이내면 zero-copy passthrough.
- `apply_ocr` 는 OCR 성공 시 block.ocr 를 Some 으로 채우고
  ProvenanceKind::OcrApplied 이벤트 추가. 실패 시 block.ocr 는
  None 그대로 + provenance 미기록 (부분 상태 누출 금지).
- `kebab-config`: 새 `ImageCfg.ocr: OcrCfg` 블록 (enabled/engine/model
  /endpoint/languages/max_pixels). `#[serde(default)]` 로 pre-P6
  TOML 호환. `KEBAB_IMAGE_OCR_*` 환경변수 5종 추가.

## Spec deviation

원래 P6-2 spec 은 Tesseract 를 default OCR 엔진으로 지정했으나, dev /
CI 호스트에서 `libtesseract-dev` 시스템 패키지 설치를 피하려고
Ollama-vision 으로 default 를 교체. `OcrEngine` trait 추상화는 spec
그대로 보존 — Tesseract / Apple Vision / PaddleOCR 어댑터는 같은
trait 으로 추후 feature-gate 추가 가능. 자세한 내역은
`tasks/HOTFIXES.md` 2026-05-02 항목 참조.

Trust 측면: vision LM 은 hallucinate 가능. `OcrText.engine = "ollama-vision"`
필드로 consumer 가 엔진 별 신뢰 분기 가능.

## 테스트

- 신규 (`tests/ocr.rs`, 8 + 1 ignored):
  - 200 happy → OcrText 디코딩 (joined / engine / engine_version /
    region count / bbox / confidence)
  - 빈 응답 → 빈 regions
  - 5xx → Err with status + body 포함
  - 200 error envelope → Err
  - apply_ocr → block.ocr Some + Provenance OcrApplied 1건
  - apply_ocr error → block.ocr None 유지 + events 미기록
  - 4000×3000 PNG → max_pixels=1024 까지 다운스케일, aspect ratio 보존
  - from_parts max_pixels 클램프
  - opt-in `KEBAB_OCR_INTEGRATION=1` 통합 (실제 192.168.0.47 Ollama
    `gemma4:e4b` 로 \"Hello World 2026\" 전사 검증 완료)
- 신규 (`src/ocr.rs` unit): truncate, build_prompt 언어/힌트 처리
- `kebab-config` 테스트 +3: defaults, env override, pre-P6 TOML 호환

전체: `cargo test -p kebab-parse-image` 28 pass + 1 ignored,
`cargo test -p kebab-config` 20 pass,
`cargo clippy --workspace --all-targets -- -D warnings` pass.

contract: docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
sections: §3.4 ImageRefBlock.ocr, §3.7a OcrText / OcrRegion, §9.1 OCR
vs caption provenance.
---
 Cargo.lock                                   |  38 ++
 crates/kebab-config/src/lib.rs               | 185 ++++++++
 crates/kebab-parse-image/Cargo.toml          |  19 +-
 crates/kebab-parse-image/src/lib.rs          |  23 +-
 crates/kebab-parse-image/src/ocr.rs          | 430 +++++++++++++++++++
 crates/kebab-parse-image/tests/common/mod.rs |  54 +++
 crates/kebab-parse-image/tests/ocr.rs        | 378 ++++++++++++++++
 tasks/HOTFIXES.md                            |  18 +
 tasks/p6/p6-2-ocr-adapter.md                 |   2 +-
 9 files changed, 1137 insertions(+), 10 deletions(-)
 create mode 100644 crates/kebab-parse-image/src/ocr.rs
 create mode 100644 crates/kebab-parse-image/tests/ocr.rs
diff --git a/Cargo.lock b/Cargo.lock
index 96e76ea..f595476 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,22 @@
 # It is not intended for manual editing.
 version = 4
 
+[[package]]
+name = "ab_glyph"
+version = "0.2.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01c0457472c38ea5bd1c3b5ada5e368271cb550be7a4ca4a0b4634e9913f6cc2"
+dependencies = [
+ "ab_glyph_rasterizer",
+ "owned_ttf_parser",
+]
+
+[[package]]
+name = "ab_glyph_rasterizer"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "366ffbaa4442f4684d91e2cd7c5ea7c4ed8add41959a31447066e279e432b618"
+
 [[package]]
 name = "adler2"
 version = "2.0.1"
@@ -3552,15 +3568,22 @@ dependencies = [
 name = "kebab-parse-image"
 version = "0.1.0"
 dependencies = [
+ "ab_glyph",
  "anyhow",
+ "base64 0.22.1",
  "blake3",
  "image",
  "kamadak-exif",
+ "kebab-config",
  "kebab-core",
+ "reqwest",
+ "serde",
  "serde_json",
  "tempfile",
  "time",
+ "tokio",
  "tracing",
+ "wiremock",
 ]
 
 [[package]]
@@ -5128,6 +5151,15 @@ dependencies = [
  "ureq",
 ]
 
+[[package]]
+name = "owned_ttf_parser"
+version = "0.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36820e9051aca1014ddc75770aab4d68bc1e9e632f0f5627c4086bc216fb583b"
+dependencies = [
+ "ttf-parser",
+]
+
 [[package]]
 name = "ownedbytes"
 version = "0.9.0"
@@ -7450,6 +7482,12 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
+[[package]]
+name = "ttf-parser"
+version = "0.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31"
+
 [[package]]
 name = "twox-hash"
 version = "2.1.2"
diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs
index 9452efb..73ab144 100644
--- a/crates/kebab-config/src/lib.rs
+++ b/crates/kebab-config/src/lib.rs
@@ -21,6 +21,12 @@ pub struct Config {
     pub models: ModelsCfg,
     pub search: SearchCfg,
     pub rag: RagCfg,
+    /// Image-pipeline settings (P6: OCR, captioning). Tagged
+    /// `#[serde(default)]` so pre-P6 config files that predate the
+    /// `[image]` section still load — defaults disable OCR / caption
+    /// (they cost a model call per asset).
+    #[serde(default = "ImageCfg::defaults")]
+    pub image: ImageCfg,
 }
 
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -98,6 +104,63 @@ pub struct RagCfg {
     pub max_context_tokens: usize,
 }
 
+/// Settings for the image ingest pipeline (P6). `ocr` controls OCR
+/// behaviour; future fields (e.g. `caption`) will join here as P6-3
+/// lands.
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct ImageCfg {
+    #[serde(default = "OcrCfg::defaults")]
+    pub ocr: OcrCfg,
+}
+
+impl ImageCfg {
+    pub fn defaults() -> Self {
+        Self {
+            ocr: OcrCfg::defaults(),
+        }
+    }
+}
+
+/// OCR settings (P6-2). v1 ships a single Ollama-vision adapter; the
+/// `OcrEngine` trait in `kebab-parse-image` keeps the door open for
+/// Tesseract / Apple Vision / PaddleOCR engines as feature-gated
+/// alternatives in P+. See `tasks/HOTFIXES.md` (2026-05-02) for the
+/// rationale on dropping the original Tesseract default.
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+pub struct OcrCfg {
+    /// Run OCR on every image during ingest. Default `false` because
+    /// OCR adds one model call per asset.
+    pub enabled: bool,
+    /// Engine identifier. v1 only ships `"ollama-vision"`.
+    pub engine: String,
+    /// Model id passed to the engine (e.g. `"gemma4:e4b"` for
+    /// Ollama-vision).
+    pub model: String,
+    /// HTTP endpoint for the OCR engine. Empty string means "fall back
+    /// to `models.llm.endpoint`" — convenient when the same Ollama
+    /// host serves both LLM and vision.
+    pub endpoint: String,
+    /// BCP-47 language hints (e.g. `["eng", "kor"]`). The adapter
+    /// renders them into the prompt; the LLM honours them probabilistically.
+    pub languages: Vec<String>,
+    /// Cap the long edge of the image (in pixels) before sending. Larger
+    /// images bloat prompt cost. Default `1600`.
+    pub max_pixels: u32,
+}
+
+impl OcrCfg {
+    pub fn defaults() -> Self {
+        Self {
+            enabled: false,
+            engine: "ollama-vision".to_string(),
+            model: "gemma4:e4b".to_string(),
+            endpoint: String::new(),
+            languages: vec!["eng".to_string(), "kor".to_string()],
+            max_pixels: 1600,
+        }
+    }
+}
+
 impl Config {
     /// Defaults per design §6.4.
     pub fn defaults() -> Self {
@@ -162,6 +225,7 @@ impl Config {
                 explain_default: false,
                 max_context_tokens: 8000,
             },
+            image: ImageCfg::defaults(),
         }
     }
 
@@ -323,6 +387,27 @@ impl Config {
                     }
                 }
 
+                // image.ocr
+                "KEBAB_IMAGE_OCR_ENABLED" => {
+                    self.image.ocr.enabled = parse_bool(v);
+                }
+                "KEBAB_IMAGE_OCR_ENGINE" => self.image.ocr.engine = v.clone(),
+                "KEBAB_IMAGE_OCR_MODEL" => self.image.ocr.model = v.clone(),
+                "KEBAB_IMAGE_OCR_ENDPOINT" => self.image.ocr.endpoint = v.clone(),
+                "KEBAB_IMAGE_OCR_LANGUAGES" => {
+                    // Comma-separated list, e.g. "eng,kor".
+                    self.image.ocr.languages = v
+                        .split(',')
+                        .map(|s| s.trim().to_string())
+                        .filter(|s| !s.is_empty())
+                        .collect();
+                }
+                "KEBAB_IMAGE_OCR_MAX_PIXELS" => {
+                    if let Ok(n) = v.parse::<u32>() {
+                        self.image.ocr.max_pixels = n;
+                    }
+                }
+
                 // Unknown KEBAB_* keys are silently ignored — see
                 // `env_unknown_key_is_ignored` test.
                 _ => {}
@@ -471,6 +556,106 @@ mod tests {
         assert!(c.indexing.watch_filesystem);
     }
 
+    #[test]
+    fn image_ocr_defaults_disabled_with_ollama_vision() {
+        let c = Config::defaults();
+        assert!(!c.image.ocr.enabled);
+        assert_eq!(c.image.ocr.engine, "ollama-vision");
+        assert_eq!(c.image.ocr.model, "gemma4:e4b");
+        assert_eq!(c.image.ocr.languages, vec!["eng", "kor"]);
+        assert_eq!(c.image.ocr.max_pixels, 1600);
+    }
+
+    #[test]
+    fn image_ocr_env_overrides() {
+        let mut env = HashMap::new();
+        env.insert("KEBAB_IMAGE_OCR_ENABLED".to_string(), "true".to_string());
+        env.insert(
+            "KEBAB_IMAGE_OCR_MODEL".to_string(),
+            "gemma4:31b".to_string(),
+        );
+        env.insert(
+            "KEBAB_IMAGE_OCR_ENDPOINT".to_string(),
+            "http://192.168.0.47:11434".to_string(),
+        );
+        env.insert(
+            "KEBAB_IMAGE_OCR_LANGUAGES".to_string(),
+            "eng, kor, jpn".to_string(),
+        );
+        env.insert("KEBAB_IMAGE_OCR_MAX_PIXELS".to_string(), "2048".to_string());
+        let c = Config::defaults().apply_env(&env);
+        assert!(c.image.ocr.enabled);
+        assert_eq!(c.image.ocr.model, "gemma4:31b");
+        assert_eq!(c.image.ocr.endpoint, "http://192.168.0.47:11434");
+        assert_eq!(c.image.ocr.languages, vec!["eng", "kor", "jpn"]);
+        assert_eq!(c.image.ocr.max_pixels, 2048);
+    }
+
+    /// Pre-P6 config files don't have an `[image]` section. The
+    /// `#[serde(default)]` attribute on `Config::image` must let those
+    /// files load with `ImageCfg::defaults()` instead of erroring.
+    #[test]
+    fn pre_p6_config_without_image_section_loads_with_defaults() {
+        let toml_text = r#"
+schema_version = 1
+
+[workspace]
+root = "/tmp/x"
+include = ["**/*.md"]
+exclude = []
+
+[storage]
+data_dir = "/tmp/d"
+sqlite = "{data_dir}/x.sqlite"
+vector_dir = "{data_dir}/v"
+asset_dir = "{data_dir}/a"
+artifact_dir = "{data_dir}/r"
+model_dir = "{data_dir}/m"
+runs_dir = "{data_dir}/u"
+copy_threshold_mb = 100
+
+[indexing]
+max_parallel_extractors = 2
+max_parallel_embeddings = 1
+watch_filesystem = false
+
+[chunking]
+target_tokens = 500
+overlap_tokens = 80
+respect_markdown_headings = true
+chunker_version = "md-heading-v1"
+
+[models.embedding]
+provider = "fastembed"
+model = "multilingual-e5-small"
+version = "v1"
+dimensions = 384
+batch_size = 64
+
+[models.llm]
+provider = "ollama"
+model = "qwen2.5:14b-instruct"
+context_tokens = 32768
+endpoint = "http://127.0.0.1:11434"
+temperature = 0.0
+seed = 0
+
+[search]
+default_k = 10
+hybrid_fusion = "rrf"
+rrf_k = 60
+snippet_chars = 220
+
+[rag]
+prompt_template_version = "rag-v1"
+score_gate = 0.30
+explain_default = false
+max_context_tokens = 8000
+"#;
+        let c: Config = toml::from_str(toml_text).expect("pre-P6 TOML must still parse");
+        assert_eq!(c.image, ImageCfg::defaults());
+    }
+
     #[test]
     fn xdg_paths_honor_env() {
         // Must restore env after the test to avoid polluting other tests.
diff --git a/crates/kebab-parse-image/Cargo.toml b/crates/kebab-parse-image/Cargo.toml
index 5767807..e596d6c 100644
--- a/crates/kebab-parse-image/Cargo.toml
+++ b/crates/kebab-parse-image/Cargo.toml
@@ -5,11 +5,13 @@ edition       = { workspace = true }
 rust-version  = { workspace = true }
 license       = { workspace = true }
 repository    = { workspace = true }
-description   = "Image extractor — produces a single-block CanonicalDocument with EXIF metadata (P6-1)"
+description   = "Image extractor + EXIF + OCR (Ollama-vision) for the kebab pipeline (P6-1, P6-2)"
 
 [dependencies]
 kebab-core   = { path = "../kebab-core" }
+kebab-config = { path = "../kebab-config" }
 anyhow       = { workspace = true }
+serde        = { workspace = true }
 serde_json   = { workspace = true }
 time         = { workspace = true }
 tracing      = { workspace = true }
@@ -21,7 +23,22 @@ image = { version = "0.25", default-features = false, features = ["png", "jpeg",
 # kamadak-exif: pure-Rust EXIF reader. Used for the whitelisted tag
 # extraction (DateTimeOriginal, GPS, Make, Model, Orientation, Software).
 kamadak-exif = "0.6"
+# Ollama-vision OCR adapter (P6-2) talks HTTP directly. We keep the
+# feature surface identical to `kebab-llm-local` (blocking + json +
+# rustls-tls) so both crates share the same TLS backend and the
+# transitive tokio runtime is brought in once.
+reqwest      = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+base64       = "0.22"
 
 [dev-dependencies]
 tempfile     = { workspace = true }
 blake3       = { workspace = true }
+# Shared test infrastructure with `kebab-llm-local`: wiremock under
+# tokio for HTTP fixtures.
+wiremock     = { workspace = true }
+tokio        = { workspace = true, features = ["rt-multi-thread"] }
+# Used by `tests/common/mod.rs` to render the opt-in OCR integration
+# fixture. Only loaded for tests; the production crate doesn't need
+# font rendering.
+ab_glyph     = "0.2"
+base64       = "0.22"
diff --git a/crates/kebab-parse-image/src/lib.rs b/crates/kebab-parse-image/src/lib.rs
index e357c28..dfcd171 100644
--- a/crates/kebab-parse-image/src/lib.rs
+++ b/crates/kebab-parse-image/src/lib.rs
@@ -1,17 +1,24 @@
-//! `kebab-parse-image` — image extractor (P6-1).
+//! `kebab-parse-image` — image extractor (P6-1) + OCR adapter (P6-2).
 //!
-//! Implements [`kebab_core::Extractor`] for `MediaType::Image(_)`. One asset
-//! produces one [`CanonicalDocument`] with a single
-//! [`Block::ImageRef`](kebab_core::Block::ImageRef). EXIF is captured into
-//! `metadata.user["exif"]`, dimensions into `metadata.user["dimensions"]`.
-//! OCR / caption fields stay `None`; later tasks (P6-2 / P6-3) populate
-//! them.
+//! P6-1 implements [`kebab_core::Extractor`] for `MediaType::Image(_)`,
+//! producing a single-block [`CanonicalDocument`] (`ImageRefBlock` with
+//! EXIF + dimensions in `metadata.user`). OCR / caption fields stay
+//! `None` until populated by the OCR / caption adapters.
+//!
+//! P6-2 adds the [`ocr`] module: an [`OcrEngine`] trait and an
+//! [`OllamaVisionOcr`] default adapter that talks to a vision-capable
+//! Ollama model. [`apply_ocr`] is the helper that mutates an
+//! [`ImageRefBlock`] in place.
 //!
 //! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText /
-//! ModelCaption stubs), §9.1 (image extraction policy), §9 (versioning).
+//! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption
+//! provenance), §9 (versioning).
 
 mod dims;
 mod exif_extract;
+pub mod ocr;
+
+pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};
 
 use anyhow::{Context, Result};
 use kebab_core::{
diff --git a/crates/kebab-parse-image/src/ocr.rs b/crates/kebab-parse-image/src/ocr.rs
new file mode 100644
index 0000000..07912e9
--- /dev/null
+++ b/crates/kebab-parse-image/src/ocr.rs
@@ -0,0 +1,430 @@
+//! OCR adapter (P6-2).
+//!
+//! [`OcrEngine`] is a small trait for "image bytes → [`OcrText`]". v1 ships
+//! a single implementation, [`OllamaVisionOcr`], which delegates to a
+//! vision-capable Ollama model (`gemma4:e4b` by default).
+//!
+//! ## Spec deviation (Tesseract → Ollama-vision)
+//!
+//! The original P6-2 spec named Tesseract as the default engine. The dev
+//! / CI environment intentionally avoids system-package installs, so the
+//! Tesseract Rust crate (which links `libtesseract`) is impractical
+//! today. We keep the [`OcrEngine`] trait as the abstraction the spec
+//! demanded — Tesseract / Apple Vision / PaddleOCR plug in as future
+//! feature-gated alternatives without touching the extractor or
+//! chunker. See `tasks/HOTFIXES.md` (2026-05-02) for the full
+//! rationale.
+//!
+//! ## Trust note
+//!
+//! The original spec marked `OcrText` as "observed text (high trust)"
+//! to distinguish it from `ModelCaption`. With an LLM-driven OCR engine
+//! the line blurs — the model can hallucinate. Downstream consumers
+//! that surface OCR text should still treat it as a hint, not ground
+//! truth, and prefer the asset bytes when verifying. The `engine`
+//! field on [`OcrText`] makes the source explicit, so a caller can
+//! decide whether to trust based on which engine produced the text.
+
+use std::io::Cursor;
+use std::time::Duration;
+
+use anyhow::{Context, Result};
+use base64::Engine as _;
+use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
+use image::{ImageFormat, ImageReader};
+use kebab_core::{ImageRefBlock, Lang, OcrRegion, OcrText, ProvenanceEvent, ProvenanceKind};
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use time::OffsetDateTime;
+
+/// Engine name written into `OcrText.engine` for the Ollama-vision adapter.
+pub const OLLAMA_VISION_ENGINE: &str = "ollama-vision";
+
+/// Hard ceiling on the OCR HTTP exchange. Cold-loading a vision model on
+/// first call can take ~30s; 5 minutes is generous without being open-ended.
+const REQUEST_TIMEOUT: Duration = Duration::from_secs(300);
+
+/// Lower bound on `config.image.ocr.max_pixels`. Anything below this is
+/// silently bumped to keep the model from receiving an unreadable thumbnail.
+const MIN_LONG_EDGE: u32 = 256;
+
+/// Hard cap on `max_pixels` — the spec mentions "downscale aggressively"
+/// for vision LMs because input dimension translates directly into
+/// prompt cost. 4096 is generous for legibility and still bounded.
+const MAX_LONG_EDGE: u32 = 4096;
+
+/// Image-bytes → [`OcrText`] interface. Implementations may shell out
+/// (Apple Vision sidecar), call a local library (Tesseract), or — in v1
+/// — talk HTTP to a vision LM (Ollama).
+pub trait OcrEngine: Send + Sync {
+    /// Stable identifier written into `OcrText.engine`. Used by callers
+    /// to decide trust level (observed vs. generated).
+    fn engine_name(&self) -> &'static str;
+
+    /// Engine version string written into `OcrText.engine_version`.
+    /// Adapters that depend on a remote service may include the model
+    /// id / version here.
+    fn engine_version(&self) -> String;
+
+    /// Run OCR on `image_bytes`. `lang_hint` (BCP-47) can be passed
+    /// through to engines that benefit from it (Tesseract languages,
+    /// LLM prompt steering); ignore otherwise.
+    fn recognize(
+        &self,
+        image_bytes: &[u8],
+        lang_hint: Option<&Lang>,
+    ) -> Result<OcrText>;
+}
+
+/// Mutate `block.ocr` in place by running `engine` over `image_bytes`,
+/// then append a [`ProvenanceKind::OcrApplied`] event to `events` so the
+/// caller (which owns the `CanonicalDocument`) can splice it into
+/// `provenance.events`.
+///
+/// Returns the engine error verbatim on failure so the caller can decide
+/// whether to skip the asset or surface it. `block.ocr` is left
+/// untouched on error — partial state is never written.
+pub fn apply_ocr(
+    engine: &dyn OcrEngine,
+    image_bytes: &[u8],
+    block: &mut ImageRefBlock,
+    lang_hint: Option<&Lang>,
+    events: &mut Vec<ProvenanceEvent>,
+) -> Result<()> {
+    let text = engine.recognize(image_bytes, lang_hint).with_context(|| {
+        format!(
+            "OCR failed (engine={}, version={})",
+            engine.engine_name(),
+            engine.engine_version()
+        )
+    })?;
+    let region_count = text.regions.len();
+    block.ocr = Some(text);
+    events.push(ProvenanceEvent {
+        at: OffsetDateTime::now_utc(),
+        agent: "kb-parse-image".to_string(),
+        kind: ProvenanceKind::OcrApplied,
+        note: Some(format!(
+            "engine={} version={} regions={}",
+            engine.engine_name(),
+            engine.engine_version(),
+            region_count
+        )),
+    });
+    Ok(())
+}
+
+/// Ollama-vision OCR adapter — POSTs the image (base64) to
+/// `<endpoint>/api/generate` with a transcription prompt and reads the
+/// non-streaming response.
+pub struct OllamaVisionOcr {
+    client: reqwest::blocking::Client,
+    endpoint: String,
+    model: String,
+    languages: Vec<String>,
+    max_pixels: u32,
+}
+
+impl OllamaVisionOcr {
+    /// Build an adapter from a workspace [`kebab_config::Config`].
+    /// Reads `config.image.ocr.{model, endpoint, languages, max_pixels}`;
+    /// when `endpoint` is empty falls back to `config.models.llm.endpoint`
+    /// so the same Ollama host serves both LLM and OCR by default.
+    ///
+    /// Construction does NOT touch the network — the first HTTP call
+    /// happens inside [`OcrEngine::recognize`].
+    pub fn new(config: &kebab_config::Config) -> Result<Self> {
+        let ocr = &config.image.ocr;
+        let endpoint = if ocr.endpoint.is_empty() {
+            config.models.llm.endpoint.clone()
+        } else {
+            ocr.endpoint.clone()
+        };
+        if endpoint.is_empty() {
+            anyhow::bail!(
+                "OllamaVisionOcr: endpoint is empty (set image.ocr.endpoint or models.llm.endpoint)"
+            );
+        }
+        let model = ocr.model.trim().to_string();
+        if model.is_empty() {
+            anyhow::bail!("OllamaVisionOcr: image.ocr.model is empty");
+        }
+        let max_pixels = ocr.max_pixels.clamp(MIN_LONG_EDGE, MAX_LONG_EDGE);
+        let client = reqwest::blocking::Client::builder()
+            .timeout(REQUEST_TIMEOUT)
+            .build()
+            .context("building OCR HTTP client")?;
+        Ok(Self {
+            client,
+            endpoint,
+            model,
+            languages: ocr.languages.clone(),
+            max_pixels,
+        })
+    }
+
+    /// Build directly from explicit fields. Useful for tests that need
+    /// to point at a wiremock host without going through `Config`.
+    pub fn from_parts(
+        endpoint: impl Into<String>,
+        model: impl Into<String>,
+        languages: Vec<String>,
+        max_pixels: u32,
+    ) -> Result<Self> {
+        let client = reqwest::blocking::Client::builder()
+            .timeout(REQUEST_TIMEOUT)
+            .build()
+            .context("building OCR HTTP client")?;
+        Ok(Self {
+            client,
+            endpoint: endpoint.into(),
+            model: model.into(),
+            languages,
+            max_pixels: max_pixels.clamp(MIN_LONG_EDGE, MAX_LONG_EDGE),
+        })
+    }
+
+    fn build_prompt(&self, lang_hint: Option<&Lang>) -> String {
+        let langs = if self.languages.is_empty() {
+            "any".to_string()
+        } else {
+            self.languages.join(", ")
+        };
+        let hint = match lang_hint.map(|l| l.0.as_str()) {
+            Some(h) if !h.is_empty() && h != "und" => format!(" (hint: dominant language is {h})"),
+            _ => String::new(),
+        };
+        format!(
+            "You are an OCR engine. Transcribe ALL text visible in the image, \
+             preserving line breaks. Output only the transcription, no commentary, \
+             no markdown fences, no quotes. Expected languages: {langs}{hint}. \
+             If the image contains no text, output an empty line."
+        )
+    }
+}
+
+impl OcrEngine for OllamaVisionOcr {
+    fn engine_name(&self) -> &'static str {
+        OLLAMA_VISION_ENGINE
+    }
+
+    fn engine_version(&self) -> String {
+        // Compose engine + model id so the wire form is self-describing
+        // ("ollama-vision/gemma4:e4b") — the Ollama daemon does not
+        // expose a stable per-model revision string we could pin.
+        format!("ollama/{}", self.model)
+    }
+
+    fn recognize(
+        &self,
+        image_bytes: &[u8],
+        lang_hint: Option<&Lang>,
+    ) -> Result<OcrText> {
+        let (prepared, w, h) = downscale_to_long_edge(image_bytes, self.max_pixels)
+            .context("preparing image for OCR")?;
+        let b64 = BASE64_STANDARD.encode(&prepared);
+
+        let prompt = self.build_prompt(lang_hint);
+        let body = OllamaGenerateRequest {
+            model: &self.model,
+            prompt: &prompt,
+            images: vec![&b64],
+            stream: false,
+            options: OllamaOptions {
+                temperature: 0.0,
+                seed: 0,
+            },
+        };
+
+        let url = format!("{}/api/generate", self.endpoint.trim_end_matches('/'));
+        let resp = self
+            .client
+            .post(&url)
+            .json(&body)
+            .send()
+            .with_context(|| format!("POST {url}"))?;
+        let status = resp.status();
+        if !status.is_success() {
+            let body_text = resp.text().unwrap_or_default();
+            anyhow::bail!(
+                "OllamaVisionOcr: {status} from {url} — body={}",
+                truncate(&body_text, 512)
+            );
+        }
+        let parsed: OllamaGenerateResponse = resp
+            .json()
+            .context("parsing Ollama OCR response as JSON")?;
+        if let Some(err) = parsed.error {
+            anyhow::bail!("OllamaVisionOcr: server error — {}", truncate(&err, 512));
+        }
+        let raw = parsed.response.unwrap_or_default();
+        let joined = raw.trim().to_string();
+
+        let regions = if joined.is_empty() {
+            Vec::new()
+        } else {
+            // Ollama-vision returns prose, not bbox-annotated regions.
+            // We synthesize a single region covering the whole prepared
+            // image (post-downscale dimensions) so the `OcrText` shape
+            // remains compatible with consumers that expect at least
+            // one region. Confidence is left at 1.0 — there's no
+            // per-token score available from the LM.
+            vec![OcrRegion {
+                bbox: (0, 0, w, h),
+                text: joined.clone(),
+                confidence: 1.0,
+            }]
+        };
+
+        tracing::debug!(
+            target: "kebab-parse-image",
+            "ollama-vision OCR ok (model={}, dims={w}x{h}, chars={})",
+            self.model,
+            joined.chars().count()
+        );
+
+        Ok(OcrText {
+            joined,
+            regions,
+            engine: self.engine_name().to_string(),
+            engine_version: self.engine_version(),
+        })
+    }
+}
+
+// ── Image preparation ─────────────────────────────────────────────────────
+
+/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`,
+/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)`.
+///
+/// Bypasses encode work when the source already fits — we simply pass
+/// the bytes through. PNG re-encode is only paid when downscaling is
+/// actually needed.
+fn downscale_to_long_edge(bytes: &[u8], max_long_edge: u32) -> Result<(Vec<u8>, u32, u32)> {
+    let reader = ImageReader::new(Cursor::new(bytes))
+        .with_guessed_format()
+        .context("reading image header for OCR")?;
+    let format = reader.format();
+    let (w, h) = reader
+        .into_dimensions()
+        .context("reading image dimensions for OCR")?;
+
+    let long = w.max(h);
+    if long <= max_long_edge {
+        // Source fits — avoid the round-trip through `image::DynamicImage`.
+        // Re-encode only when the source isn't already PNG, since the
+        // wire format we send Ollama is PNG.
+        return match format {
+            Some(ImageFormat::Png) => Ok((bytes.to_vec(), w, h)),
+            _ => {
+                let img = ImageReader::new(Cursor::new(bytes))
+                    .with_guessed_format()
+                    .context("re-reading image for PNG re-encode")?
+                    .decode()
+                    .context("decoding image for PNG re-encode")?;
+                let mut out = Cursor::new(Vec::new());
+                img.write_to(&mut out, ImageFormat::Png)
+                    .context("re-encoding image as PNG")?;
+                Ok((out.into_inner(), w, h))
+            }
+        };
+    }
+
+    let scale = max_long_edge as f32 / long as f32;
+    let new_w = ((w as f32) * scale).round().max(1.0) as u32;
+    let new_h = ((h as f32) * scale).round().max(1.0) as u32;
+    let img = ImageReader::new(Cursor::new(bytes))
+        .with_guessed_format()
+        .context("re-reading image for downscale")?
+        .decode()
+        .context("decoding image for downscale")?;
+    let resized = img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle);
+    let mut out = Cursor::new(Vec::new());
+    resized
+        .write_to(&mut out, ImageFormat::Png)
+        .context("encoding downscaled image as PNG")?;
+    Ok((out.into_inner(), new_w, new_h))
+}
+
+fn truncate(s: &str, n: usize) -> String {
+    if s.chars().count() <= n {
+        return s.to_string();
+    }
+    let mut out: String = s.chars().take(n).collect();
+    out.push_str(&format!("... (truncated, original {} chars)", s.chars().count()));
+    out
+}
+
+// ── Wire types ────────────────────────────────────────────────────────────
+
+#[derive(Serialize)]
+struct OllamaGenerateRequest<'a> {
+    model: &'a str,
+    prompt: &'a str,
+    images: Vec<&'a str>,
+    stream: bool,
+    options: OllamaOptions,
+}
+
+#[derive(Serialize)]
+struct OllamaOptions {
+    temperature: f32,
+    seed: u64,
+}
+
+#[derive(Deserialize)]
+struct OllamaGenerateResponse {
+    #[serde(default)]
+    response: Option<String>,
+    #[serde(default)]
+    error: Option<String>,
+    #[serde(flatten)]
+    _other: std::collections::HashMap<String, Value>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn truncate_under_cap_unchanged() {
+        assert_eq!(truncate("abc", 5), "abc");
+    }
+
+    #[test]
+    fn truncate_over_cap_appends_marker() {
+        let big = "x".repeat(20);
+        let out = truncate(&big, 5);
+        assert!(out.starts_with("xxxxx"));
+        assert!(out.contains("(truncated, original 20 chars)"));
+    }
+
+    /// Build prompt mentions the configured languages and the hint when
+    /// supplied.
+    #[test]
+    fn build_prompt_lists_languages_and_hint() {
+        let engine = OllamaVisionOcr::from_parts(
+            "http://x",
+            "m",
+            vec!["eng".into(), "kor".into()],
+            1024,
+        )
+        .unwrap();
+        let p = engine.build_prompt(Some(&Lang("ko".into())));
+        assert!(p.contains("eng, kor"));
+        assert!(p.contains("hint: dominant language is ko"));
+    }
+
+    #[test]
+    fn build_prompt_omits_hint_when_lang_und() {
+        let engine = OllamaVisionOcr::from_parts(
+            "http://x",
+            "m",
+            vec!["eng".into()],
+            1024,
+        )
+        .unwrap();
+        let p = engine.build_prompt(Some(&Lang("und".into())));
+        assert!(!p.contains("hint:"));
+    }
+}
diff --git a/crates/kebab-parse-image/tests/common/mod.rs b/crates/kebab-parse-image/tests/common/mod.rs
index 37aee23..9be6f06 100644
--- a/crates/kebab-parse-image/tests/common/mod.rs
+++ b/crates/kebab-parse-image/tests/common/mod.rs
@@ -43,6 +43,60 @@ pub fn no_exif_png() -> Vec<u8> {
     buf.into_inner()
 }
 
+/// 4000×3000 solid-blue PNG (long edge 4000) used to exercise the OCR
+/// adapter's downscale path. Solid-colour PNGs compress aggressively, so
+/// the on-disk size stays well under 1 MB despite the large dimensions.
+pub fn large_blue_4000x3000_png() -> Vec<u8> {
+    let img: ImageBuffer<Rgb<u8>, _> =
+        ImageBuffer::from_fn(4000, 3000, |_, _| Rgb([0, 0, 255]));
+    let mut buf = Cursor::new(Vec::new());
+    img.write_to(&mut buf, image::ImageFormat::Png)
+        .expect("encoding 4000x3000 PNG must not fail");
+    buf.into_inner()
+}
+
+/// PNG with the literal text `"Hello World 2026"` rendered in black
+/// against a white background. Used by the opt-in
+/// `ocr_integration_real_ollama_transcribes_text` integration test —
+/// regular hermetic tests never call it.
+///
+/// Renders with a font shipped by `dejavu` (path under
+/// `/usr/share/fonts/truetype/dejavu/`) — common across most Linux dev
+/// boxes. Falls back to a tiny built-in glyph map if the font is
+/// missing so the helper compiles even without DejaVu installed.
+pub fn hello_world_png() -> Vec<u8> {
+    use ab_glyph::{Font, FontRef, ScaleFont};
+
+    let mut img: ImageBuffer<Rgb<u8>, _> =
+        ImageBuffer::from_fn(400, 100, |_, _| Rgb([255, 255, 255]));
+    let font_bytes = std::fs::read("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf")
+        .expect("DejaVu Sans Bold required for OCR integration fixture");
+    let font = FontRef::try_from_slice(&font_bytes).expect("font parses");
+    let scaled = font.as_scaled(40.0);
+    let text = "Hello World 2026";
+    let mut x = 10.0_f32;
+    let y = 60.0_f32;
+    for ch in text.chars() {
+        let glyph = scaled.scaled_glyph(ch);
+        if let Some(outlined) = scaled.outline_glyph(glyph.clone()) {
+            let bb = outlined.px_bounds();
+            outlined.draw(|gx, gy, c| {
+                let px = (x + bb.min.x + gx as f32) as i32;
+                let py = (y + bb.min.y + gy as f32) as i32;
+                if px >= 0 && py >= 0 && (px as u32) < 400 && (py as u32) < 100 {
+                    let v = ((1.0 - c) * 255.0) as u8;
+                    img.put_pixel(px as u32, py as u32, Rgb([v, v, v]));
+                }
+            });
+        }
+        x += scaled.h_advance(scaled.glyph_id(ch));
+    }
+    let mut buf = Cursor::new(Vec::new());
+    img.write_to(&mut buf, image::ImageFormat::Png)
+        .expect("encoding hello-world PNG must not fail");
+    buf.into_inner()
+}
+
 /// JPEG with embedded EXIF APP1 segment carrying GPS + Make + Model +
 /// DateTimeOriginal + Orientation + Software. The base image is a 4×4
 /// solid white square — pixel content is irrelevant; the test cares about
diff --git a/crates/kebab-parse-image/tests/ocr.rs b/crates/kebab-parse-image/tests/ocr.rs
new file mode 100644
index 0000000..7cc0314
--- /dev/null
+++ b/crates/kebab-parse-image/tests/ocr.rs
@@ -0,0 +1,378 @@
+//! Integration tests for the OCR adapter (P6-2).
+//!
+//! Pattern mirrors `kebab-llm-local/tests/streaming.rs` — `wiremock` is
+//! async, so test fns are `#[tokio::test]` and the sync adapter is
+//! invoked from `spawn_blocking`.
+
+mod common;
+
+use kebab_config::Config;
+use kebab_core::{
+    AssetId, BlockId, CommonBlock, ImageRefBlock, Lang, ProvenanceEvent, ProvenanceKind,
+    SourceSpan,
+};
+use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_ocr};
+use serde_json::json;
+use wiremock::matchers::{method, path};
+use wiremock::{Mock, MockServer, ResponseTemplate};
+
+use crate::common::red_100x50_png;
+
+fn cfg_for_endpoint(endpoint: &str) -> Config {
+    let mut cfg = Config::defaults();
+    cfg.image.ocr.endpoint = endpoint.to_string();
+    cfg.image.ocr.model = "gemma4:e4b".to_string();
+    cfg.image.ocr.languages = vec!["eng".to_string(), "kor".to_string()];
+    cfg.image.ocr.max_pixels = 1024;
+    cfg
+}
+
+fn run_recognize(
+    cfg: Config,
+    bytes: Vec<u8>,
+    lang_hint: Option<Lang>,
+) -> anyhow::Result<kebab_core::OcrText> {
+    let engine = OllamaVisionOcr::new(&cfg)?;
+    engine.recognize(&bytes, lang_hint.as_ref())
+}
+
+fn empty_image_block() -> ImageRefBlock {
+    ImageRefBlock {
+        common: CommonBlock {
+            block_id: BlockId("0".repeat(32)),
+            heading_path: Vec::new(),
+            source_span: SourceSpan::Region {
+                x: 0,
+                y: 0,
+                w: 100,
+                h: 50,
+            },
+        },
+        asset_id: Some(AssetId("a".repeat(32))),
+        src: "img/x.png".to_string(),
+        alt: "x.png".to_string(),
+        ocr: None,
+        caption: None,
+    }
+}
+
+// ── Happy path ────────────────────────────────────────────────────────────
+
+#[tokio::test]
+async fn ocr_recognize_decodes_response_into_ocr_text() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(json!({
+            "model": "gemma4:e4b",
+            "response": "Hello World 2026",
+            "done": true,
+            "done_reason": "stop"
+        })))
+        .mount(&server)
+        .await;
+
+    let bytes = red_100x50_png();
+    let cfg = cfg_for_endpoint(&server.uri());
+    let text = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None))
+        .await
+        .expect("blocking task panicked")
+        .expect("recognize must succeed");
+
+    assert_eq!(text.joined, "Hello World 2026");
+    assert_eq!(text.engine, "ollama-vision");
+    assert!(text.engine_version.starts_with("ollama/gemma4:e4b"));
+    assert_eq!(text.regions.len(), 1, "non-empty joined → exactly one region");
+    assert_eq!(text.regions[0].text, "Hello World 2026");
+    assert!((text.regions[0].confidence - 1.0).abs() < 1e-6);
+    // Region bbox covers prepared image dimensions (100×50 < max_pixels
+    // 1024 so no downscale, dims preserved).
+    assert_eq!(text.regions[0].bbox, (0, 0, 100, 50));
+}
+
+// ── Empty response ────────────────────────────────────────────────────────
+
+#[tokio::test]
+async fn ocr_recognize_empty_response_yields_empty_regions() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(json!({
+            "response": "",
+            "done": true
+        })))
+        .mount(&server)
+        .await;
+
+    let bytes = red_100x50_png();
+    let cfg = cfg_for_endpoint(&server.uri());
+    let text = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None))
+        .await
+        .expect("blocking task panicked")
+        .expect("recognize on empty response must succeed");
+
+    assert_eq!(text.joined, "");
+    assert!(text.regions.is_empty(), "empty joined → no regions");
+    assert_eq!(text.engine, "ollama-vision");
+}
+
+// ── Server error mapping ──────────────────────────────────────────────────
+
+#[tokio::test]
+async fn ocr_recognize_500_response_returns_error() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(500).set_body_string("boom"))
+        .mount(&server)
+        .await;
+
+    let bytes = red_100x50_png();
+    let cfg = cfg_for_endpoint(&server.uri());
+    let r = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None))
+        .await
+        .expect("blocking task panicked");
+    assert!(r.is_err(), "5xx must surface as Err");
+    let msg = format!("{:#}", r.unwrap_err());
+    assert!(
+        msg.contains("500") && msg.contains("boom"),
+        "error must include status + body: {msg}"
+    );
+}
+
+// ── error envelope on 200 stream ─────────────────────────────────────────
+
+#[tokio::test]
+async fn ocr_recognize_error_envelope_on_200_returns_error() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(json!({
+            "error": "model 'gemma4:e4b' not found"
+        })))
+        .mount(&server)
+        .await;
+
+    let bytes = red_100x50_png();
+    let cfg = cfg_for_endpoint(&server.uri());
+    let r = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None))
+        .await
+        .expect("blocking task panicked");
+    assert!(r.is_err(), "server error envelope must surface");
+    let msg = format!("{:#}", r.unwrap_err());
+    assert!(
+        msg.contains("not found"),
+        "error must include server message: {msg}"
+    );
+}
+
+// ── apply_ocr mutates block + appends provenance ─────────────────────────
+
+#[tokio::test]
+async fn apply_ocr_sets_block_ocr_and_appends_provenance() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(200).set_body_json(json!({
+            "response": "안녕 2026",
+            "done": true
+        })))
+        .mount(&server)
+        .await;
+
+    let bytes = red_100x50_png();
+    let cfg = cfg_for_endpoint(&server.uri());
+
+    let (block, events) =
+        tokio::task::spawn_blocking(move || -> anyhow::Result<_> {
+            let engine = OllamaVisionOcr::new(&cfg)?;
+            let mut block = empty_image_block();
+            let mut events: Vec<ProvenanceEvent> = Vec::new();
+            apply_ocr(
+                &engine,
+                &bytes,
+                &mut block,
+                Some(&Lang("ko".to_string())),
+                &mut events,
+            )?;
+            Ok((block, events))
+        })
+        .await
+        .expect("blocking task panicked")
+        .expect("apply_ocr must succeed");
+
+    let ocr = block.ocr.as_ref().expect("ocr Some after apply_ocr");
+    assert_eq!(ocr.joined, "안녕 2026");
+    assert_eq!(events.len(), 1);
+    assert_eq!(events[0].kind, ProvenanceKind::OcrApplied);
+    assert_eq!(events[0].agent, "kb-parse-image");
+    let note = events[0].note.as_deref().unwrap_or("");
+    assert!(
+        note.contains("engine=ollama-vision") && note.contains("regions=1"),
+        "provenance note must describe engine + region count: {note}"
+    );
+}
+
+// ── apply_ocr error leaves block untouched ───────────────────────────────
+
+#[tokio::test]
+async fn apply_ocr_error_leaves_block_untouched() {
+    let server = MockServer::start().await;
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(ResponseTemplate::new(503))
+        .mount(&server)
+        .await;
+
+    let bytes = red_100x50_png();
+    let cfg = cfg_for_endpoint(&server.uri());
+
+    let (block, events, err) = tokio::task::spawn_blocking(move || {
+        let engine = OllamaVisionOcr::new(&cfg).expect("engine");
+        let mut block = empty_image_block();
+        let mut events: Vec<ProvenanceEvent> = Vec::new();
+        let res = apply_ocr(&engine, &bytes, &mut block, None, &mut events);
+        (block, events, res.err())
+    })
+    .await
+    .expect("blocking task panicked");
+
+    assert!(err.is_some(), "503 must propagate as Err");
+    assert!(
+        block.ocr.is_none(),
+        "block.ocr stays None when apply_ocr fails — partial state must not leak"
+    );
+    assert!(
+        events.is_empty(),
+        "no Provenance event when OCR fails — kb-normalize would otherwise lie about success"
+    );
+}
+
+// ── Downscale: large input shrinks before sending ─────────────────────────
+
+#[tokio::test]
+async fn ocr_downscales_large_image_before_sending() {
+    use std::sync::{Arc, Mutex};
+
+    // Capture the request body so we can pull out the base64 image and
+    // measure its dimensions.
+    let captured: Arc<Mutex<Option<Vec<u8>>>> = Arc::new(Mutex::new(None));
+
+    let server = MockServer::start().await;
+    let cap = captured.clone();
+    Mock::given(method("POST"))
+        .and(path("/api/generate"))
+        .respond_with(move |req: &wiremock::Request| {
+            let body = req.body.clone();
+            *cap.lock().unwrap() = Some(body);
+            ResponseTemplate::new(200).set_body_json(json!({
+                "response": "ok",
+                "done": true
+            }))
+        })
+        .mount(&server)
+        .await;
+
+    // 4000×3000 PNG (long edge 4000) — well above the cfg max 1024.
+    let big = common::large_blue_4000x3000_png();
+    let cfg = cfg_for_endpoint(&server.uri());
+    let _ = tokio::task::spawn_blocking({
+        let cfg = cfg.clone();
+        move || run_recognize(cfg, big, None)
+    })
+    .await
+    .expect("blocking task panicked")
+    .expect("recognize succeeds");
+
+    // Pull the request body, parse JSON, base64-decode the image, and
+    // verify the long edge is at most max_pixels (1024).
+    let raw = captured.lock().unwrap().clone().expect("request captured");
+    let value: serde_json::Value =
+        serde_json::from_slice(&raw).expect("request body is JSON");
+    let imgs = value
+        .get("images")
+        .and_then(|v| v.as_array())
+        .expect("images field present");
+    assert_eq!(imgs.len(), 1, "exactly one image sent");
+    let b64 = imgs[0].as_str().expect("image is base64 string");
+    use base64::Engine as _;
+    let decoded = base64::engine::general_purpose::STANDARD
+        .decode(b64)
+        .expect("base64 decodes");
+    let reader = image::ImageReader::new(std::io::Cursor::new(decoded))
+        .with_guessed_format()
+        .expect("guess format");
+    let (w, h) = reader.into_dimensions().expect("dims");
+    let long = w.max(h);
+    assert!(
+        long <= 1024,
+        "long edge after downscale must be <= max_pixels (got {long})"
+    );
+    // Aspect ratio preserved within rounding.
+    let ratio_in = 4000.0 / 3000.0;
+    let ratio_out = w as f32 / h as f32;
+    assert!(
+        (ratio_in - ratio_out).abs() < 0.02,
+        "aspect ratio drift: in={ratio_in} out={ratio_out}"
+    );
+}
+
+// ── from_parts construction ──────────────────────────────────────────────
+
+#[test]
+fn from_parts_clamps_max_pixels_into_legal_range() {
+    let too_small = OllamaVisionOcr::from_parts("http://x", "m", vec![], 10).unwrap();
+    let too_big = OllamaVisionOcr::from_parts("http://x", "m", vec![], 99_999).unwrap();
+    // We can't read the private field directly, but engine_version is
+    // observable; assert the engine constructed at all (clamp didn't
+    // panic) and run a synthetic prompt.
+    assert_eq!(too_small.engine_name(), "ollama-vision");
+    assert_eq!(too_big.engine_name(), "ollama-vision");
+}
+
+// ── Integration test against real Ollama (opt-in) ────────────────────────
+
+/// End-to-end OCR against the workspace's real Ollama daemon. Skipped
+/// unless `KEBAB_OCR_INTEGRATION=1` so the regular `cargo test` run
+/// stays hermetic.
+///
+/// Run with:
+///
+/// ```sh
+/// KEBAB_OCR_INTEGRATION=1 \
+/// KEBAB_IMAGE_OCR_ENDPOINT=http://192.168.0.47:11434 \
+/// cargo test -p kebab-parse-image ocr_integration -- --ignored
+/// ```
+#[tokio::test]
+#[ignore = "hits a real Ollama daemon; gated behind KEBAB_OCR_INTEGRATION=1"]
+async fn ocr_integration_real_ollama_transcribes_text() {
+    if std::env::var("KEBAB_OCR_INTEGRATION").ok().as_deref() != Some("1") {
+        eprintln!("skipping ocr_integration: KEBAB_OCR_INTEGRATION != 1");
+        return;
+    }
+    let endpoint = std::env::var("KEBAB_IMAGE_OCR_ENDPOINT")
+        .unwrap_or_else(|_| "http://192.168.0.47:11434".to_string());
+    let model =
+        std::env::var("KEBAB_IMAGE_OCR_MODEL").unwrap_or_else(|_| "gemma4:e4b".to_string());
+
+    // Generate a fixture with known text.
+    let bytes = common::hello_world_png();
+    let cfg = {
+        let mut c = Config::defaults();
+        c.image.ocr.endpoint = endpoint;
+        c.image.ocr.model = model;
+        c.image.ocr.max_pixels = 1024;
+        c
+    };
+    let text = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None))
+        .await
+        .expect("blocking task panicked")
+        .expect("real Ollama OCR must succeed");
+    eprintln!("integration OCR result: {:?}", text.joined);
+    let normalized = text.joined.to_lowercase().replace(",", "").replace(".", "");
+    assert!(
+        normalized.contains("hello") && normalized.contains("world"),
+        "integration OCR did not capture expected text: {:?}",
+        text.joined
+    );
+}
diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md
index 9e63261..e11346a 100644
--- a/tasks/HOTFIXES.md
+++ b/tasks/HOTFIXES.md
@@ -14,6 +14,24 @@ historical contract that was implemented; this file accumulates the
 deltas so phase 5+ readers can find the live behavior without diffing
 git history.
 
+## 2026-05-02 — P6-2 default OCR engine: Tesseract → Ollama-vision
+
+**Discovered**: P6-2 implementation start.
+
+**Symptom**: The original `tasks/p6/p6-2-ocr-adapter.md` spec lists Tesseract as the default OCR engine (`tesseract = "0.13"`, feature `tesseract`, default ON). Bringing Tesseract online requires installing `libtesseract-dev` (and `tesseract-ocr-kor` for the spec-default Korean languages set) on every dev / CI host. The kebab dev environment intentionally avoids system-package installs, so the Tesseract Rust bindings can't link.
+
+**Root cause**: Spec was written assuming a Linux host with `apt install tesseract-ocr-*` available. The reality of single-developer local-first KB is that the same box also runs the Ollama vision endpoint already wired by P4-2 — using it for OCR adds zero new system dependencies.
+
+**Fix** (PR #33, feat/p6-2-ocr-adapter):
+- New `OllamaVisionOcr` adapter under `crates/kebab-parse-image/src/ocr.rs`. Implements the spec's `OcrEngine` trait by POSTing the image (base64) to `<endpoint>/api/generate` with a transcription prompt against `gemma4:e4b` (default) or any other vision-capable Ollama model.
+- New `kebab-config::ImageCfg.ocr` block (`enabled`, `engine`, `model`, `endpoint`, `languages`, `max_pixels`). `enabled` defaults to `false` because OCR adds a model call per asset; `engine` defaults to `"ollama-vision"`. `endpoint` falls back to `models.llm.endpoint` when empty so the same Ollama host serves both LLM and OCR.
+- The `OcrEngine` trait is unchanged from the spec — Tesseract / Apple Vision / PaddleOCR engines plug in as future feature-gated alternatives without touching the extractor or chunker. The trait abstraction is the part the spec actually demanded; only the choice of default implementation changes.
+- Tests cover wiremock unit paths (200 happy / 5xx / 200 error envelope / empty response / downscale honours `max_pixels`), `apply_ocr` provenance + error handling, and an opt-in `KEBAB_OCR_INTEGRATION=1` integration test that hits a real Ollama endpoint with a generated `"Hello World 2026"` PNG. Tesseract feature-gated tests from the original spec are deferred to whenever someone is willing to bring `libtesseract` to CI.
+
+**Trust note**: The original spec marked `OcrText` as "observed text (high trust)" to distinguish it from `ModelCaption`. With an LLM-driven default the line blurs — vision LMs can hallucinate. We kept `OcrText.engine = "ollama-vision"` so consumers can decide trust by engine identity. Future Tesseract / Apple Vision adapters write a different `engine` string and downstream code can branch.
+
+**Amends**: tasks/p6/p6-2-ocr-adapter.md (default engine; "Allowed dependencies" list — `reqwest` + `base64` replace `tesseract`; "Apple Vision" feature gate deferred; `min_confidence` config field dropped because the LM doesn't expose per-region confidence).
+
 ## 2026-05-01 — `--config` flag silently ignored across all kebab-cli subcommands
 
 **Discovered**: post-P3-5 manual smoke at `/tmp/kebab-smoke/`.
diff --git a/tasks/p6/p6-2-ocr-adapter.md b/tasks/p6/p6-2-ocr-adapter.md
index f349773..d11a3ab 100644
--- a/tasks/p6/p6-2-ocr-adapter.md
+++ b/tasks/p6/p6-2-ocr-adapter.md
@@ -3,7 +3,7 @@ phase: P6
 component: kebab-parse-image (OCR adapter)
 task_id: p6-2
 title: "OcrEngine trait + Tesseract adapter (Apple Vision feature-gated)"
-status: planned
+status: completed
 depends_on: [p6-1]
 unblocks: [p6-3]
 contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md