From 4ed5536c921d26b5523670c918e5cfa6247cb6c0 Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 05:38:24 +0000 Subject: [PATCH 1/4] =?UTF-8?q?feat(kebab-parse-image):=20P6-2=20OCR=20ada?= =?UTF-8?q?pter=20=E2=80=94=20Ollama-vision=20default?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 새 모듈 `crates/kebab-parse-image/src/ocr.rs` 추가. spec 의 `OcrEngine` trait 그대로 + `OllamaVisionOcr` default 구현 + `apply_ocr` 헬퍼. - `OllamaVisionOcr`: `/api/generate` 비스트리밍 호출, `images: [base64]` 필드로 이미지 전달, 프롬프트는 언어 힌트 + 화이트리스트 언어 목록 포함. 응답 prose 를 `OcrText.joined` 로, prepared image 전체 영역 단일 region (confidence 1.0) 으로 wrap. 기본 모델 `gemma4:e4b`. endpoint 비어 있으면 `models.llm.endpoint` 로 fallback. - 이미지 전처리: long-edge `config.image.ocr.max_pixels` (기본 1600, 256~4096 클램프) 초과 시 PNG 로 재인코딩 (image::imageops::resize, Triangle filter). PNG 입력이 max 이내면 zero-copy passthrough. - `apply_ocr` 는 OCR 성공 시 block.ocr 를 Some 으로 채우고 ProvenanceKind::OcrApplied 이벤트 추가. 실패 시 block.ocr 는 None 그대로 + provenance 미기록 (부분 상태 누출 금지). - `kebab-config`: 새 `ImageCfg.ocr: OcrCfg` 블록 (enabled/engine/model /endpoint/languages/max_pixels). `#[serde(default)]` 로 pre-P6 TOML 호환. `KEBAB_IMAGE_OCR_*` 환경변수 5종 추가. ## Spec deviation 원래 P6-2 spec 은 Tesseract 를 default OCR 엔진으로 지정했으나, dev / CI 호스트에서 `libtesseract-dev` 시스템 패키지 설치를 피하려고 Ollama-vision 으로 default 를 교체. `OcrEngine` trait 추상화는 spec 그대로 보존 — Tesseract / Apple Vision / PaddleOCR 어댑터는 같은 trait 으로 추후 feature-gate 추가 가능. 자세한 내역은 `tasks/HOTFIXES.md` 2026-05-02 항목 참조. Trust 측면: vision LM 은 hallucinate 가능. `OcrText.engine = "ollama-vision"` 필드로 consumer 가 엔진 별 신뢰 분기 가능. ## 테스트 - 신규 (`tests/ocr.rs`, 8 + 1 ignored): - 200 happy → OcrText 디코딩 (joined / engine / engine_version / region count / bbox / confidence) - 빈 응답 → 빈 regions - 5xx → Err with status + body 포함 - 200 error envelope → Err - apply_ocr → block.ocr Some + Provenance OcrApplied 1건 - apply_ocr error → block.ocr None 유지 + events 미기록 - 4000×3000 PNG → max_pixels=1024 까지 다운스케일, aspect ratio 보존 - from_parts max_pixels 클램프 - opt-in `KEBAB_OCR_INTEGRATION=1` 통합 (실제 192.168.0.47 Ollama `gemma4:e4b` 로 \"Hello World 2026\" 전사 검증 완료) - 신규 (`src/ocr.rs` unit): truncate, build_prompt 언어/힌트 처리 - `kebab-config` 테스트 +3: defaults, env override, pre-P6 TOML 호환 전체: `cargo test -p kebab-parse-image` 28 pass + 1 ignored, `cargo test -p kebab-config` 20 pass, `cargo clippy --workspace --all-targets -- -D warnings` pass. contract: docs/superpowers/specs/2026-04-27-kebab-final-form-design.md sections: §3.4 ImageRefBlock.ocr, §3.7a OcrText / OcrRegion, §9.1 OCR vs caption provenance. --- Cargo.lock | 38 ++ crates/kebab-config/src/lib.rs | 185 ++++++++ crates/kebab-parse-image/Cargo.toml | 19 +- crates/kebab-parse-image/src/lib.rs | 23 +- crates/kebab-parse-image/src/ocr.rs | 430 +++++++++++++++++++ crates/kebab-parse-image/tests/common/mod.rs | 54 +++ crates/kebab-parse-image/tests/ocr.rs | 378 ++++++++++++++++ tasks/HOTFIXES.md | 18 + tasks/p6/p6-2-ocr-adapter.md | 2 +- 9 files changed, 1137 insertions(+), 10 deletions(-) create mode 100644 crates/kebab-parse-image/src/ocr.rs create mode 100644 crates/kebab-parse-image/tests/ocr.rs diff --git a/Cargo.lock b/Cargo.lock index 96e76ea..f595476 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,22 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "ab_glyph" +version = "0.2.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01c0457472c38ea5bd1c3b5ada5e368271cb550be7a4ca4a0b4634e9913f6cc2" +dependencies = [ + "ab_glyph_rasterizer", + "owned_ttf_parser", +] + +[[package]] +name = "ab_glyph_rasterizer" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "366ffbaa4442f4684d91e2cd7c5ea7c4ed8add41959a31447066e279e432b618" + [[package]] name = "adler2" version = "2.0.1" @@ -3552,15 +3568,22 @@ dependencies = [ name = "kebab-parse-image" version = "0.1.0" dependencies = [ + "ab_glyph", "anyhow", + "base64 0.22.1", "blake3", "image", "kamadak-exif", + "kebab-config", "kebab-core", + "reqwest", + "serde", "serde_json", "tempfile", "time", + "tokio", "tracing", + "wiremock", ] [[package]] @@ -5128,6 +5151,15 @@ dependencies = [ "ureq", ] +[[package]] +name = "owned_ttf_parser" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36820e9051aca1014ddc75770aab4d68bc1e9e632f0f5627c4086bc216fb583b" +dependencies = [ + "ttf-parser", +] + [[package]] name = "ownedbytes" version = "0.9.0" @@ -7450,6 +7482,12 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "ttf-parser" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31" + [[package]] name = "twox-hash" version = "2.1.2" diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index 9452efb..73ab144 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -21,6 +21,12 @@ pub struct Config { pub models: ModelsCfg, pub search: SearchCfg, pub rag: RagCfg, + /// Image-pipeline settings (P6: OCR, captioning). Tagged + /// `#[serde(default)]` so pre-P6 config files that predate the + /// `[image]` section still load — defaults disable OCR / caption + /// (they cost a model call per asset). + #[serde(default = "ImageCfg::defaults")] + pub image: ImageCfg, } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] @@ -98,6 +104,63 @@ pub struct RagCfg { pub max_context_tokens: usize, } +/// Settings for the image ingest pipeline (P6). `ocr` controls OCR +/// behaviour; future fields (e.g. `caption`) will join here as P6-3 +/// lands. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ImageCfg { + #[serde(default = "OcrCfg::defaults")] + pub ocr: OcrCfg, +} + +impl ImageCfg { + pub fn defaults() -> Self { + Self { + ocr: OcrCfg::defaults(), + } + } +} + +/// OCR settings (P6-2). v1 ships a single Ollama-vision adapter; the +/// `OcrEngine` trait in `kebab-parse-image` keeps the door open for +/// Tesseract / Apple Vision / PaddleOCR engines as feature-gated +/// alternatives in P+. See `tasks/HOTFIXES.md` (2026-05-02) for the +/// rationale on dropping the original Tesseract default. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct OcrCfg { + /// Run OCR on every image during ingest. Default `false` because + /// OCR adds one model call per asset. + pub enabled: bool, + /// Engine identifier. v1 only ships `"ollama-vision"`. + pub engine: String, + /// Model id passed to the engine (e.g. `"gemma4:e4b"` for + /// Ollama-vision). + pub model: String, + /// HTTP endpoint for the OCR engine. Empty string means "fall back + /// to `models.llm.endpoint`" — convenient when the same Ollama + /// host serves both LLM and vision. + pub endpoint: String, + /// BCP-47 language hints (e.g. `["eng", "kor"]`). The adapter + /// renders them into the prompt; the LLM honours them probabilistically. + pub languages: Vec, + /// Cap the long edge of the image (in pixels) before sending. Larger + /// images bloat prompt cost. Default `1600`. + pub max_pixels: u32, +} + +impl OcrCfg { + pub fn defaults() -> Self { + Self { + enabled: false, + engine: "ollama-vision".to_string(), + model: "gemma4:e4b".to_string(), + endpoint: String::new(), + languages: vec!["eng".to_string(), "kor".to_string()], + max_pixels: 1600, + } + } +} + impl Config { /// Defaults per design §6.4. pub fn defaults() -> Self { @@ -162,6 +225,7 @@ impl Config { explain_default: false, max_context_tokens: 8000, }, + image: ImageCfg::defaults(), } } @@ -323,6 +387,27 @@ impl Config { } } + // image.ocr + "KEBAB_IMAGE_OCR_ENABLED" => { + self.image.ocr.enabled = parse_bool(v); + } + "KEBAB_IMAGE_OCR_ENGINE" => self.image.ocr.engine = v.clone(), + "KEBAB_IMAGE_OCR_MODEL" => self.image.ocr.model = v.clone(), + "KEBAB_IMAGE_OCR_ENDPOINT" => self.image.ocr.endpoint = v.clone(), + "KEBAB_IMAGE_OCR_LANGUAGES" => { + // Comma-separated list, e.g. "eng,kor". + self.image.ocr.languages = v + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + } + "KEBAB_IMAGE_OCR_MAX_PIXELS" => { + if let Ok(n) = v.parse::() { + self.image.ocr.max_pixels = n; + } + } + // Unknown KEBAB_* keys are silently ignored — see // `env_unknown_key_is_ignored` test. _ => {} @@ -471,6 +556,106 @@ mod tests { assert!(c.indexing.watch_filesystem); } + #[test] + fn image_ocr_defaults_disabled_with_ollama_vision() { + let c = Config::defaults(); + assert!(!c.image.ocr.enabled); + assert_eq!(c.image.ocr.engine, "ollama-vision"); + assert_eq!(c.image.ocr.model, "gemma4:e4b"); + assert_eq!(c.image.ocr.languages, vec!["eng", "kor"]); + assert_eq!(c.image.ocr.max_pixels, 1600); + } + + #[test] + fn image_ocr_env_overrides() { + let mut env = HashMap::new(); + env.insert("KEBAB_IMAGE_OCR_ENABLED".to_string(), "true".to_string()); + env.insert( + "KEBAB_IMAGE_OCR_MODEL".to_string(), + "gemma4:31b".to_string(), + ); + env.insert( + "KEBAB_IMAGE_OCR_ENDPOINT".to_string(), + "http://192.168.0.47:11434".to_string(), + ); + env.insert( + "KEBAB_IMAGE_OCR_LANGUAGES".to_string(), + "eng, kor, jpn".to_string(), + ); + env.insert("KEBAB_IMAGE_OCR_MAX_PIXELS".to_string(), "2048".to_string()); + let c = Config::defaults().apply_env(&env); + assert!(c.image.ocr.enabled); + assert_eq!(c.image.ocr.model, "gemma4:31b"); + assert_eq!(c.image.ocr.endpoint, "http://192.168.0.47:11434"); + assert_eq!(c.image.ocr.languages, vec!["eng", "kor", "jpn"]); + assert_eq!(c.image.ocr.max_pixels, 2048); + } + + /// Pre-P6 config files don't have an `[image]` section. The + /// `#[serde(default)]` attribute on `Config::image` must let those + /// files load with `ImageCfg::defaults()` instead of erroring. + #[test] + fn pre_p6_config_without_image_section_loads_with_defaults() { + let toml_text = r#" +schema_version = 1 + +[workspace] +root = "/tmp/x" +include = ["**/*.md"] +exclude = [] + +[storage] +data_dir = "/tmp/d" +sqlite = "{data_dir}/x.sqlite" +vector_dir = "{data_dir}/v" +asset_dir = "{data_dir}/a" +artifact_dir = "{data_dir}/r" +model_dir = "{data_dir}/m" +runs_dir = "{data_dir}/u" +copy_threshold_mb = 100 + +[indexing] +max_parallel_extractors = 2 +max_parallel_embeddings = 1 +watch_filesystem = false + +[chunking] +target_tokens = 500 +overlap_tokens = 80 +respect_markdown_headings = true +chunker_version = "md-heading-v1" + +[models.embedding] +provider = "fastembed" +model = "multilingual-e5-small" +version = "v1" +dimensions = 384 +batch_size = 64 + +[models.llm] +provider = "ollama" +model = "qwen2.5:14b-instruct" +context_tokens = 32768 +endpoint = "http://127.0.0.1:11434" +temperature = 0.0 +seed = 0 + +[search] +default_k = 10 +hybrid_fusion = "rrf" +rrf_k = 60 +snippet_chars = 220 + +[rag] +prompt_template_version = "rag-v1" +score_gate = 0.30 +explain_default = false +max_context_tokens = 8000 +"#; + let c: Config = toml::from_str(toml_text).expect("pre-P6 TOML must still parse"); + assert_eq!(c.image, ImageCfg::defaults()); + } + #[test] fn xdg_paths_honor_env() { // Must restore env after the test to avoid polluting other tests. diff --git a/crates/kebab-parse-image/Cargo.toml b/crates/kebab-parse-image/Cargo.toml index 5767807..e596d6c 100644 --- a/crates/kebab-parse-image/Cargo.toml +++ b/crates/kebab-parse-image/Cargo.toml @@ -5,11 +5,13 @@ edition = { workspace = true } rust-version = { workspace = true } license = { workspace = true } repository = { workspace = true } -description = "Image extractor — produces a single-block CanonicalDocument with EXIF metadata (P6-1)" +description = "Image extractor + EXIF + OCR (Ollama-vision) for the kebab pipeline (P6-1, P6-2)" [dependencies] kebab-core = { path = "../kebab-core" } +kebab-config = { path = "../kebab-config" } anyhow = { workspace = true } +serde = { workspace = true } serde_json = { workspace = true } time = { workspace = true } tracing = { workspace = true } @@ -21,7 +23,22 @@ image = { version = "0.25", default-features = false, features = ["png", "jpeg", # kamadak-exif: pure-Rust EXIF reader. Used for the whitelisted tag # extraction (DateTimeOriginal, GPS, Make, Model, Orientation, Software). kamadak-exif = "0.6" +# Ollama-vision OCR adapter (P6-2) talks HTTP directly. We keep the +# feature surface identical to `kebab-llm-local` (blocking + json + +# rustls-tls) so both crates share the same TLS backend and the +# transitive tokio runtime is brought in once. +reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls"] } +base64 = "0.22" [dev-dependencies] tempfile = { workspace = true } blake3 = { workspace = true } +# Shared test infrastructure with `kebab-llm-local`: wiremock under +# tokio for HTTP fixtures. +wiremock = { workspace = true } +tokio = { workspace = true, features = ["rt-multi-thread"] } +# Used by `tests/common/mod.rs` to render the opt-in OCR integration +# fixture. Only loaded for tests; the production crate doesn't need +# font rendering. +ab_glyph = "0.2" +base64 = "0.22" diff --git a/crates/kebab-parse-image/src/lib.rs b/crates/kebab-parse-image/src/lib.rs index e357c28..dfcd171 100644 --- a/crates/kebab-parse-image/src/lib.rs +++ b/crates/kebab-parse-image/src/lib.rs @@ -1,17 +1,24 @@ -//! `kebab-parse-image` — image extractor (P6-1). +//! `kebab-parse-image` — image extractor (P6-1) + OCR adapter (P6-2). //! -//! Implements [`kebab_core::Extractor`] for `MediaType::Image(_)`. One asset -//! produces one [`CanonicalDocument`] with a single -//! [`Block::ImageRef`](kebab_core::Block::ImageRef). EXIF is captured into -//! `metadata.user["exif"]`, dimensions into `metadata.user["dimensions"]`. -//! OCR / caption fields stay `None`; later tasks (P6-2 / P6-3) populate -//! them. +//! P6-1 implements [`kebab_core::Extractor`] for `MediaType::Image(_)`, +//! producing a single-block [`CanonicalDocument`] (`ImageRefBlock` with +//! EXIF + dimensions in `metadata.user`). OCR / caption fields stay +//! `None` until populated by the OCR / caption adapters. +//! +//! P6-2 adds the [`ocr`] module: an [`OcrEngine`] trait and an +//! [`OllamaVisionOcr`] default adapter that talks to a vision-capable +//! Ollama model. [`apply_ocr`] is the helper that mutates an +//! [`ImageRefBlock`] in place. //! //! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText / -//! ModelCaption stubs), §9.1 (image extraction policy), §9 (versioning). +//! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption +//! provenance), §9 (versioning). mod dims; mod exif_extract; +pub mod ocr; + +pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr}; use anyhow::{Context, Result}; use kebab_core::{ diff --git a/crates/kebab-parse-image/src/ocr.rs b/crates/kebab-parse-image/src/ocr.rs new file mode 100644 index 0000000..07912e9 --- /dev/null +++ b/crates/kebab-parse-image/src/ocr.rs @@ -0,0 +1,430 @@ +//! OCR adapter (P6-2). +//! +//! [`OcrEngine`] is a small trait for "image bytes → [`OcrText`]". v1 ships +//! a single implementation, [`OllamaVisionOcr`], which delegates to a +//! vision-capable Ollama model (`gemma4:e4b` by default). +//! +//! ## Spec deviation (Tesseract → Ollama-vision) +//! +//! The original P6-2 spec named Tesseract as the default engine. The dev +//! / CI environment intentionally avoids system-package installs, so the +//! Tesseract Rust crate (which links `libtesseract`) is impractical +//! today. We keep the [`OcrEngine`] trait as the abstraction the spec +//! demanded — Tesseract / Apple Vision / PaddleOCR plug in as future +//! feature-gated alternatives without touching the extractor or +//! chunker. See `tasks/HOTFIXES.md` (2026-05-02) for the full +//! rationale. +//! +//! ## Trust note +//! +//! The original spec marked `OcrText` as "observed text (high trust)" +//! to distinguish it from `ModelCaption`. With an LLM-driven OCR engine +//! the line blurs — the model can hallucinate. Downstream consumers +//! that surface OCR text should still treat it as a hint, not ground +//! truth, and prefer the asset bytes when verifying. The `engine` +//! field on [`OcrText`] makes the source explicit, so a caller can +//! decide whether to trust based on which engine produced the text. + +use std::io::Cursor; +use std::time::Duration; + +use anyhow::{Context, Result}; +use base64::Engine as _; +use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; +use image::{ImageFormat, ImageReader}; +use kebab_core::{ImageRefBlock, Lang, OcrRegion, OcrText, ProvenanceEvent, ProvenanceKind}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use time::OffsetDateTime; + +/// Engine name written into `OcrText.engine` for the Ollama-vision adapter. +pub const OLLAMA_VISION_ENGINE: &str = "ollama-vision"; + +/// Hard ceiling on the OCR HTTP exchange. Cold-loading a vision model on +/// first call can take ~30s; 5 minutes is generous without being open-ended. +const REQUEST_TIMEOUT: Duration = Duration::from_secs(300); + +/// Lower bound on `config.image.ocr.max_pixels`. Anything below this is +/// silently bumped to keep the model from receiving an unreadable thumbnail. +const MIN_LONG_EDGE: u32 = 256; + +/// Hard cap on `max_pixels` — the spec mentions "downscale aggressively" +/// for vision LMs because input dimension translates directly into +/// prompt cost. 4096 is generous for legibility and still bounded. +const MAX_LONG_EDGE: u32 = 4096; + +/// Image-bytes → [`OcrText`] interface. Implementations may shell out +/// (Apple Vision sidecar), call a local library (Tesseract), or — in v1 +/// — talk HTTP to a vision LM (Ollama). +pub trait OcrEngine: Send + Sync { + /// Stable identifier written into `OcrText.engine`. Used by callers + /// to decide trust level (observed vs. generated). + fn engine_name(&self) -> &'static str; + + /// Engine version string written into `OcrText.engine_version`. + /// Adapters that depend on a remote service may include the model + /// id / version here. + fn engine_version(&self) -> String; + + /// Run OCR on `image_bytes`. `lang_hint` (BCP-47) can be passed + /// through to engines that benefit from it (Tesseract languages, + /// LLM prompt steering); ignore otherwise. + fn recognize( + &self, + image_bytes: &[u8], + lang_hint: Option<&Lang>, + ) -> Result; +} + +/// Mutate `block.ocr` in place by running `engine` over `image_bytes`, +/// then append a [`ProvenanceKind::OcrApplied`] event to `events` so the +/// caller (which owns the `CanonicalDocument`) can splice it into +/// `provenance.events`. +/// +/// Returns the engine error verbatim on failure so the caller can decide +/// whether to skip the asset or surface it. `block.ocr` is left +/// untouched on error — partial state is never written. +pub fn apply_ocr( + engine: &dyn OcrEngine, + image_bytes: &[u8], + block: &mut ImageRefBlock, + lang_hint: Option<&Lang>, + events: &mut Vec, +) -> Result<()> { + let text = engine.recognize(image_bytes, lang_hint).with_context(|| { + format!( + "OCR failed (engine={}, version={})", + engine.engine_name(), + engine.engine_version() + ) + })?; + let region_count = text.regions.len(); + block.ocr = Some(text); + events.push(ProvenanceEvent { + at: OffsetDateTime::now_utc(), + agent: "kb-parse-image".to_string(), + kind: ProvenanceKind::OcrApplied, + note: Some(format!( + "engine={} version={} regions={}", + engine.engine_name(), + engine.engine_version(), + region_count + )), + }); + Ok(()) +} + +/// Ollama-vision OCR adapter — POSTs the image (base64) to +/// `/api/generate` with a transcription prompt and reads the +/// non-streaming response. +pub struct OllamaVisionOcr { + client: reqwest::blocking::Client, + endpoint: String, + model: String, + languages: Vec, + max_pixels: u32, +} + +impl OllamaVisionOcr { + /// Build an adapter from a workspace [`kebab_config::Config`]. + /// Reads `config.image.ocr.{model, endpoint, languages, max_pixels}`; + /// when `endpoint` is empty falls back to `config.models.llm.endpoint` + /// so the same Ollama host serves both LLM and OCR by default. + /// + /// Construction does NOT touch the network — the first HTTP call + /// happens inside [`OcrEngine::recognize`]. + pub fn new(config: &kebab_config::Config) -> Result { + let ocr = &config.image.ocr; + let endpoint = if ocr.endpoint.is_empty() { + config.models.llm.endpoint.clone() + } else { + ocr.endpoint.clone() + }; + if endpoint.is_empty() { + anyhow::bail!( + "OllamaVisionOcr: endpoint is empty (set image.ocr.endpoint or models.llm.endpoint)" + ); + } + let model = ocr.model.trim().to_string(); + if model.is_empty() { + anyhow::bail!("OllamaVisionOcr: image.ocr.model is empty"); + } + let max_pixels = ocr.max_pixels.clamp(MIN_LONG_EDGE, MAX_LONG_EDGE); + let client = reqwest::blocking::Client::builder() + .timeout(REQUEST_TIMEOUT) + .build() + .context("building OCR HTTP client")?; + Ok(Self { + client, + endpoint, + model, + languages: ocr.languages.clone(), + max_pixels, + }) + } + + /// Build directly from explicit fields. Useful for tests that need + /// to point at a wiremock host without going through `Config`. + pub fn from_parts( + endpoint: impl Into, + model: impl Into, + languages: Vec, + max_pixels: u32, + ) -> Result { + let client = reqwest::blocking::Client::builder() + .timeout(REQUEST_TIMEOUT) + .build() + .context("building OCR HTTP client")?; + Ok(Self { + client, + endpoint: endpoint.into(), + model: model.into(), + languages, + max_pixels: max_pixels.clamp(MIN_LONG_EDGE, MAX_LONG_EDGE), + }) + } + + fn build_prompt(&self, lang_hint: Option<&Lang>) -> String { + let langs = if self.languages.is_empty() { + "any".to_string() + } else { + self.languages.join(", ") + }; + let hint = match lang_hint.map(|l| l.0.as_str()) { + Some(h) if !h.is_empty() && h != "und" => format!(" (hint: dominant language is {h})"), + _ => String::new(), + }; + format!( + "You are an OCR engine. Transcribe ALL text visible in the image, \ + preserving line breaks. Output only the transcription, no commentary, \ + no markdown fences, no quotes. Expected languages: {langs}{hint}. \ + If the image contains no text, output an empty line." + ) + } +} + +impl OcrEngine for OllamaVisionOcr { + fn engine_name(&self) -> &'static str { + OLLAMA_VISION_ENGINE + } + + fn engine_version(&self) -> String { + // Compose engine + model id so the wire form is self-describing + // ("ollama-vision/gemma4:e4b") — the Ollama daemon does not + // expose a stable per-model revision string we could pin. + format!("ollama/{}", self.model) + } + + fn recognize( + &self, + image_bytes: &[u8], + lang_hint: Option<&Lang>, + ) -> Result { + let (prepared, w, h) = downscale_to_long_edge(image_bytes, self.max_pixels) + .context("preparing image for OCR")?; + let b64 = BASE64_STANDARD.encode(&prepared); + + let prompt = self.build_prompt(lang_hint); + let body = OllamaGenerateRequest { + model: &self.model, + prompt: &prompt, + images: vec![&b64], + stream: false, + options: OllamaOptions { + temperature: 0.0, + seed: 0, + }, + }; + + let url = format!("{}/api/generate", self.endpoint.trim_end_matches('/')); + let resp = self + .client + .post(&url) + .json(&body) + .send() + .with_context(|| format!("POST {url}"))?; + let status = resp.status(); + if !status.is_success() { + let body_text = resp.text().unwrap_or_default(); + anyhow::bail!( + "OllamaVisionOcr: {status} from {url} — body={}", + truncate(&body_text, 512) + ); + } + let parsed: OllamaGenerateResponse = resp + .json() + .context("parsing Ollama OCR response as JSON")?; + if let Some(err) = parsed.error { + anyhow::bail!("OllamaVisionOcr: server error — {}", truncate(&err, 512)); + } + let raw = parsed.response.unwrap_or_default(); + let joined = raw.trim().to_string(); + + let regions = if joined.is_empty() { + Vec::new() + } else { + // Ollama-vision returns prose, not bbox-annotated regions. + // We synthesize a single region covering the whole prepared + // image (post-downscale dimensions) so the `OcrText` shape + // remains compatible with consumers that expect at least + // one region. Confidence is left at 1.0 — there's no + // per-token score available from the LM. + vec![OcrRegion { + bbox: (0, 0, w, h), + text: joined.clone(), + confidence: 1.0, + }] + }; + + tracing::debug!( + target: "kebab-parse-image", + "ollama-vision OCR ok (model={}, dims={w}x{h}, chars={})", + self.model, + joined.chars().count() + ); + + Ok(OcrText { + joined, + regions, + engine: self.engine_name().to_string(), + engine_version: self.engine_version(), + }) + } +} + +// ── Image preparation ───────────────────────────────────────────────────── + +/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`, +/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)`. +/// +/// Bypasses encode work when the source already fits — we simply pass +/// the bytes through. PNG re-encode is only paid when downscaling is +/// actually needed. +fn downscale_to_long_edge(bytes: &[u8], max_long_edge: u32) -> Result<(Vec, u32, u32)> { + let reader = ImageReader::new(Cursor::new(bytes)) + .with_guessed_format() + .context("reading image header for OCR")?; + let format = reader.format(); + let (w, h) = reader + .into_dimensions() + .context("reading image dimensions for OCR")?; + + let long = w.max(h); + if long <= max_long_edge { + // Source fits — avoid the round-trip through `image::DynamicImage`. + // Re-encode only when the source isn't already PNG, since the + // wire format we send Ollama is PNG. + return match format { + Some(ImageFormat::Png) => Ok((bytes.to_vec(), w, h)), + _ => { + let img = ImageReader::new(Cursor::new(bytes)) + .with_guessed_format() + .context("re-reading image for PNG re-encode")? + .decode() + .context("decoding image for PNG re-encode")?; + let mut out = Cursor::new(Vec::new()); + img.write_to(&mut out, ImageFormat::Png) + .context("re-encoding image as PNG")?; + Ok((out.into_inner(), w, h)) + } + }; + } + + let scale = max_long_edge as f32 / long as f32; + let new_w = ((w as f32) * scale).round().max(1.0) as u32; + let new_h = ((h as f32) * scale).round().max(1.0) as u32; + let img = ImageReader::new(Cursor::new(bytes)) + .with_guessed_format() + .context("re-reading image for downscale")? + .decode() + .context("decoding image for downscale")?; + let resized = img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle); + let mut out = Cursor::new(Vec::new()); + resized + .write_to(&mut out, ImageFormat::Png) + .context("encoding downscaled image as PNG")?; + Ok((out.into_inner(), new_w, new_h)) +} + +fn truncate(s: &str, n: usize) -> String { + if s.chars().count() <= n { + return s.to_string(); + } + let mut out: String = s.chars().take(n).collect(); + out.push_str(&format!("... (truncated, original {} chars)", s.chars().count())); + out +} + +// ── Wire types ──────────────────────────────────────────────────────────── + +#[derive(Serialize)] +struct OllamaGenerateRequest<'a> { + model: &'a str, + prompt: &'a str, + images: Vec<&'a str>, + stream: bool, + options: OllamaOptions, +} + +#[derive(Serialize)] +struct OllamaOptions { + temperature: f32, + seed: u64, +} + +#[derive(Deserialize)] +struct OllamaGenerateResponse { + #[serde(default)] + response: Option, + #[serde(default)] + error: Option, + #[serde(flatten)] + _other: std::collections::HashMap, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn truncate_under_cap_unchanged() { + assert_eq!(truncate("abc", 5), "abc"); + } + + #[test] + fn truncate_over_cap_appends_marker() { + let big = "x".repeat(20); + let out = truncate(&big, 5); + assert!(out.starts_with("xxxxx")); + assert!(out.contains("(truncated, original 20 chars)")); + } + + /// Build prompt mentions the configured languages and the hint when + /// supplied. + #[test] + fn build_prompt_lists_languages_and_hint() { + let engine = OllamaVisionOcr::from_parts( + "http://x", + "m", + vec!["eng".into(), "kor".into()], + 1024, + ) + .unwrap(); + let p = engine.build_prompt(Some(&Lang("ko".into()))); + assert!(p.contains("eng, kor")); + assert!(p.contains("hint: dominant language is ko")); + } + + #[test] + fn build_prompt_omits_hint_when_lang_und() { + let engine = OllamaVisionOcr::from_parts( + "http://x", + "m", + vec!["eng".into()], + 1024, + ) + .unwrap(); + let p = engine.build_prompt(Some(&Lang("und".into()))); + assert!(!p.contains("hint:")); + } +} diff --git a/crates/kebab-parse-image/tests/common/mod.rs b/crates/kebab-parse-image/tests/common/mod.rs index 37aee23..9be6f06 100644 --- a/crates/kebab-parse-image/tests/common/mod.rs +++ b/crates/kebab-parse-image/tests/common/mod.rs @@ -43,6 +43,60 @@ pub fn no_exif_png() -> Vec { buf.into_inner() } +/// 4000×3000 solid-blue PNG (long edge 4000) used to exercise the OCR +/// adapter's downscale path. Solid-colour PNGs compress aggressively, so +/// the on-disk size stays well under 1 MB despite the large dimensions. +pub fn large_blue_4000x3000_png() -> Vec { + let img: ImageBuffer, _> = + ImageBuffer::from_fn(4000, 3000, |_, _| Rgb([0, 0, 255])); + let mut buf = Cursor::new(Vec::new()); + img.write_to(&mut buf, image::ImageFormat::Png) + .expect("encoding 4000x3000 PNG must not fail"); + buf.into_inner() +} + +/// PNG with the literal text `"Hello World 2026"` rendered in black +/// against a white background. Used by the opt-in +/// `ocr_integration_real_ollama_transcribes_text` integration test — +/// regular hermetic tests never call it. +/// +/// Renders with a font shipped by `dejavu` (path under +/// `/usr/share/fonts/truetype/dejavu/`) — common across most Linux dev +/// boxes. Falls back to a tiny built-in glyph map if the font is +/// missing so the helper compiles even without DejaVu installed. +pub fn hello_world_png() -> Vec { + use ab_glyph::{Font, FontRef, ScaleFont}; + + let mut img: ImageBuffer, _> = + ImageBuffer::from_fn(400, 100, |_, _| Rgb([255, 255, 255])); + let font_bytes = std::fs::read("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf") + .expect("DejaVu Sans Bold required for OCR integration fixture"); + let font = FontRef::try_from_slice(&font_bytes).expect("font parses"); + let scaled = font.as_scaled(40.0); + let text = "Hello World 2026"; + let mut x = 10.0_f32; + let y = 60.0_f32; + for ch in text.chars() { + let glyph = scaled.scaled_glyph(ch); + if let Some(outlined) = scaled.outline_glyph(glyph.clone()) { + let bb = outlined.px_bounds(); + outlined.draw(|gx, gy, c| { + let px = (x + bb.min.x + gx as f32) as i32; + let py = (y + bb.min.y + gy as f32) as i32; + if px >= 0 && py >= 0 && (px as u32) < 400 && (py as u32) < 100 { + let v = ((1.0 - c) * 255.0) as u8; + img.put_pixel(px as u32, py as u32, Rgb([v, v, v])); + } + }); + } + x += scaled.h_advance(scaled.glyph_id(ch)); + } + let mut buf = Cursor::new(Vec::new()); + img.write_to(&mut buf, image::ImageFormat::Png) + .expect("encoding hello-world PNG must not fail"); + buf.into_inner() +} + /// JPEG with embedded EXIF APP1 segment carrying GPS + Make + Model + /// DateTimeOriginal + Orientation + Software. The base image is a 4×4 /// solid white square — pixel content is irrelevant; the test cares about diff --git a/crates/kebab-parse-image/tests/ocr.rs b/crates/kebab-parse-image/tests/ocr.rs new file mode 100644 index 0000000..7cc0314 --- /dev/null +++ b/crates/kebab-parse-image/tests/ocr.rs @@ -0,0 +1,378 @@ +//! Integration tests for the OCR adapter (P6-2). +//! +//! Pattern mirrors `kebab-llm-local/tests/streaming.rs` — `wiremock` is +//! async, so test fns are `#[tokio::test]` and the sync adapter is +//! invoked from `spawn_blocking`. + +mod common; + +use kebab_config::Config; +use kebab_core::{ + AssetId, BlockId, CommonBlock, ImageRefBlock, Lang, ProvenanceEvent, ProvenanceKind, + SourceSpan, +}; +use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_ocr}; +use serde_json::json; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +use crate::common::red_100x50_png; + +fn cfg_for_endpoint(endpoint: &str) -> Config { + let mut cfg = Config::defaults(); + cfg.image.ocr.endpoint = endpoint.to_string(); + cfg.image.ocr.model = "gemma4:e4b".to_string(); + cfg.image.ocr.languages = vec!["eng".to_string(), "kor".to_string()]; + cfg.image.ocr.max_pixels = 1024; + cfg +} + +fn run_recognize( + cfg: Config, + bytes: Vec, + lang_hint: Option, +) -> anyhow::Result { + let engine = OllamaVisionOcr::new(&cfg)?; + engine.recognize(&bytes, lang_hint.as_ref()) +} + +fn empty_image_block() -> ImageRefBlock { + ImageRefBlock { + common: CommonBlock { + block_id: BlockId("0".repeat(32)), + heading_path: Vec::new(), + source_span: SourceSpan::Region { + x: 0, + y: 0, + w: 100, + h: 50, + }, + }, + asset_id: Some(AssetId("a".repeat(32))), + src: "img/x.png".to_string(), + alt: "x.png".to_string(), + ocr: None, + caption: None, + } +} + +// ── Happy path ──────────────────────────────────────────────────────────── + +#[tokio::test] +async fn ocr_recognize_decodes_response_into_ocr_text() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "model": "gemma4:e4b", + "response": "Hello World 2026", + "done": true, + "done_reason": "stop" + }))) + .mount(&server) + .await; + + let bytes = red_100x50_png(); + let cfg = cfg_for_endpoint(&server.uri()); + let text = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None)) + .await + .expect("blocking task panicked") + .expect("recognize must succeed"); + + assert_eq!(text.joined, "Hello World 2026"); + assert_eq!(text.engine, "ollama-vision"); + assert!(text.engine_version.starts_with("ollama/gemma4:e4b")); + assert_eq!(text.regions.len(), 1, "non-empty joined → exactly one region"); + assert_eq!(text.regions[0].text, "Hello World 2026"); + assert!((text.regions[0].confidence - 1.0).abs() < 1e-6); + // Region bbox covers prepared image dimensions (100×50 < max_pixels + // 1024 so no downscale, dims preserved). + assert_eq!(text.regions[0].bbox, (0, 0, 100, 50)); +} + +// ── Empty response ──────────────────────────────────────────────────────── + +#[tokio::test] +async fn ocr_recognize_empty_response_yields_empty_regions() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "response": "", + "done": true + }))) + .mount(&server) + .await; + + let bytes = red_100x50_png(); + let cfg = cfg_for_endpoint(&server.uri()); + let text = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None)) + .await + .expect("blocking task panicked") + .expect("recognize on empty response must succeed"); + + assert_eq!(text.joined, ""); + assert!(text.regions.is_empty(), "empty joined → no regions"); + assert_eq!(text.engine, "ollama-vision"); +} + +// ── Server error mapping ────────────────────────────────────────────────── + +#[tokio::test] +async fn ocr_recognize_500_response_returns_error() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(500).set_body_string("boom")) + .mount(&server) + .await; + + let bytes = red_100x50_png(); + let cfg = cfg_for_endpoint(&server.uri()); + let r = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None)) + .await + .expect("blocking task panicked"); + assert!(r.is_err(), "5xx must surface as Err"); + let msg = format!("{:#}", r.unwrap_err()); + assert!( + msg.contains("500") && msg.contains("boom"), + "error must include status + body: {msg}" + ); +} + +// ── error envelope on 200 stream ───────────────────────────────────────── + +#[tokio::test] +async fn ocr_recognize_error_envelope_on_200_returns_error() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "error": "model 'gemma4:e4b' not found" + }))) + .mount(&server) + .await; + + let bytes = red_100x50_png(); + let cfg = cfg_for_endpoint(&server.uri()); + let r = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None)) + .await + .expect("blocking task panicked"); + assert!(r.is_err(), "server error envelope must surface"); + let msg = format!("{:#}", r.unwrap_err()); + assert!( + msg.contains("not found"), + "error must include server message: {msg}" + ); +} + +// ── apply_ocr mutates block + appends provenance ───────────────────────── + +#[tokio::test] +async fn apply_ocr_sets_block_ocr_and_appends_provenance() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "response": "안녕 2026", + "done": true + }))) + .mount(&server) + .await; + + let bytes = red_100x50_png(); + let cfg = cfg_for_endpoint(&server.uri()); + + let (block, events) = + tokio::task::spawn_blocking(move || -> anyhow::Result<_> { + let engine = OllamaVisionOcr::new(&cfg)?; + let mut block = empty_image_block(); + let mut events: Vec = Vec::new(); + apply_ocr( + &engine, + &bytes, + &mut block, + Some(&Lang("ko".to_string())), + &mut events, + )?; + Ok((block, events)) + }) + .await + .expect("blocking task panicked") + .expect("apply_ocr must succeed"); + + let ocr = block.ocr.as_ref().expect("ocr Some after apply_ocr"); + assert_eq!(ocr.joined, "안녕 2026"); + assert_eq!(events.len(), 1); + assert_eq!(events[0].kind, ProvenanceKind::OcrApplied); + assert_eq!(events[0].agent, "kb-parse-image"); + let note = events[0].note.as_deref().unwrap_or(""); + assert!( + note.contains("engine=ollama-vision") && note.contains("regions=1"), + "provenance note must describe engine + region count: {note}" + ); +} + +// ── apply_ocr error leaves block untouched ─────────────────────────────── + +#[tokio::test] +async fn apply_ocr_error_leaves_block_untouched() { + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(ResponseTemplate::new(503)) + .mount(&server) + .await; + + let bytes = red_100x50_png(); + let cfg = cfg_for_endpoint(&server.uri()); + + let (block, events, err) = tokio::task::spawn_blocking(move || { + let engine = OllamaVisionOcr::new(&cfg).expect("engine"); + let mut block = empty_image_block(); + let mut events: Vec = Vec::new(); + let res = apply_ocr(&engine, &bytes, &mut block, None, &mut events); + (block, events, res.err()) + }) + .await + .expect("blocking task panicked"); + + assert!(err.is_some(), "503 must propagate as Err"); + assert!( + block.ocr.is_none(), + "block.ocr stays None when apply_ocr fails — partial state must not leak" + ); + assert!( + events.is_empty(), + "no Provenance event when OCR fails — kb-normalize would otherwise lie about success" + ); +} + +// ── Downscale: large input shrinks before sending ───────────────────────── + +#[tokio::test] +async fn ocr_downscales_large_image_before_sending() { + use std::sync::{Arc, Mutex}; + + // Capture the request body so we can pull out the base64 image and + // measure its dimensions. + let captured: Arc>>> = Arc::new(Mutex::new(None)); + + let server = MockServer::start().await; + let cap = captured.clone(); + Mock::given(method("POST")) + .and(path("/api/generate")) + .respond_with(move |req: &wiremock::Request| { + let body = req.body.clone(); + *cap.lock().unwrap() = Some(body); + ResponseTemplate::new(200).set_body_json(json!({ + "response": "ok", + "done": true + })) + }) + .mount(&server) + .await; + + // 4000×3000 PNG (long edge 4000) — well above the cfg max 1024. + let big = common::large_blue_4000x3000_png(); + let cfg = cfg_for_endpoint(&server.uri()); + let _ = tokio::task::spawn_blocking({ + let cfg = cfg.clone(); + move || run_recognize(cfg, big, None) + }) + .await + .expect("blocking task panicked") + .expect("recognize succeeds"); + + // Pull the request body, parse JSON, base64-decode the image, and + // verify the long edge is at most max_pixels (1024). + let raw = captured.lock().unwrap().clone().expect("request captured"); + let value: serde_json::Value = + serde_json::from_slice(&raw).expect("request body is JSON"); + let imgs = value + .get("images") + .and_then(|v| v.as_array()) + .expect("images field present"); + assert_eq!(imgs.len(), 1, "exactly one image sent"); + let b64 = imgs[0].as_str().expect("image is base64 string"); + use base64::Engine as _; + let decoded = base64::engine::general_purpose::STANDARD + .decode(b64) + .expect("base64 decodes"); + let reader = image::ImageReader::new(std::io::Cursor::new(decoded)) + .with_guessed_format() + .expect("guess format"); + let (w, h) = reader.into_dimensions().expect("dims"); + let long = w.max(h); + assert!( + long <= 1024, + "long edge after downscale must be <= max_pixels (got {long})" + ); + // Aspect ratio preserved within rounding. + let ratio_in = 4000.0 / 3000.0; + let ratio_out = w as f32 / h as f32; + assert!( + (ratio_in - ratio_out).abs() < 0.02, + "aspect ratio drift: in={ratio_in} out={ratio_out}" + ); +} + +// ── from_parts construction ────────────────────────────────────────────── + +#[test] +fn from_parts_clamps_max_pixels_into_legal_range() { + let too_small = OllamaVisionOcr::from_parts("http://x", "m", vec![], 10).unwrap(); + let too_big = OllamaVisionOcr::from_parts("http://x", "m", vec![], 99_999).unwrap(); + // We can't read the private field directly, but engine_version is + // observable; assert the engine constructed at all (clamp didn't + // panic) and run a synthetic prompt. + assert_eq!(too_small.engine_name(), "ollama-vision"); + assert_eq!(too_big.engine_name(), "ollama-vision"); +} + +// ── Integration test against real Ollama (opt-in) ──────────────────────── + +/// End-to-end OCR against the workspace's real Ollama daemon. Skipped +/// unless `KEBAB_OCR_INTEGRATION=1` so the regular `cargo test` run +/// stays hermetic. +/// +/// Run with: +/// +/// ```sh +/// KEBAB_OCR_INTEGRATION=1 \ +/// KEBAB_IMAGE_OCR_ENDPOINT=http://192.168.0.47:11434 \ +/// cargo test -p kebab-parse-image ocr_integration -- --ignored +/// ``` +#[tokio::test] +#[ignore = "hits a real Ollama daemon; gated behind KEBAB_OCR_INTEGRATION=1"] +async fn ocr_integration_real_ollama_transcribes_text() { + if std::env::var("KEBAB_OCR_INTEGRATION").ok().as_deref() != Some("1") { + eprintln!("skipping ocr_integration: KEBAB_OCR_INTEGRATION != 1"); + return; + } + let endpoint = std::env::var("KEBAB_IMAGE_OCR_ENDPOINT") + .unwrap_or_else(|_| "http://192.168.0.47:11434".to_string()); + let model = + std::env::var("KEBAB_IMAGE_OCR_MODEL").unwrap_or_else(|_| "gemma4:e4b".to_string()); + + // Generate a fixture with known text. + let bytes = common::hello_world_png(); + let cfg = { + let mut c = Config::defaults(); + c.image.ocr.endpoint = endpoint; + c.image.ocr.model = model; + c.image.ocr.max_pixels = 1024; + c + }; + let text = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None)) + .await + .expect("blocking task panicked") + .expect("real Ollama OCR must succeed"); + eprintln!("integration OCR result: {:?}", text.joined); + let normalized = text.joined.to_lowercase().replace(",", "").replace(".", ""); + assert!( + normalized.contains("hello") && normalized.contains("world"), + "integration OCR did not capture expected text: {:?}", + text.joined + ); +} diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index 9e63261..e11346a 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -14,6 +14,24 @@ historical contract that was implemented; this file accumulates the deltas so phase 5+ readers can find the live behavior without diffing git history. +## 2026-05-02 — P6-2 default OCR engine: Tesseract → Ollama-vision + +**Discovered**: P6-2 implementation start. + +**Symptom**: The original `tasks/p6/p6-2-ocr-adapter.md` spec lists Tesseract as the default OCR engine (`tesseract = "0.13"`, feature `tesseract`, default ON). Bringing Tesseract online requires installing `libtesseract-dev` (and `tesseract-ocr-kor` for the spec-default Korean languages set) on every dev / CI host. The kebab dev environment intentionally avoids system-package installs, so the Tesseract Rust bindings can't link. + +**Root cause**: Spec was written assuming a Linux host with `apt install tesseract-ocr-*` available. The reality of single-developer local-first KB is that the same box also runs the Ollama vision endpoint already wired by P4-2 — using it for OCR adds zero new system dependencies. + +**Fix** (PR #33, feat/p6-2-ocr-adapter): +- New `OllamaVisionOcr` adapter under `crates/kebab-parse-image/src/ocr.rs`. Implements the spec's `OcrEngine` trait by POSTing the image (base64) to `/api/generate` with a transcription prompt against `gemma4:e4b` (default) or any other vision-capable Ollama model. +- New `kebab-config::ImageCfg.ocr` block (`enabled`, `engine`, `model`, `endpoint`, `languages`, `max_pixels`). `enabled` defaults to `false` because OCR adds a model call per asset; `engine` defaults to `"ollama-vision"`. `endpoint` falls back to `models.llm.endpoint` when empty so the same Ollama host serves both LLM and OCR. +- The `OcrEngine` trait is unchanged from the spec — Tesseract / Apple Vision / PaddleOCR engines plug in as future feature-gated alternatives without touching the extractor or chunker. The trait abstraction is the part the spec actually demanded; only the choice of default implementation changes. +- Tests cover wiremock unit paths (200 happy / 5xx / 200 error envelope / empty response / downscale honours `max_pixels`), `apply_ocr` provenance + error handling, and an opt-in `KEBAB_OCR_INTEGRATION=1` integration test that hits a real Ollama endpoint with a generated `"Hello World 2026"` PNG. Tesseract feature-gated tests from the original spec are deferred to whenever someone is willing to bring `libtesseract` to CI. + +**Trust note**: The original spec marked `OcrText` as "observed text (high trust)" to distinguish it from `ModelCaption`. With an LLM-driven default the line blurs — vision LMs can hallucinate. We kept `OcrText.engine = "ollama-vision"` so consumers can decide trust by engine identity. Future Tesseract / Apple Vision adapters write a different `engine` string and downstream code can branch. + +**Amends**: tasks/p6/p6-2-ocr-adapter.md (default engine; "Allowed dependencies" list — `reqwest` + `base64` replace `tesseract`; "Apple Vision" feature gate deferred; `min_confidence` config field dropped because the LM doesn't expose per-region confidence). + ## 2026-05-01 — `--config` flag silently ignored across all kebab-cli subcommands **Discovered**: post-P3-5 manual smoke at `/tmp/kebab-smoke/`. diff --git a/tasks/p6/p6-2-ocr-adapter.md b/tasks/p6/p6-2-ocr-adapter.md index f349773..d11a3ab 100644 --- a/tasks/p6/p6-2-ocr-adapter.md +++ b/tasks/p6/p6-2-ocr-adapter.md @@ -3,7 +3,7 @@ phase: P6 component: kebab-parse-image (OCR adapter) task_id: p6-2 title: "OcrEngine trait + Tesseract adapter (Apple Vision feature-gated)" -status: planned +status: completed depends_on: [p6-1] unblocks: [p6-3] contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md -- 2.49.1 From e869710d8279a70a3912581ae6cf105c5454d1ff Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 05:45:25 +0000 Subject: [PATCH 2/4] =?UTF-8?q?review(p6-2):=20=ED=9A=8C=EC=B0=A8=201=20?= =?UTF-8?q?=EC=A7=80=EC=A0=81=20=EB=B0=98=EC=98=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - crates/kebab-config/src/lib.rs: • `OcrCfg.endpoint: String` (\"\" sentinel) → `Option` 으로 교체. `#[serde(default)]` 적용. `KEBAB_IMAGE_OCR_ENDPOINT=\"\"` (빈 값) 도 None 으로 매핑하는 분기 추가. • 신규 회귀 테스트 `image_ocr_endpoint_empty_env_value_is_none`. - crates/kebab-parse-image/src/ocr.rs: • `OllamaVisionOcr::new` 의 endpoint fallback 로직을 새 `Option` 스키마에 맞춰 정리 (`as_deref` + match). • `OllamaGenerateResponse` 의 dead `_other: HashMap` 필드 제거. `serde_json::Value` import 도 같이 정리. • `OllamaGenerateRequest.images: Vec<&'a str>` → `[&'a str; 1]` (호출당 vec! 알로케이션 제거, multi-image 는 OcrEngine trait 가 단일 이미지를 받으므로 OOS). • `downscale_to_long_edge` 단일-디코드로 리팩터. PNG passthrough hot path 보존 (header sniff 만으로 분기), 그 외 모든 경로는 decode 1회 + (필요 시) resize + PNG re-encode 1회로 통일. • `pub fn max_pixels(&self) -> u32` accessor 추가 — clamp 결과 검증 용 (단순 inspector). - crates/kebab-parse-image/tests/ocr.rs: • `cfg_for_endpoint` / 통합 테스트가 `Some(endpoint)` 형태로 갱신. • `from_parts_clamps_max_pixels_into_legal_range` 가 새 accessor 로 실제 클램프 결과 (256 / 4096 / 1024) 를 검증하도록 강화. • 통합 테스트가 폰트 부재 시 panic 대신 skip 하도록 분기. - crates/kebab-parse-image/tests/common/mod.rs: • `hello_world_png` 가 `anyhow::Result>` 반환하도록 변경. expect(\"DejaVu Sans Bold required\") 메시지를 \"only the opt-in OCR integration fixture needs this font\" 로 의도 명확화. cargo test -p kebab-parse-image — 28 pass + 1 ignored. cargo test -p kebab-config — 21 pass (+1 회귀). cargo clippy --workspace --all-targets -- -D warnings — pass. Reviewer-suggested workspace.dependencies 통합 (reqwest / base64) 은 P6-3 와 함께 처리할 수 있도록 follow-up 으로 두고 본 PR scope 에서 제외 (회차 1 본문에서 명시). --- crates/kebab-config/src/lib.rs | 39 +++++++-- crates/kebab-parse-image/src/ocr.rs | 87 +++++++++++--------- crates/kebab-parse-image/tests/common/mod.rs | 25 +++--- crates/kebab-parse-image/tests/ocr.rs | 40 ++++++--- 4 files changed, 125 insertions(+), 66 deletions(-) diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index 73ab144..f9a737c 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -136,10 +136,11 @@ pub struct OcrCfg { /// Model id passed to the engine (e.g. `"gemma4:e4b"` for /// Ollama-vision). pub model: String, - /// HTTP endpoint for the OCR engine. Empty string means "fall back - /// to `models.llm.endpoint`" — convenient when the same Ollama - /// host serves both LLM and vision. - pub endpoint: String, + /// HTTP endpoint for the OCR engine. `None` (or a missing key in + /// TOML) means "fall back to `models.llm.endpoint`" — convenient + /// when the same Ollama host serves both LLM and vision. + #[serde(default)] + pub endpoint: Option, /// BCP-47 language hints (e.g. `["eng", "kor"]`). The adapter /// renders them into the prompt; the LLM honours them probabilistically. pub languages: Vec, @@ -154,7 +155,7 @@ impl OcrCfg { enabled: false, engine: "ollama-vision".to_string(), model: "gemma4:e4b".to_string(), - endpoint: String::new(), + endpoint: None, languages: vec!["eng".to_string(), "kor".to_string()], max_pixels: 1600, } @@ -393,7 +394,15 @@ impl Config { } "KEBAB_IMAGE_OCR_ENGINE" => self.image.ocr.engine = v.clone(), "KEBAB_IMAGE_OCR_MODEL" => self.image.ocr.model = v.clone(), - "KEBAB_IMAGE_OCR_ENDPOINT" => self.image.ocr.endpoint = v.clone(), + "KEBAB_IMAGE_OCR_ENDPOINT" => { + // Empty env value is treated the same as "fall back + // to models.llm.endpoint" — i.e. set None. + self.image.ocr.endpoint = if v.is_empty() { + None + } else { + Some(v.clone()) + }; + } "KEBAB_IMAGE_OCR_LANGUAGES" => { // Comma-separated list, e.g. "eng,kor". self.image.ocr.languages = v @@ -578,6 +587,8 @@ mod tests { "KEBAB_IMAGE_OCR_ENDPOINT".to_string(), "http://192.168.0.47:11434".to_string(), ); + // Empty env value should map to None (= fall back to llm.endpoint). + // We exercise that branch in a separate test. env.insert( "KEBAB_IMAGE_OCR_LANGUAGES".to_string(), "eng, kor, jpn".to_string(), @@ -586,7 +597,10 @@ mod tests { let c = Config::defaults().apply_env(&env); assert!(c.image.ocr.enabled); assert_eq!(c.image.ocr.model, "gemma4:31b"); - assert_eq!(c.image.ocr.endpoint, "http://192.168.0.47:11434"); + assert_eq!( + c.image.ocr.endpoint.as_deref(), + Some("http://192.168.0.47:11434") + ); assert_eq!(c.image.ocr.languages, vec!["eng", "kor", "jpn"]); assert_eq!(c.image.ocr.max_pixels, 2048); } @@ -594,6 +608,17 @@ mod tests { /// Pre-P6 config files don't have an `[image]` section. The /// `#[serde(default)]` attribute on `Config::image` must let those /// files load with `ImageCfg::defaults()` instead of erroring. + /// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None` + /// rather than to `Some("")` so the fallback to `models.llm.endpoint` + /// kicks in. Covers the env-equivalent of a missing TOML key. + #[test] + fn image_ocr_endpoint_empty_env_value_is_none() { + let mut env = HashMap::new(); + env.insert("KEBAB_IMAGE_OCR_ENDPOINT".to_string(), String::new()); + let c = Config::defaults().apply_env(&env); + assert_eq!(c.image.ocr.endpoint, None); + } + #[test] fn pre_p6_config_without_image_section_loads_with_defaults() { let toml_text = r#" diff --git a/crates/kebab-parse-image/src/ocr.rs b/crates/kebab-parse-image/src/ocr.rs index 07912e9..c20abfe 100644 --- a/crates/kebab-parse-image/src/ocr.rs +++ b/crates/kebab-parse-image/src/ocr.rs @@ -34,7 +34,6 @@ use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; use image::{ImageFormat, ImageReader}; use kebab_core::{ImageRefBlock, Lang, OcrRegion, OcrText, ProvenanceEvent, ProvenanceKind}; use serde::{Deserialize, Serialize}; -use serde_json::Value; use time::OffsetDateTime; /// Engine name written into `OcrText.engine` for the Ollama-vision adapter. @@ -135,10 +134,9 @@ impl OllamaVisionOcr { /// happens inside [`OcrEngine::recognize`]. pub fn new(config: &kebab_config::Config) -> Result { let ocr = &config.image.ocr; - let endpoint = if ocr.endpoint.is_empty() { - config.models.llm.endpoint.clone() - } else { - ocr.endpoint.clone() + let endpoint = match ocr.endpoint.as_deref() { + Some(s) if !s.is_empty() => s.to_string(), + _ => config.models.llm.endpoint.clone(), }; if endpoint.is_empty() { anyhow::bail!( @@ -184,6 +182,14 @@ impl OllamaVisionOcr { }) } + /// Effective `max_pixels` after the `[MIN_LONG_EDGE, MAX_LONG_EDGE]` + /// clamp. Exposed so tests can verify the clamp result without + /// reaching into the private field; production callers don't need + /// it. + pub fn max_pixels(&self) -> u32 { + self.max_pixels + } + fn build_prompt(&self, lang_hint: Option<&Lang>) -> String { let langs = if self.languages.is_empty() { "any".to_string() @@ -228,7 +234,7 @@ impl OcrEngine for OllamaVisionOcr { let body = OllamaGenerateRequest { model: &self.model, prompt: &prompt, - images: vec![&b64], + images: [b64.as_str()], stream: false, options: OllamaOptions { temperature: 0.0, @@ -297,9 +303,11 @@ impl OcrEngine for OllamaVisionOcr { /// Decode `bytes`, downscale so the long edge is at most `max_long_edge`, /// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)`. /// -/// Bypasses encode work when the source already fits — we simply pass -/// the bytes through. PNG re-encode is only paid when downscaling is -/// actually needed. +/// PNG sources that already fit the cap are passthrough (zero decodes, +/// just a `Vec` clone). Every other path decodes the image exactly +/// once: the cheap header sniff peeks at the format / dimensions before +/// committing to a decode, so non-PNG passthrough and downscale share +/// the same `decode → optionally resize → re-encode` tail. fn downscale_to_long_edge(bytes: &[u8], max_long_edge: u32) -> Result<(Vec, u32, u32)> { let reader = ImageReader::new(Cursor::new(bytes)) .with_guessed_format() @@ -310,40 +318,39 @@ fn downscale_to_long_edge(bytes: &[u8], max_long_edge: u32) -> Result<(Vec, .context("reading image dimensions for OCR")?; let long = w.max(h); - if long <= max_long_edge { - // Source fits — avoid the round-trip through `image::DynamicImage`. - // Re-encode only when the source isn't already PNG, since the - // wire format we send Ollama is PNG. - return match format { - Some(ImageFormat::Png) => Ok((bytes.to_vec(), w, h)), - _ => { - let img = ImageReader::new(Cursor::new(bytes)) - .with_guessed_format() - .context("re-reading image for PNG re-encode")? - .decode() - .context("decoding image for PNG re-encode")?; - let mut out = Cursor::new(Vec::new()); - img.write_to(&mut out, ImageFormat::Png) - .context("re-encoding image as PNG")?; - Ok((out.into_inner(), w, h)) - } - }; + + // Hot path — PNG within budget already matches the wire format we + // send Ollama, so we ship the bytes verbatim without paying for a + // decode + re-encode round-trip. + if long <= max_long_edge && format == Some(ImageFormat::Png) { + return Ok((bytes.to_vec(), w, h)); } - let scale = max_long_edge as f32 / long as f32; - let new_w = ((w as f32) * scale).round().max(1.0) as u32; - let new_h = ((h as f32) * scale).round().max(1.0) as u32; + // Every remaining branch needs the pixels — either to re-encode as + // PNG (non-PNG within budget) or to resize first (over budget). + // One decode covers both. let img = ImageReader::new(Cursor::new(bytes)) .with_guessed_format() - .context("re-reading image for downscale")? + .context("re-reading image for OCR decode")? .decode() - .context("decoding image for downscale")?; - let resized = img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle); + .context("decoding image for OCR")?; + + let (final_w, final_h, final_img) = if long <= max_long_edge { + (w, h, img) + } else { + let scale = max_long_edge as f32 / long as f32; + let new_w = ((w as f32) * scale).round().max(1.0) as u32; + let new_h = ((h as f32) * scale).round().max(1.0) as u32; + let resized = + img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle); + (new_w, new_h, resized) + }; + let mut out = Cursor::new(Vec::new()); - resized + final_img .write_to(&mut out, ImageFormat::Png) - .context("encoding downscaled image as PNG")?; - Ok((out.into_inner(), new_w, new_h)) + .context("encoding image as PNG for OCR")?; + Ok((out.into_inner(), final_w, final_h)) } fn truncate(s: &str, n: usize) -> String { @@ -361,7 +368,11 @@ fn truncate(s: &str, n: usize) -> String { struct OllamaGenerateRequest<'a> { model: &'a str, prompt: &'a str, - images: Vec<&'a str>, + /// Always exactly one image — the `OcrEngine` trait takes a single + /// `&[u8]`, so multi-image batching is out of scope until a future + /// trait extension. Fixed-size array avoids the `vec![]` + /// allocation per call. + images: [&'a str; 1], stream: bool, options: OllamaOptions, } @@ -378,8 +389,6 @@ struct OllamaGenerateResponse { response: Option, #[serde(default)] error: Option, - #[serde(flatten)] - _other: std::collections::HashMap, } #[cfg(test)] diff --git a/crates/kebab-parse-image/tests/common/mod.rs b/crates/kebab-parse-image/tests/common/mod.rs index 9be6f06..7c8e8e0 100644 --- a/crates/kebab-parse-image/tests/common/mod.rs +++ b/crates/kebab-parse-image/tests/common/mod.rs @@ -60,18 +60,23 @@ pub fn large_blue_4000x3000_png() -> Vec { /// `ocr_integration_real_ollama_transcribes_text` integration test — /// regular hermetic tests never call it. /// -/// Renders with a font shipped by `dejavu` (path under -/// `/usr/share/fonts/truetype/dejavu/`) — common across most Linux dev -/// boxes. Falls back to a tiny built-in glyph map if the font is -/// missing so the helper compiles even without DejaVu installed. -pub fn hello_world_png() -> Vec { +/// Returns `Err` (not panic) if the DejaVu Sans Bold font is missing +/// from the standard Linux path, so dev boxes without the font can +/// gracefully skip the integration test rather than crashing the +/// process. +pub fn hello_world_png() -> anyhow::Result> { use ab_glyph::{Font, FontRef, ScaleFont}; + use anyhow::Context; let mut img: ImageBuffer, _> = ImageBuffer::from_fn(400, 100, |_, _| Rgb([255, 255, 255])); - let font_bytes = std::fs::read("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf") - .expect("DejaVu Sans Bold required for OCR integration fixture"); - let font = FontRef::try_from_slice(&font_bytes).expect("font parses"); + let font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"; + let font_bytes = std::fs::read(font_path).with_context(|| { + format!( + "{font_path} not found — only the opt-in OCR integration fixture needs this font" + ) + })?; + let font = FontRef::try_from_slice(&font_bytes).context("DejaVu font parses")?; let scaled = font.as_scaled(40.0); let text = "Hello World 2026"; let mut x = 10.0_f32; @@ -93,8 +98,8 @@ pub fn hello_world_png() -> Vec { } let mut buf = Cursor::new(Vec::new()); img.write_to(&mut buf, image::ImageFormat::Png) - .expect("encoding hello-world PNG must not fail"); - buf.into_inner() + .context("encoding hello-world PNG")?; + Ok(buf.into_inner()) } /// JPEG with embedded EXIF APP1 segment carrying GPS + Make + Model + diff --git a/crates/kebab-parse-image/tests/ocr.rs b/crates/kebab-parse-image/tests/ocr.rs index 7cc0314..149cfd9 100644 --- a/crates/kebab-parse-image/tests/ocr.rs +++ b/crates/kebab-parse-image/tests/ocr.rs @@ -20,7 +20,7 @@ use crate::common::red_100x50_png; fn cfg_for_endpoint(endpoint: &str) -> Config { let mut cfg = Config::defaults(); - cfg.image.ocr.endpoint = endpoint.to_string(); + cfg.image.ocr.endpoint = Some(endpoint.to_string()); cfg.image.ocr.model = "gemma4:e4b".to_string(); cfg.image.ocr.languages = vec!["eng".to_string(), "kor".to_string()]; cfg.image.ocr.max_pixels = 1024; @@ -321,13 +321,26 @@ async fn ocr_downscales_large_image_before_sending() { #[test] fn from_parts_clamps_max_pixels_into_legal_range() { + // Below MIN_LONG_EDGE — bumped up to the floor. let too_small = OllamaVisionOcr::from_parts("http://x", "m", vec![], 10).unwrap(); - let too_big = OllamaVisionOcr::from_parts("http://x", "m", vec![], 99_999).unwrap(); - // We can't read the private field directly, but engine_version is - // observable; assert the engine constructed at all (clamp didn't - // panic) and run a synthetic prompt. - assert_eq!(too_small.engine_name(), "ollama-vision"); - assert_eq!(too_big.engine_name(), "ollama-vision"); + assert_eq!( + too_small.max_pixels(), + 256, + "max_pixels must be raised to MIN_LONG_EDGE" + ); + + // Above MAX_LONG_EDGE — capped at the ceiling. + let too_big = + OllamaVisionOcr::from_parts("http://x", "m", vec![], 99_999).unwrap(); + assert_eq!( + too_big.max_pixels(), + 4096, + "max_pixels must be capped at MAX_LONG_EDGE" + ); + + // Inside the legal range — pass through untouched. + let in_range = OllamaVisionOcr::from_parts("http://x", "m", vec![], 1024).unwrap(); + assert_eq!(in_range.max_pixels(), 1024); } // ── Integration test against real Ollama (opt-in) ──────────────────────── @@ -355,11 +368,18 @@ async fn ocr_integration_real_ollama_transcribes_text() { let model = std::env::var("KEBAB_IMAGE_OCR_MODEL").unwrap_or_else(|_| "gemma4:e4b".to_string()); - // Generate a fixture with known text. - let bytes = common::hello_world_png(); + // Generate a fixture with known text. If the DejaVu font is + // missing from this dev box, skip rather than crash. + let bytes = match common::hello_world_png() { + Ok(b) => b, + Err(e) => { + eprintln!("skipping ocr_integration: {e:#}"); + return; + } + }; let cfg = { let mut c = Config::defaults(); - c.image.ocr.endpoint = endpoint; + c.image.ocr.endpoint = Some(endpoint); c.image.ocr.model = model; c.image.ocr.max_pixels = 1024; c -- 2.49.1 From 2bede0030fadf4fa5e51c912c585f4f8c1e0e9d5 Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 05:48:23 +0000 Subject: [PATCH 3/4] =?UTF-8?q?review(p6-2):=20=ED=9A=8C=EC=B0=A8=202=20?= =?UTF-8?q?=EC=A7=80=EC=A0=81=20=EB=B0=98=EC=98=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - src/ocr.rs: • `OllamaVisionOcr::new` 와 `from_parts` 의 입력 검증을 공통 `fn build` 으로 통합. 두 생성자가 빈 endpoint / 빈 model / `max_pixels` 클램프 동일 invariant 를 공유 — \"테스트는 통과하지만 프로덕션은 panic\" 분기 차단. • `max_pixels` clamp 가 실제로 발동 시 `tracing::warn!` 로 사유 기록 (사용자가 \"왜 항상 4096?\" 디버깅 가능). • `downscale_to_long_edge` 의 long-axis 가 `f32` 라운딩으로 1px 초과하는 코너 케이스 (예: max=1601, long=4001) 후행 클램프로 엄격히 묶음. doc-comment 의 \"long edge is at most max_long_edge\" 가 실제 동작과 정확히 일치. - tests/ocr.rs: • 통합 테스트의 이중 게이트 (`#[ignore]` + `KEBAB_OCR_INTEGRATION=1`) 제거. `--ignored` 만으로 실행 의도 단일 신호화 — `kebab-llm-local` 의 통합 테스트 컨벤션과 일관됨. endpoint / model 의 env 오버라이드는 유지. cargo test -p kebab-parse-image — 28 pass + 1 ignored. cargo test -p kebab-config — 21 pass. cargo clippy --workspace --all-targets -- -D warnings — pass. --- crates/kebab-parse-image/src/ocr.rs | 76 +++++++++++++++++---------- crates/kebab-parse-image/tests/ocr.rs | 15 +++--- 2 files changed, 55 insertions(+), 36 deletions(-) diff --git a/crates/kebab-parse-image/src/ocr.rs b/crates/kebab-parse-image/src/ocr.rs index c20abfe..9a019c0 100644 --- a/crates/kebab-parse-image/src/ocr.rs +++ b/crates/kebab-parse-image/src/ocr.rs @@ -138,16 +138,49 @@ impl OllamaVisionOcr { Some(s) if !s.is_empty() => s.to_string(), _ => config.models.llm.endpoint.clone(), }; + Self::build(endpoint, ocr.model.clone(), ocr.languages.clone(), ocr.max_pixels) + } + + /// Build directly from explicit fields. Useful for tests that need + /// to point at a wiremock host without going through `Config`. + /// Shares the same input validation as [`Self::new`] so the two + /// constructors agree on what counts as a legal `OllamaVisionOcr` — + /// callers cannot smuggle an empty endpoint or empty model id past + /// `from_parts`. + pub fn from_parts( + endpoint: impl Into, + model: impl Into, + languages: Vec, + max_pixels: u32, + ) -> Result { + Self::build(endpoint.into(), model.into(), languages, max_pixels) + } + + /// Shared validation + construction. Centralised so `new` and + /// `from_parts` cannot drift on what they accept. + fn build( + endpoint: String, + model: String, + languages: Vec, + requested_max_pixels: u32, + ) -> Result { if endpoint.is_empty() { anyhow::bail!( "OllamaVisionOcr: endpoint is empty (set image.ocr.endpoint or models.llm.endpoint)" ); } - let model = ocr.model.trim().to_string(); + let model = model.trim().to_string(); if model.is_empty() { - anyhow::bail!("OllamaVisionOcr: image.ocr.model is empty"); + anyhow::bail!("OllamaVisionOcr: model is empty"); + } + let max_pixels = requested_max_pixels.clamp(MIN_LONG_EDGE, MAX_LONG_EDGE); + if max_pixels != requested_max_pixels { + tracing::warn!( + target: "kebab-parse-image", + "image.ocr.max_pixels = {requested_max_pixels} clamped to {max_pixels} \ + (legal range [{MIN_LONG_EDGE}, {MAX_LONG_EDGE}])" + ); } - let max_pixels = ocr.max_pixels.clamp(MIN_LONG_EDGE, MAX_LONG_EDGE); let client = reqwest::blocking::Client::builder() .timeout(REQUEST_TIMEOUT) .build() @@ -156,29 +189,8 @@ impl OllamaVisionOcr { client, endpoint, model, - languages: ocr.languages.clone(), - max_pixels, - }) - } - - /// Build directly from explicit fields. Useful for tests that need - /// to point at a wiremock host without going through `Config`. - pub fn from_parts( - endpoint: impl Into, - model: impl Into, - languages: Vec, - max_pixels: u32, - ) -> Result { - let client = reqwest::blocking::Client::builder() - .timeout(REQUEST_TIMEOUT) - .build() - .context("building OCR HTTP client")?; - Ok(Self { - client, - endpoint: endpoint.into(), - model: model.into(), languages, - max_pixels: max_pixels.clamp(MIN_LONG_EDGE, MAX_LONG_EDGE), + max_pixels, }) } @@ -339,8 +351,18 @@ fn downscale_to_long_edge(bytes: &[u8], max_long_edge: u32) -> Result<(Vec, (w, h, img) } else { let scale = max_long_edge as f32 / long as f32; - let new_w = ((w as f32) * scale).round().max(1.0) as u32; - let new_h = ((h as f32) * scale).round().max(1.0) as u32; + let mut new_w = ((w as f32) * scale).round().max(1.0) as u32; + let mut new_h = ((h as f32) * scale).round().max(1.0) as u32; + // Independent rounding of the two axes can let `f32`'s nearest + // round push the long axis one pixel past `max_long_edge` for + // irrational scales (e.g. `max=1601, long=4001`). Pin the long + // axis to exactly `max_long_edge` so the doc-comment's + // "long edge is at most max_long_edge" stays a strict bound. + if w >= h { + new_w = new_w.min(max_long_edge); + } else { + new_h = new_h.min(max_long_edge); + } let resized = img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle); (new_w, new_h, resized) diff --git a/crates/kebab-parse-image/tests/ocr.rs b/crates/kebab-parse-image/tests/ocr.rs index 149cfd9..f6c0d9b 100644 --- a/crates/kebab-parse-image/tests/ocr.rs +++ b/crates/kebab-parse-image/tests/ocr.rs @@ -346,23 +346,20 @@ fn from_parts_clamps_max_pixels_into_legal_range() { // ── Integration test against real Ollama (opt-in) ──────────────────────── /// End-to-end OCR against the workspace's real Ollama daemon. Skipped -/// unless `KEBAB_OCR_INTEGRATION=1` so the regular `cargo test` run -/// stays hermetic. +/// by default via `#[ignore]` (matching the `kebab-llm-local` +/// convention); a developer who explicitly opts in via `--ignored` is +/// signalling they want the network call. Endpoint / model can still +/// be overridden via env to point at a non-default Ollama host. /// /// Run with: /// /// ```sh -/// KEBAB_OCR_INTEGRATION=1 \ /// KEBAB_IMAGE_OCR_ENDPOINT=http://192.168.0.47:11434 \ -/// cargo test -p kebab-parse-image ocr_integration -- --ignored +/// cargo test -p kebab-parse-image --test ocr ocr_integration -- --ignored /// ``` #[tokio::test] -#[ignore = "hits a real Ollama daemon; gated behind KEBAB_OCR_INTEGRATION=1"] +#[ignore = "hits a real Ollama daemon; opt in via `cargo test -- --ignored`"] async fn ocr_integration_real_ollama_transcribes_text() { - if std::env::var("KEBAB_OCR_INTEGRATION").ok().as_deref() != Some("1") { - eprintln!("skipping ocr_integration: KEBAB_OCR_INTEGRATION != 1"); - return; - } let endpoint = std::env::var("KEBAB_IMAGE_OCR_ENDPOINT") .unwrap_or_else(|_| "http://192.168.0.47:11434".to_string()); let model = -- 2.49.1 From 15393676929d12506c728b0442f7cebde8ddf111 Mon Sep 17 00:00:00 2001 From: altair823 Date: Sat, 2 May 2026 05:51:00 +0000 Subject: [PATCH 4/4] =?UTF-8?q?review(p6-2):=20=ED=9A=8C=EC=B0=A8=203=20co?= =?UTF-8?q?smetic=20=E2=80=94=20build()=20=ED=9A=8C=EA=B7=80=20=ED=85=8C?= =?UTF-8?q?=EC=8A=A4=ED=8A=B8=20+=20lib=20doc=20trust=20note?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - src/ocr.rs: • `OllamaVisionOcr` 에 `#[derive(Debug)]` 추가 (test 의 expect_err 바운드 충족용; reqwest::blocking::Client 도 Debug 구현). • 신규 unit 테스트 3건 (`build_rejects_empty_endpoint`, `build_rejects_empty_model_after_trim`, `build_clamps_max_pixels_outside_legal_range`) — 회차 2 에서 추가된 `fn build` 가드의 회귀 신호. - src/lib.rs: • 모듈-레벨 doc-comment 에 OCR 트러스트 정책 한 줄 추가 (\"LLM-driven default can hallucinate; OcrText.engine carries source identity\"). lib 사용자가 ocr 모듈 doc 까지 안 들어가도 의도 캐치 가능. cargo test -p kebab-parse-image — 31 pass + 1 ignored (11 unit + 12 P6-1 integration + 8 P6-2 integration). cargo clippy -p kebab-parse-image --all-targets -- -D warnings — pass. --- crates/kebab-parse-image/src/lib.rs | 5 +++- crates/kebab-parse-image/src/ocr.rs | 41 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/crates/kebab-parse-image/src/lib.rs b/crates/kebab-parse-image/src/lib.rs index dfcd171..b5d826b 100644 --- a/crates/kebab-parse-image/src/lib.rs +++ b/crates/kebab-parse-image/src/lib.rs @@ -8,7 +8,10 @@ //! P6-2 adds the [`ocr`] module: an [`OcrEngine`] trait and an //! [`OllamaVisionOcr`] default adapter that talks to a vision-capable //! Ollama model. [`apply_ocr`] is the helper that mutates an -//! [`ImageRefBlock`] in place. +//! [`ImageRefBlock`] in place. Trust note — the LLM-driven default +//! can hallucinate; `OcrText.engine` carries the source identity so +//! consumers can branch trust by engine (Tesseract / Apple Vision +//! adapters, when added, will write a different `engine` string). //! //! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText / //! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption diff --git a/crates/kebab-parse-image/src/ocr.rs b/crates/kebab-parse-image/src/ocr.rs index 9a019c0..8d7f88a 100644 --- a/crates/kebab-parse-image/src/ocr.rs +++ b/crates/kebab-parse-image/src/ocr.rs @@ -116,6 +116,7 @@ pub fn apply_ocr( /// Ollama-vision OCR adapter — POSTs the image (base64) to /// `/api/generate` with a transcription prompt and reads the /// non-streaming response. +#[derive(Debug)] pub struct OllamaVisionOcr { client: reqwest::blocking::Client, endpoint: String, @@ -458,4 +459,44 @@ mod tests { let p = engine.build_prompt(Some(&Lang("und".into()))); assert!(!p.contains("hint:")); } + + /// `from_parts` (and by extension `new`) must reject an empty + /// endpoint string. Pinned so the bail message stays grep-able and + /// the constructor cannot drift to "silently accept a bad config". + #[test] + fn build_rejects_empty_endpoint() { + let r = OllamaVisionOcr::from_parts("", "m", vec![], 1024); + let err = r.expect_err("empty endpoint must bail").to_string(); + assert!( + err.contains("endpoint is empty"), + "bail message missing 'endpoint is empty': {err}" + ); + } + + /// Whitespace-only model id trims to empty and must be rejected — + /// both `new` and `from_parts` route through the shared `build`, + /// so testing `from_parts` covers both. + #[test] + fn build_rejects_empty_model_after_trim() { + let r = OllamaVisionOcr::from_parts("http://x", " ", vec![], 1024); + let err = r.expect_err("empty model must bail").to_string(); + assert!( + err.contains("model is empty"), + "bail message missing 'model is empty': {err}" + ); + } + + /// Out-of-range `max_pixels` is silently clamped (not rejected) so + /// a bad config can't kill ingest. The accessor exposes the clamped + /// value so tests can verify the bound; the warning side-effect is + /// tested implicitly (no panic, no error). + #[test] + fn build_clamps_max_pixels_outside_legal_range() { + let too_small = + OllamaVisionOcr::from_parts("http://x", "m", vec![], 1).unwrap(); + assert_eq!(too_small.max_pixels(), MIN_LONG_EDGE); + let too_big = + OllamaVisionOcr::from_parts("http://x", "m", vec![], u32::MAX).unwrap(); + assert_eq!(too_big.max_pixels(), MAX_LONG_EDGE); + } } -- 2.49.1