feat(kebab-parse-image): P6-2 OCR adapter — Ollama-vision default #33
38
Cargo.lock
generated
@@ -2,6 +2,22 @@
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "ab_glyph"
|
||||
version = "0.2.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01c0457472c38ea5bd1c3b5ada5e368271cb550be7a4ca4a0b4634e9913f6cc2"
|
||||
dependencies = [
|
||||
"ab_glyph_rasterizer",
|
||||
"owned_ttf_parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ab_glyph_rasterizer"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "366ffbaa4442f4684d91e2cd7c5ea7c4ed8add41959a31447066e279e432b618"
|
||||
|
||||
[[package]]
|
||||
name = "adler2"
|
||||
version = "2.0.1"
|
||||
@@ -3552,15 +3568,22 @@ dependencies = [
|
||||
name = "kebab-parse-image"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ab_glyph",
|
||||
"anyhow",
|
||||
"base64 0.22.1",
|
||||
"blake3",
|
||||
"image",
|
||||
"kamadak-exif",
|
||||
"kebab-config",
|
||||
"kebab-core",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"time",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"wiremock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5128,6 +5151,15 @@ dependencies = [
|
||||
"ureq",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "owned_ttf_parser"
|
||||
version = "0.25.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "36820e9051aca1014ddc75770aab4d68bc1e9e632f0f5627c4086bc216fb583b"
|
||||
dependencies = [
|
||||
"ttf-parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ownedbytes"
|
||||
version = "0.9.0"
|
||||
@@ -7450,6 +7482,12 @@ version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
||||
|
||||
[[package]]
|
||||
name = "ttf-parser"
|
||||
version = "0.25.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31"
|
||||
|
||||
[[package]]
|
||||
name = "twox-hash"
|
||||
version = "2.1.2"
|
||||
|
||||
@@ -21,6 +21,12 @@ pub struct Config {
|
||||
pub models: ModelsCfg,
|
||||
pub search: SearchCfg,
|
||||
pub rag: RagCfg,
|
||||
/// Image-pipeline settings (P6: OCR, captioning). Tagged
|
||||
/// `#[serde(default)]` so pre-P6 config files that predate the
|
||||
/// `[image]` section still load — defaults disable OCR / caption
|
||||
/// (they cost a model call per asset).
|
||||
#[serde(default = "ImageCfg::defaults")]
|
||||
pub image: ImageCfg,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -98,6 +104,64 @@ pub struct RagCfg {
|
||||
pub max_context_tokens: usize,
|
||||
}
|
||||
|
||||
/// Settings for the image ingest pipeline (P6). `ocr` controls OCR
|
||||
/// behaviour; future fields (e.g. `caption`) will join here as P6-3
|
||||
/// lands.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ImageCfg {
|
||||
#[serde(default = "OcrCfg::defaults")]
|
||||
pub ocr: OcrCfg,
|
||||
}
|
||||
|
||||
impl ImageCfg {
|
||||
pub fn defaults() -> Self {
|
||||
Self {
|
||||
ocr: OcrCfg::defaults(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// OCR settings (P6-2). v1 ships a single Ollama-vision adapter; the
|
||||
/// `OcrEngine` trait in `kebab-parse-image` keeps the door open for
|
||||
/// Tesseract / Apple Vision / PaddleOCR engines as feature-gated
|
||||
/// alternatives in P+. See `tasks/HOTFIXES.md` (2026-05-02) for the
|
||||
/// rationale on dropping the original Tesseract default.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct OcrCfg {
|
||||
/// Run OCR on every image during ingest. Default `false` because
|
||||
/// OCR adds one model call per asset.
|
||||
pub enabled: bool,
|
||||
/// Engine identifier. v1 only ships `"ollama-vision"`.
|
||||
pub engine: String,
|
||||
/// Model id passed to the engine (e.g. `"gemma4:e4b"` for
|
||||
/// Ollama-vision).
|
||||
pub model: String,
|
||||
/// HTTP endpoint for the OCR engine. `None` (or a missing key in
|
||||
/// TOML) means "fall back to `models.llm.endpoint`" — convenient
|
||||
/// when the same Ollama host serves both LLM and vision.
|
||||
#[serde(default)]
|
||||
|
|
||||
pub endpoint: Option<String>,
|
||||
/// BCP-47 language hints (e.g. `["eng", "kor"]`). The adapter
|
||||
/// renders them into the prompt; the LLM honours them probabilistically.
|
||||
pub languages: Vec<String>,
|
||||
/// Cap the long edge of the image (in pixels) before sending. Larger
|
||||
/// images bloat prompt cost. Default `1600`.
|
||||
pub max_pixels: u32,
|
||||
}
|
||||
|
||||
impl OcrCfg {
|
||||
pub fn defaults() -> Self {
|
||||
Self {
|
||||
enabled: false,
|
||||
engine: "ollama-vision".to_string(),
|
||||
model: "gemma4:e4b".to_string(),
|
||||
endpoint: None,
|
||||
languages: vec!["eng".to_string(), "kor".to_string()],
|
||||
max_pixels: 1600,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Defaults per design §6.4.
|
||||
pub fn defaults() -> Self {
|
||||
@@ -162,6 +226,7 @@ impl Config {
|
||||
explain_default: false,
|
||||
max_context_tokens: 8000,
|
||||
},
|
||||
image: ImageCfg::defaults(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -323,6 +388,35 @@ impl Config {
|
||||
}
|
||||
}
|
||||
|
||||
// image.ocr
|
||||
"KEBAB_IMAGE_OCR_ENABLED" => {
|
||||
self.image.ocr.enabled = parse_bool(v);
|
||||
}
|
||||
"KEBAB_IMAGE_OCR_ENGINE" => self.image.ocr.engine = v.clone(),
|
||||
"KEBAB_IMAGE_OCR_MODEL" => self.image.ocr.model = v.clone(),
|
||||
"KEBAB_IMAGE_OCR_ENDPOINT" => {
|
||||
// Empty env value is treated the same as "fall back
|
||||
// to models.llm.endpoint" — i.e. set None.
|
||||
self.image.ocr.endpoint = if v.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(v.clone())
|
||||
};
|
||||
}
|
||||
"KEBAB_IMAGE_OCR_LANGUAGES" => {
|
||||
// Comma-separated list, e.g. "eng,kor".
|
||||
self.image.ocr.languages = v
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
}
|
||||
"KEBAB_IMAGE_OCR_MAX_PIXELS" => {
|
||||
if let Ok(n) = v.parse::<u32>() {
|
||||
self.image.ocr.max_pixels = n;
|
||||
}
|
||||
}
|
||||
|
||||
// Unknown KEBAB_* keys are silently ignored — see
|
||||
// `env_unknown_key_is_ignored` test.
|
||||
_ => {}
|
||||
@@ -471,6 +565,122 @@ mod tests {
|
||||
assert!(c.indexing.watch_filesystem);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn image_ocr_defaults_disabled_with_ollama_vision() {
|
||||
let c = Config::defaults();
|
||||
assert!(!c.image.ocr.enabled);
|
||||
assert_eq!(c.image.ocr.engine, "ollama-vision");
|
||||
assert_eq!(c.image.ocr.model, "gemma4:e4b");
|
||||
assert_eq!(c.image.ocr.languages, vec!["eng", "kor"]);
|
||||
assert_eq!(c.image.ocr.max_pixels, 1600);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn image_ocr_env_overrides() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert("KEBAB_IMAGE_OCR_ENABLED".to_string(), "true".to_string());
|
||||
env.insert(
|
||||
"KEBAB_IMAGE_OCR_MODEL".to_string(),
|
||||
"gemma4:31b".to_string(),
|
||||
);
|
||||
env.insert(
|
||||
"KEBAB_IMAGE_OCR_ENDPOINT".to_string(),
|
||||
"http://192.168.0.47:11434".to_string(),
|
||||
);
|
||||
// Empty env value should map to None (= fall back to llm.endpoint).
|
||||
// We exercise that branch in a separate test.
|
||||
env.insert(
|
||||
"KEBAB_IMAGE_OCR_LANGUAGES".to_string(),
|
||||
"eng, kor, jpn".to_string(),
|
||||
);
|
||||
env.insert("KEBAB_IMAGE_OCR_MAX_PIXELS".to_string(), "2048".to_string());
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert!(c.image.ocr.enabled);
|
||||
assert_eq!(c.image.ocr.model, "gemma4:31b");
|
||||
assert_eq!(
|
||||
c.image.ocr.endpoint.as_deref(),
|
||||
Some("http://192.168.0.47:11434")
|
||||
);
|
||||
assert_eq!(c.image.ocr.languages, vec!["eng", "kor", "jpn"]);
|
||||
assert_eq!(c.image.ocr.max_pixels, 2048);
|
||||
}
|
||||
|
||||
/// Pre-P6 config files don't have an `[image]` section. The
|
||||
/// `#[serde(default)]` attribute on `Config::image` must let those
|
||||
/// files load with `ImageCfg::defaults()` instead of erroring.
|
||||
/// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None`
|
||||
/// rather than to `Some("")` so the fallback to `models.llm.endpoint`
|
||||
/// kicks in. Covers the env-equivalent of a missing TOML key.
|
||||
#[test]
|
||||
fn image_ocr_endpoint_empty_env_value_is_none() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert("KEBAB_IMAGE_OCR_ENDPOINT".to_string(), String::new());
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert_eq!(c.image.ocr.endpoint, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pre_p6_config_without_image_section_loads_with_defaults() {
|
||||
let toml_text = r#"
|
||||
schema_version = 1
|
||||
|
||||
[workspace]
|
||||
root = "/tmp/x"
|
||||
include = ["**/*.md"]
|
||||
exclude = []
|
||||
|
||||
[storage]
|
||||
data_dir = "/tmp/d"
|
||||
sqlite = "{data_dir}/x.sqlite"
|
||||
vector_dir = "{data_dir}/v"
|
||||
asset_dir = "{data_dir}/a"
|
||||
artifact_dir = "{data_dir}/r"
|
||||
model_dir = "{data_dir}/m"
|
||||
runs_dir = "{data_dir}/u"
|
||||
copy_threshold_mb = 100
|
||||
|
||||
[indexing]
|
||||
max_parallel_extractors = 2
|
||||
max_parallel_embeddings = 1
|
||||
watch_filesystem = false
|
||||
|
||||
[chunking]
|
||||
target_tokens = 500
|
||||
overlap_tokens = 80
|
||||
respect_markdown_headings = true
|
||||
chunker_version = "md-heading-v1"
|
||||
|
||||
[models.embedding]
|
||||
provider = "fastembed"
|
||||
model = "multilingual-e5-small"
|
||||
version = "v1"
|
||||
dimensions = 384
|
||||
batch_size = 64
|
||||
|
||||
[models.llm]
|
||||
provider = "ollama"
|
||||
model = "qwen2.5:14b-instruct"
|
||||
context_tokens = 32768
|
||||
endpoint = "http://127.0.0.1:11434"
|
||||
temperature = 0.0
|
||||
seed = 0
|
||||
|
||||
[search]
|
||||
default_k = 10
|
||||
hybrid_fusion = "rrf"
|
||||
rrf_k = 60
|
||||
snippet_chars = 220
|
||||
|
||||
[rag]
|
||||
prompt_template_version = "rag-v1"
|
||||
score_gate = 0.30
|
||||
explain_default = false
|
||||
max_context_tokens = 8000
|
||||
"#;
|
||||
let c: Config = toml::from_str(toml_text).expect("pre-P6 TOML must still parse");
|
||||
assert_eq!(c.image, ImageCfg::defaults());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xdg_paths_honor_env() {
|
||||
// Must restore env after the test to avoid polluting other tests.
|
||||
|
||||
@@ -5,11 +5,13 @@ edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Image extractor — produces a single-block CanonicalDocument with EXIF metadata (P6-1)"
|
||||
description = "Image extractor + EXIF + OCR (Ollama-vision) for the kebab pipeline (P6-1, P6-2)"
|
||||
|
||||
[dependencies]
|
||||
kebab-core = { path = "../kebab-core" }
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
anyhow = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
@@ -21,7 +23,22 @@ image = { version = "0.25", default-features = false, features = ["png", "jpeg",
|
||||
# kamadak-exif: pure-Rust EXIF reader. Used for the whitelisted tag
|
||||
# extraction (DateTimeOriginal, GPS, Make, Model, Orientation, Software).
|
||||
kamadak-exif = "0.6"
|
||||
# Ollama-vision OCR adapter (P6-2) talks HTTP directly. We keep the
|
||||
# feature surface identical to `kebab-llm-local` (blocking + json +
|
||||
# rustls-tls) so both crates share the same TLS backend and the
|
||||
# transitive tokio runtime is brought in once.
|
||||
reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
base64 = "0.22"
|
||||
|
claude-reviewer-01
commented
(칭찬 + 작은 권장) 같은 버전이 두 crate 에서 두 번 선언되어 있으니 (칭찬 + 작은 권장) `reqwest` / `base64` / `wiremock` / `tokio` / `ab_glyph` 의 feature flag 와 버전을 `kebab-llm-local` 과 의식적으로 맞춘 게 좋습니다 — TLS 백엔드 / 런타임이 워크스페이스에서 일관되게 한 번만 빌드됩니다.
같은 버전이 두 crate 에서 두 번 선언되어 있으니 `Cargo.toml` (workspace) 의 `[workspace.dependencies]` 로 끌어 올리는 follow-up 을 권장합니다. 본 PR 의 scope 는 아니지만, P6-3 caption adapter 도 같은 reqwest / base64 를 쓸 가능성이 커서 그때까지 묻어 두면 세 crate 분산이 됩니다.
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
# Shared test infrastructure with `kebab-llm-local`: wiremock under
|
||||
# tokio for HTTP fixtures.
|
||||
wiremock = { workspace = true }
|
||||
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||
# Used by `tests/common/mod.rs` to render the opt-in OCR integration
|
||||
# fixture. Only loaded for tests; the production crate doesn't need
|
||||
# font rendering.
|
||||
ab_glyph = "0.2"
|
||||
base64 = "0.22"
|
||||
|
||||
@@ -1,17 +1,27 @@
|
||||
//! `kebab-parse-image` — image extractor (P6-1).
|
||||
//! `kebab-parse-image` — image extractor (P6-1) + OCR adapter (P6-2).
|
||||
//!
|
||||
//! Implements [`kebab_core::Extractor`] for `MediaType::Image(_)`. One asset
|
||||
//! produces one [`CanonicalDocument`] with a single
|
||||
//! [`Block::ImageRef`](kebab_core::Block::ImageRef). EXIF is captured into
|
||||
//! `metadata.user["exif"]`, dimensions into `metadata.user["dimensions"]`.
|
||||
//! OCR / caption fields stay `None`; later tasks (P6-2 / P6-3) populate
|
||||
//! them.
|
||||
//! P6-1 implements [`kebab_core::Extractor`] for `MediaType::Image(_)`,
|
||||
//! producing a single-block [`CanonicalDocument`] (`ImageRefBlock` with
|
||||
//! EXIF + dimensions in `metadata.user`). OCR / caption fields stay
|
||||
//! `None` until populated by the OCR / caption adapters.
|
||||
//!
|
||||
//! P6-2 adds the [`ocr`] module: an [`OcrEngine`] trait and an
|
||||
//! [`OllamaVisionOcr`] default adapter that talks to a vision-capable
|
||||
//! Ollama model. [`apply_ocr`] is the helper that mutates an
|
||||
//! [`ImageRefBlock`] in place. Trust note — the LLM-driven default
|
||||
//! can hallucinate; `OcrText.engine` carries the source identity so
|
||||
//! consumers can branch trust by engine (Tesseract / Apple Vision
|
||||
//! adapters, when added, will write a different `engine` string).
|
||||
|
claude-reviewer-01
commented
(작은 권장) 사소한 doc 변경이지만 P6-3 caption 모듈이 (작은 권장) `lib.rs` 의 모듈-레벨 doc-comment 가 OCR 어댑터를 소개할 때 "P6-2 adds the [`ocr`] module" 한 줄로만 짧게 끝납니다. `OcrText` 의 trust 정책 (관찰된 텍스트 vs 모델 생성) 이 본 워크스페이스의 핵심 분류라서 `lib.rs` 부터 한 줄 노출되면 lib 사용자가 ocr 모듈 doc 까지 안 들어가도 의도를 빠르게 잡습니다:
```text
//! P6-2 adds the [`ocr`] module: an [`OcrEngine`] trait and an
//! [`OllamaVisionOcr`] default adapter that talks to a vision-capable
//! Ollama model. [`apply_ocr`] is the helper that mutates an
//! [`ImageRefBlock`] in place. Trust note: the LLM-driven default can
//! hallucinate — `OcrText.engine` carries the source identity so
//! consumers can branch trust by engine.
```
사소한 doc 변경이지만 P6-3 caption 모듈이 `lib.rs` doc 에 합류할 때 같은 톤을 유지하는 단서가 됩니다.
claude-reviewer-01
commented
(칭찬) Trust note 가 lib doc-comment 까지 한 줄 노출됐습니다 — (칭찬) Trust note 가 lib doc-comment 까지 한 줄 노출됐습니다 — `cargo doc -p kebab-parse-image --open` 시 사용자가 모듈 트리를 들어가지 않아도 OCR 출력의 신뢰 분류를 즉시 인지하게 됩니다. 워크스페이스의 "observed vs generated" 분류가 P6-3 caption / 미래 PDF OCR 까지 일관되게 흘러가는 첫 도큐먼트 단서입니다.
|
||||
//!
|
||||
//! Per design §3.4 (Block::ImageRef + ImageRefBlock), §3.7a (OcrText /
|
||||
//! ModelCaption stubs), §9.1 (image extraction policy), §9 (versioning).
|
||||
//! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption
|
||||
//! provenance), §9 (versioning).
|
||||
|
||||
mod dims;
|
||||
mod exif_extract;
|
||||
pub mod ocr;
|
||||
|
||||
pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kebab_core::{
|
||||
|
||||
502
crates/kebab-parse-image/src/ocr.rs
Normal file
@@ -0,0 +1,502 @@
|
||||
//! OCR adapter (P6-2).
|
||||
//!
|
||||
//! [`OcrEngine`] is a small trait for "image bytes → [`OcrText`]". v1 ships
|
||||
//! a single implementation, [`OllamaVisionOcr`], which delegates to a
|
||||
//! vision-capable Ollama model (`gemma4:e4b` by default).
|
||||
//!
|
||||
//! ## Spec deviation (Tesseract → Ollama-vision)
|
||||
//!
|
||||
//! The original P6-2 spec named Tesseract as the default engine. The dev
|
||||
//! / CI environment intentionally avoids system-package installs, so the
|
||||
//! Tesseract Rust crate (which links `libtesseract`) is impractical
|
||||
//! today. We keep the [`OcrEngine`] trait as the abstraction the spec
|
||||
//! demanded — Tesseract / Apple Vision / PaddleOCR plug in as future
|
||||
//! feature-gated alternatives without touching the extractor or
|
||||
//! chunker. See `tasks/HOTFIXES.md` (2026-05-02) for the full
|
||||
//! rationale.
|
||||
//!
|
||||
//! ## Trust note
|
||||
//!
|
||||
//! The original spec marked `OcrText` as "observed text (high trust)"
|
||||
//! to distinguish it from `ModelCaption`. With an LLM-driven OCR engine
|
||||
//! the line blurs — the model can hallucinate. Downstream consumers
|
||||
//! that surface OCR text should still treat it as a hint, not ground
|
||||
//! truth, and prefer the asset bytes when verifying. The `engine`
|
||||
//! field on [`OcrText`] makes the source explicit, so a caller can
|
||||
//! decide whether to trust based on which engine produced the text.
|
||||
|
||||
use std::io::Cursor;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use base64::Engine as _;
|
||||
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
|
||||
use image::{ImageFormat, ImageReader};
|
||||
use kebab_core::{ImageRefBlock, Lang, OcrRegion, OcrText, ProvenanceEvent, ProvenanceKind};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
/// Engine name written into `OcrText.engine` for the Ollama-vision adapter.
|
||||
pub const OLLAMA_VISION_ENGINE: &str = "ollama-vision";
|
||||
|
||||
/// Hard ceiling on the OCR HTTP exchange. Cold-loading a vision model on
|
||||
/// first call can take ~30s; 5 minutes is generous without being open-ended.
|
||||
const REQUEST_TIMEOUT: Duration = Duration::from_secs(300);
|
||||
|
||||
/// Lower bound on `config.image.ocr.max_pixels`. Anything below this is
|
||||
/// silently bumped to keep the model from receiving an unreadable thumbnail.
|
||||
const MIN_LONG_EDGE: u32 = 256;
|
||||
|
||||
/// Hard cap on `max_pixels` — the spec mentions "downscale aggressively"
|
||||
/// for vision LMs because input dimension translates directly into
|
||||
/// prompt cost. 4096 is generous for legibility and still bounded.
|
||||
const MAX_LONG_EDGE: u32 = 4096;
|
||||
|
||||
/// Image-bytes → [`OcrText`] interface. Implementations may shell out
|
||||
/// (Apple Vision sidecar), call a local library (Tesseract), or — in v1
|
||||
/// — talk HTTP to a vision LM (Ollama).
|
||||
pub trait OcrEngine: Send + Sync {
|
||||
/// Stable identifier written into `OcrText.engine`. Used by callers
|
||||
/// to decide trust level (observed vs. generated).
|
||||
fn engine_name(&self) -> &'static str;
|
||||
|
||||
/// Engine version string written into `OcrText.engine_version`.
|
||||
/// Adapters that depend on a remote service may include the model
|
||||
/// id / version here.
|
||||
fn engine_version(&self) -> String;
|
||||
|
||||
/// Run OCR on `image_bytes`. `lang_hint` (BCP-47) can be passed
|
||||
/// through to engines that benefit from it (Tesseract languages,
|
||||
/// LLM prompt steering); ignore otherwise.
|
||||
fn recognize(
|
||||
&self,
|
||||
image_bytes: &[u8],
|
||||
lang_hint: Option<&Lang>,
|
||||
) -> Result<OcrText>;
|
||||
}
|
||||
|
||||
/// Mutate `block.ocr` in place by running `engine` over `image_bytes`,
|
||||
/// then append a [`ProvenanceKind::OcrApplied`] event to `events` so the
|
||||
/// caller (which owns the `CanonicalDocument`) can splice it into
|
||||
/// `provenance.events`.
|
||||
///
|
||||
/// Returns the engine error verbatim on failure so the caller can decide
|
||||
/// whether to skip the asset or surface it. `block.ocr` is left
|
||||
/// untouched on error — partial state is never written.
|
||||
pub fn apply_ocr(
|
||||
engine: &dyn OcrEngine,
|
||||
image_bytes: &[u8],
|
||||
block: &mut ImageRefBlock,
|
||||
lang_hint: Option<&Lang>,
|
||||
events: &mut Vec<ProvenanceEvent>,
|
||||
) -> Result<()> {
|
||||
let text = engine.recognize(image_bytes, lang_hint).with_context(|| {
|
||||
format!(
|
||||
"OCR failed (engine={}, version={})",
|
||||
engine.engine_name(),
|
||||
engine.engine_version()
|
||||
)
|
||||
})?;
|
||||
let region_count = text.regions.len();
|
||||
block.ocr = Some(text);
|
||||
events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-image".to_string(),
|
||||
kind: ProvenanceKind::OcrApplied,
|
||||
note: Some(format!(
|
||||
"engine={} version={} regions={}",
|
||||
engine.engine_name(),
|
||||
engine.engine_version(),
|
||||
region_count
|
||||
)),
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Ollama-vision OCR adapter — POSTs the image (base64) to
|
||||
/// `<endpoint>/api/generate` with a transcription prompt and reads the
|
||||
/// non-streaming response.
|
||||
#[derive(Debug)]
|
||||
pub struct OllamaVisionOcr {
|
||||
client: reqwest::blocking::Client,
|
||||
endpoint: String,
|
||||
model: String,
|
||||
languages: Vec<String>,
|
||||
max_pixels: u32,
|
||||
}
|
||||
|
||||
impl OllamaVisionOcr {
|
||||
/// Build an adapter from a workspace [`kebab_config::Config`].
|
||||
/// Reads `config.image.ocr.{model, endpoint, languages, max_pixels}`;
|
||||
/// when `endpoint` is empty falls back to `config.models.llm.endpoint`
|
||||
/// so the same Ollama host serves both LLM and OCR by default.
|
||||
///
|
||||
/// Construction does NOT touch the network — the first HTTP call
|
||||
/// happens inside [`OcrEngine::recognize`].
|
||||
pub fn new(config: &kebab_config::Config) -> Result<Self> {
|
||||
let ocr = &config.image.ocr;
|
||||
let endpoint = match ocr.endpoint.as_deref() {
|
||||
Some(s) if !s.is_empty() => s.to_string(),
|
||||
_ => config.models.llm.endpoint.clone(),
|
||||
};
|
||||
Self::build(endpoint, ocr.model.clone(), ocr.languages.clone(), ocr.max_pixels)
|
||||
}
|
||||
|
||||
/// Build directly from explicit fields. Useful for tests that need
|
||||
/// to point at a wiremock host without going through `Config`.
|
||||
/// Shares the same input validation as [`Self::new`] so the two
|
||||
/// constructors agree on what counts as a legal `OllamaVisionOcr` —
|
||||
/// callers cannot smuggle an empty endpoint or empty model id past
|
||||
/// `from_parts`.
|
||||
|
claude-reviewer-01
commented
`max_pixels.clamp(MIN_LONG_EDGE, MAX_LONG_EDGE)` 가 silent 입니다. 사용자가 `image.ocr.max_pixels = 99999` 를 지정하면 4096 으로 조용히 떨어지는데 — 디버깅 시 "왜 내 OCR 이 항상 4096 px 로 떨어지지?" 시나리오가 발생합니다. clamp 가 실제로 발동했을 때만 한 줄 경고:
```rust
let requested = ocr.max_pixels;
let max_pixels = requested.clamp(MIN_LONG_EDGE, MAX_LONG_EDGE);
if max_pixels != requested {
tracing::warn!(
target: "kebab-parse-image",
"image.ocr.max_pixels = {requested} clamped to {max_pixels} (legal range [{MIN_LONG_EDGE}, {MAX_LONG_EDGE}])"
);
}
```
`from_parts` 도 같은 패턴 적용. tracing 은 이미 의존성이라 비용 무시 가능.
|
||||
pub fn from_parts(
|
||||
endpoint: impl Into<String>,
|
||||
model: impl Into<String>,
|
||||
languages: Vec<String>,
|
||||
max_pixels: u32,
|
||||
) -> Result<Self> {
|
||||
Self::build(endpoint.into(), model.into(), languages, max_pixels)
|
||||
}
|
||||
|
||||
/// Shared validation + construction. Centralised so `new` and
|
||||
/// `from_parts` cannot drift on what they accept.
|
||||
fn build(
|
||||
endpoint: String,
|
||||
model: String,
|
||||
languages: Vec<String>,
|
||||
requested_max_pixels: u32,
|
||||
|
claude-reviewer-01
commented
같은 가드를 추가하거나, `from_parts` 가 endpoint / model 의 빈 문자열 가드를 갖고 있지 않습니다. `new()` 는 빈 endpoint / 빈 model 을 모두 `bail!` 로 거절하지만 `from_parts` 는 어떤 입력이든 통과합니다. 두 생성자가 invariant 가 다르면 "테스트는 통과하지만 실서비스 코드는 panic" 같은 swarm 이 생기기 쉽습니다.
같은 가드를 추가하거나, `from_parts` doc-comment 에 "test-seam — 입력 검증을 caller 가 책임진다" 를 명시해 주세요. 둘 다 짧지만 첫 번째가 더 안전합니다.
|
||||
) -> Result<Self> {
|
||||
if endpoint.is_empty() {
|
||||
anyhow::bail!(
|
||||
"OllamaVisionOcr: endpoint is empty (set image.ocr.endpoint or models.llm.endpoint)"
|
||||
);
|
||||
}
|
||||
let model = model.trim().to_string();
|
||||
if model.is_empty() {
|
||||
anyhow::bail!("OllamaVisionOcr: model is empty");
|
||||
}
|
||||
let max_pixels = requested_max_pixels.clamp(MIN_LONG_EDGE, MAX_LONG_EDGE);
|
||||
if max_pixels != requested_max_pixels {
|
||||
tracing::warn!(
|
||||
target: "kebab-parse-image",
|
||||
"image.ocr.max_pixels = {requested_max_pixels} clamped to {max_pixels} \
|
||||
(legal range [{MIN_LONG_EDGE}, {MAX_LONG_EDGE}])"
|
||||
);
|
||||
}
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.timeout(REQUEST_TIMEOUT)
|
||||
.build()
|
||||
.context("building OCR HTTP client")?;
|
||||
Ok(Self {
|
||||
client,
|
||||
endpoint,
|
||||
model,
|
||||
languages,
|
||||
max_pixels,
|
||||
})
|
||||
|
claude-reviewer-01
commented
회차 2 에서 추가된 간단한 회귀 테스트 3건 추가 권장: 첫 두 개는 회차 2 에서 추가된 `fn build` 가드 (`endpoint.is_empty()` → bail, `model.trim().is_empty()` → bail, `max_pixels` clamp + warn) 가 unit/integration 테스트로 커버되지 않습니다. 회귀 신호가 비어 있으면 누군가 미래에 `bail!` 메시지나 클램프 범위를 무심코 바꿔도 그린 테스트로 머지됩니다.
간단한 회귀 테스트 3건 추가 권장:
```rust
#[test]
fn build_rejects_empty_endpoint() {
let r = OllamaVisionOcr::from_parts("", "m", vec![], 1024);
let err = r.unwrap_err().to_string();
assert!(err.contains("endpoint is empty"), "{err}");
}
#[test]
fn build_rejects_empty_model() {
let r = OllamaVisionOcr::from_parts("http://x", " ", vec![], 1024);
let err = r.unwrap_err().to_string();
assert!(err.contains("model is empty"), "{err}");
}
#[test]
fn new_falls_back_to_llm_endpoint_when_ocr_endpoint_is_none() {
let mut cfg = kebab_config::Config::defaults();
cfg.image.ocr.endpoint = None;
cfg.models.llm.endpoint = "http://llm.example".to_string();
// Construction must not panic; we don't expose endpoint() but
// engine_version is observable.
let engine = OllamaVisionOcr::new(&cfg).unwrap();
assert_eq!(engine.engine_name(), "ollama-vision");
}
```
첫 두 개는 `src/ocr.rs` 의 `#[cfg(test)] mod tests` 안에, 세 번째는 통합 테스트 (`tests/ocr.rs`) 어느 쪽이든 OK.
|
||||
}
|
||||
|
||||
/// Effective `max_pixels` after the `[MIN_LONG_EDGE, MAX_LONG_EDGE]`
|
||||
/// clamp. Exposed so tests can verify the clamp result without
|
||||
/// reaching into the private field; production callers don't need
|
||||
/// it.
|
||||
pub fn max_pixels(&self) -> u32 {
|
||||
self.max_pixels
|
||||
}
|
||||
|
||||
fn build_prompt(&self, lang_hint: Option<&Lang>) -> String {
|
||||
let langs = if self.languages.is_empty() {
|
||||
"any".to_string()
|
||||
} else {
|
||||
self.languages.join(", ")
|
||||
};
|
||||
let hint = match lang_hint.map(|l| l.0.as_str()) {
|
||||
Some(h) if !h.is_empty() && h != "und" => format!(" (hint: dominant language is {h})"),
|
||||
_ => String::new(),
|
||||
};
|
||||
format!(
|
||||
"You are an OCR engine. Transcribe ALL text visible in the image, \
|
||||
preserving line breaks. Output only the transcription, no commentary, \
|
||||
no markdown fences, no quotes. Expected languages: {langs}{hint}. \
|
||||
If the image contains no text, output an empty line."
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl OcrEngine for OllamaVisionOcr {
|
||||
fn engine_name(&self) -> &'static str {
|
||||
OLLAMA_VISION_ENGINE
|
||||
}
|
||||
|
||||
fn engine_version(&self) -> String {
|
||||
// Compose engine + model id so the wire form is self-describing
|
||||
// ("ollama-vision/gemma4:e4b") — the Ollama daemon does not
|
||||
// expose a stable per-model revision string we could pin.
|
||||
format!("ollama/{}", self.model)
|
||||
}
|
||||
|
||||
fn recognize(
|
||||
&self,
|
||||
image_bytes: &[u8],
|
||||
lang_hint: Option<&Lang>,
|
||||
) -> Result<OcrText> {
|
||||
let (prepared, w, h) = downscale_to_long_edge(image_bytes, self.max_pixels)
|
||||
.context("preparing image for OCR")?;
|
||||
let b64 = BASE64_STANDARD.encode(&prepared);
|
||||
|
||||
let prompt = self.build_prompt(lang_hint);
|
||||
let body = OllamaGenerateRequest {
|
||||
model: &self.model,
|
||||
prompt: &prompt,
|
||||
images: [b64.as_str()],
|
||||
stream: false,
|
||||
options: OllamaOptions {
|
||||
temperature: 0.0,
|
||||
seed: 0,
|
||||
},
|
||||
};
|
||||
|
||||
let url = format!("{}/api/generate", self.endpoint.trim_end_matches('/'));
|
||||
let resp = self
|
||||
.client
|
||||
.post(&url)
|
||||
.json(&body)
|
||||
.send()
|
||||
.with_context(|| format!("POST {url}"))?;
|
||||
let status = resp.status();
|
||||
if !status.is_success() {
|
||||
let body_text = resp.text().unwrap_or_default();
|
||||
anyhow::bail!(
|
||||
"OllamaVisionOcr: {status} from {url} — body={}",
|
||||
truncate(&body_text, 512)
|
||||
);
|
||||
}
|
||||
let parsed: OllamaGenerateResponse = resp
|
||||
.json()
|
||||
.context("parsing Ollama OCR response as JSON")?;
|
||||
if let Some(err) = parsed.error {
|
||||
anyhow::bail!("OllamaVisionOcr: server error — {}", truncate(&err, 512));
|
||||
}
|
||||
let raw = parsed.response.unwrap_or_default();
|
||||
let joined = raw.trim().to_string();
|
||||
|
||||
let regions = if joined.is_empty() {
|
||||
Vec::new()
|
||||
} else {
|
||||
// Ollama-vision returns prose, not bbox-annotated regions.
|
||||
// We synthesize a single region covering the whole prepared
|
||||
// image (post-downscale dimensions) so the `OcrText` shape
|
||||
// remains compatible with consumers that expect at least
|
||||
// one region. Confidence is left at 1.0 — there's no
|
||||
// per-token score available from the LM.
|
||||
vec![OcrRegion {
|
||||
bbox: (0, 0, w, h),
|
||||
text: joined.clone(),
|
||||
confidence: 1.0,
|
||||
}]
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-parse-image",
|
||||
"ollama-vision OCR ok (model={}, dims={w}x{h}, chars={})",
|
||||
self.model,
|
||||
joined.chars().count()
|
||||
);
|
||||
|
claude-reviewer-01
commented
사이드 이펙트: PNG passthrough 단축 경로를 잃습니다 (현재는 PNG 이고 size OK 면 `downscale_to_long_edge` 가 다운스케일이 필요한 케이스에서 동일 bytes 를 두 번 읽습니다 — 첫 번째 `ImageReader` 로 dims 를 알아내고 (소비), 그다음 새 `ImageReader` 를 만들어 `decode()` 합니다. 이미지가 큰 경우 (4000×3000) base64 + decode 가 메모리에 두 번 들어옵니다.
`ImageReader::new(...).with_guessed_format()?.decode()` 한 번으로 `DynamicImage` 를 얻은 다음 `.dimensions()` 로 (w, h) 를 빼고 같은 객체에서 `.resize_exact()` 까지 처리하면 한 번의 디코드로 끝납니다:
```rust
let img = ImageReader::new(Cursor::new(bytes))
.with_guessed_format()
.context("reading image header")?
.decode()
.context("decoding image")?;
let (w, h) = (img.width(), img.height());
let long = w.max(h);
if long <= max_long_edge {
// 이미 PNG 인지 cheap-check 후 passthrough 또는 re-encode
...
}
let (new_w, new_h) = ...;
let resized = img.resize_exact(new_w, new_h, ...);
```
사이드 이펙트: PNG passthrough 단축 경로를 잃습니다 (현재는 PNG 이고 size OK 면 `bytes.to_vec()` 로 끝). passthrough 가 hot path 라면 첫 단계에서 format 만 sniff (`with_guessed_format` 만 호출, dims 는 안 봄) 한 다음 분기시키는 절충안도 있습니다. 둘 중 어느 쪽이든 "같은 bytes 를 두 번 디코드" 만 정리되면 충분합니다.
|
||||
|
||||
Ok(OcrText {
|
||||
joined,
|
||||
regions,
|
||||
engine: self.engine_name().to_string(),
|
||||
engine_version: self.engine_version(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ── Image preparation ─────────────────────────────────────────────────────
|
||||
|
||||
/// Decode `bytes`, downscale so the long edge is at most `max_long_edge`,
|
||||
/// and re-encode as PNG. Returns `(png_bytes, final_w, final_h)`.
|
||||
///
|
||||
/// PNG sources that already fit the cap are passthrough (zero decodes,
|
||||
/// just a `Vec` clone). Every other path decodes the image exactly
|
||||
/// once: the cheap header sniff peeks at the format / dimensions before
|
||||
/// committing to a decode, so non-PNG passthrough and downscale share
|
||||
/// the same `decode → optionally resize → re-encode` tail.
|
||||
fn downscale_to_long_edge(bytes: &[u8], max_long_edge: u32) -> Result<(Vec<u8>, u32, u32)> {
|
||||
let reader = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("reading image header for OCR")?;
|
||||
let format = reader.format();
|
||||
let (w, h) = reader
|
||||
.into_dimensions()
|
||||
.context("reading image dimensions for OCR")?;
|
||||
|
||||
let long = w.max(h);
|
||||
|
||||
// Hot path — PNG within budget already matches the wire format we
|
||||
// send Ollama, so we ship the bytes verbatim without paying for a
|
||||
// decode + re-encode round-trip.
|
||||
if long <= max_long_edge && format == Some(ImageFormat::Png) {
|
||||
return Ok((bytes.to_vec(), w, h));
|
||||
}
|
||||
|
||||
// Every remaining branch needs the pixels — either to re-encode as
|
||||
// PNG (non-PNG within budget) or to resize first (over budget).
|
||||
|
claude-reviewer-01
commented
downscale 의 간단한 후행 클램프 한 줄로 invariant 정확히 보장: 또는 doc-comment 를 "approximately at most" 로 완화. 둘 중 어느 쪽이든. downscale 의 `(w as f32 * scale).round()` 두 축이 독립적으로 round 되어 long-edge 가 `max_long_edge` 를 1px 초과할 수 있는 코너 케이스가 있습니다. `scale = max_long_edge / long` 이 무리수 (예: max=1601, long=4001) 인 경우 long-axis 도 +1 px 로 round-up 가능. wire 측에선 1px 차이가 의미 없지만, doc-comment 가 "long edge is at most max_long_edge" 라고 단정하므로 실제 동작과 살짝 어긋납니다.
간단한 후행 클램프 한 줄로 invariant 정확히 보장:
```rust
let new_w = ((w as f32) * scale).round().max(1.0) as u32;
let new_h = ((h as f32) * scale).round().max(1.0) as u32;
let (new_w, new_h) = if w >= h {
(new_w.min(max_long_edge), new_h)
} else {
(new_w, new_h.min(max_long_edge))
};
```
또는 doc-comment 를 "approximately at most" 로 완화. 둘 중 어느 쪽이든.
|
||||
// One decode covers both.
|
||||
let img = ImageReader::new(Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.context("re-reading image for OCR decode")?
|
||||
.decode()
|
||||
.context("decoding image for OCR")?;
|
||||
|
||||
let (final_w, final_h, final_img) = if long <= max_long_edge {
|
||||
(w, h, img)
|
||||
} else {
|
||||
let scale = max_long_edge as f32 / long as f32;
|
||||
let mut new_w = ((w as f32) * scale).round().max(1.0) as u32;
|
||||
let mut new_h = ((h as f32) * scale).round().max(1.0) as u32;
|
||||
// Independent rounding of the two axes can let `f32`'s nearest
|
||||
// round push the long axis one pixel past `max_long_edge` for
|
||||
// irrational scales (e.g. `max=1601, long=4001`). Pin the long
|
||||
// axis to exactly `max_long_edge` so the doc-comment's
|
||||
// "long edge is at most max_long_edge" stays a strict bound.
|
||||
if w >= h {
|
||||
new_w = new_w.min(max_long_edge);
|
||||
} else {
|
||||
|
claude-reviewer-01
commented
향후 multi-image batch 가 필요해지면 그때 `images: Vec<&'a str>` 가 항상 길이 1 입니다. wire 형태도 `"images": [<one>]` 단일 원소 배열로 고정 — `Vec` 알로케이션 + `vec![&b64]` 매크로 호출이 함수 호출당 1회 발생합니다. `[&'a str; 1]` 로 박으면 알로케이션 없이 동일 JSON 출력이 나옵니다:
```rust
#[derive(Serialize)]
struct OllamaGenerateRequest<'a> {
model: &'a str,
prompt: &'a str,
images: [&'a str; 1],
stream: bool,
options: OllamaOptions,
}
// 호출: images: [b64.as_str()],
```
향후 multi-image batch 가 필요해지면 그때 `Vec` 으로 되돌리면 됩니다 (트레이트 시그니처가 단일 이미지를 받기 때문에 batch 는 별개 RFC).
|
||||
new_h = new_h.min(max_long_edge);
|
||||
}
|
||||
let resized =
|
||||
img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle);
|
||||
(new_w, new_h, resized)
|
||||
};
|
||||
|
||||
let mut out = Cursor::new(Vec::new());
|
||||
final_img
|
||||
.write_to(&mut out, ImageFormat::Png)
|
||||
.context("encoding image as PNG for OCR")?;
|
||||
Ok((out.into_inner(), final_w, final_h))
|
||||
}
|
||||
|
||||
fn truncate(s: &str, n: usize) -> String {
|
||||
if s.chars().count() <= n {
|
||||
return s.to_string();
|
||||
}
|
||||
|
claude-reviewer-01
commented
그리고 `_other: HashMap<String, Value>` + `#[serde(flatten)]` 가 어떤 호출자에서도 읽히지 않는 dead 필드입니다. `OllamaGenerateResponse` 의 다른 필드들이 ���두 `#[serde(default)]` 인데, serde 는 알 수 없는 키를 기본적으로 silently drop 하므로 capture-all 이 굳이 필요하지 않습니다. 통째로 제거해 wire type 을 더 가볍게:
```rust
#[derive(Deserialize)]
struct OllamaGenerateResponse {
#[serde(default)]
response: Option<String>,
#[serde(default)]
error: Option<String>,
}
```
그리고 `serde_json::Value` import 도 같이 정리할 수 있습니다.
|
||||
let mut out: String = s.chars().take(n).collect();
|
||||
out.push_str(&format!("... (truncated, original {} chars)", s.chars().count()));
|
||||
out
|
||||
}
|
||||
|
||||
// ── Wire types ────────────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct OllamaGenerateRequest<'a> {
|
||||
model: &'a str,
|
||||
prompt: &'a str,
|
||||
/// Always exactly one image — the `OcrEngine` trait takes a single
|
||||
/// `&[u8]`, so multi-image batching is out of scope until a future
|
||||
/// trait extension. Fixed-size array avoids the `vec![]`
|
||||
/// allocation per call.
|
||||
images: [&'a str; 1],
|
||||
stream: bool,
|
||||
options: OllamaOptions,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct OllamaOptions {
|
||||
temperature: f32,
|
||||
seed: u64,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct OllamaGenerateResponse {
|
||||
#[serde(default)]
|
||||
response: Option<String>,
|
||||
#[serde(default)]
|
||||
error: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn truncate_under_cap_unchanged() {
|
||||
assert_eq!(truncate("abc", 5), "abc");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_over_cap_appends_marker() {
|
||||
let big = "x".repeat(20);
|
||||
let out = truncate(&big, 5);
|
||||
assert!(out.starts_with("xxxxx"));
|
||||
assert!(out.contains("(truncated, original 20 chars)"));
|
||||
}
|
||||
|
||||
/// Build prompt mentions the configured languages and the hint when
|
||||
/// supplied.
|
||||
#[test]
|
||||
fn build_prompt_lists_languages_and_hint() {
|
||||
let engine = OllamaVisionOcr::from_parts(
|
||||
"http://x",
|
||||
"m",
|
||||
vec!["eng".into(), "kor".into()],
|
||||
1024,
|
||||
)
|
||||
.unwrap();
|
||||
let p = engine.build_prompt(Some(&Lang("ko".into())));
|
||||
assert!(p.contains("eng, kor"));
|
||||
assert!(p.contains("hint: dominant language is ko"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_prompt_omits_hint_when_lang_und() {
|
||||
let engine = OllamaVisionOcr::from_parts(
|
||||
"http://x",
|
||||
"m",
|
||||
vec!["eng".into()],
|
||||
1024,
|
||||
)
|
||||
.unwrap();
|
||||
let p = engine.build_prompt(Some(&Lang("und".into())));
|
||||
assert!(!p.contains("hint:"));
|
||||
}
|
||||
|
||||
/// `from_parts` (and by extension `new`) must reject an empty
|
||||
/// endpoint string. Pinned so the bail message stays grep-able and
|
||||
/// the constructor cannot drift to "silently accept a bad config".
|
||||
#[test]
|
||||
fn build_rejects_empty_endpoint() {
|
||||
let r = OllamaVisionOcr::from_parts("", "m", vec![], 1024);
|
||||
let err = r.expect_err("empty endpoint must bail").to_string();
|
||||
assert!(
|
||||
err.contains("endpoint is empty"),
|
||||
"bail message missing 'endpoint is empty': {err}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Whitespace-only model id trims to empty and must be rejected —
|
||||
/// both `new` and `from_parts` route through the shared `build`,
|
||||
/// so testing `from_parts` covers both.
|
||||
#[test]
|
||||
fn build_rejects_empty_model_after_trim() {
|
||||
let r = OllamaVisionOcr::from_parts("http://x", " ", vec![], 1024);
|
||||
let err = r.expect_err("empty model must bail").to_string();
|
||||
assert!(
|
||||
err.contains("model is empty"),
|
||||
"bail message missing 'model is empty': {err}"
|
||||
);
|
||||
}
|
||||
|
||||
/// Out-of-range `max_pixels` is silently clamped (not rejected) so
|
||||
/// a bad config can't kill ingest. The accessor exposes the clamped
|
||||
/// value so tests can verify the bound; the warning side-effect is
|
||||
/// tested implicitly (no panic, no error).
|
||||
#[test]
|
||||
fn build_clamps_max_pixels_outside_legal_range() {
|
||||
let too_small =
|
||||
OllamaVisionOcr::from_parts("http://x", "m", vec![], 1).unwrap();
|
||||
assert_eq!(too_small.max_pixels(), MIN_LONG_EDGE);
|
||||
let too_big =
|
||||
OllamaVisionOcr::from_parts("http://x", "m", vec![], u32::MAX).unwrap();
|
||||
assert_eq!(too_big.max_pixels(), MAX_LONG_EDGE);
|
||||
}
|
||||
}
|
||||
@@ -43,6 +43,65 @@ pub fn no_exif_png() -> Vec<u8> {
|
||||
buf.into_inner()
|
||||
}
|
||||
|
||||
/// 4000×3000 solid-blue PNG (long edge 4000) used to exercise the OCR
|
||||
/// adapter's downscale path. Solid-colour PNGs compress aggressively, so
|
||||
/// the on-disk size stays well under 1 MB despite the large dimensions.
|
||||
pub fn large_blue_4000x3000_png() -> Vec<u8> {
|
||||
let img: ImageBuffer<Rgb<u8>, _> =
|
||||
ImageBuffer::from_fn(4000, 3000, |_, _| Rgb([0, 0, 255]));
|
||||
let mut buf = Cursor::new(Vec::new());
|
||||
img.write_to(&mut buf, image::ImageFormat::Png)
|
||||
.expect("encoding 4000x3000 PNG must not fail");
|
||||
buf.into_inner()
|
||||
}
|
||||
|
||||
/// PNG with the literal text `"Hello World 2026"` rendered in black
|
||||
/// against a white background. Used by the opt-in
|
||||
/// `ocr_integration_real_ollama_transcribes_text` integration test —
|
||||
/// regular hermetic tests never call it.
|
||||
///
|
||||
/// Returns `Err` (not panic) if the DejaVu Sans Bold font is missing
|
||||
/// from the standard Linux path, so dev boxes without the font can
|
||||
/// gracefully skip the integration test rather than crashing the
|
||||
/// process.
|
||||
pub fn hello_world_png() -> anyhow::Result<Vec<u8>> {
|
||||
|
claude-reviewer-01
commented
패닉 메시지가 "전반적으로 DejaVu 가 필요" 처럼 들려서 misleading 합니다. 두 가지 중 하나를 권장:
현재 통합 테스트가 dev 박스에서 동작 검증된 상태라 1번이 더 안전한 선택입니다 (다른 dev 박스의 폰트 부재가 즉시 panic 으로 가는 걸 방지). `hello_world_png` 가 `/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf` 를 강제로 읽고 panic 메시지로 "DejaVu Sans Bold required for OCR integration fixture" 를 띄웁니다. 그런데 실제로 이 함수는 `#[ignore]` 가 붙은 통합 테스트 한 곳에서만 호출되고, 일반 hermetic 테스트는 절대 부르지 않습니다.
패닉 메시지가 "전반적으로 DejaVu 가 필요" 처럼 들려서 misleading 합니다. 두 가지 중 하나를 권장:
1. (선호) 함수가 `Result<Vec<u8>, _>` 를 반환하게 바꾸고, 통합 테스트가 폰트 부재 시 우아하게 skip:
```rust
pub fn hello_world_png() -> anyhow::Result<Vec<u8>> {
let font_bytes = std::fs::read("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf")
.context("DejaVu Sans Bold required only for the OCR integration fixture")?;
...
}
```
2. panic 메시지를 "DejaVu Sans Bold is required only for the OCR integration fixture" 로 다듬어 일반 사용자가 함수 호출시 헷갈리지 않게 만들기.
현재 통합 테스트가 dev 박스에서 동작 검증된 상태라 1번이 더 안전한 선택입니다 (다른 dev 박스의 폰트 부재가 즉시 panic 으로 가는 걸 방지).
|
||||
use ab_glyph::{Font, FontRef, ScaleFont};
|
||||
use anyhow::Context;
|
||||
|
||||
let mut img: ImageBuffer<Rgb<u8>, _> =
|
||||
ImageBuffer::from_fn(400, 100, |_, _| Rgb([255, 255, 255]));
|
||||
let font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf";
|
||||
let font_bytes = std::fs::read(font_path).with_context(|| {
|
||||
format!(
|
||||
"{font_path} not found — only the opt-in OCR integration fixture needs this font"
|
||||
)
|
||||
})?;
|
||||
let font = FontRef::try_from_slice(&font_bytes).context("DejaVu font parses")?;
|
||||
let scaled = font.as_scaled(40.0);
|
||||
let text = "Hello World 2026";
|
||||
let mut x = 10.0_f32;
|
||||
let y = 60.0_f32;
|
||||
for ch in text.chars() {
|
||||
let glyph = scaled.scaled_glyph(ch);
|
||||
if let Some(outlined) = scaled.outline_glyph(glyph.clone()) {
|
||||
let bb = outlined.px_bounds();
|
||||
outlined.draw(|gx, gy, c| {
|
||||
let px = (x + bb.min.x + gx as f32) as i32;
|
||||
let py = (y + bb.min.y + gy as f32) as i32;
|
||||
if px >= 0 && py >= 0 && (px as u32) < 400 && (py as u32) < 100 {
|
||||
let v = ((1.0 - c) * 255.0) as u8;
|
||||
img.put_pixel(px as u32, py as u32, Rgb([v, v, v]));
|
||||
}
|
||||
});
|
||||
}
|
||||
x += scaled.h_advance(scaled.glyph_id(ch));
|
||||
}
|
||||
let mut buf = Cursor::new(Vec::new());
|
||||
img.write_to(&mut buf, image::ImageFormat::Png)
|
||||
.context("encoding hello-world PNG")?;
|
||||
Ok(buf.into_inner())
|
||||
}
|
||||
|
||||
/// JPEG with embedded EXIF APP1 segment carrying GPS + Make + Model +
|
||||
/// DateTimeOriginal + Orientation + Software. The base image is a 4×4
|
||||
/// solid white square — pixel content is irrelevant; the test cares about
|
||||
|
||||
395
crates/kebab-parse-image/tests/ocr.rs
Normal file
@@ -0,0 +1,395 @@
|
||||
//! Integration tests for the OCR adapter (P6-2).
|
||||
//!
|
||||
//! Pattern mirrors `kebab-llm-local/tests/streaming.rs` — `wiremock` is
|
||||
//! async, so test fns are `#[tokio::test]` and the sync adapter is
|
||||
//! invoked from `spawn_blocking`.
|
||||
|
||||
mod common;
|
||||
|
||||
use kebab_config::Config;
|
||||
use kebab_core::{
|
||||
AssetId, BlockId, CommonBlock, ImageRefBlock, Lang, ProvenanceEvent, ProvenanceKind,
|
||||
SourceSpan,
|
||||
};
|
||||
use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_ocr};
|
||||
use serde_json::json;
|
||||
use wiremock::matchers::{method, path};
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
use crate::common::red_100x50_png;
|
||||
|
||||
fn cfg_for_endpoint(endpoint: &str) -> Config {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.image.ocr.endpoint = Some(endpoint.to_string());
|
||||
cfg.image.ocr.model = "gemma4:e4b".to_string();
|
||||
cfg.image.ocr.languages = vec!["eng".to_string(), "kor".to_string()];
|
||||
cfg.image.ocr.max_pixels = 1024;
|
||||
cfg
|
||||
}
|
||||
|
||||
fn run_recognize(
|
||||
cfg: Config,
|
||||
bytes: Vec<u8>,
|
||||
lang_hint: Option<Lang>,
|
||||
) -> anyhow::Result<kebab_core::OcrText> {
|
||||
let engine = OllamaVisionOcr::new(&cfg)?;
|
||||
engine.recognize(&bytes, lang_hint.as_ref())
|
||||
}
|
||||
|
||||
fn empty_image_block() -> ImageRefBlock {
|
||||
ImageRefBlock {
|
||||
common: CommonBlock {
|
||||
block_id: BlockId("0".repeat(32)),
|
||||
heading_path: Vec::new(),
|
||||
source_span: SourceSpan::Region {
|
||||
x: 0,
|
||||
y: 0,
|
||||
w: 100,
|
||||
h: 50,
|
||||
},
|
||||
},
|
||||
asset_id: Some(AssetId("a".repeat(32))),
|
||||
src: "img/x.png".to_string(),
|
||||
alt: "x.png".to_string(),
|
||||
ocr: None,
|
||||
caption: None,
|
||||
}
|
||||
}
|
||||
|
||||
// ── Happy path ────────────────────────────────────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn ocr_recognize_decodes_response_into_ocr_text() {
|
||||
let server = MockServer::start().await;
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/generate"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_json(json!({
|
||||
"model": "gemma4:e4b",
|
||||
"response": "Hello World 2026",
|
||||
"done": true,
|
||||
"done_reason": "stop"
|
||||
})))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let bytes = red_100x50_png();
|
||||
let cfg = cfg_for_endpoint(&server.uri());
|
||||
let text = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None))
|
||||
.await
|
||||
.expect("blocking task panicked")
|
||||
.expect("recognize must succeed");
|
||||
|
||||
assert_eq!(text.joined, "Hello World 2026");
|
||||
assert_eq!(text.engine, "ollama-vision");
|
||||
assert!(text.engine_version.starts_with("ollama/gemma4:e4b"));
|
||||
assert_eq!(text.regions.len(), 1, "non-empty joined → exactly one region");
|
||||
assert_eq!(text.regions[0].text, "Hello World 2026");
|
||||
assert!((text.regions[0].confidence - 1.0).abs() < 1e-6);
|
||||
// Region bbox covers prepared image dimensions (100×50 < max_pixels
|
||||
// 1024 so no downscale, dims preserved).
|
||||
assert_eq!(text.regions[0].bbox, (0, 0, 100, 50));
|
||||
}
|
||||
|
||||
// ── Empty response ────────────────────────────────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn ocr_recognize_empty_response_yields_empty_regions() {
|
||||
let server = MockServer::start().await;
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/generate"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_json(json!({
|
||||
"response": "",
|
||||
"done": true
|
||||
})))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let bytes = red_100x50_png();
|
||||
let cfg = cfg_for_endpoint(&server.uri());
|
||||
let text = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None))
|
||||
.await
|
||||
.expect("blocking task panicked")
|
||||
.expect("recognize on empty response must succeed");
|
||||
|
||||
assert_eq!(text.joined, "");
|
||||
assert!(text.regions.is_empty(), "empty joined → no regions");
|
||||
assert_eq!(text.engine, "ollama-vision");
|
||||
}
|
||||
|
||||
// ── Server error mapping ──────────────────────────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn ocr_recognize_500_response_returns_error() {
|
||||
let server = MockServer::start().await;
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/generate"))
|
||||
.respond_with(ResponseTemplate::new(500).set_body_string("boom"))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let bytes = red_100x50_png();
|
||||
let cfg = cfg_for_endpoint(&server.uri());
|
||||
let r = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None))
|
||||
.await
|
||||
.expect("blocking task panicked");
|
||||
assert!(r.is_err(), "5xx must surface as Err");
|
||||
let msg = format!("{:#}", r.unwrap_err());
|
||||
assert!(
|
||||
msg.contains("500") && msg.contains("boom"),
|
||||
"error must include status + body: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
// ── error envelope on 200 stream ─────────────────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn ocr_recognize_error_envelope_on_200_returns_error() {
|
||||
let server = MockServer::start().await;
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/generate"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_json(json!({
|
||||
"error": "model 'gemma4:e4b' not found"
|
||||
})))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let bytes = red_100x50_png();
|
||||
let cfg = cfg_for_endpoint(&server.uri());
|
||||
let r = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None))
|
||||
.await
|
||||
.expect("blocking task panicked");
|
||||
assert!(r.is_err(), "server error envelope must surface");
|
||||
let msg = format!("{:#}", r.unwrap_err());
|
||||
assert!(
|
||||
msg.contains("not found"),
|
||||
"error must include server message: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
// ── apply_ocr mutates block + appends provenance ─────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn apply_ocr_sets_block_ocr_and_appends_provenance() {
|
||||
let server = MockServer::start().await;
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/generate"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_json(json!({
|
||||
"response": "안녕 2026",
|
||||
"done": true
|
||||
})))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let bytes = red_100x50_png();
|
||||
let cfg = cfg_for_endpoint(&server.uri());
|
||||
|
||||
let (block, events) =
|
||||
tokio::task::spawn_blocking(move || -> anyhow::Result<_> {
|
||||
let engine = OllamaVisionOcr::new(&cfg)?;
|
||||
let mut block = empty_image_block();
|
||||
let mut events: Vec<ProvenanceEvent> = Vec::new();
|
||||
apply_ocr(
|
||||
&engine,
|
||||
&bytes,
|
||||
&mut block,
|
||||
Some(&Lang("ko".to_string())),
|
||||
&mut events,
|
||||
)?;
|
||||
Ok((block, events))
|
||||
})
|
||||
.await
|
||||
.expect("blocking task panicked")
|
||||
.expect("apply_ocr must succeed");
|
||||
|
||||
let ocr = block.ocr.as_ref().expect("ocr Some after apply_ocr");
|
||||
assert_eq!(ocr.joined, "안녕 2026");
|
||||
assert_eq!(events.len(), 1);
|
||||
assert_eq!(events[0].kind, ProvenanceKind::OcrApplied);
|
||||
assert_eq!(events[0].agent, "kb-parse-image");
|
||||
let note = events[0].note.as_deref().unwrap_or("");
|
||||
assert!(
|
||||
note.contains("engine=ollama-vision") && note.contains("regions=1"),
|
||||
"provenance note must describe engine + region count: {note}"
|
||||
);
|
||||
}
|
||||
|
||||
// ── apply_ocr error leaves block untouched ───────────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn apply_ocr_error_leaves_block_untouched() {
|
||||
let server = MockServer::start().await;
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/generate"))
|
||||
.respond_with(ResponseTemplate::new(503))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let bytes = red_100x50_png();
|
||||
let cfg = cfg_for_endpoint(&server.uri());
|
||||
|
||||
let (block, events, err) = tokio::task::spawn_blocking(move || {
|
||||
let engine = OllamaVisionOcr::new(&cfg).expect("engine");
|
||||
let mut block = empty_image_block();
|
||||
let mut events: Vec<ProvenanceEvent> = Vec::new();
|
||||
let res = apply_ocr(&engine, &bytes, &mut block, None, &mut events);
|
||||
(block, events, res.err())
|
||||
})
|
||||
.await
|
||||
.expect("blocking task panicked");
|
||||
|
||||
assert!(err.is_some(), "503 must propagate as Err");
|
||||
assert!(
|
||||
block.ocr.is_none(),
|
||||
"block.ocr stays None when apply_ocr fails — partial state must not leak"
|
||||
);
|
||||
assert!(
|
||||
events.is_empty(),
|
||||
"no Provenance event when OCR fails — kb-normalize would otherwise lie about success"
|
||||
);
|
||||
}
|
||||
|
||||
// ── Downscale: large input shrinks before sending ─────────────────────────
|
||||
|
||||
#[tokio::test]
|
||||
async fn ocr_downscales_large_image_before_sending() {
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
// Capture the request body so we can pull out the base64 image and
|
||||
// measure its dimensions.
|
||||
let captured: Arc<Mutex<Option<Vec<u8>>>> = Arc::new(Mutex::new(None));
|
||||
|
||||
let server = MockServer::start().await;
|
||||
let cap = captured.clone();
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/generate"))
|
||||
.respond_with(move |req: &wiremock::Request| {
|
||||
let body = req.body.clone();
|
||||
*cap.lock().unwrap() = Some(body);
|
||||
ResponseTemplate::new(200).set_body_json(json!({
|
||||
"response": "ok",
|
||||
"done": true
|
||||
}))
|
||||
})
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
// 4000×3000 PNG (long edge 4000) — well above the cfg max 1024.
|
||||
let big = common::large_blue_4000x3000_png();
|
||||
let cfg = cfg_for_endpoint(&server.uri());
|
||||
let _ = tokio::task::spawn_blocking({
|
||||
let cfg = cfg.clone();
|
||||
move || run_recognize(cfg, big, None)
|
||||
})
|
||||
.await
|
||||
.expect("blocking task panicked")
|
||||
.expect("recognize succeeds");
|
||||
|
||||
// Pull the request body, parse JSON, base64-decode the image, and
|
||||
// verify the long edge is at most max_pixels (1024).
|
||||
let raw = captured.lock().unwrap().clone().expect("request captured");
|
||||
let value: serde_json::Value =
|
||||
serde_json::from_slice(&raw).expect("request body is JSON");
|
||||
let imgs = value
|
||||
.get("images")
|
||||
.and_then(|v| v.as_array())
|
||||
.expect("images field present");
|
||||
assert_eq!(imgs.len(), 1, "exactly one image sent");
|
||||
let b64 = imgs[0].as_str().expect("image is base64 string");
|
||||
use base64::Engine as _;
|
||||
let decoded = base64::engine::general_purpose::STANDARD
|
||||
.decode(b64)
|
||||
.expect("base64 decodes");
|
||||
let reader = image::ImageReader::new(std::io::Cursor::new(decoded))
|
||||
.with_guessed_format()
|
||||
.expect("guess format");
|
||||
let (w, h) = reader.into_dimensions().expect("dims");
|
||||
let long = w.max(h);
|
||||
assert!(
|
||||
long <= 1024,
|
||||
"long edge after downscale must be <= max_pixels (got {long})"
|
||||
);
|
||||
// Aspect ratio preserved within rounding.
|
||||
let ratio_in = 4000.0 / 3000.0;
|
||||
let ratio_out = w as f32 / h as f32;
|
||||
assert!(
|
||||
(ratio_in - ratio_out).abs() < 0.02,
|
||||
"aspect ratio drift: in={ratio_in} out={ratio_out}"
|
||||
);
|
||||
}
|
||||
|
||||
// ── from_parts construction ──────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn from_parts_clamps_max_pixels_into_legal_range() {
|
||||
|
claude-reviewer-01
commented
어느 쪽이든 "패닉 안 났음 만 보는" 현 상태는 머지 전에 정리하면 좋겠습니다. `from_parts_clamps_max_pixels_into_legal_range` 테스트는 "패닉 안 났음" 만 검증합니다 — 코멘트도 "can't read the private field directly" 라 인정. 두 가지 정리 방향이 있습니다:
1. (선호) `OllamaVisionOcr` 에 `pub fn max_pixels(&self) -> u32` 같은 inspector accessor 를 추가해 실제 클램프 결과 (`MIN_LONG_EDGE` / `MAX_LONG_EDGE` 와 동일) 를 검증.
2. 테스트 자체를 제거 — 클램프 동작은 이미 `ocr_downscales_large_image_before_sending` 가 max_pixels=1024 로 간접 검증하고 있고, 이 추가 테스트는 회귀 신호를 거의 못 줌.
어느 쪽이든 "패닉 안 났음 만 보는" 현 상태는 머지 전에 정리하면 좋겠습니다.
|
||||
// Below MIN_LONG_EDGE — bumped up to the floor.
|
||||
let too_small = OllamaVisionOcr::from_parts("http://x", "m", vec![], 10).unwrap();
|
||||
assert_eq!(
|
||||
too_small.max_pixels(),
|
||||
256,
|
||||
"max_pixels must be raised to MIN_LONG_EDGE"
|
||||
);
|
||||
|
||||
// Above MAX_LONG_EDGE — capped at the ceiling.
|
||||
let too_big =
|
||||
OllamaVisionOcr::from_parts("http://x", "m", vec![], 99_999).unwrap();
|
||||
assert_eq!(
|
||||
too_big.max_pixels(),
|
||||
4096,
|
||||
"max_pixels must be capped at MAX_LONG_EDGE"
|
||||
);
|
||||
|
||||
// Inside the legal range — pass through untouched.
|
||||
let in_range = OllamaVisionOcr::from_parts("http://x", "m", vec![], 1024).unwrap();
|
||||
assert_eq!(in_range.max_pixels(), 1024);
|
||||
}
|
||||
|
||||
// ── Integration test against real Ollama (opt-in) ────────────────────────
|
||||
|
||||
/// End-to-end OCR against the workspace's real Ollama daemon. Skipped
|
||||
/// by default via `#[ignore]` (matching the `kebab-llm-local`
|
||||
/// convention); a developer who explicitly opts in via `--ignored` is
|
||||
/// signalling they want the network call. Endpoint / model can still
|
||||
/// be overridden via env to point at a non-default Ollama host.
|
||||
///
|
||||
/// Run with:
|
||||
///
|
||||
/// ```sh
|
||||
/// KEBAB_IMAGE_OCR_ENDPOINT=http://192.168.0.47:11434 \
|
||||
/// cargo test -p kebab-parse-image --test ocr ocr_integration -- --ignored
|
||||
/// ```
|
||||
#[tokio::test]
|
||||
#[ignore = "hits a real Ollama daemon; opt in via `cargo test -- --ignored`"]
|
||||
async fn ocr_integration_real_ollama_transcribes_text() {
|
||||
|
claude-reviewer-01
commented
통합 테스트가 두 겹의 게이트를 가집니다 — 둘 중 하나로 통일을 권장:
둘 다 동작은 같지만, 1번이 다른 워크스페이스 통합 테스트 ( 통합 테스트가 두 겹의 게이트를 가집니다 — `#[ignore]` (cargo 가 default 로 skip) + 함수 내부의 `KEBAB_OCR_INTEGRATION != "1"` 조기 return. `#[ignore]` 만으로 일반 `cargo test` 는 이미 skip 되고, `--ignored` 로 실행할 때는 사용자가 명시적으로 의도를 표현한 거니 env var 추가 검증이 "수단의 중복" 처럼 보입니다.
둘 중 하나로 통일을 권장:
1. (선호) env var 검증 제거 — `--ignored` 가 곧 "실행 의도" 의 단일 신호.
2. `#[ignore]` 제거하고 env var 만으로 게이팅 — `cargo test` 실행 시점에 env 검사로 자동 skip.
둘 다 동작은 같지만, 1번이 다른 워크스페이스 통합 테스트 (`kebab-llm-local`) 의 `#[ignore]` 패턴과 일관됩니다.
|
||||
let endpoint = std::env::var("KEBAB_IMAGE_OCR_ENDPOINT")
|
||||
.unwrap_or_else(|_| "http://192.168.0.47:11434".to_string());
|
||||
let model =
|
||||
std::env::var("KEBAB_IMAGE_OCR_MODEL").unwrap_or_else(|_| "gemma4:e4b".to_string());
|
||||
|
||||
// Generate a fixture with known text. If the DejaVu font is
|
||||
// missing from this dev box, skip rather than crash.
|
||||
let bytes = match common::hello_world_png() {
|
||||
Ok(b) => b,
|
||||
Err(e) => {
|
||||
eprintln!("skipping ocr_integration: {e:#}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
let cfg = {
|
||||
let mut c = Config::defaults();
|
||||
c.image.ocr.endpoint = Some(endpoint);
|
||||
c.image.ocr.model = model;
|
||||
c.image.ocr.max_pixels = 1024;
|
||||
c
|
||||
};
|
||||
let text = tokio::task::spawn_blocking(move || run_recognize(cfg, bytes, None))
|
||||
.await
|
||||
.expect("blocking task panicked")
|
||||
.expect("real Ollama OCR must succeed");
|
||||
eprintln!("integration OCR result: {:?}", text.joined);
|
||||
let normalized = text.joined.to_lowercase().replace(",", "").replace(".", "");
|
||||
assert!(
|
||||
normalized.contains("hello") && normalized.contains("world"),
|
||||
"integration OCR did not capture expected text: {:?}",
|
||||
text.joined
|
||||
);
|
||||
}
|
||||
@@ -14,6 +14,24 @@ historical contract that was implemented; this file accumulates the
|
||||
deltas so phase 5+ readers can find the live behavior without diffing
|
||||
git history.
|
||||
|
||||
## 2026-05-02 — P6-2 default OCR engine: Tesseract → Ollama-vision
|
||||
|
||||
**Discovered**: P6-2 implementation start.
|
||||
|
||||
**Symptom**: The original `tasks/p6/p6-2-ocr-adapter.md` spec lists Tesseract as the default OCR engine (`tesseract = "0.13"`, feature `tesseract`, default ON). Bringing Tesseract online requires installing `libtesseract-dev` (and `tesseract-ocr-kor` for the spec-default Korean languages set) on every dev / CI host. The kebab dev environment intentionally avoids system-package installs, so the Tesseract Rust bindings can't link.
|
||||
|
||||
**Root cause**: Spec was written assuming a Linux host with `apt install tesseract-ocr-*` available. The reality of single-developer local-first KB is that the same box also runs the Ollama vision endpoint already wired by P4-2 — using it for OCR adds zero new system dependencies.
|
||||
|
||||
**Fix** (PR #33, feat/p6-2-ocr-adapter):
|
||||
- New `OllamaVisionOcr` adapter under `crates/kebab-parse-image/src/ocr.rs`. Implements the spec's `OcrEngine` trait by POSTing the image (base64) to `<endpoint>/api/generate` with a transcription prompt against `gemma4:e4b` (default) or any other vision-capable Ollama model.
|
||||
- New `kebab-config::ImageCfg.ocr` block (`enabled`, `engine`, `model`, `endpoint`, `languages`, `max_pixels`). `enabled` defaults to `false` because OCR adds a model call per asset; `engine` defaults to `"ollama-vision"`. `endpoint` falls back to `models.llm.endpoint` when empty so the same Ollama host serves both LLM and OCR.
|
||||
- The `OcrEngine` trait is unchanged from the spec — Tesseract / Apple Vision / PaddleOCR engines plug in as future feature-gated alternatives without touching the extractor or chunker. The trait abstraction is the part the spec actually demanded; only the choice of default implementation changes.
|
||||
- Tests cover wiremock unit paths (200 happy / 5xx / 200 error envelope / empty response / downscale honours `max_pixels`), `apply_ocr` provenance + error handling, and an opt-in `KEBAB_OCR_INTEGRATION=1` integration test that hits a real Ollama endpoint with a generated `"Hello World 2026"` PNG. Tesseract feature-gated tests from the original spec are deferred to whenever someone is willing to bring `libtesseract` to CI.
|
||||
|
||||
**Trust note**: The original spec marked `OcrText` as "observed text (high trust)" to distinguish it from `ModelCaption`. With an LLM-driven default the line blurs — vision LMs can hallucinate. We kept `OcrText.engine = "ollama-vision"` so consumers can decide trust by engine identity. Future Tesseract / Apple Vision adapters write a different `engine` string and downstream code can branch.
|
||||
|
||||
**Amends**: tasks/p6/p6-2-ocr-adapter.md (default engine; "Allowed dependencies" list — `reqwest` + `base64` replace `tesseract`; "Apple Vision" feature gate deferred; `min_confidence` config field dropped because the LM doesn't expose per-region confidence).
|
||||
|
||||
## 2026-05-01 — `--config` flag silently ignored across all kebab-cli subcommands
|
||||
|
||||
**Discovered**: post-P3-5 manual smoke at `/tmp/kebab-smoke/`.
|
||||
|
||||
@@ -3,7 +3,7 @@ phase: P6
|
||||
component: kebab-parse-image (OCR adapter)
|
||||
task_id: p6-2
|
||||
title: "OcrEngine trait + Tesseract adapter (Apple Vision feature-gated)"
|
||||
status: planned
|
||||
status: completed
|
||||
depends_on: [p6-1]
|
||||
unblocks: [p6-3]
|
||||
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
|
||||
|
||||
OcrCfg.endpoint: String에서 빈 문자열""을 "models.llm.endpoint로 fallback" 시그널로 사용합니다. 의미가 "실제로 빈 endpoint" 와 "명시되지 않음" 을 구분 못 하는 string-typed boolean 입니다.Option<String>으로 바꾸면 의도가 타입에 새겨집니다:TOML 측에선
endpoint키 생략 →None, 명시 →Some(...).OllamaVisionOcr::new의 fallback 로직도match cfg.image.ocr.endpoint.as_deref() { Some(s) if !s.is_empty() => s, _ => &cfg.models.llm.endpoint }로 더 정직해집니다.사소하지만 P6-3 의
caption.endpoint도 같은 길을 갈 텐데, 첫 케이스에서 컨벤션을 잡는 게 비용이 작습니다.