From 901416d8e986f7d48fa378bfd0528ceb735541da Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 4 Jun 2026 08:15:30 +0000 Subject: [PATCH] =?UTF-8?q?feat(ocr):=20T7-T9=20=E2=80=94=20config=20overr?= =?UTF-8?q?ides=20+=20engine=20factory=20+=20signature=20cascade?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit T7: OcrCfg gains det_model/rec_model/dict overrides + score_thresh/ unclip_ratio/max_boxes (serde default, KEBAB_IMAGE_OCR_* env). OnnxPaddleOcr::new threads them via ModelPaths::from_config. T8: build_image_ocr_engine / build_pdf_ocr_engine factories return Box; match on engine string (ollama-vision|paddle-onnx|err). ImagePipeline.ocr_engine + pdf_ocr_engine signatures switched to &dyn OcrEngine. OcrEngine gains model() for the progress label. T9: ingest_config_signature image/pdf branches emit |ocr:1:{engine}:{engine_version} (memoized blake3 per asset-triple, m3-safe). Unit tests (a)(b)(c) added. Co-Authored-By: Claude Opus 4.8 --- crates/kebab-app/src/lib.rs | 249 ++++++++++++++++++-- crates/kebab-app/tests/common/mock_ocr.rs | 4 + crates/kebab-config/src/lib.rs | 77 ++++++ crates/kebab-parse-image/src/lib.rs | 4 +- crates/kebab-parse-image/src/ocr.rs | 18 +- crates/kebab-parse-image/src/paddle_onnx.rs | 60 ++++- 6 files changed, 373 insertions(+), 39 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 6891b62..fe16940 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -52,7 +52,10 @@ use kebab_core::{ SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore, }; use kebab_llm_local::OllamaLanguageModel; -use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_caption, apply_ocr}; +use kebab_parse_image::{ + OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, OnnxPaddleOcr, PADDLE_ONNX_ENGINE, + apply_caption, apply_ocr, engine_version_for_config, +}; use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -357,8 +360,8 @@ pub fn ingest_with_config_opts( // loop is correct and cheap. Construction failure (e.g. invalid // endpoint) aborts ingest fail-fast — better than silently disabling // OCR/caption mid-run. - let ocr_engine: Option = if app.config.image.ocr.enabled { - Some(OllamaVisionOcr::new(&app.config).context("kb-app::ingest: build OllamaVisionOcr")?) + let ocr_engine: Option> = if app.config.image.ocr.enabled { + Some(build_image_ocr_engine(&app.config).context("kb-app::ingest: build image OCR engine")?) } else { None }; @@ -370,28 +373,17 @@ pub fn ingest_with_config_opts( None }; let image_pipeline = ImagePipeline { - ocr_engine: ocr_engine.as_ref(), + ocr_engine: ocr_engine.as_deref(), caption_llm: caption_llm.as_deref(), }; // p10 / v0.20 sub-item 1: PDF OCR engine eager init (H-5 resolution). // image OCR pattern mirror — per-ingest 1회 build, fallible → fail-fast. - let pdf_ocr_engine: Option = + let pdf_ocr_engine: Option> = if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on { - let cfg = &app.config.pdf.ocr; - let endpoint = match cfg.endpoint.as_deref() { - Some(s) if !s.is_empty() => s.to_string(), - _ => app.config.models.llm.endpoint.clone(), - }; Some( - OllamaVisionOcr::from_parts( - endpoint, - cfg.model.clone(), - cfg.languages.clone(), - cfg.max_pixels, - cfg.request_timeout_secs, - ) - .context("kb-app::ingest: build OllamaVisionOcr (pdf)")?, + build_pdf_ocr_engine(&app.config) + .context("kb-app::ingest: build pdf OCR engine")?, ) } else { None @@ -488,7 +480,7 @@ pub fn ingest_with_config_opts( &existing_doc_ids, &image_pipeline, force_reingest, - pdf_ocr_engine.as_ref(), + pdf_ocr_engine.as_deref(), progress, opts.cancel.as_ref(), log_writer.clone(), @@ -832,11 +824,73 @@ fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String { /// `<… as JobRepo>` to be explicit. type SqliteStoreAlias = kebab_store_sqlite::SqliteStore; +/// v0.27.0 (T8): build the image OCR engine selected by +/// `config.image.ocr.engine`. Returns a boxed trait object so the ingest +/// pipeline is engine-agnostic. Construction is fail-fast (model load / +/// hash / endpoint validation) — mirrors the prior concrete-type behaviour. +/// +/// `--config` facade: the caller threads the explicit [`kebab_config::Config`] +/// in, so `OnnxPaddleOcr::new` honours `image.ocr.{det_model,rec_model,dict,…}` +/// overrides resolved from that config (not a re-loaded XDG default). +fn build_image_ocr_engine( + config: &kebab_config::Config, +) -> anyhow::Result> { + match config.image.ocr.engine.as_str() { + OLLAMA_VISION_ENGINE => Ok(Box::new( + OllamaVisionOcr::new(config).context("build OllamaVisionOcr")?, + )), + PADDLE_ONNX_ENGINE => Ok(Box::new( + OnnxPaddleOcr::new(config).context("build OnnxPaddleOcr")?, + )), + other => anyhow::bail!( + "unknown image.ocr.engine {other:?}; expected \ + {OLLAMA_VISION_ENGINE:?} or {PADDLE_ONNX_ENGINE:?}" + ), + } +} + +/// v0.27.0 (T8): build the PDF OCR engine selected by +/// `config.pdf.ocr.engine`. The ollama-vision arm uses the PDF-specific +/// `model` / `languages` / `max_pixels` / `request_timeout_secs` knobs (and +/// endpoint fallback to `models.llm.endpoint`). The paddle-onnx arm shares +/// the same bundled ONNX models as image OCR (resolved from `image.ocr` +/// overrides) — PaddleOCR is page-agnostic and carries no per-engine prompt. +fn build_pdf_ocr_engine( + config: &kebab_config::Config, +) -> anyhow::Result> { + match config.pdf.ocr.engine.as_str() { + OLLAMA_VISION_ENGINE => { + let cfg = &config.pdf.ocr; + let endpoint = match cfg.endpoint.as_deref() { + Some(s) if !s.is_empty() => s.to_string(), + _ => config.models.llm.endpoint.clone(), + }; + Ok(Box::new( + OllamaVisionOcr::from_parts( + endpoint, + cfg.model.clone(), + cfg.languages.clone(), + cfg.max_pixels, + cfg.request_timeout_secs, + ) + .context("build OllamaVisionOcr (pdf)")?, + )) + } + PADDLE_ONNX_ENGINE => Ok(Box::new( + OnnxPaddleOcr::new(config).context("build OnnxPaddleOcr (pdf)")?, + )), + other => anyhow::bail!( + "unknown pdf.ocr.engine {other:?}; expected \ + {OLLAMA_VISION_ENGINE:?} or {PADDLE_ONNX_ENGINE:?}" + ), + } +} + /// P6-4: borrowed bundle of the three image-pipeline components built /// once per ingest invocation. Threaded through `ingest_one_asset` so /// the dispatch does not need ten separate parameters. struct ImagePipeline<'a> { - ocr_engine: Option<&'a OllamaVisionOcr>, + ocr_engine: Option<&'a dyn OcrEngine>, caption_llm: Option<&'a dyn LanguageModel>, } @@ -1110,7 +1164,7 @@ fn ingest_one_asset( existing_doc_ids: &std::collections::HashSet, image_pipeline: &ImagePipeline<'_>, force_reingest: bool, - pdf_ocr_engine: Option<&OllamaVisionOcr>, + pdf_ocr_engine: Option<&dyn OcrEngine>, progress: Option<&std::sync::mpsc::Sender>, cancel: Option<&std::sync::Arc>, log_writer: Option>>, @@ -2093,7 +2147,7 @@ fn ingest_one_pdf_asset( vector_store: Option<&Arc>, existing_doc_ids: &std::collections::HashSet, force_reingest: bool, - pdf_ocr_engine: Option<&OllamaVisionOcr>, + pdf_ocr_engine: Option<&dyn OcrEngine>, progress: Option<&std::sync::mpsc::Sender>, cancel: Option<&std::sync::Arc>, log_writer: Option>>, @@ -3017,6 +3071,50 @@ fn chunk_policy_from_config(config: &kebab_config::Config) -> ChunkPolicy { /// The output is purely a comparison token — it is never parsed back, so the /// exact format is internal. Field order is fixed and `Vec`s are joined so /// the same `Config` always yields the same string. +/// Process-wide memo of the paddle-onnx `engine_version`, keyed by the +/// resolved (det,rec,dict) override triple. Hashing the ~17 MB of model bytes +/// happens once per triple per process (m3 — never re-hash per asset); the +/// per-asset [`ingest_config_signature`] calls hit this cache. +static PADDLE_OCR_VERSION_MEMO: std::sync::OnceLock< + std::sync::Mutex>, +> = std::sync::OnceLock::new(); + +/// T9: resolve the OCR `engine_version` string used inside the ingest config +/// signature. ollama-vision is self-describing from `engine/model` (cheap, no +/// I/O). paddle-onnx hashes the bundled/override model assets (memoized). +fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model: &str) -> String { + if engine != PADDLE_ONNX_ENGINE { + // ollama-vision (and any non-paddle engine): the daemon exposes no + // stable per-model revision, so engine/model is the identity. + return format!("ollama/{model}"); + } + let ocr = &config.image.ocr; + let key = format!( + "{}|{}|{}", + ocr.det_model.as_deref().unwrap_or(""), + ocr.rec_model.as_deref().unwrap_or(""), + ocr.dict.as_deref().unwrap_or(""), + ); + let memo = PADDLE_OCR_VERSION_MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new())); + if let Some(v) = memo.lock().unwrap().get(&key) { + return v.clone(); + } + // First call for this triple in this process: hash once. In any real + // ingest the engine was already built (fail-fast) so the assets are + // present and this succeeds; the path-derived identity below is an + // unreachable-in-practice guard that keeps the signature total. + let version = engine_version_for_config(config).unwrap_or_else(|e| { + tracing::warn!( + target: "kebab-app::ingest", + error = %e, + "paddle-onnx engine_version hash failed; using path-derived identity for signature" + ); + format!("ppocrv5-mobile-kor-paths:{key}") + }); + memo.lock().unwrap().insert(key, version.clone()); + version +} + fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> String { // Common (every media type): chunking parameters that move chunk // boundaries. `target_tokens` / `overlap_tokens` change re-chunking for @@ -3033,7 +3131,14 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> // a stable empty token so re-running the same config skips. let ocr = &config.image.ocr; if ocr.enabled { - sig.push_str(&format!("|ocr:1:{}", ocr.model)); + // v0.27.0 (T9): engine + engine_version so switching engine + // (ollama-vision ↔ paddle-onnx) OR changing the model/assets + // invalidates downstream chunks (design §9 cascade). + sig.push_str(&format!( + "|ocr:1:{}:{}", + ocr.engine, + ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model) + )); } else { sig.push_str("|ocr:0"); } @@ -3049,9 +3154,14 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> // (mirrors the ingest gate). `model` only matters when active. let ocr = &config.pdf.ocr; if ocr.enabled || ocr.always_on { + // v0.27.0 (T9): engine + engine_version (same cascade rule as + // image OCR above) alongside the enabled/always_on gate. sig.push_str(&format!( - "|pdfocr:{}:{}:{}", - ocr.enabled, ocr.always_on, ocr.model + "|pdfocr:{}:{}:{}:{}", + ocr.enabled, + ocr.always_on, + ocr.engine, + ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model) )); } else { sig.push_str("|pdfocr:0"); @@ -3816,4 +3926,93 @@ mod ingest_config_signature_tests { ); } } + + // ── v0.27.0 (T9): engine + engine_version cascade ───────────────────── + + /// (a) Switching the engine (ollama-vision → paddle-onnx) with the SAME + /// model id changes the image signature — different engines produce + /// different output even from an identically-named model. + #[test] + fn image_ocr_engine_switch_invalidates_image() { + let mut ollama = Config::defaults(); + ollama.image.ocr.enabled = true; + // same `model` string on both — only the engine differs + let mut paddle = ollama.clone(); + paddle.image.ocr.engine = "paddle-onnx".to_string(); + assert_ne!( + ingest_config_signature(&ollama, &img()), + ingest_config_signature(&paddle, &img()), + "engine switch with identical model must invalidate images" + ); + } + + /// (b) A different engine_version (here: a different ollama model id, which + /// the signature folds into `ollama/{model}`) changes the image signature. + #[test] + fn image_ocr_engine_version_change_invalidates_image() { + let mut a = Config::defaults(); + a.image.ocr.enabled = true; + a.image.ocr.model = "gemma4:e4b".to_string(); + let mut b = a.clone(); + b.image.ocr.model = "qwen2.5vl:3b".to_string(); + assert_ne!( + ingest_config_signature(&a, &img()), + ingest_config_signature(&b, &img()), + "engine_version change must invalidate images" + ); + } + + /// (b') For the paddle-onnx engine, pointing at a different model asset + /// (override path) yields a different engine_version → different signature. + #[test] + fn image_ocr_paddle_model_path_change_invalidates_image() { + let mut base = Config::defaults(); + base.image.ocr.enabled = true; + base.image.ocr.engine = "paddle-onnx".to_string(); + let mut overridden = base.clone(); + overridden.image.ocr.det_model = Some("/some/other/det.onnx".to_string()); + assert_ne!( + ingest_config_signature(&base, &img()), + ingest_config_signature(&overridden, &img()), + "paddle-onnx model path change must invalidate images" + ); + } + + /// (c) Unrelated settings leave the paddle-onnx image signature stable + /// (engine_version is memoized + deterministic for a fixed asset triple). + #[test] + fn paddle_image_signature_stable_for_unrelated_change() { + let mut base = Config::defaults(); + base.image.ocr.enabled = true; + base.image.ocr.engine = "paddle-onnx".to_string(); + let mut other = base.clone(); + other.search.default_k += 3; + other.image.ocr.max_pixels += 100; // runtime-only knob + assert_eq!( + ingest_config_signature(&base, &img()), + ingest_config_signature(&other, &img()), + "unrelated/runtime-only changes must not invalidate paddle images" + ); + } + + /// PDF OCR: engine switch with the same model invalidates pdf only. + #[test] + fn pdf_ocr_engine_switch_invalidates_pdf() { + let mut ollama = Config::defaults(); + ollama.pdf.ocr.enabled = true; + let mut paddle = ollama.clone(); + paddle.pdf.ocr.engine = "paddle-onnx".to_string(); + assert_ne!( + ingest_config_signature(&ollama, &pdf()), + ingest_config_signature(&paddle, &pdf()), + "pdf engine switch must invalidate pdf" + ); + for m in [md(), img(), code()] { + assert_eq!( + ingest_config_signature(&ollama, &m), + ingest_config_signature(&paddle, &m), + "pdf engine switch must NOT touch {m:?}" + ); + } + } } diff --git a/crates/kebab-app/tests/common/mock_ocr.rs b/crates/kebab-app/tests/common/mock_ocr.rs index 3632214..144e88d 100644 --- a/crates/kebab-app/tests/common/mock_ocr.rs +++ b/crates/kebab-app/tests/common/mock_ocr.rs @@ -39,6 +39,10 @@ impl OcrEngine for MockOcrEngine { "mock-v1".to_string() } + fn model(&self) -> &str { + "mock-model" + } + fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result { if self.fail { anyhow::bail!("mock failure"); diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index cec773e..0c23e66 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -377,6 +377,36 @@ pub struct OcrCfg { /// `86400`). #[serde(default = "default_ocr_request_timeout_secs")] pub request_timeout_secs: u64, + + // ── paddle-onnx engine overrides (v0.27.0) ────────────────────────── + // Only consulted when `engine == "paddle-onnx"`; the ollama-vision + // engine ignores them. All `#[serde(default)]` so pre-v0.27 config + // files load unchanged. + /// Override path to the detection ONNX model. `None` → bundled + /// `assets/paddleocr-onnx/ppocrv5_mobile_det.onnx` (or the directory + /// named by `KEBAB_IMAGE_OCR_MODEL_DIR`). + #[serde(default)] + pub det_model: Option, + /// Override path to the recognition ONNX model. `None` → bundled + /// `assets/paddleocr-onnx/korean_ppocrv5_mobile_rec.onnx`. + #[serde(default)] + pub rec_model: Option, + /// Override path to the character dictionary. `None` → bundled + /// `assets/paddleocr-onnx/korean_dict.txt`. + #[serde(default)] + pub dict: Option, + /// DBNet detection box score threshold (0.0..=1.0). Boxes whose mean + /// probability is below this are dropped. Default `0.3`. + #[serde(default = "default_ocr_score_thresh")] + pub score_thresh: f32, + /// Polygon unclip ratio applied to each detected box before crop. + /// Larger = more padding around the text. Default `1.5`. + #[serde(default = "default_ocr_unclip_ratio")] + pub unclip_ratio: f32, + /// Hard cap on detected boxes per image (runaway guard). Extra boxes + /// past this count are truncated with a warning. Default `1000`. + #[serde(default = "default_ocr_max_boxes")] + pub max_boxes: usize, } impl OcrCfg { @@ -389,10 +419,29 @@ impl OcrCfg { languages: vec!["eng".to_string(), "kor".to_string()], max_pixels: 1600, request_timeout_secs: default_ocr_request_timeout_secs(), + det_model: None, + rec_model: None, + dict: None, + score_thresh: default_ocr_score_thresh(), + unclip_ratio: default_ocr_unclip_ratio(), + max_boxes: default_ocr_max_boxes(), } } } +/// paddle-onnx DBNet box score threshold default. See [`OcrCfg::score_thresh`]. +fn default_ocr_score_thresh() -> f32 { + 0.3 +} +/// paddle-onnx unclip ratio default. See [`OcrCfg::unclip_ratio`]. +fn default_ocr_unclip_ratio() -> f32 { + 1.5 +} +/// paddle-onnx box-count cap default. See [`OcrCfg::max_boxes`]. +fn default_ocr_max_boxes() -> usize { + 1000 +} + /// v0.17.2 post-dogfood: matches the legacy hard-coded ceiling so /// existing configs that omit the field keep behaving identically. /// Overridable per config / `KEBAB_IMAGE_OCR_REQUEST_TIMEOUT_SECS`. @@ -1098,6 +1147,34 @@ impl Config { self.image.ocr.request_timeout_secs = n; } } + // paddle-onnx engine overrides (v0.27.0). Empty string → None + // (fall back to bundled / KEBAB_IMAGE_OCR_MODEL_DIR). + "KEBAB_IMAGE_OCR_DET_MODEL" => { + self.image.ocr.det_model = + if v.is_empty() { None } else { Some(v.clone()) }; + } + "KEBAB_IMAGE_OCR_REC_MODEL" => { + self.image.ocr.rec_model = + if v.is_empty() { None } else { Some(v.clone()) }; + } + "KEBAB_IMAGE_OCR_DICT" => { + self.image.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) }; + } + "KEBAB_IMAGE_OCR_SCORE_THRESH" => { + if let Ok(f) = v.parse::() { + self.image.ocr.score_thresh = f; + } + } + "KEBAB_IMAGE_OCR_UNCLIP_RATIO" => { + if let Ok(f) = v.parse::() { + self.image.ocr.unclip_ratio = f; + } + } + "KEBAB_IMAGE_OCR_MAX_BOXES" => { + if let Ok(n) = v.parse::() { + self.image.ocr.max_boxes = n; + } + } // image.caption (P6-3) "KEBAB_IMAGE_CAPTION_ENABLED" => { diff --git a/crates/kebab-parse-image/src/lib.rs b/crates/kebab-parse-image/src/lib.rs index de9fd92..ab93677 100644 --- a/crates/kebab-parse-image/src/lib.rs +++ b/crates/kebab-parse-image/src/lib.rs @@ -33,8 +33,8 @@ pub mod ocr; pub mod paddle_onnx; pub use caption::{apply_caption, caption_image}; -pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr}; -pub use paddle_onnx::{OnnxPaddleOcr, PADDLE_ONNX_ENGINE}; +pub use ocr::{OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, apply_ocr}; +pub use paddle_onnx::{OnnxPaddleOcr, PADDLE_ONNX_ENGINE, engine_version_for_config}; use anyhow::{Context, Result}; use kebab_core::{ diff --git a/crates/kebab-parse-image/src/ocr.rs b/crates/kebab-parse-image/src/ocr.rs index 64ea1dd..f604cbb 100644 --- a/crates/kebab-parse-image/src/ocr.rs +++ b/crates/kebab-parse-image/src/ocr.rs @@ -65,6 +65,13 @@ pub trait OcrEngine: Send + Sync { /// through to engines that benefit from it (Tesseract languages, /// LLM prompt steering); ignore otherwise. fn recognize(&self, image_bytes: &[u8], lang_hint: Option<&Lang>) -> Result; + + /// Human-facing model label for the ingest progress display + /// (`AssetPhase{phase:"ocr", model}`). Distinct from + /// [`engine_version`](Self::engine_version), which is the cache-key + /// hash. E.g. `"gemma4:e4b"` (ollama-vision) or `"ppocrv5-mobile-kor"` + /// (paddle-onnx). + fn model(&self) -> &str; } /// Mutate `block.ocr` in place by running `engine` over `image_bytes`, @@ -209,13 +216,6 @@ impl OllamaVisionOcr { self.max_pixels } - /// The Ollama model id this engine drives (e.g. `gemma4:e4b`). - /// Surfaced so the ingest progress display can name the model - /// running a slow OCR phase (`AssetPhase{phase:"ocr", model}`). - pub fn model(&self) -> &str { - &self.model - } - fn build_prompt(&self, lang_hint: Option<&Lang>) -> String { let langs = if self.languages.is_empty() { "any".to_string() @@ -247,6 +247,10 @@ impl OcrEngine for OllamaVisionOcr { format!("ollama/{}", self.model) } + fn model(&self) -> &str { + &self.model + } + fn recognize(&self, image_bytes: &[u8], lang_hint: Option<&Lang>) -> Result { let (prepared, w, h) = image_prep::downscale_to_png(image_bytes, self.max_pixels) .context("preparing image for OCR")?; diff --git a/crates/kebab-parse-image/src/paddle_onnx.rs b/crates/kebab-parse-image/src/paddle_onnx.rs index 54f2aaf..df4aba5 100644 --- a/crates/kebab-parse-image/src/paddle_onnx.rs +++ b/crates/kebab-parse-image/src/paddle_onnx.rs @@ -111,6 +111,22 @@ impl ModelPaths { dict: dir.join("korean_dict.txt"), } } + + /// Resolve model paths from the `image.ocr` config (T7). Each of + /// `det_model` / `rec_model` / `dict` overrides the corresponding bundled + /// path when set; unset fields fall back to [`from_default_dir`], so a + /// caller can override just one asset. + /// + /// [`from_default_dir`]: ModelPaths::from_default_dir + pub fn from_config(config: &kebab_config::Config) -> Self { + let defaults = Self::from_default_dir(); + let ocr = &config.image.ocr; + Self { + det: ocr.det_model.as_ref().map(PathBuf::from).unwrap_or(defaults.det), + rec: ocr.rec_model.as_ref().map(PathBuf::from).unwrap_or(defaults.rec), + dict: ocr.dict.as_ref().map(PathBuf::from).unwrap_or(defaults.dict), + } + } } impl OnnxPaddleOcr { @@ -119,13 +135,14 @@ impl OnnxPaddleOcr { /// Construction loads both ONNX sessions and hashes the assets — failures /// here are fail-fast (matches the Ollama adapter's construction contract). pub fn new(config: &kebab_config::Config) -> Result { - let paths = ModelPaths::from_default_dir(); + let paths = ModelPaths::from_config(config); + let ocr = &config.image.ocr; Self::from_paths( &paths, - 0.3, - 1.5, - 1000, - config.image.ocr.max_pixels, + ocr.score_thresh, + ocr.unclip_ratio, + ocr.max_boxes, + ocr.max_pixels, ) } @@ -209,6 +226,12 @@ impl OcrEngine for OnnxPaddleOcr { self.engine_version.clone() } + fn model(&self) -> &str { + // Static label for the progress display; the per-asset hash lives + // in `engine_version`. + "ppocrv5-mobile-kor" + } + fn recognize(&self, image_bytes: &[u8], _lang_hint: Option<&Lang>) -> Result { let img = image::load_from_memory(image_bytes) .context("decoding image for OCR")? @@ -430,6 +453,15 @@ fn load_dict(path: &Path) -> Result> { Ok(lines) } +/// Resolve the paddle-onnx `engine_version` for `config` without loading the +/// ONNX sessions (T9). This is the same blake3-over-assets string that a +/// constructed [`OnnxPaddleOcr`] exposes via [`OcrEngine::engine_version`], so +/// the ingest config signature can include it. Reads ~17 MB of model bytes — +/// callers MUST memoize per (det,rec,dict) triple (m3: never re-hash per asset). +pub fn engine_version_for_config(config: &kebab_config::Config) -> Result { + compute_engine_version(&ModelPaths::from_config(config)) +} + /// blake3 over det + rec + dict bytes → stable `engine_version`. fn compute_engine_version(paths: &ModelPaths) -> Result { let mut hasher = blake3::Hasher::new(); @@ -802,6 +834,24 @@ mod tests { assert!((hi - 20.0).abs() < 1e-3, "long side {hi}"); } + #[test] + fn model_paths_from_config_uses_overrides() { + // T7: unset overrides → bundled default asset paths. + let mut cfg = kebab_config::Config::defaults(); + let def = ModelPaths::from_config(&cfg); + assert!(def.det.ends_with("ppocrv5_mobile_det.onnx"), "{:?}", def.det); + assert!(def.rec.ends_with("korean_ppocrv5_mobile_rec.onnx"), "{:?}", def.rec); + assert!(def.dict.ends_with("korean_dict.txt"), "{:?}", def.dict); + + // Override det + dict; rec stays bundled (partial override allowed). + cfg.image.ocr.det_model = Some("/custom/det.onnx".to_string()); + cfg.image.ocr.dict = Some("/custom/dict.txt".to_string()); + let ov = ModelPaths::from_config(&cfg); + assert_eq!(ov.det, PathBuf::from("/custom/det.onnx")); + assert_eq!(ov.dict, PathBuf::from("/custom/dict.txt")); + assert!(ov.rec.ends_with("korean_ppocrv5_mobile_rec.onnx"), "{:?}", ov.rec); + } + #[test] fn unclip_expands_box() { let rect = RotRect {