feat(ocr): T7-T9 — config overrides + engine factory + signature cascade

T7: OcrCfg gains det_model/rec_model/dict overrides + score_thresh/
unclip_ratio/max_boxes (serde default, KEBAB_IMAGE_OCR_* env). OnnxPaddleOcr::new
threads them via ModelPaths::from_config.
T8: build_image_ocr_engine / build_pdf_ocr_engine factories return
Box<dyn OcrEngine>; match on engine string (ollama-vision|paddle-onnx|err).
ImagePipeline.ocr_engine + pdf_ocr_engine signatures switched to &dyn OcrEngine.
OcrEngine gains model() for the progress label.
T9: ingest_config_signature image/pdf branches emit |ocr:1:{engine}:{engine_version}
(memoized blake3 per asset-triple, m3-safe). Unit tests (a)(b)(c) added.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-04 08:15:30 +00:00
parent b706e3e88c
commit 901416d8e9
6 changed files with 373 additions and 39 deletions

View File

@@ -52,7 +52,10 @@ use kebab_core::{
SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore,
};
use kebab_llm_local::OllamaLanguageModel;
use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_caption, apply_ocr};
use kebab_parse_image::{
OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, OnnxPaddleOcr, PADDLE_ONNX_ENGINE,
apply_caption, apply_ocr, engine_version_for_config,
};
use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
use kebab_source_fs::FsSourceConnector;
@@ -357,8 +360,8 @@ pub fn ingest_with_config_opts(
// loop is correct and cheap. Construction failure (e.g. invalid
// endpoint) aborts ingest fail-fast — better than silently disabling
// OCR/caption mid-run.
let ocr_engine: Option<OllamaVisionOcr> = if app.config.image.ocr.enabled {
Some(OllamaVisionOcr::new(&app.config).context("kb-app::ingest: build OllamaVisionOcr")?)
let ocr_engine: Option<Box<dyn OcrEngine>> = if app.config.image.ocr.enabled {
Some(build_image_ocr_engine(&app.config).context("kb-app::ingest: build image OCR engine")?)
} else {
None
};
@@ -370,28 +373,17 @@ pub fn ingest_with_config_opts(
None
};
let image_pipeline = ImagePipeline {
ocr_engine: ocr_engine.as_ref(),
ocr_engine: ocr_engine.as_deref(),
caption_llm: caption_llm.as_deref(),
};
// p10 / v0.20 sub-item 1: PDF OCR engine eager init (H-5 resolution).
// image OCR pattern mirror — per-ingest 1회 build, fallible → fail-fast.
let pdf_ocr_engine: Option<OllamaVisionOcr> =
let pdf_ocr_engine: Option<Box<dyn OcrEngine>> =
if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on {
let cfg = &app.config.pdf.ocr;
let endpoint = match cfg.endpoint.as_deref() {
Some(s) if !s.is_empty() => s.to_string(),
_ => app.config.models.llm.endpoint.clone(),
};
Some(
OllamaVisionOcr::from_parts(
endpoint,
cfg.model.clone(),
cfg.languages.clone(),
cfg.max_pixels,
cfg.request_timeout_secs,
)
.context("kb-app::ingest: build OllamaVisionOcr (pdf)")?,
build_pdf_ocr_engine(&app.config)
.context("kb-app::ingest: build pdf OCR engine")?,
)
} else {
None
@@ -488,7 +480,7 @@ pub fn ingest_with_config_opts(
&existing_doc_ids,
&image_pipeline,
force_reingest,
pdf_ocr_engine.as_ref(),
pdf_ocr_engine.as_deref(),
progress,
opts.cancel.as_ref(),
log_writer.clone(),
@@ -832,11 +824,73 @@ fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String {
/// `<… as JobRepo>` to be explicit.
type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
/// v0.27.0 (T8): build the image OCR engine selected by
/// `config.image.ocr.engine`. Returns a boxed trait object so the ingest
/// pipeline is engine-agnostic. Construction is fail-fast (model load /
/// hash / endpoint validation) — mirrors the prior concrete-type behaviour.
///
/// `--config` facade: the caller threads the explicit [`kebab_config::Config`]
/// in, so `OnnxPaddleOcr::new` honours `image.ocr.{det_model,rec_model,dict,…}`
/// overrides resolved from that config (not a re-loaded XDG default).
fn build_image_ocr_engine(
config: &kebab_config::Config,
) -> anyhow::Result<Box<dyn OcrEngine>> {
match config.image.ocr.engine.as_str() {
OLLAMA_VISION_ENGINE => Ok(Box::new(
OllamaVisionOcr::new(config).context("build OllamaVisionOcr")?,
)),
PADDLE_ONNX_ENGINE => Ok(Box::new(
OnnxPaddleOcr::new(config).context("build OnnxPaddleOcr")?,
)),
other => anyhow::bail!(
"unknown image.ocr.engine {other:?}; expected \
{OLLAMA_VISION_ENGINE:?} or {PADDLE_ONNX_ENGINE:?}"
),
}
}
/// v0.27.0 (T8): build the PDF OCR engine selected by
/// `config.pdf.ocr.engine`. The ollama-vision arm uses the PDF-specific
/// `model` / `languages` / `max_pixels` / `request_timeout_secs` knobs (and
/// endpoint fallback to `models.llm.endpoint`). The paddle-onnx arm shares
/// the same bundled ONNX models as image OCR (resolved from `image.ocr`
/// overrides) — PaddleOCR is page-agnostic and carries no per-engine prompt.
fn build_pdf_ocr_engine(
config: &kebab_config::Config,
) -> anyhow::Result<Box<dyn OcrEngine>> {
match config.pdf.ocr.engine.as_str() {
OLLAMA_VISION_ENGINE => {
let cfg = &config.pdf.ocr;
let endpoint = match cfg.endpoint.as_deref() {
Some(s) if !s.is_empty() => s.to_string(),
_ => config.models.llm.endpoint.clone(),
};
Ok(Box::new(
OllamaVisionOcr::from_parts(
endpoint,
cfg.model.clone(),
cfg.languages.clone(),
cfg.max_pixels,
cfg.request_timeout_secs,
)
.context("build OllamaVisionOcr (pdf)")?,
))
}
PADDLE_ONNX_ENGINE => Ok(Box::new(
OnnxPaddleOcr::new(config).context("build OnnxPaddleOcr (pdf)")?,
)),
other => anyhow::bail!(
"unknown pdf.ocr.engine {other:?}; expected \
{OLLAMA_VISION_ENGINE:?} or {PADDLE_ONNX_ENGINE:?}"
),
}
}
/// P6-4: borrowed bundle of the three image-pipeline components built
/// once per ingest invocation. Threaded through `ingest_one_asset` so
/// the dispatch does not need ten separate parameters.
struct ImagePipeline<'a> {
ocr_engine: Option<&'a OllamaVisionOcr>,
ocr_engine: Option<&'a dyn OcrEngine>,
caption_llm: Option<&'a dyn LanguageModel>,
}
@@ -1110,7 +1164,7 @@ fn ingest_one_asset(
existing_doc_ids: &std::collections::HashSet<String>,
image_pipeline: &ImagePipeline<'_>,
force_reingest: bool,
pdf_ocr_engine: Option<&OllamaVisionOcr>,
pdf_ocr_engine: Option<&dyn OcrEngine>,
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
@@ -2093,7 +2147,7 @@ fn ingest_one_pdf_asset(
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
force_reingest: bool,
pdf_ocr_engine: Option<&OllamaVisionOcr>,
pdf_ocr_engine: Option<&dyn OcrEngine>,
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
@@ -3017,6 +3071,50 @@ fn chunk_policy_from_config(config: &kebab_config::Config) -> ChunkPolicy {
/// The output is purely a comparison token — it is never parsed back, so the
/// exact format is internal. Field order is fixed and `Vec`s are joined so
/// the same `Config` always yields the same string.
/// Process-wide memo of the paddle-onnx `engine_version`, keyed by the
/// resolved (det,rec,dict) override triple. Hashing the ~17 MB of model bytes
/// happens once per triple per process (m3 — never re-hash per asset); the
/// per-asset [`ingest_config_signature`] calls hit this cache.
static PADDLE_OCR_VERSION_MEMO: std::sync::OnceLock<
std::sync::Mutex<std::collections::HashMap<String, String>>,
> = std::sync::OnceLock::new();
/// T9: resolve the OCR `engine_version` string used inside the ingest config
/// signature. ollama-vision is self-describing from `engine/model` (cheap, no
/// I/O). paddle-onnx hashes the bundled/override model assets (memoized).
fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model: &str) -> String {
if engine != PADDLE_ONNX_ENGINE {
// ollama-vision (and any non-paddle engine): the daemon exposes no
// stable per-model revision, so engine/model is the identity.
return format!("ollama/{model}");
}
let ocr = &config.image.ocr;
let key = format!(
"{}|{}|{}",
ocr.det_model.as_deref().unwrap_or("<bundled>"),
ocr.rec_model.as_deref().unwrap_or("<bundled>"),
ocr.dict.as_deref().unwrap_or("<bundled>"),
);
let memo = PADDLE_OCR_VERSION_MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()));
if let Some(v) = memo.lock().unwrap().get(&key) {
return v.clone();
}
// First call for this triple in this process: hash once. In any real
// ingest the engine was already built (fail-fast) so the assets are
// present and this succeeds; the path-derived identity below is an
// unreachable-in-practice guard that keeps the signature total.
let version = engine_version_for_config(config).unwrap_or_else(|e| {
tracing::warn!(
target: "kebab-app::ingest",
error = %e,
"paddle-onnx engine_version hash failed; using path-derived identity for signature"
);
format!("ppocrv5-mobile-kor-paths:{key}")
});
memo.lock().unwrap().insert(key, version.clone());
version
}
fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> String {
// Common (every media type): chunking parameters that move chunk
// boundaries. `target_tokens` / `overlap_tokens` change re-chunking for
@@ -3033,7 +3131,14 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
// a stable empty token so re-running the same config skips.
let ocr = &config.image.ocr;
if ocr.enabled {
sig.push_str(&format!("|ocr:1:{}", ocr.model));
// v0.27.0 (T9): engine + engine_version so switching engine
// (ollama-vision ↔ paddle-onnx) OR changing the model/assets
// invalidates downstream chunks (design §9 cascade).
sig.push_str(&format!(
"|ocr:1:{}:{}",
ocr.engine,
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
));
} else {
sig.push_str("|ocr:0");
}
@@ -3049,9 +3154,14 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
// (mirrors the ingest gate). `model` only matters when active.
let ocr = &config.pdf.ocr;
if ocr.enabled || ocr.always_on {
// v0.27.0 (T9): engine + engine_version (same cascade rule as
// image OCR above) alongside the enabled/always_on gate.
sig.push_str(&format!(
"|pdfocr:{}:{}:{}",
ocr.enabled, ocr.always_on, ocr.model
"|pdfocr:{}:{}:{}:{}",
ocr.enabled,
ocr.always_on,
ocr.engine,
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
));
} else {
sig.push_str("|pdfocr:0");
@@ -3816,4 +3926,93 @@ mod ingest_config_signature_tests {
);
}
}
// ── v0.27.0 (T9): engine + engine_version cascade ─────────────────────
/// (a) Switching the engine (ollama-vision → paddle-onnx) with the SAME
/// model id changes the image signature — different engines produce
/// different output even from an identically-named model.
#[test]
fn image_ocr_engine_switch_invalidates_image() {
let mut ollama = Config::defaults();
ollama.image.ocr.enabled = true;
// same `model` string on both — only the engine differs
let mut paddle = ollama.clone();
paddle.image.ocr.engine = "paddle-onnx".to_string();
assert_ne!(
ingest_config_signature(&ollama, &img()),
ingest_config_signature(&paddle, &img()),
"engine switch with identical model must invalidate images"
);
}
/// (b) A different engine_version (here: a different ollama model id, which
/// the signature folds into `ollama/{model}`) changes the image signature.
#[test]
fn image_ocr_engine_version_change_invalidates_image() {
let mut a = Config::defaults();
a.image.ocr.enabled = true;
a.image.ocr.model = "gemma4:e4b".to_string();
let mut b = a.clone();
b.image.ocr.model = "qwen2.5vl:3b".to_string();
assert_ne!(
ingest_config_signature(&a, &img()),
ingest_config_signature(&b, &img()),
"engine_version change must invalidate images"
);
}
/// (b') For the paddle-onnx engine, pointing at a different model asset
/// (override path) yields a different engine_version → different signature.
#[test]
fn image_ocr_paddle_model_path_change_invalidates_image() {
let mut base = Config::defaults();
base.image.ocr.enabled = true;
base.image.ocr.engine = "paddle-onnx".to_string();
let mut overridden = base.clone();
overridden.image.ocr.det_model = Some("/some/other/det.onnx".to_string());
assert_ne!(
ingest_config_signature(&base, &img()),
ingest_config_signature(&overridden, &img()),
"paddle-onnx model path change must invalidate images"
);
}
/// (c) Unrelated settings leave the paddle-onnx image signature stable
/// (engine_version is memoized + deterministic for a fixed asset triple).
#[test]
fn paddle_image_signature_stable_for_unrelated_change() {
let mut base = Config::defaults();
base.image.ocr.enabled = true;
base.image.ocr.engine = "paddle-onnx".to_string();
let mut other = base.clone();
other.search.default_k += 3;
other.image.ocr.max_pixels += 100; // runtime-only knob
assert_eq!(
ingest_config_signature(&base, &img()),
ingest_config_signature(&other, &img()),
"unrelated/runtime-only changes must not invalidate paddle images"
);
}
/// PDF OCR: engine switch with the same model invalidates pdf only.
#[test]
fn pdf_ocr_engine_switch_invalidates_pdf() {
let mut ollama = Config::defaults();
ollama.pdf.ocr.enabled = true;
let mut paddle = ollama.clone();
paddle.pdf.ocr.engine = "paddle-onnx".to_string();
assert_ne!(
ingest_config_signature(&ollama, &pdf()),
ingest_config_signature(&paddle, &pdf()),
"pdf engine switch must invalidate pdf"
);
for m in [md(), img(), code()] {
assert_eq!(
ingest_config_signature(&ollama, &m),
ingest_config_signature(&paddle, &m),
"pdf engine switch must NOT touch {m:?}"
);
}
}
}

View File

@@ -39,6 +39,10 @@ impl OcrEngine for MockOcrEngine {
"mock-v1".to_string()
}
fn model(&self) -> &str {
"mock-model"
}
fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
if self.fail {
anyhow::bail!("mock failure");

View File

@@ -377,6 +377,36 @@ pub struct OcrCfg {
/// `86400`).
#[serde(default = "default_ocr_request_timeout_secs")]
pub request_timeout_secs: u64,
// ── paddle-onnx engine overrides (v0.27.0) ──────────────────────────
// Only consulted when `engine == "paddle-onnx"`; the ollama-vision
// engine ignores them. All `#[serde(default)]` so pre-v0.27 config
// files load unchanged.
/// Override path to the detection ONNX model. `None` → bundled
/// `assets/paddleocr-onnx/ppocrv5_mobile_det.onnx` (or the directory
/// named by `KEBAB_IMAGE_OCR_MODEL_DIR`).
#[serde(default)]
pub det_model: Option<String>,
/// Override path to the recognition ONNX model. `None` → bundled
/// `assets/paddleocr-onnx/korean_ppocrv5_mobile_rec.onnx`.
#[serde(default)]
pub rec_model: Option<String>,
/// Override path to the character dictionary. `None` → bundled
/// `assets/paddleocr-onnx/korean_dict.txt`.
#[serde(default)]
pub dict: Option<String>,
/// DBNet detection box score threshold (0.0..=1.0). Boxes whose mean
/// probability is below this are dropped. Default `0.3`.
#[serde(default = "default_ocr_score_thresh")]
pub score_thresh: f32,
/// Polygon unclip ratio applied to each detected box before crop.
/// Larger = more padding around the text. Default `1.5`.
#[serde(default = "default_ocr_unclip_ratio")]
pub unclip_ratio: f32,
/// Hard cap on detected boxes per image (runaway guard). Extra boxes
/// past this count are truncated with a warning. Default `1000`.
#[serde(default = "default_ocr_max_boxes")]
pub max_boxes: usize,
}
impl OcrCfg {
@@ -389,10 +419,29 @@ impl OcrCfg {
languages: vec!["eng".to_string(), "kor".to_string()],
max_pixels: 1600,
request_timeout_secs: default_ocr_request_timeout_secs(),
det_model: None,
rec_model: None,
dict: None,
score_thresh: default_ocr_score_thresh(),
unclip_ratio: default_ocr_unclip_ratio(),
max_boxes: default_ocr_max_boxes(),
}
}
}
/// paddle-onnx DBNet box score threshold default. See [`OcrCfg::score_thresh`].
fn default_ocr_score_thresh() -> f32 {
0.3
}
/// paddle-onnx unclip ratio default. See [`OcrCfg::unclip_ratio`].
fn default_ocr_unclip_ratio() -> f32 {
1.5
}
/// paddle-onnx box-count cap default. See [`OcrCfg::max_boxes`].
fn default_ocr_max_boxes() -> usize {
1000
}
/// v0.17.2 post-dogfood: matches the legacy hard-coded ceiling so
/// existing configs that omit the field keep behaving identically.
/// Overridable per config / `KEBAB_IMAGE_OCR_REQUEST_TIMEOUT_SECS`.
@@ -1098,6 +1147,34 @@ impl Config {
self.image.ocr.request_timeout_secs = n;
}
}
// paddle-onnx engine overrides (v0.27.0). Empty string → None
// (fall back to bundled / KEBAB_IMAGE_OCR_MODEL_DIR).
"KEBAB_IMAGE_OCR_DET_MODEL" => {
self.image.ocr.det_model =
if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_REC_MODEL" => {
self.image.ocr.rec_model =
if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_DICT" => {
self.image.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_SCORE_THRESH" => {
if let Ok(f) = v.parse::<f32>() {
self.image.ocr.score_thresh = f;
}
}
"KEBAB_IMAGE_OCR_UNCLIP_RATIO" => {
if let Ok(f) = v.parse::<f32>() {
self.image.ocr.unclip_ratio = f;
}
}
"KEBAB_IMAGE_OCR_MAX_BOXES" => {
if let Ok(n) = v.parse::<usize>() {
self.image.ocr.max_boxes = n;
}
}
// image.caption (P6-3)
"KEBAB_IMAGE_CAPTION_ENABLED" => {

View File

@@ -33,8 +33,8 @@ pub mod ocr;
pub mod paddle_onnx;
pub use caption::{apply_caption, caption_image};
pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};
pub use paddle_onnx::{OnnxPaddleOcr, PADDLE_ONNX_ENGINE};
pub use ocr::{OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, apply_ocr};
pub use paddle_onnx::{OnnxPaddleOcr, PADDLE_ONNX_ENGINE, engine_version_for_config};
use anyhow::{Context, Result};
use kebab_core::{

View File

@@ -65,6 +65,13 @@ pub trait OcrEngine: Send + Sync {
/// through to engines that benefit from it (Tesseract languages,
/// LLM prompt steering); ignore otherwise.
fn recognize(&self, image_bytes: &[u8], lang_hint: Option<&Lang>) -> Result<OcrText>;
/// Human-facing model label for the ingest progress display
/// (`AssetPhase{phase:"ocr", model}`). Distinct from
/// [`engine_version`](Self::engine_version), which is the cache-key
/// hash. E.g. `"gemma4:e4b"` (ollama-vision) or `"ppocrv5-mobile-kor"`
/// (paddle-onnx).
fn model(&self) -> &str;
}
/// Mutate `block.ocr` in place by running `engine` over `image_bytes`,
@@ -209,13 +216,6 @@ impl OllamaVisionOcr {
self.max_pixels
}
/// The Ollama model id this engine drives (e.g. `gemma4:e4b`).
/// Surfaced so the ingest progress display can name the model
/// running a slow OCR phase (`AssetPhase{phase:"ocr", model}`).
pub fn model(&self) -> &str {
&self.model
}
fn build_prompt(&self, lang_hint: Option<&Lang>) -> String {
let langs = if self.languages.is_empty() {
"any".to_string()
@@ -247,6 +247,10 @@ impl OcrEngine for OllamaVisionOcr {
format!("ollama/{}", self.model)
}
fn model(&self) -> &str {
&self.model
}
fn recognize(&self, image_bytes: &[u8], lang_hint: Option<&Lang>) -> Result<OcrText> {
let (prepared, w, h) = image_prep::downscale_to_png(image_bytes, self.max_pixels)
.context("preparing image for OCR")?;

View File

@@ -111,6 +111,22 @@ impl ModelPaths {
dict: dir.join("korean_dict.txt"),
}
}
/// Resolve model paths from the `image.ocr` config (T7). Each of
/// `det_model` / `rec_model` / `dict` overrides the corresponding bundled
/// path when set; unset fields fall back to [`from_default_dir`], so a
/// caller can override just one asset.
///
/// [`from_default_dir`]: ModelPaths::from_default_dir
pub fn from_config(config: &kebab_config::Config) -> Self {
let defaults = Self::from_default_dir();
let ocr = &config.image.ocr;
Self {
det: ocr.det_model.as_ref().map(PathBuf::from).unwrap_or(defaults.det),
rec: ocr.rec_model.as_ref().map(PathBuf::from).unwrap_or(defaults.rec),
dict: ocr.dict.as_ref().map(PathBuf::from).unwrap_or(defaults.dict),
}
}
}
impl OnnxPaddleOcr {
@@ -119,13 +135,14 @@ impl OnnxPaddleOcr {
/// Construction loads both ONNX sessions and hashes the assets — failures
/// here are fail-fast (matches the Ollama adapter's construction contract).
pub fn new(config: &kebab_config::Config) -> Result<Self> {
let paths = ModelPaths::from_default_dir();
let paths = ModelPaths::from_config(config);
let ocr = &config.image.ocr;
Self::from_paths(
&paths,
0.3,
1.5,
1000,
config.image.ocr.max_pixels,
ocr.score_thresh,
ocr.unclip_ratio,
ocr.max_boxes,
ocr.max_pixels,
)
}
@@ -209,6 +226,12 @@ impl OcrEngine for OnnxPaddleOcr {
self.engine_version.clone()
}
fn model(&self) -> &str {
// Static label for the progress display; the per-asset hash lives
// in `engine_version`.
"ppocrv5-mobile-kor"
}
fn recognize(&self, image_bytes: &[u8], _lang_hint: Option<&Lang>) -> Result<OcrText> {
let img = image::load_from_memory(image_bytes)
.context("decoding image for OCR")?
@@ -430,6 +453,15 @@ fn load_dict(path: &Path) -> Result<Vec<String>> {
Ok(lines)
}
/// Resolve the paddle-onnx `engine_version` for `config` without loading the
/// ONNX sessions (T9). This is the same blake3-over-assets string that a
/// constructed [`OnnxPaddleOcr`] exposes via [`OcrEngine::engine_version`], so
/// the ingest config signature can include it. Reads ~17 MB of model bytes —
/// callers MUST memoize per (det,rec,dict) triple (m3: never re-hash per asset).
pub fn engine_version_for_config(config: &kebab_config::Config) -> Result<String> {
compute_engine_version(&ModelPaths::from_config(config))
}
/// blake3 over det + rec + dict bytes → stable `engine_version`.
fn compute_engine_version(paths: &ModelPaths) -> Result<String> {
let mut hasher = blake3::Hasher::new();
@@ -802,6 +834,24 @@ mod tests {
assert!((hi - 20.0).abs() < 1e-3, "long side {hi}");
}
#[test]
fn model_paths_from_config_uses_overrides() {
// T7: unset overrides → bundled default asset paths.
let mut cfg = kebab_config::Config::defaults();
let def = ModelPaths::from_config(&cfg);
assert!(def.det.ends_with("ppocrv5_mobile_det.onnx"), "{:?}", def.det);
assert!(def.rec.ends_with("korean_ppocrv5_mobile_rec.onnx"), "{:?}", def.rec);
assert!(def.dict.ends_with("korean_dict.txt"), "{:?}", def.dict);
// Override det + dict; rec stays bundled (partial override allowed).
cfg.image.ocr.det_model = Some("/custom/det.onnx".to_string());
cfg.image.ocr.dict = Some("/custom/dict.txt".to_string());
let ov = ModelPaths::from_config(&cfg);
assert_eq!(ov.det, PathBuf::from("/custom/det.onnx"));
assert_eq!(ov.dict, PathBuf::from("/custom/dict.txt"));
assert!(ov.rec.ends_with("korean_ppocrv5_mobile_rec.onnx"), "{:?}", ov.rec);
}
#[test]
fn unclip_expands_box() {
let rect = RotRect {