feat(ocr): T7-T9 — config overrides + engine factory + signature cascade
T7: OcrCfg gains det_model/rec_model/dict overrides + score_thresh/
unclip_ratio/max_boxes (serde default, KEBAB_IMAGE_OCR_* env). OnnxPaddleOcr::new
threads them via ModelPaths::from_config.
T8: build_image_ocr_engine / build_pdf_ocr_engine factories return
Box<dyn OcrEngine>; match on engine string (ollama-vision|paddle-onnx|err).
ImagePipeline.ocr_engine + pdf_ocr_engine signatures switched to &dyn OcrEngine.
OcrEngine gains model() for the progress label.
T9: ingest_config_signature image/pdf branches emit |ocr:1:{engine}:{engine_version}
(memoized blake3 per asset-triple, m3-safe). Unit tests (a)(b)(c) added.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -52,7 +52,10 @@ use kebab_core::{
|
||||
SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore,
|
||||
};
|
||||
use kebab_llm_local::OllamaLanguageModel;
|
||||
use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_caption, apply_ocr};
|
||||
use kebab_parse_image::{
|
||||
OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, OnnxPaddleOcr, PADDLE_ONNX_ENGINE,
|
||||
apply_caption, apply_ocr, engine_version_for_config,
|
||||
};
|
||||
use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
|
||||
@@ -357,8 +360,8 @@ pub fn ingest_with_config_opts(
|
||||
// loop is correct and cheap. Construction failure (e.g. invalid
|
||||
// endpoint) aborts ingest fail-fast — better than silently disabling
|
||||
// OCR/caption mid-run.
|
||||
let ocr_engine: Option<OllamaVisionOcr> = if app.config.image.ocr.enabled {
|
||||
Some(OllamaVisionOcr::new(&app.config).context("kb-app::ingest: build OllamaVisionOcr")?)
|
||||
let ocr_engine: Option<Box<dyn OcrEngine>> = if app.config.image.ocr.enabled {
|
||||
Some(build_image_ocr_engine(&app.config).context("kb-app::ingest: build image OCR engine")?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -370,28 +373,17 @@ pub fn ingest_with_config_opts(
|
||||
None
|
||||
};
|
||||
let image_pipeline = ImagePipeline {
|
||||
ocr_engine: ocr_engine.as_ref(),
|
||||
ocr_engine: ocr_engine.as_deref(),
|
||||
caption_llm: caption_llm.as_deref(),
|
||||
};
|
||||
|
||||
// p10 / v0.20 sub-item 1: PDF OCR engine eager init (H-5 resolution).
|
||||
// image OCR pattern mirror — per-ingest 1회 build, fallible → fail-fast.
|
||||
let pdf_ocr_engine: Option<OllamaVisionOcr> =
|
||||
let pdf_ocr_engine: Option<Box<dyn OcrEngine>> =
|
||||
if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on {
|
||||
let cfg = &app.config.pdf.ocr;
|
||||
let endpoint = match cfg.endpoint.as_deref() {
|
||||
Some(s) if !s.is_empty() => s.to_string(),
|
||||
_ => app.config.models.llm.endpoint.clone(),
|
||||
};
|
||||
Some(
|
||||
OllamaVisionOcr::from_parts(
|
||||
endpoint,
|
||||
cfg.model.clone(),
|
||||
cfg.languages.clone(),
|
||||
cfg.max_pixels,
|
||||
cfg.request_timeout_secs,
|
||||
)
|
||||
.context("kb-app::ingest: build OllamaVisionOcr (pdf)")?,
|
||||
build_pdf_ocr_engine(&app.config)
|
||||
.context("kb-app::ingest: build pdf OCR engine")?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
@@ -488,7 +480,7 @@ pub fn ingest_with_config_opts(
|
||||
&existing_doc_ids,
|
||||
&image_pipeline,
|
||||
force_reingest,
|
||||
pdf_ocr_engine.as_ref(),
|
||||
pdf_ocr_engine.as_deref(),
|
||||
progress,
|
||||
opts.cancel.as_ref(),
|
||||
log_writer.clone(),
|
||||
@@ -832,11 +824,73 @@ fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String {
|
||||
/// `<… as JobRepo>` to be explicit.
|
||||
type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
|
||||
|
||||
/// v0.27.0 (T8): build the image OCR engine selected by
|
||||
/// `config.image.ocr.engine`. Returns a boxed trait object so the ingest
|
||||
/// pipeline is engine-agnostic. Construction is fail-fast (model load /
|
||||
/// hash / endpoint validation) — mirrors the prior concrete-type behaviour.
|
||||
///
|
||||
/// `--config` facade: the caller threads the explicit [`kebab_config::Config`]
|
||||
/// in, so `OnnxPaddleOcr::new` honours `image.ocr.{det_model,rec_model,dict,…}`
|
||||
/// overrides resolved from that config (not a re-loaded XDG default).
|
||||
fn build_image_ocr_engine(
|
||||
config: &kebab_config::Config,
|
||||
) -> anyhow::Result<Box<dyn OcrEngine>> {
|
||||
match config.image.ocr.engine.as_str() {
|
||||
OLLAMA_VISION_ENGINE => Ok(Box::new(
|
||||
OllamaVisionOcr::new(config).context("build OllamaVisionOcr")?,
|
||||
)),
|
||||
PADDLE_ONNX_ENGINE => Ok(Box::new(
|
||||
OnnxPaddleOcr::new(config).context("build OnnxPaddleOcr")?,
|
||||
)),
|
||||
other => anyhow::bail!(
|
||||
"unknown image.ocr.engine {other:?}; expected \
|
||||
{OLLAMA_VISION_ENGINE:?} or {PADDLE_ONNX_ENGINE:?}"
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// v0.27.0 (T8): build the PDF OCR engine selected by
|
||||
/// `config.pdf.ocr.engine`. The ollama-vision arm uses the PDF-specific
|
||||
/// `model` / `languages` / `max_pixels` / `request_timeout_secs` knobs (and
|
||||
/// endpoint fallback to `models.llm.endpoint`). The paddle-onnx arm shares
|
||||
/// the same bundled ONNX models as image OCR (resolved from `image.ocr`
|
||||
/// overrides) — PaddleOCR is page-agnostic and carries no per-engine prompt.
|
||||
fn build_pdf_ocr_engine(
|
||||
config: &kebab_config::Config,
|
||||
) -> anyhow::Result<Box<dyn OcrEngine>> {
|
||||
match config.pdf.ocr.engine.as_str() {
|
||||
OLLAMA_VISION_ENGINE => {
|
||||
let cfg = &config.pdf.ocr;
|
||||
let endpoint = match cfg.endpoint.as_deref() {
|
||||
Some(s) if !s.is_empty() => s.to_string(),
|
||||
_ => config.models.llm.endpoint.clone(),
|
||||
};
|
||||
Ok(Box::new(
|
||||
OllamaVisionOcr::from_parts(
|
||||
endpoint,
|
||||
cfg.model.clone(),
|
||||
cfg.languages.clone(),
|
||||
cfg.max_pixels,
|
||||
cfg.request_timeout_secs,
|
||||
)
|
||||
.context("build OllamaVisionOcr (pdf)")?,
|
||||
))
|
||||
}
|
||||
PADDLE_ONNX_ENGINE => Ok(Box::new(
|
||||
OnnxPaddleOcr::new(config).context("build OnnxPaddleOcr (pdf)")?,
|
||||
)),
|
||||
other => anyhow::bail!(
|
||||
"unknown pdf.ocr.engine {other:?}; expected \
|
||||
{OLLAMA_VISION_ENGINE:?} or {PADDLE_ONNX_ENGINE:?}"
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// P6-4: borrowed bundle of the three image-pipeline components built
|
||||
/// once per ingest invocation. Threaded through `ingest_one_asset` so
|
||||
/// the dispatch does not need ten separate parameters.
|
||||
struct ImagePipeline<'a> {
|
||||
ocr_engine: Option<&'a OllamaVisionOcr>,
|
||||
ocr_engine: Option<&'a dyn OcrEngine>,
|
||||
caption_llm: Option<&'a dyn LanguageModel>,
|
||||
}
|
||||
|
||||
@@ -1110,7 +1164,7 @@ fn ingest_one_asset(
|
||||
existing_doc_ids: &std::collections::HashSet<String>,
|
||||
image_pipeline: &ImagePipeline<'_>,
|
||||
force_reingest: bool,
|
||||
pdf_ocr_engine: Option<&OllamaVisionOcr>,
|
||||
pdf_ocr_engine: Option<&dyn OcrEngine>,
|
||||
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
|
||||
cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
|
||||
log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
|
||||
@@ -2093,7 +2147,7 @@ fn ingest_one_pdf_asset(
|
||||
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
|
||||
existing_doc_ids: &std::collections::HashSet<String>,
|
||||
force_reingest: bool,
|
||||
pdf_ocr_engine: Option<&OllamaVisionOcr>,
|
||||
pdf_ocr_engine: Option<&dyn OcrEngine>,
|
||||
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
|
||||
cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
|
||||
log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
|
||||
@@ -3017,6 +3071,50 @@ fn chunk_policy_from_config(config: &kebab_config::Config) -> ChunkPolicy {
|
||||
/// The output is purely a comparison token — it is never parsed back, so the
|
||||
/// exact format is internal. Field order is fixed and `Vec`s are joined so
|
||||
/// the same `Config` always yields the same string.
|
||||
/// Process-wide memo of the paddle-onnx `engine_version`, keyed by the
|
||||
/// resolved (det,rec,dict) override triple. Hashing the ~17 MB of model bytes
|
||||
/// happens once per triple per process (m3 — never re-hash per asset); the
|
||||
/// per-asset [`ingest_config_signature`] calls hit this cache.
|
||||
static PADDLE_OCR_VERSION_MEMO: std::sync::OnceLock<
|
||||
std::sync::Mutex<std::collections::HashMap<String, String>>,
|
||||
> = std::sync::OnceLock::new();
|
||||
|
||||
/// T9: resolve the OCR `engine_version` string used inside the ingest config
|
||||
/// signature. ollama-vision is self-describing from `engine/model` (cheap, no
|
||||
/// I/O). paddle-onnx hashes the bundled/override model assets (memoized).
|
||||
fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model: &str) -> String {
|
||||
if engine != PADDLE_ONNX_ENGINE {
|
||||
// ollama-vision (and any non-paddle engine): the daemon exposes no
|
||||
// stable per-model revision, so engine/model is the identity.
|
||||
return format!("ollama/{model}");
|
||||
}
|
||||
let ocr = &config.image.ocr;
|
||||
let key = format!(
|
||||
"{}|{}|{}",
|
||||
ocr.det_model.as_deref().unwrap_or("<bundled>"),
|
||||
ocr.rec_model.as_deref().unwrap_or("<bundled>"),
|
||||
ocr.dict.as_deref().unwrap_or("<bundled>"),
|
||||
);
|
||||
let memo = PADDLE_OCR_VERSION_MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()));
|
||||
if let Some(v) = memo.lock().unwrap().get(&key) {
|
||||
return v.clone();
|
||||
}
|
||||
// First call for this triple in this process: hash once. In any real
|
||||
// ingest the engine was already built (fail-fast) so the assets are
|
||||
// present and this succeeds; the path-derived identity below is an
|
||||
// unreachable-in-practice guard that keeps the signature total.
|
||||
let version = engine_version_for_config(config).unwrap_or_else(|e| {
|
||||
tracing::warn!(
|
||||
target: "kebab-app::ingest",
|
||||
error = %e,
|
||||
"paddle-onnx engine_version hash failed; using path-derived identity for signature"
|
||||
);
|
||||
format!("ppocrv5-mobile-kor-paths:{key}")
|
||||
});
|
||||
memo.lock().unwrap().insert(key, version.clone());
|
||||
version
|
||||
}
|
||||
|
||||
fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> String {
|
||||
// Common (every media type): chunking parameters that move chunk
|
||||
// boundaries. `target_tokens` / `overlap_tokens` change re-chunking for
|
||||
@@ -3033,7 +3131,14 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
|
||||
// a stable empty token so re-running the same config skips.
|
||||
let ocr = &config.image.ocr;
|
||||
if ocr.enabled {
|
||||
sig.push_str(&format!("|ocr:1:{}", ocr.model));
|
||||
// v0.27.0 (T9): engine + engine_version so switching engine
|
||||
// (ollama-vision ↔ paddle-onnx) OR changing the model/assets
|
||||
// invalidates downstream chunks (design §9 cascade).
|
||||
sig.push_str(&format!(
|
||||
"|ocr:1:{}:{}",
|
||||
ocr.engine,
|
||||
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
|
||||
));
|
||||
} else {
|
||||
sig.push_str("|ocr:0");
|
||||
}
|
||||
@@ -3049,9 +3154,14 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
|
||||
// (mirrors the ingest gate). `model` only matters when active.
|
||||
let ocr = &config.pdf.ocr;
|
||||
if ocr.enabled || ocr.always_on {
|
||||
// v0.27.0 (T9): engine + engine_version (same cascade rule as
|
||||
// image OCR above) alongside the enabled/always_on gate.
|
||||
sig.push_str(&format!(
|
||||
"|pdfocr:{}:{}:{}",
|
||||
ocr.enabled, ocr.always_on, ocr.model
|
||||
"|pdfocr:{}:{}:{}:{}",
|
||||
ocr.enabled,
|
||||
ocr.always_on,
|
||||
ocr.engine,
|
||||
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
|
||||
));
|
||||
} else {
|
||||
sig.push_str("|pdfocr:0");
|
||||
@@ -3816,4 +3926,93 @@ mod ingest_config_signature_tests {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── v0.27.0 (T9): engine + engine_version cascade ─────────────────────
|
||||
|
||||
/// (a) Switching the engine (ollama-vision → paddle-onnx) with the SAME
|
||||
/// model id changes the image signature — different engines produce
|
||||
/// different output even from an identically-named model.
|
||||
#[test]
|
||||
fn image_ocr_engine_switch_invalidates_image() {
|
||||
let mut ollama = Config::defaults();
|
||||
ollama.image.ocr.enabled = true;
|
||||
// same `model` string on both — only the engine differs
|
||||
let mut paddle = ollama.clone();
|
||||
paddle.image.ocr.engine = "paddle-onnx".to_string();
|
||||
assert_ne!(
|
||||
ingest_config_signature(&ollama, &img()),
|
||||
ingest_config_signature(&paddle, &img()),
|
||||
"engine switch with identical model must invalidate images"
|
||||
);
|
||||
}
|
||||
|
||||
/// (b) A different engine_version (here: a different ollama model id, which
|
||||
/// the signature folds into `ollama/{model}`) changes the image signature.
|
||||
#[test]
|
||||
fn image_ocr_engine_version_change_invalidates_image() {
|
||||
let mut a = Config::defaults();
|
||||
a.image.ocr.enabled = true;
|
||||
a.image.ocr.model = "gemma4:e4b".to_string();
|
||||
let mut b = a.clone();
|
||||
b.image.ocr.model = "qwen2.5vl:3b".to_string();
|
||||
assert_ne!(
|
||||
ingest_config_signature(&a, &img()),
|
||||
ingest_config_signature(&b, &img()),
|
||||
"engine_version change must invalidate images"
|
||||
);
|
||||
}
|
||||
|
||||
/// (b') For the paddle-onnx engine, pointing at a different model asset
|
||||
/// (override path) yields a different engine_version → different signature.
|
||||
#[test]
|
||||
fn image_ocr_paddle_model_path_change_invalidates_image() {
|
||||
let mut base = Config::defaults();
|
||||
base.image.ocr.enabled = true;
|
||||
base.image.ocr.engine = "paddle-onnx".to_string();
|
||||
let mut overridden = base.clone();
|
||||
overridden.image.ocr.det_model = Some("/some/other/det.onnx".to_string());
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &img()),
|
||||
ingest_config_signature(&overridden, &img()),
|
||||
"paddle-onnx model path change must invalidate images"
|
||||
);
|
||||
}
|
||||
|
||||
/// (c) Unrelated settings leave the paddle-onnx image signature stable
|
||||
/// (engine_version is memoized + deterministic for a fixed asset triple).
|
||||
#[test]
|
||||
fn paddle_image_signature_stable_for_unrelated_change() {
|
||||
let mut base = Config::defaults();
|
||||
base.image.ocr.enabled = true;
|
||||
base.image.ocr.engine = "paddle-onnx".to_string();
|
||||
let mut other = base.clone();
|
||||
other.search.default_k += 3;
|
||||
other.image.ocr.max_pixels += 100; // runtime-only knob
|
||||
assert_eq!(
|
||||
ingest_config_signature(&base, &img()),
|
||||
ingest_config_signature(&other, &img()),
|
||||
"unrelated/runtime-only changes must not invalidate paddle images"
|
||||
);
|
||||
}
|
||||
|
||||
/// PDF OCR: engine switch with the same model invalidates pdf only.
|
||||
#[test]
|
||||
fn pdf_ocr_engine_switch_invalidates_pdf() {
|
||||
let mut ollama = Config::defaults();
|
||||
ollama.pdf.ocr.enabled = true;
|
||||
let mut paddle = ollama.clone();
|
||||
paddle.pdf.ocr.engine = "paddle-onnx".to_string();
|
||||
assert_ne!(
|
||||
ingest_config_signature(&ollama, &pdf()),
|
||||
ingest_config_signature(&paddle, &pdf()),
|
||||
"pdf engine switch must invalidate pdf"
|
||||
);
|
||||
for m in [md(), img(), code()] {
|
||||
assert_eq!(
|
||||
ingest_config_signature(&ollama, &m),
|
||||
ingest_config_signature(&paddle, &m),
|
||||
"pdf engine switch must NOT touch {m:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,6 +39,10 @@ impl OcrEngine for MockOcrEngine {
|
||||
"mock-v1".to_string()
|
||||
}
|
||||
|
||||
fn model(&self) -> &str {
|
||||
"mock-model"
|
||||
}
|
||||
|
||||
fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
|
||||
if self.fail {
|
||||
anyhow::bail!("mock failure");
|
||||
|
||||
@@ -377,6 +377,36 @@ pub struct OcrCfg {
|
||||
/// `86400`).
|
||||
#[serde(default = "default_ocr_request_timeout_secs")]
|
||||
pub request_timeout_secs: u64,
|
||||
|
||||
// ── paddle-onnx engine overrides (v0.27.0) ──────────────────────────
|
||||
// Only consulted when `engine == "paddle-onnx"`; the ollama-vision
|
||||
// engine ignores them. All `#[serde(default)]` so pre-v0.27 config
|
||||
// files load unchanged.
|
||||
/// Override path to the detection ONNX model. `None` → bundled
|
||||
/// `assets/paddleocr-onnx/ppocrv5_mobile_det.onnx` (or the directory
|
||||
/// named by `KEBAB_IMAGE_OCR_MODEL_DIR`).
|
||||
#[serde(default)]
|
||||
pub det_model: Option<String>,
|
||||
/// Override path to the recognition ONNX model. `None` → bundled
|
||||
/// `assets/paddleocr-onnx/korean_ppocrv5_mobile_rec.onnx`.
|
||||
#[serde(default)]
|
||||
pub rec_model: Option<String>,
|
||||
/// Override path to the character dictionary. `None` → bundled
|
||||
/// `assets/paddleocr-onnx/korean_dict.txt`.
|
||||
#[serde(default)]
|
||||
pub dict: Option<String>,
|
||||
/// DBNet detection box score threshold (0.0..=1.0). Boxes whose mean
|
||||
/// probability is below this are dropped. Default `0.3`.
|
||||
#[serde(default = "default_ocr_score_thresh")]
|
||||
pub score_thresh: f32,
|
||||
/// Polygon unclip ratio applied to each detected box before crop.
|
||||
/// Larger = more padding around the text. Default `1.5`.
|
||||
#[serde(default = "default_ocr_unclip_ratio")]
|
||||
pub unclip_ratio: f32,
|
||||
/// Hard cap on detected boxes per image (runaway guard). Extra boxes
|
||||
/// past this count are truncated with a warning. Default `1000`.
|
||||
#[serde(default = "default_ocr_max_boxes")]
|
||||
pub max_boxes: usize,
|
||||
}
|
||||
|
||||
impl OcrCfg {
|
||||
@@ -389,10 +419,29 @@ impl OcrCfg {
|
||||
languages: vec!["eng".to_string(), "kor".to_string()],
|
||||
max_pixels: 1600,
|
||||
request_timeout_secs: default_ocr_request_timeout_secs(),
|
||||
det_model: None,
|
||||
rec_model: None,
|
||||
dict: None,
|
||||
score_thresh: default_ocr_score_thresh(),
|
||||
unclip_ratio: default_ocr_unclip_ratio(),
|
||||
max_boxes: default_ocr_max_boxes(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// paddle-onnx DBNet box score threshold default. See [`OcrCfg::score_thresh`].
|
||||
fn default_ocr_score_thresh() -> f32 {
|
||||
0.3
|
||||
}
|
||||
/// paddle-onnx unclip ratio default. See [`OcrCfg::unclip_ratio`].
|
||||
fn default_ocr_unclip_ratio() -> f32 {
|
||||
1.5
|
||||
}
|
||||
/// paddle-onnx box-count cap default. See [`OcrCfg::max_boxes`].
|
||||
fn default_ocr_max_boxes() -> usize {
|
||||
1000
|
||||
}
|
||||
|
||||
/// v0.17.2 post-dogfood: matches the legacy hard-coded ceiling so
|
||||
/// existing configs that omit the field keep behaving identically.
|
||||
/// Overridable per config / `KEBAB_IMAGE_OCR_REQUEST_TIMEOUT_SECS`.
|
||||
@@ -1098,6 +1147,34 @@ impl Config {
|
||||
self.image.ocr.request_timeout_secs = n;
|
||||
}
|
||||
}
|
||||
// paddle-onnx engine overrides (v0.27.0). Empty string → None
|
||||
// (fall back to bundled / KEBAB_IMAGE_OCR_MODEL_DIR).
|
||||
"KEBAB_IMAGE_OCR_DET_MODEL" => {
|
||||
self.image.ocr.det_model =
|
||||
if v.is_empty() { None } else { Some(v.clone()) };
|
||||
}
|
||||
"KEBAB_IMAGE_OCR_REC_MODEL" => {
|
||||
self.image.ocr.rec_model =
|
||||
if v.is_empty() { None } else { Some(v.clone()) };
|
||||
}
|
||||
"KEBAB_IMAGE_OCR_DICT" => {
|
||||
self.image.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) };
|
||||
}
|
||||
"KEBAB_IMAGE_OCR_SCORE_THRESH" => {
|
||||
if let Ok(f) = v.parse::<f32>() {
|
||||
self.image.ocr.score_thresh = f;
|
||||
}
|
||||
}
|
||||
"KEBAB_IMAGE_OCR_UNCLIP_RATIO" => {
|
||||
if let Ok(f) = v.parse::<f32>() {
|
||||
self.image.ocr.unclip_ratio = f;
|
||||
}
|
||||
}
|
||||
"KEBAB_IMAGE_OCR_MAX_BOXES" => {
|
||||
if let Ok(n) = v.parse::<usize>() {
|
||||
self.image.ocr.max_boxes = n;
|
||||
}
|
||||
}
|
||||
|
||||
// image.caption (P6-3)
|
||||
"KEBAB_IMAGE_CAPTION_ENABLED" => {
|
||||
|
||||
@@ -33,8 +33,8 @@ pub mod ocr;
|
||||
pub mod paddle_onnx;
|
||||
|
||||
pub use caption::{apply_caption, caption_image};
|
||||
pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};
|
||||
pub use paddle_onnx::{OnnxPaddleOcr, PADDLE_ONNX_ENGINE};
|
||||
pub use ocr::{OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, apply_ocr};
|
||||
pub use paddle_onnx::{OnnxPaddleOcr, PADDLE_ONNX_ENGINE, engine_version_for_config};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kebab_core::{
|
||||
|
||||
@@ -65,6 +65,13 @@ pub trait OcrEngine: Send + Sync {
|
||||
/// through to engines that benefit from it (Tesseract languages,
|
||||
/// LLM prompt steering); ignore otherwise.
|
||||
fn recognize(&self, image_bytes: &[u8], lang_hint: Option<&Lang>) -> Result<OcrText>;
|
||||
|
||||
/// Human-facing model label for the ingest progress display
|
||||
/// (`AssetPhase{phase:"ocr", model}`). Distinct from
|
||||
/// [`engine_version`](Self::engine_version), which is the cache-key
|
||||
/// hash. E.g. `"gemma4:e4b"` (ollama-vision) or `"ppocrv5-mobile-kor"`
|
||||
/// (paddle-onnx).
|
||||
fn model(&self) -> &str;
|
||||
}
|
||||
|
||||
/// Mutate `block.ocr` in place by running `engine` over `image_bytes`,
|
||||
@@ -209,13 +216,6 @@ impl OllamaVisionOcr {
|
||||
self.max_pixels
|
||||
}
|
||||
|
||||
/// The Ollama model id this engine drives (e.g. `gemma4:e4b`).
|
||||
/// Surfaced so the ingest progress display can name the model
|
||||
/// running a slow OCR phase (`AssetPhase{phase:"ocr", model}`).
|
||||
pub fn model(&self) -> &str {
|
||||
&self.model
|
||||
}
|
||||
|
||||
fn build_prompt(&self, lang_hint: Option<&Lang>) -> String {
|
||||
let langs = if self.languages.is_empty() {
|
||||
"any".to_string()
|
||||
@@ -247,6 +247,10 @@ impl OcrEngine for OllamaVisionOcr {
|
||||
format!("ollama/{}", self.model)
|
||||
}
|
||||
|
||||
fn model(&self) -> &str {
|
||||
&self.model
|
||||
}
|
||||
|
||||
fn recognize(&self, image_bytes: &[u8], lang_hint: Option<&Lang>) -> Result<OcrText> {
|
||||
let (prepared, w, h) = image_prep::downscale_to_png(image_bytes, self.max_pixels)
|
||||
.context("preparing image for OCR")?;
|
||||
|
||||
@@ -111,6 +111,22 @@ impl ModelPaths {
|
||||
dict: dir.join("korean_dict.txt"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve model paths from the `image.ocr` config (T7). Each of
|
||||
/// `det_model` / `rec_model` / `dict` overrides the corresponding bundled
|
||||
/// path when set; unset fields fall back to [`from_default_dir`], so a
|
||||
/// caller can override just one asset.
|
||||
///
|
||||
/// [`from_default_dir`]: ModelPaths::from_default_dir
|
||||
pub fn from_config(config: &kebab_config::Config) -> Self {
|
||||
let defaults = Self::from_default_dir();
|
||||
let ocr = &config.image.ocr;
|
||||
Self {
|
||||
det: ocr.det_model.as_ref().map(PathBuf::from).unwrap_or(defaults.det),
|
||||
rec: ocr.rec_model.as_ref().map(PathBuf::from).unwrap_or(defaults.rec),
|
||||
dict: ocr.dict.as_ref().map(PathBuf::from).unwrap_or(defaults.dict),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OnnxPaddleOcr {
|
||||
@@ -119,13 +135,14 @@ impl OnnxPaddleOcr {
|
||||
/// Construction loads both ONNX sessions and hashes the assets — failures
|
||||
/// here are fail-fast (matches the Ollama adapter's construction contract).
|
||||
pub fn new(config: &kebab_config::Config) -> Result<Self> {
|
||||
let paths = ModelPaths::from_default_dir();
|
||||
let paths = ModelPaths::from_config(config);
|
||||
let ocr = &config.image.ocr;
|
||||
Self::from_paths(
|
||||
&paths,
|
||||
0.3,
|
||||
1.5,
|
||||
1000,
|
||||
config.image.ocr.max_pixels,
|
||||
ocr.score_thresh,
|
||||
ocr.unclip_ratio,
|
||||
ocr.max_boxes,
|
||||
ocr.max_pixels,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -209,6 +226,12 @@ impl OcrEngine for OnnxPaddleOcr {
|
||||
self.engine_version.clone()
|
||||
}
|
||||
|
||||
fn model(&self) -> &str {
|
||||
// Static label for the progress display; the per-asset hash lives
|
||||
// in `engine_version`.
|
||||
"ppocrv5-mobile-kor"
|
||||
}
|
||||
|
||||
fn recognize(&self, image_bytes: &[u8], _lang_hint: Option<&Lang>) -> Result<OcrText> {
|
||||
let img = image::load_from_memory(image_bytes)
|
||||
.context("decoding image for OCR")?
|
||||
@@ -430,6 +453,15 @@ fn load_dict(path: &Path) -> Result<Vec<String>> {
|
||||
Ok(lines)
|
||||
}
|
||||
|
||||
/// Resolve the paddle-onnx `engine_version` for `config` without loading the
|
||||
/// ONNX sessions (T9). This is the same blake3-over-assets string that a
|
||||
/// constructed [`OnnxPaddleOcr`] exposes via [`OcrEngine::engine_version`], so
|
||||
/// the ingest config signature can include it. Reads ~17 MB of model bytes —
|
||||
/// callers MUST memoize per (det,rec,dict) triple (m3: never re-hash per asset).
|
||||
pub fn engine_version_for_config(config: &kebab_config::Config) -> Result<String> {
|
||||
compute_engine_version(&ModelPaths::from_config(config))
|
||||
}
|
||||
|
||||
/// blake3 over det + rec + dict bytes → stable `engine_version`.
|
||||
fn compute_engine_version(paths: &ModelPaths) -> Result<String> {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
@@ -802,6 +834,24 @@ mod tests {
|
||||
assert!((hi - 20.0).abs() < 1e-3, "long side {hi}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn model_paths_from_config_uses_overrides() {
|
||||
// T7: unset overrides → bundled default asset paths.
|
||||
let mut cfg = kebab_config::Config::defaults();
|
||||
let def = ModelPaths::from_config(&cfg);
|
||||
assert!(def.det.ends_with("ppocrv5_mobile_det.onnx"), "{:?}", def.det);
|
||||
assert!(def.rec.ends_with("korean_ppocrv5_mobile_rec.onnx"), "{:?}", def.rec);
|
||||
assert!(def.dict.ends_with("korean_dict.txt"), "{:?}", def.dict);
|
||||
|
||||
// Override det + dict; rec stays bundled (partial override allowed).
|
||||
cfg.image.ocr.det_model = Some("/custom/det.onnx".to_string());
|
||||
cfg.image.ocr.dict = Some("/custom/dict.txt".to_string());
|
||||
let ov = ModelPaths::from_config(&cfg);
|
||||
assert_eq!(ov.det, PathBuf::from("/custom/det.onnx"));
|
||||
assert_eq!(ov.dict, PathBuf::from("/custom/dict.txt"));
|
||||
assert!(ov.rec.ends_with("korean_ppocrv5_mobile_rec.onnx"), "{:?}", ov.rec);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unclip_expands_box() {
|
||||
let rect = RotRect {
|
||||
|
||||
Reference in New Issue
Block a user