feat(ocr): PP-OCRv5 ONNX Rust 네이티브 OCR 엔진 #206

Merged
altair823 merged 8 commits from feat/rust-native-ocr into main 2026-06-04 09:24:41 +00:00
24 changed files with 14500 additions and 62 deletions

6
.gitattributes vendored Normal file
View File

@@ -0,0 +1,6 @@
# PP-OCRv5 ONNX OCR models (paddle-onnx engine). git-lfs is not installed on
# this host, so they are committed as plain binary blobs (treated as binary —
# no textual diff/merge). If/when git-lfs becomes available, migrate with
# `git lfs migrate import --include='*.onnx'` and restore the filter line:
# *.onnx filter=lfs diff=lfs merge=lfs -text
*.onnx -text

126
Cargo.lock generated
View File

@@ -4417,6 +4417,24 @@ dependencies = [
"quick-error 2.0.1",
]
[[package]]
name = "imageproc"
version = "0.25.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "602b4e8a4cc3e98372b766cd184ab532999bc0e839b7469e759511ccabc65d77"
dependencies = [
"ab_glyph",
"approx",
"getrandom 0.2.17",
"image",
"itertools 0.12.1",
"nalgebra",
"num",
"rand 0.8.6",
"rand_distr 0.4.3",
"rayon",
]
[[package]]
name = "imgref"
version = "1.12.1"
@@ -4548,6 +4566,15 @@ dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.13.0"
@@ -4724,7 +4751,7 @@ dependencies = [
[[package]]
name = "kebab-app"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"base64 0.22.1",
@@ -4772,7 +4799,7 @@ dependencies = [
[[package]]
name = "kebab-chunk"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"blake3",
@@ -4790,7 +4817,7 @@ dependencies = [
[[package]]
name = "kebab-cli"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"clap",
@@ -4811,7 +4838,7 @@ dependencies = [
[[package]]
name = "kebab-config"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"dirs 5.0.1",
@@ -4827,7 +4854,7 @@ dependencies = [
[[package]]
name = "kebab-core"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"blake3",
@@ -4841,7 +4868,7 @@ dependencies = [
[[package]]
name = "kebab-embed"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"blake3",
@@ -4855,7 +4882,7 @@ dependencies = [
[[package]]
name = "kebab-embed-candle"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"candle-core",
@@ -4875,7 +4902,7 @@ dependencies = [
[[package]]
name = "kebab-embed-local"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"fastembed",
@@ -4888,7 +4915,7 @@ dependencies = [
[[package]]
name = "kebab-embed-ollama"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"kebab-config",
@@ -4903,7 +4930,7 @@ dependencies = [
[[package]]
name = "kebab-eval"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"kebab-app",
@@ -4922,7 +4949,7 @@ dependencies = [
[[package]]
name = "kebab-llm"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"kebab-core",
@@ -4931,7 +4958,7 @@ dependencies = [
[[package]]
name = "kebab-llm-local"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"kebab-config",
@@ -4948,7 +4975,7 @@ dependencies = [
[[package]]
name = "kebab-mcp"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"kebab-app",
@@ -4966,7 +4993,7 @@ dependencies = [
[[package]]
name = "kebab-nli"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"hf-hub",
@@ -4981,7 +5008,7 @@ dependencies = [
[[package]]
name = "kebab-parse-code"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"gix",
@@ -5004,22 +5031,26 @@ dependencies = [
[[package]]
name = "kebab-parse-image"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"ab_glyph",
"anyhow",
"base64 0.22.1",
"blake3",
"image",
"imageproc",
"kamadak-exif",
"kebab-config",
"kebab-core",
"kebab-llm",
"kebab-llm-local",
"ndarray",
"ort",
"reqwest 0.12.28",
"serde",
"serde_json",
"tempfile",
"thiserror 2.0.18",
"time",
"tokio",
"tracing",
@@ -5028,7 +5059,7 @@ dependencies = [
[[package]]
name = "kebab-parse-md"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"kebab-core",
@@ -5045,7 +5076,7 @@ dependencies = [
[[package]]
name = "kebab-parse-pdf"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"blake3",
@@ -5060,7 +5091,7 @@ dependencies = [
[[package]]
name = "kebab-rag"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"blake3",
@@ -5082,7 +5113,7 @@ dependencies = [
[[package]]
name = "kebab-search"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"globset",
@@ -5101,7 +5132,7 @@ dependencies = [
[[package]]
name = "kebab-source-fs"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"blake3",
@@ -5119,7 +5150,7 @@ dependencies = [
[[package]]
name = "kebab-store-sqlite"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"blake3",
@@ -5139,7 +5170,7 @@ dependencies = [
[[package]]
name = "kebab-store-vector"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"arrow",
@@ -5163,7 +5194,7 @@ dependencies = [
[[package]]
name = "kebab-tui"
version = "0.26.2"
version = "0.27.0"
dependencies = [
"anyhow",
"crossterm",
@@ -6423,6 +6454,21 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13d2233c9842d08cfe13f9eac96e207ca6a2ea10b80259ebe8ad0268be27d2af"
[[package]]
name = "nalgebra"
version = "0.32.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b5c17de023a86f59ed79891b2e5d5a94c705dbe904a5b5c9c952ea6221b03e4"
dependencies = [
"approx",
"matrixmultiply",
"num-complex",
"num-rational",
"num-traits",
"simba",
"typenum",
]
[[package]]
name = "native-tls"
version = "0.2.18"
@@ -8238,6 +8284,15 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15"
[[package]]
name = "safe_arch"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96b02de82ddbe1b636e6170c21be622223aea188ef2e139be0a5b219ec215323"
dependencies = [
"bytemuck",
]
[[package]]
name = "safetensors"
version = "0.4.5"
@@ -8615,6 +8670,19 @@ dependencies = [
"libc",
]
[[package]]
name = "simba"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "061507c94fc6ab4ba1c9a0305018408e312e17c041eb63bef8aa726fa33aceae"
dependencies = [
"approx",
"num-complex",
"num-traits",
"paste",
"wide",
]
[[package]]
name = "simd-adler32"
version = "0.3.9"
@@ -10220,6 +10288,16 @@ version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88"
[[package]]
name = "wide"
version = "0.7.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ce5da8ecb62bcd8ec8b7ea19f69a51275e91299be594ea5cc6ef7819e16cd03"
dependencies = [
"bytemuck",
"safe_arch",
]
[[package]]
name = "winapi"
version = "0.3.9"

View File

@@ -32,7 +32,7 @@ edition = "2024"
rust-version = "1.85"
license = "MIT OR Apache-2.0"
repository = "https://github.com/altair823/kebab"
version = "0.26.2" # v0.26.2 — ingest 설정 변경 시 영향 자산 자동 재색인: ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/[ingest.code])의 결정적 서명을 effective parser_version(skip 비교 + 저장 doc 필드 양쪽)에 폴딩 → 해당 설정 변경 시 `--force-reingest` 없이 영향 자산만 자동 재색인. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout 등)은 제외(과도 무효화 회피). doc_id 는 base parser_version 으로 안정 유지(orphan churn 회피). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정) → patch. — CLAUDE.md §Release
version = "0.27.0" # v0.27.0 — PP-OCRv5 ONNX Rust 네이티브 OCR 엔진: `[image.ocr] engine = "paddle-onnx"` (default 여전히 "ollama-vision") 로 in-process 검출+인식(`ort` =2.0.0-rc.9, Python 런타임 0). DBNet det + CTC rec, 후처리(min-area rect/unclip)는 pure-Rust. e2e CER 0.005(synthetic 한/영, PoC 0.024 대비 우수), 큰 페이지 CPU <4초(Ollama vision ~50초 대비). 신규 config `det_model`/`rec_model`/`dict`/`score_thresh`/`unclip_ratio`/`max_boxes` + `KEBAB_IMAGE_OCR_*` env. ingest 서명 `|ocr:1:{engine}:{engine_version}` 로 engine/모델 변경 시 자동 재색인. 신규 인터페이스(engine 값/config 키) → minor. — CLAUDE.md §Release
# pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with
# intentional allow-list. The allowed lints are either cosmetic (doc style),

View File

@@ -35,6 +35,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만:
- **2026-06-04 PP-OCRv5 ONNX Rust 네이티브 OCR** — v0.27.0. `[image.ocr] engine = "paddle-onnx"` 로 PP-OCRv5(검출+인식) ONNX 를 in-process(`ort` =2.0.0-rc.9) 실행 — Python 런타임/원격 호출 없이 큰 페이지 CPU <4초(Ollama vision ~50초 대비). default 는 여전히 `"ollama-vision"`. 후처리(min-area rect/unclip)는 pure-Rust. **함정**: unclip 은 corner 를 centroid 에서 방사 확장하면 안 되고 edge 별 polygon offset 이어야 함(방사 확장 시 wide/short 텍스트 박스 높이가 안 커져 글자 윗부분 잘림 → ㄷ→ㄴ, e2e CER 0.26). 수정 후 CER 0.005. 모델 ONNX 는 `crates/kebab-parse-image/assets/paddleocr-onnx/`(LFS). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-04 PP-OCRv5 ONNX), spec/plan `docs/superpowers/{specs,plans}/2026-06-04-rust-native-ocr-*.md`.
- **2026-06-03 ingest 설정 변경 자동 재색인** — v0.26.2. ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/`[ingest.code]`)을 변경하면 `--force-reingest` 없이 영향 자산만 자동 재색인. 그 설정들의 결정적 서명(`ingest_config_signature`)을 effective parser_version(skip 비교 + 저장 doc 필드 양쪽)에 폴딩 → 다음 ingest 비교가 mismatch. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout)은 제외(과도 무효화 회피), doc_id 는 base 로 안정 유지. **업그레이드 후 첫 ingest 는 전 자산 1회 재색인**(저장된 상수 parser_version ≠ 새 composite; embedding 은 V012 캐시 히트). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 설정 변경 자동 재색인), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-*invalidation*.md`.
- **2026-06-03 ingest 진행 로그 개선** — v0.26.1. 이미지/PDF + OCR/caption on 볼트 ingest 가 "멈춘 듯" 보이던 문제 해소: TTY 진행바에 현재 파일명 + 느린 phase(ocr/caption/embed)+모델명 + 경과초 `(Ns)` heartbeat, 종료 시 최장 소요 파일 top-5 요약. 신규 wire `asset_phase{idx,total,phase,model}` + `asset_timings.ocr_ms`/`caption_ms`(additive, `ingest_progress.v1` 유지, serde default 0). 이미지·PDF 경로도 `asset_timings` emit(이전 markdown 만). 기본 동작 불변. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 진행 로그), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-ingest-log-improve-*.md`.
- **2026-06-03 arctic-embed-l-v2.0 임베더 통합** — v0.26.0. 별칭 제거 후 설명형 query recall 보강(측정 recall@10 130/132, e5 +7). `kebab-embed-candle` 모델 레지스트리화(e5 mean + `snowflake-arctic-embed-l-v2.0` CLS, 모델별 pooling/prefix) + 신규 `kebab-embed-ollama`(`provider="ollama"`, `/api/embed`). config `endpoint: Option<String>` 추가. 기본 e5 유지(opt-in), arctic 전환은 embedding_version cascade → 재색인. candle↔Ollama cosine>0.99 게이트로 pooling/prefix 정확성 고정(`#[ignore]`). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 arctic), spec `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`.

View File

@@ -184,7 +184,8 @@ nli_threshold = 0.0 # >0 (예: 0.5) 면 mDeBERTa XNLI groundedn
- **파생물 캐시** — embedding 결과를 내용 해시로 자동 캐싱한다 (위 「핵심 기능」 참고). 설정 항목 없음.
- **`[ingest.code]`** — code ingest 의 skip 정책 (`skip_generated_header`, `max_file_bytes`, `extra_skip_globs`). `.gitignore` 자동 honor, `.kebabignore` 는 추가 layer.
- **`[pdf.ocr]`** — scanned PDF 의 page-단위 OCR (default off / opt-in, page 당 ~수십 초 cost). 활성화 후 v0.19 시절 색인분은 `kebab ingest --force-reingest` 로 재처리.
- **`[image.ocr]`** — 이미지 OCR (default off / opt-in). `engine` 으로 백엔드 선택: `"ollama-vision"` (default, 원격 vision LM) 또는 `"paddle-onnx"` (v0.27.0 신규 — PP-OCRv5 ONNX 를 in-process 로 실행, Python 런타임 불필요, 큰 페이지 CPU <4초, 오프라인). `paddle-onnx` 는 워크스페이스에 번들된 모델을 쓰며 `det_model`/`rec_model`/`dict` 로 경로 override, `score_thresh`(0.3)/`unclip_ratio`(1.5)/`max_boxes`(1000) 로 검출 튜닝 가능 (`KEBAB_IMAGE_OCR_*` env 동일 지원). engine 또는 모델을 바꾸면 영향 이미지가 자동 재색인된다.
- **`[pdf.ocr]`** — scanned PDF 의 page-단위 OCR (default off / opt-in, page 당 ~수십 초 cost). `engine``[image.ocr]` 과 동일하게 `"ollama-vision"`/`"paddle-onnx"` 선택. 활성화 후 v0.19 시절 색인분은 `kebab ingest --force-reingest` 로 재처리.
- **`--config <path>`** — 임시 워크스페이스 / 격리 테스트용 (CLI · TUI 모두 honor).
- **`kebab config migrate`** — 새 버전에서 추가된 config 섹션을 기존 `config.toml` 에 설명 주석과 함께 채워 넣는다 (사용자가 손본 값·주석·순서는 보존, 멱등, 변경 시 자동 `.bak` 백업). `--dry-run` 으로 변경 미리보기. `kebab doctor` 가 갱신 필요 시 안내한다. `kebab init` 으로 새로 생성되는 config.toml 도 섹션별 주석을 포함한다.
- **`KEBAB_*` env** — 일부 키 override (`KEBAB_RAG_SCORE_GATE`, `KEBAB_EVAL_GOLDEN` 등).

View File

@@ -52,7 +52,10 @@ use kebab_core::{
SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore,
};
use kebab_llm_local::OllamaLanguageModel;
use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_caption, apply_ocr};
use kebab_parse_image::{
OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, OnnxPaddleOcr, PADDLE_ONNX_ENGINE,
apply_caption, apply_ocr, engine_version_for_config,
};
use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
use kebab_source_fs::FsSourceConnector;
@@ -357,8 +360,8 @@ pub fn ingest_with_config_opts(
// loop is correct and cheap. Construction failure (e.g. invalid
// endpoint) aborts ingest fail-fast — better than silently disabling
// OCR/caption mid-run.
let ocr_engine: Option<OllamaVisionOcr> = if app.config.image.ocr.enabled {
Some(OllamaVisionOcr::new(&app.config).context("kb-app::ingest: build OllamaVisionOcr")?)
let ocr_engine: Option<Box<dyn OcrEngine>> = if app.config.image.ocr.enabled {
Some(build_image_ocr_engine(&app.config).context("kb-app::ingest: build image OCR engine")?)
} else {
None
};
@@ -370,28 +373,17 @@ pub fn ingest_with_config_opts(
None
};
let image_pipeline = ImagePipeline {
ocr_engine: ocr_engine.as_ref(),
ocr_engine: ocr_engine.as_deref(),
caption_llm: caption_llm.as_deref(),
};
// p10 / v0.20 sub-item 1: PDF OCR engine eager init (H-5 resolution).
// image OCR pattern mirror — per-ingest 1회 build, fallible → fail-fast.
let pdf_ocr_engine: Option<OllamaVisionOcr> =
let pdf_ocr_engine: Option<Box<dyn OcrEngine>> =
if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on {
let cfg = &app.config.pdf.ocr;
let endpoint = match cfg.endpoint.as_deref() {
Some(s) if !s.is_empty() => s.to_string(),
_ => app.config.models.llm.endpoint.clone(),
};
Some(
OllamaVisionOcr::from_parts(
endpoint,
cfg.model.clone(),
cfg.languages.clone(),
cfg.max_pixels,
cfg.request_timeout_secs,
)
.context("kb-app::ingest: build OllamaVisionOcr (pdf)")?,
build_pdf_ocr_engine(&app.config)
.context("kb-app::ingest: build pdf OCR engine")?,
)
} else {
None
@@ -488,7 +480,7 @@ pub fn ingest_with_config_opts(
&existing_doc_ids,
&image_pipeline,
force_reingest,
pdf_ocr_engine.as_ref(),
pdf_ocr_engine.as_deref(),
progress,
opts.cancel.as_ref(),
log_writer.clone(),
@@ -832,11 +824,84 @@ fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String {
/// `<… as JobRepo>` to be explicit.
type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
/// v0.27.0 (T8): build the image OCR engine selected by
/// `config.image.ocr.engine`. Returns a boxed trait object so the ingest
/// pipeline is engine-agnostic. Construction is fail-fast (model load /
/// hash / endpoint validation) — mirrors the prior concrete-type behaviour.
///
/// `--config` facade: the caller threads the explicit [`kebab_config::Config`]
/// in, so `OnnxPaddleOcr::new` honours `image.ocr.{det_model,rec_model,dict,…}`
/// overrides resolved from that config (not a re-loaded XDG default).
fn build_image_ocr_engine(
config: &kebab_config::Config,
) -> anyhow::Result<Box<dyn OcrEngine>> {
match config.image.ocr.engine.as_str() {
OLLAMA_VISION_ENGINE => Ok(Box::new(
OllamaVisionOcr::new(config).context("build OllamaVisionOcr")?,
)),
PADDLE_ONNX_ENGINE => Ok(Box::new(
OnnxPaddleOcr::new(config).context("build OnnxPaddleOcr")?,
)),
other => anyhow::bail!(
"unknown image.ocr.engine {other:?}; expected \
{OLLAMA_VISION_ENGINE:?} or {PADDLE_ONNX_ENGINE:?}"
),
}
}
/// v0.27.0 (T8): build the PDF OCR engine selected by
/// `config.pdf.ocr.engine`. The ollama-vision arm uses the PDF-specific
/// `model` / `languages` / `max_pixels` / `request_timeout_secs` knobs (and
/// endpoint fallback to `models.llm.endpoint`). The paddle-onnx arm shares
/// the same bundled ONNX models as image OCR (resolved from `image.ocr`
/// overrides) — PaddleOCR is page-agnostic and carries no per-engine prompt.
///
/// # Paddle-ONNX asymmetry
///
/// When `pdf.ocr.engine = "paddle-onnx"`, the model paths and tuning knobs
/// (`det_model`, `rec_model`, `dict`, `score_thresh`, `unclip_ratio`,
/// `max_boxes`, `max_pixels`) are read from **`[image.ocr]`**, not
/// `[pdf.ocr]`. PaddleOCR has no PDF-specific prompt or page-level config;
/// `[pdf.ocr]` fields other than `engine` / `enabled` / `always_on` /
/// `valid_ratio_threshold` / `min_char_count` / `lang_hint` are effectively
/// ignored for the paddle path. This asymmetry is intentional — one set of
/// tuned ONNX knobs serves both image and PDF pages.
fn build_pdf_ocr_engine(
config: &kebab_config::Config,
) -> anyhow::Result<Box<dyn OcrEngine>> {
match config.pdf.ocr.engine.as_str() {
OLLAMA_VISION_ENGINE => {
let cfg = &config.pdf.ocr;
let endpoint = match cfg.endpoint.as_deref() {
Some(s) if !s.is_empty() => s.to_string(),
_ => config.models.llm.endpoint.clone(),
};
Ok(Box::new(
OllamaVisionOcr::from_parts(
endpoint,
cfg.model.clone(),
cfg.languages.clone(),
cfg.max_pixels,
cfg.request_timeout_secs,
)
.context("build OllamaVisionOcr (pdf)")?,
))
}
PADDLE_ONNX_ENGINE => Ok(Box::new(
OnnxPaddleOcr::new(config).context("build OnnxPaddleOcr (pdf)")?,
)),
other => anyhow::bail!(
"unknown pdf.ocr.engine {other:?}; expected \
{OLLAMA_VISION_ENGINE:?} or {PADDLE_ONNX_ENGINE:?}"
),
}
}
/// P6-4: borrowed bundle of the three image-pipeline components built
/// once per ingest invocation. Threaded through `ingest_one_asset` so
/// the dispatch does not need ten separate parameters.
struct ImagePipeline<'a> {
ocr_engine: Option<&'a OllamaVisionOcr>,
ocr_engine: Option<&'a dyn OcrEngine>,
caption_llm: Option<&'a dyn LanguageModel>,
}
@@ -1110,7 +1175,7 @@ fn ingest_one_asset(
existing_doc_ids: &std::collections::HashSet<String>,
image_pipeline: &ImagePipeline<'_>,
force_reingest: bool,
pdf_ocr_engine: Option<&OllamaVisionOcr>,
pdf_ocr_engine: Option<&dyn OcrEngine>,
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
@@ -2093,7 +2158,7 @@ fn ingest_one_pdf_asset(
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
force_reingest: bool,
pdf_ocr_engine: Option<&OllamaVisionOcr>,
pdf_ocr_engine: Option<&dyn OcrEngine>,
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
@@ -3017,6 +3082,50 @@ fn chunk_policy_from_config(config: &kebab_config::Config) -> ChunkPolicy {
/// The output is purely a comparison token — it is never parsed back, so the
/// exact format is internal. Field order is fixed and `Vec`s are joined so
/// the same `Config` always yields the same string.
/// Process-wide memo of the paddle-onnx `engine_version`, keyed by the
/// resolved (det,rec,dict) override triple. Hashing the ~17 MB of model bytes
/// happens once per triple per process (m3 — never re-hash per asset); the
/// per-asset [`ingest_config_signature`] calls hit this cache.
static PADDLE_OCR_VERSION_MEMO: std::sync::OnceLock<
std::sync::Mutex<std::collections::HashMap<String, String>>,
> = std::sync::OnceLock::new();
/// T9: resolve the OCR `engine_version` string used inside the ingest config
/// signature. ollama-vision is self-describing from `engine/model` (cheap, no
/// I/O). paddle-onnx hashes the bundled/override model assets (memoized).
fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model: &str) -> String {
if engine != PADDLE_ONNX_ENGINE {
// ollama-vision (and any non-paddle engine): the daemon exposes no
// stable per-model revision, so engine/model is the identity.
return format!("ollama/{model}");
}
let ocr = &config.image.ocr;
let key = format!(
"{}|{}|{}",
ocr.det_model.as_deref().unwrap_or("<bundled>"),
ocr.rec_model.as_deref().unwrap_or("<bundled>"),
ocr.dict.as_deref().unwrap_or("<bundled>"),
);
let memo = PADDLE_OCR_VERSION_MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()));
if let Some(v) = memo.lock().unwrap().get(&key) {
return v.clone();
}
// First call for this triple in this process: hash once. In any real
// ingest the engine was already built (fail-fast) so the assets are
// present and this succeeds; the path-derived identity below is an
// unreachable-in-practice guard that keeps the signature total.
let version = engine_version_for_config(config).unwrap_or_else(|e| {
tracing::warn!(
target: "kebab-app::ingest",
error = %e,
"paddle-onnx engine_version hash failed; using path-derived identity for signature"
);
format!("ppocrv5-mobile-kor-paths:{key}")
});
memo.lock().unwrap().insert(key, version.clone());
version
}
fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> String {
// Common (every media type): chunking parameters that move chunk
// boundaries. `target_tokens` / `overlap_tokens` change re-chunking for
@@ -3033,7 +3142,14 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
// a stable empty token so re-running the same config skips.
let ocr = &config.image.ocr;
if ocr.enabled {
sig.push_str(&format!("|ocr:1:{}", ocr.model));
// v0.27.0 (T9): engine + engine_version so switching engine
// (ollama-vision ↔ paddle-onnx) OR changing the model/assets
// invalidates downstream chunks (design §9 cascade).
sig.push_str(&format!(
"|ocr:1:{}:{}",
ocr.engine,
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
));
} else {
sig.push_str("|ocr:0");
}
@@ -3049,9 +3165,14 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
// (mirrors the ingest gate). `model` only matters when active.
let ocr = &config.pdf.ocr;
if ocr.enabled || ocr.always_on {
// v0.27.0 (T9): engine + engine_version (same cascade rule as
// image OCR above) alongside the enabled/always_on gate.
sig.push_str(&format!(
"|pdfocr:{}:{}:{}",
ocr.enabled, ocr.always_on, ocr.model
"|pdfocr:{}:{}:{}:{}",
ocr.enabled,
ocr.always_on,
ocr.engine,
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
));
} else {
sig.push_str("|pdfocr:0");
@@ -3816,4 +3937,93 @@ mod ingest_config_signature_tests {
);
}
}
// ── v0.27.0 (T9): engine + engine_version cascade ─────────────────────
/// (a) Switching the engine (ollama-vision → paddle-onnx) with the SAME
/// model id changes the image signature — different engines produce
/// different output even from an identically-named model.
#[test]
fn image_ocr_engine_switch_invalidates_image() {
let mut ollama = Config::defaults();
ollama.image.ocr.enabled = true;
// same `model` string on both — only the engine differs
let mut paddle = ollama.clone();
paddle.image.ocr.engine = "paddle-onnx".to_string();
assert_ne!(
ingest_config_signature(&ollama, &img()),
ingest_config_signature(&paddle, &img()),
"engine switch with identical model must invalidate images"
);
}
/// (b) A different engine_version (here: a different ollama model id, which
/// the signature folds into `ollama/{model}`) changes the image signature.
#[test]
fn image_ocr_engine_version_change_invalidates_image() {
let mut a = Config::defaults();
a.image.ocr.enabled = true;
a.image.ocr.model = "gemma4:e4b".to_string();
let mut b = a.clone();
b.image.ocr.model = "qwen2.5vl:3b".to_string();
assert_ne!(
ingest_config_signature(&a, &img()),
ingest_config_signature(&b, &img()),
"engine_version change must invalidate images"
);
}
/// (b') For the paddle-onnx engine, pointing at a different model asset
/// (override path) yields a different engine_version → different signature.
#[test]
fn image_ocr_paddle_model_path_change_invalidates_image() {
let mut base = Config::defaults();
base.image.ocr.enabled = true;
base.image.ocr.engine = "paddle-onnx".to_string();
let mut overridden = base.clone();
overridden.image.ocr.det_model = Some("/some/other/det.onnx".to_string());
assert_ne!(
ingest_config_signature(&base, &img()),
ingest_config_signature(&overridden, &img()),
"paddle-onnx model path change must invalidate images"
);
}
/// (c) Unrelated settings leave the paddle-onnx image signature stable
/// (engine_version is memoized + deterministic for a fixed asset triple).
#[test]
fn paddle_image_signature_stable_for_unrelated_change() {
let mut base = Config::defaults();
base.image.ocr.enabled = true;
base.image.ocr.engine = "paddle-onnx".to_string();
let mut other = base.clone();
other.search.default_k += 3;
other.image.ocr.max_pixels += 100; // runtime-only knob
assert_eq!(
ingest_config_signature(&base, &img()),
ingest_config_signature(&other, &img()),
"unrelated/runtime-only changes must not invalidate paddle images"
);
}
/// PDF OCR: engine switch with the same model invalidates pdf only.
#[test]
fn pdf_ocr_engine_switch_invalidates_pdf() {
let mut ollama = Config::defaults();
ollama.pdf.ocr.enabled = true;
let mut paddle = ollama.clone();
paddle.pdf.ocr.engine = "paddle-onnx".to_string();
assert_ne!(
ingest_config_signature(&ollama, &pdf()),
ingest_config_signature(&paddle, &pdf()),
"pdf engine switch must invalidate pdf"
);
for m in [md(), img(), code()] {
assert_eq!(
ingest_config_signature(&ollama, &m),
ingest_config_signature(&paddle, &m),
"pdf engine switch must NOT touch {m:?}"
);
}
}
}

View File

@@ -39,6 +39,11 @@ impl OcrEngine for MockOcrEngine {
"mock-v1".to_string()
}
#[allow(clippy::unnecessary_literal_bound)]
fn model(&self) -> &str {
"mock-model"
}
fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
if self.fail {
anyhow::bail!("mock failure");

View File

@@ -377,6 +377,36 @@ pub struct OcrCfg {
/// `86400`).
#[serde(default = "default_ocr_request_timeout_secs")]
pub request_timeout_secs: u64,
// ── paddle-onnx engine overrides (v0.27.0) ──────────────────────────
// Only consulted when `engine == "paddle-onnx"`; the ollama-vision
// engine ignores them. All `#[serde(default)]` so pre-v0.27 config
// files load unchanged.
/// Override path to the detection ONNX model. `None` → bundled
/// `assets/paddleocr-onnx/ppocrv5_mobile_det.onnx` (or the directory
/// named by `KEBAB_IMAGE_OCR_MODEL_DIR`).
#[serde(default)]
pub det_model: Option<String>,
/// Override path to the recognition ONNX model. `None` → bundled
/// `assets/paddleocr-onnx/korean_ppocrv5_mobile_rec.onnx`.
#[serde(default)]
pub rec_model: Option<String>,
/// Override path to the character dictionary. `None` → bundled
/// `assets/paddleocr-onnx/korean_dict.txt`.
#[serde(default)]
pub dict: Option<String>,
/// DBNet detection box score threshold (0.0..=1.0). Boxes whose mean
/// probability is below this are dropped. Default `0.3`.
#[serde(default = "default_ocr_score_thresh")]
pub score_thresh: f32,
/// Polygon unclip ratio applied to each detected box before crop.
/// Larger = more padding around the text. Default `1.5`.
#[serde(default = "default_ocr_unclip_ratio")]
pub unclip_ratio: f32,
/// Hard cap on detected boxes per image (runaway guard). Extra boxes
/// past this count are truncated with a warning. Default `1000`.
#[serde(default = "default_ocr_max_boxes")]
pub max_boxes: usize,
}
impl OcrCfg {
@@ -389,10 +419,29 @@ impl OcrCfg {
languages: vec!["eng".to_string(), "kor".to_string()],
max_pixels: 1600,
request_timeout_secs: default_ocr_request_timeout_secs(),
det_model: None,
rec_model: None,
dict: None,
score_thresh: default_ocr_score_thresh(),
unclip_ratio: default_ocr_unclip_ratio(),
max_boxes: default_ocr_max_boxes(),
}
}
}
/// paddle-onnx DBNet box score threshold default. See [`OcrCfg::score_thresh`].
fn default_ocr_score_thresh() -> f32 {
0.3
}
/// paddle-onnx unclip ratio default. See [`OcrCfg::unclip_ratio`].
fn default_ocr_unclip_ratio() -> f32 {
1.5
}
/// paddle-onnx box-count cap default. See [`OcrCfg::max_boxes`].
fn default_ocr_max_boxes() -> usize {
1000
}
/// v0.17.2 post-dogfood: matches the legacy hard-coded ceiling so
/// existing configs that omit the field keep behaving identically.
/// Overridable per config / `KEBAB_IMAGE_OCR_REQUEST_TIMEOUT_SECS`.
@@ -512,7 +561,9 @@ pub struct PdfOcrCfg {
/// scanned pages only. `true` — vision LLM 호출 on every page
/// (vector PDF 의 dual-text confidence boost — doubles chunk count).
pub always_on: bool,
/// Engine identifier. v1 only ships `"ollama-vision"`.
/// Engine identifier: `"ollama-vision"` or `"paddle-onnx"`. When set to
/// `"paddle-onnx"`, model paths and tuning knobs are read from
/// `[image.ocr]`, not `[pdf.ocr]` — PaddleOCR has no PDF-specific tuning.
pub engine: String,
/// Vision model id. Default `"qwen2.5vl:3b"` per PoC (§3.5 family
/// asymmetry vs image OCR's gemma4:e4b is acknowledged).
@@ -1098,6 +1149,34 @@ impl Config {
self.image.ocr.request_timeout_secs = n;
}
}
// paddle-onnx engine overrides (v0.27.0). Empty string → None
// (fall back to bundled / KEBAB_IMAGE_OCR_MODEL_DIR).
"KEBAB_IMAGE_OCR_DET_MODEL" => {
self.image.ocr.det_model =
if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_REC_MODEL" => {
self.image.ocr.rec_model =
if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_DICT" => {
self.image.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_SCORE_THRESH" => {
if let Ok(f) = v.parse::<f32>() {
self.image.ocr.score_thresh = f;
}
}
"KEBAB_IMAGE_OCR_UNCLIP_RATIO" => {
if let Ok(f) = v.parse::<f32>() {
self.image.ocr.unclip_ratio = f;
}
}
"KEBAB_IMAGE_OCR_MAX_BOXES" => {
if let Ok(n) = v.parse::<usize>() {
self.image.ocr.max_boxes = n;
}
}
// image.caption (P6-3)
"KEBAB_IMAGE_CAPTION_ENABLED" => {

View File

@@ -35,6 +35,24 @@ kamadak-exif = "0.6"
# transitive tokio runtime is brought in once.
reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls"] }
base64 = { workspace = true }
thiserror = { workspace = true }
# paddle-onnx OCR engine (PP-OCRv5, in-process). We reuse the workspace ort
# pin (=2.0.0-rc.9) so the ONNX Runtime native lib stays single-versioned with
# fastembed / kebab-nli (oar-ocr is intentionally NOT a dep — it would pull
# ort rc.12 + ndarray 0.17, splitting the native `links` and threatening the
# embedding stack). `download-binaries` extends the pin the same way
# `kebab-nli/Cargo.toml:23` does: this crate isn't in fastembed's build graph,
# so a standalone `cargo test -p kebab-parse-image` needs it to link onnxruntime.
ort = { workspace = true, features = ["ndarray", "download-binaries"] }
ndarray = { workspace = true }
# blake3: engine_version hash over the bundled det/rec/dict assets (computed
# once at OnnxPaddleOcr construction, cached — `ingest_config_signature` calls
# engine_version() per asset).
blake3 = { workspace = true }
# imageproc: connected-components / contours for DBNet det post-processing.
# min-area rotated-rect (rotating calipers) and polygon unclip are implemented
# in pure Rust (clipper2 is C++ FFI — would break the single-binary guarantee).
imageproc = "0.25"
[dev-dependencies]
tempfile = { workspace = true }

View File

@@ -0,0 +1,33 @@
PP-OCRv5 mobile ONNX models bundled with kebab (paddle-onnx OCR engine)
=======================================================================
These model weights and the recognition dictionary are derived from
PaddleOCR (https://github.com/PaddlePaddle/PaddleOCR), licensed under the
Apache License, Version 2.0.
Copyright (c) PaddlePaddle Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use these files except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Files
-----
ppocrv5_mobile_det.onnx PP-OCRv5_mobile detection model (DBNet)
korean_ppocrv5_mobile_rec.onnx korean_PP-OCRv5_mobile recognition model (CTC)
korean_dict.txt recognition dictionary (11,945 chars: KR + Latin + digits + symbols)
These were converted from the official PaddlePaddle inference models to ONNX
via paddle2onnx for in-process execution with onnxruntime (`ort`). No model
architecture or weights were modified; only the serialization format changed.
The recognition CTC class layout (empirically confirmed, see
tests/golden/ctc_rec_golden.json):
index 0 = CTC blank
index 1..11945 = korean_dict.txt line N -> class N (dict[N-1])
index 11946 = space ' '
total classes = 11947 (= 11945 dict + blank + space)
If any post-processing source (min-area-rect / polygon unclip) is later
ported verbatim from oar-ocr (Apache-2.0), record the per-file provenance
here as required by the Apache-2.0 attribution clause.

File diff suppressed because it is too large Load Diff

View File

@@ -30,9 +30,11 @@ mod dims;
mod exif_extract;
mod image_prep;
pub mod ocr;
pub mod paddle_onnx;
pub use caption::{apply_caption, caption_image};
pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr};
pub use ocr::{OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, apply_ocr};
pub use paddle_onnx::{ModelPaths, OnnxPaddleOcr, PADDLE_ONNX_ENGINE, engine_version_for_config};
use anyhow::{Context, Result};
use kebab_core::{

View File

@@ -65,6 +65,13 @@ pub trait OcrEngine: Send + Sync {
/// through to engines that benefit from it (Tesseract languages,
/// LLM prompt steering); ignore otherwise.
fn recognize(&self, image_bytes: &[u8], lang_hint: Option<&Lang>) -> Result<OcrText>;
/// Human-facing model label for the ingest progress display
/// (`AssetPhase{phase:"ocr", model}`). Distinct from
/// [`engine_version`](Self::engine_version), which is the cache-key
/// hash. E.g. `"gemma4:e4b"` (ollama-vision) or `"ppocrv5-mobile-kor"`
/// (paddle-onnx).
fn model(&self) -> &str;
}
/// Mutate `block.ocr` in place by running `engine` over `image_bytes`,
@@ -209,13 +216,6 @@ impl OllamaVisionOcr {
self.max_pixels
}
/// The Ollama model id this engine drives (e.g. `gemma4:e4b`).
/// Surfaced so the ingest progress display can name the model
/// running a slow OCR phase (`AssetPhase{phase:"ocr", model}`).
pub fn model(&self) -> &str {
&self.model
}
fn build_prompt(&self, lang_hint: Option<&Lang>) -> String {
let langs = if self.languages.is_empty() {
"any".to_string()
@@ -247,6 +247,10 @@ impl OcrEngine for OllamaVisionOcr {
format!("ollama/{}", self.model)
}
fn model(&self) -> &str {
&self.model
}
fn recognize(&self, image_bytes: &[u8], lang_hint: Option<&Lang>) -> Result<OcrText> {
let (prepared, w, h) = image_prep::downscale_to_png(image_bytes, self.max_pixels)
.context("preparing image for OCR")?;

View File

@@ -0,0 +1,985 @@
//! PP-OCRv5 ONNX OCR engine — in-process detection + recognition on the
//! workspace-pinned `ort` (=2.0.0-rc.9), no Python runtime, no oar-ocr
//! production dependency (see crate-level rationale + `assets/paddleocr-onnx/NOTICE`).
//!
//! Pipeline (`recognize`):
//! 1. decode (RGB) + downscale long edge to `max_pixels`
//! 2. det: ImageNet-normalized NCHW → DBNet prob map `[1,1,H,W]` → threshold
//! 0.3 → contours → min-area rect (rotating calipers, pure Rust) →
//! unclip(ratio 1.5, pure Rust) → boxes
//! 3. crop+rectify: perspective warp each rotated box to a horizontal strip
//! 4. rec: 48×W normalized `(x-0.5)/0.5` → `[1,T,11947]` → CTC greedy decode
//! 5. assemble reading-order `OcrText`
//!
//! ## Confirmed CTC facts (empirically derived in T0a, see
//! `tests/golden/ctc_rec_golden.json` — do NOT re-derive):
//! * rec classes = 11947 = dict(11945) + blank + space
//! * index 0 = CTC blank
//! * index 1..=11945 = `korean_dict.txt` line N → class N (i.e. `dict[N-1]`)
//! * index 11946 = space ' '
//!
//! ## rc.9 API notes (differ from rc.12):
//! * `try_extract_tensor::<f32>()` → `ArrayViewD<f32>` (`.shape()` / indexing).
//! * `Session::run` is called through a `Mutex` guard so the engine is
//! `Send + Sync` regardless of `Session`'s own auto-trait status (ingest
//! is serial today; the lock is uncontended).
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use anyhow::{Context, Result};
use kebab_core::{Lang, OcrRegion, OcrText};
use ndarray::Array4;
use ort::session::Session;
use ort::value::Value;
use crate::ocr::OcrEngine;
/// Engine name written into `OcrText.engine`.
pub const PADDLE_ONNX_ENGINE: &str = "paddle-onnx";
/// CTC blank class index (confirmed in T0a).
const CTC_BLANK: usize = 0;
/// Space class index (confirmed in T0a). `1..=DICT_LINES` map to dict entries.
const CTC_SPACE: usize = 11946;
/// `korean_dict.txt` line count (confirmed in T0a).
const DICT_LINES: usize = 11945;
/// rec output class count = dict + blank + space (confirmed in T0a).
const REC_CLASSES: usize = 11947;
/// det long-edge cap before rounding to a multiple of 32 (PaddleOCR default).
const DET_LIMIT_SIDE_LEN: u32 = 960;
/// rec input height (PP-OCRv5 mobile).
const REC_HEIGHT: u32 = 48;
/// DBNet probability-map binarization threshold. Looser than Paddle's default
/// `box_thresh` (0.6) to keep recall high on low-contrast Korean text.
const DET_BIN_THRESH: f32 = 0.3;
/// ImageNet normalization (det preprocessing — RGB).
const IMAGENET_MEAN: [f32; 3] = [0.485, 0.456, 0.406];
const IMAGENET_STD: [f32; 3] = [0.229, 0.224, 0.225];
/// PP-OCRv5 ONNX engine. Holds the two ONNX sessions (loaded once) and the
/// dict. `engine_version` is computed once at construction (blake3 over the
/// three model assets) and cached — `ingest_config_signature` calls
/// `engine_version()` per asset, so re-hashing there would be O(assets).
pub struct OnnxPaddleOcr {
det: Mutex<Session>,
rec: Mutex<Session>,
det_input_name: String,
rec_input_name: String,
dict: Vec<String>,
engine_version: String,
score_thresh: f32,
unclip_ratio: f32,
max_boxes: usize,
max_pixels: u32,
}
impl std::fmt::Debug for OnnxPaddleOcr {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("OnnxPaddleOcr")
.field("engine_version", &self.engine_version)
.field("dict_lines", &self.dict.len())
.field("score_thresh", &self.score_thresh)
.field("unclip_ratio", &self.unclip_ratio)
.field("max_boxes", &self.max_boxes)
.field("max_pixels", &self.max_pixels)
.finish_non_exhaustive()
}
}
/// Resolved model-asset paths. Construction is decoupled from `kebab-config`
/// (T7 adds the `det_model`/`rec_model`/`dict` overrides) so the engine can be
/// built directly in tests.
#[derive(Clone, Debug)]
pub struct ModelPaths {
pub det: PathBuf,
pub rec: PathBuf,
pub dict: PathBuf,
}
impl ModelPaths {
/// Default bundled-asset directory: `KEBAB_IMAGE_OCR_MODEL_DIR` if set,
/// else the crate's `assets/paddleocr-onnx/`.
pub fn from_default_dir() -> Self {
let dir = std::env::var("KEBAB_IMAGE_OCR_MODEL_DIR").map_or_else(
|_| Path::new(env!("CARGO_MANIFEST_DIR")).join("assets/paddleocr-onnx"),
PathBuf::from,
);
Self {
det: dir.join("ppocrv5_mobile_det.onnx"),
rec: dir.join("korean_ppocrv5_mobile_rec.onnx"),
dict: dir.join("korean_dict.txt"),
}
}
/// Resolve model paths from the `image.ocr` config (T7). Each of
/// `det_model` / `rec_model` / `dict` overrides the corresponding bundled
/// path when set; unset fields fall back to [`from_default_dir`], so a
/// caller can override just one asset.
///
/// [`from_default_dir`]: ModelPaths::from_default_dir
pub fn from_config(config: &kebab_config::Config) -> Self {
let defaults = Self::from_default_dir();
let ocr = &config.image.ocr;
Self {
det: ocr.det_model.as_ref().map(PathBuf::from).unwrap_or(defaults.det),
rec: ocr.rec_model.as_ref().map(PathBuf::from).unwrap_or(defaults.rec),
dict: ocr.dict.as_ref().map(PathBuf::from).unwrap_or(defaults.dict),
}
}
}
impl OnnxPaddleOcr {
/// Build from a workspace [`kebab_config::Config`]. Resolves model paths
/// from the default bundled directory (T7 will thread config overrides).
/// Construction loads both ONNX sessions and hashes the assets — failures
/// here are fail-fast (matches the Ollama adapter's construction contract).
pub fn new(config: &kebab_config::Config) -> Result<Self> {
let paths = ModelPaths::from_config(config);
let ocr = &config.image.ocr;
Self::from_paths(
&paths,
ocr.score_thresh,
ocr.unclip_ratio,
ocr.max_boxes,
ocr.max_pixels,
)
}
/// Build from explicit asset paths + tuning knobs. Used by tests and by
/// `new` after path resolution.
pub fn from_paths(
paths: &ModelPaths,
score_thresh: f32,
unclip_ratio: f32,
max_boxes: usize,
max_pixels: u32,
) -> Result<Self> {
let dict = load_dict(&paths.dict)
.with_context(|| format!("loading OCR dict from {}", paths.dict.display()))?;
// bounds-check: dict length must match the rec class layout
// (dict + blank + space). A mismatch means a wrong dict file —
// fail at construction rather than mis-decoding silently.
if dict.len() != DICT_LINES {
anyhow::bail!(
"OnnxPaddleOcr: dict has {} lines, expected {DICT_LINES} \
(rec classes {REC_CLASSES} = dict + blank + space)",
dict.len()
);
}
let engine_version = compute_engine_version(paths)
.context("hashing OCR model assets for engine_version")?;
let det = Session::builder()
.context("ort Session::builder (det)")?
.commit_from_file(&paths.det)
.with_context(|| format!("loading det model {}", paths.det.display()))?;
let rec = Session::builder()
.context("ort Session::builder (rec)")?
.commit_from_file(&paths.rec)
.with_context(|| format!("loading rec model {}", paths.rec.display()))?;
let det_input_name = det
.inputs
.first()
.map(|i| i.name.clone())
.context("det model has no inputs")?;
let rec_input_name = rec
.inputs
.first()
.map(|i| i.name.clone())
.context("rec model has no inputs")?;
Ok(Self {
det: Mutex::new(det),
rec: Mutex::new(rec),
det_input_name,
rec_input_name,
dict,
engine_version,
score_thresh,
unclip_ratio,
max_boxes,
max_pixels: max_pixels.clamp(256, 4096),
})
}
}
impl OcrEngine for OnnxPaddleOcr {
fn engine_name(&self) -> &'static str {
PADDLE_ONNX_ENGINE
}
fn engine_version(&self) -> String {
self.engine_version.clone()
}
// The trait method's elided lifetime ties the return to `&self`; the body
// returns a literal, but the signature must match the trait, so allow the
// `'static`-narrowing lint here.
#[allow(clippy::unnecessary_literal_bound)]
fn model(&self) -> &str {
// Static label for the progress display; the per-asset hash lives
// in `engine_version`.
"ppocrv5-mobile-kor"
}
fn recognize(&self, image_bytes: &[u8], _lang_hint: Option<&Lang>) -> Result<OcrText> {
let img = image::load_from_memory(image_bytes)
.context("decoding image for OCR")?
.to_rgb8();
let (orig_w, orig_h) = (img.width(), img.height());
if orig_w == 0 || orig_h == 0 {
return Ok(empty_ocr(self));
}
// ── det ────────────────────────────────────────────────────────
let (det_w, det_h) = det_target_dims(orig_w, orig_h, self.max_pixels);
let det_img = image::imageops::resize(
&img,
det_w,
det_h,
image::imageops::FilterType::Triangle,
);
let prob = self.run_det(&det_img)?; // (det_h, det_w) prob map
let scale_x = orig_w as f32 / det_w as f32;
let scale_y = orig_h as f32 / det_h as f32;
let mut boxes = det_postprocess(
&prob,
prob.w,
prob.h,
self.score_thresh,
self.unclip_ratio,
);
if boxes.len() > self.max_boxes {
tracing::warn!(
target: "kebab-parse-image",
"paddle-onnx: {} boxes exceeds max_boxes {} — truncating",
boxes.len(),
self.max_boxes
);
boxes.truncate(self.max_boxes);
}
// scale box corners back to original image coordinates
for b in &mut boxes {
for p in &mut b.corners {
p.0 *= scale_x;
p.1 *= scale_y;
}
}
if boxes.is_empty() {
return Ok(empty_ocr(self));
}
// ── rec per box (reading order: top→bottom, left→right) ─────────
boxes.sort_by(|a, b| {
let ay = a.center_y();
let by = b.center_y();
// group into rough rows by 0.5*box height tolerance via y then x
ay.partial_cmp(&by)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| {
a.center_x()
.partial_cmp(&b.center_x())
.unwrap_or(std::cmp::Ordering::Equal)
})
});
let mut regions: Vec<OcrRegion> = Vec::with_capacity(boxes.len());
for b in &boxes {
let crop = rectify_crop(&img, &b.corners);
if crop.width() == 0 || crop.height() == 0 {
continue;
}
let (text, conf) = self.run_rec(&crop)?;
if text.is_empty() {
continue; // rec empty → skip this box, keep the rest
}
let (x, y, w, h) = b.aabb();
regions.push(OcrRegion {
bbox: (x, y, w, h),
text,
confidence: conf,
});
}
let joined = regions
.iter()
.map(|r| r.text.as_str())
.collect::<Vec<_>>()
.join("\n");
Ok(OcrText {
joined,
regions,
engine: PADDLE_ONNX_ENGINE.to_string(),
engine_version: self.engine_version.clone(),
})
}
}
impl OnnxPaddleOcr {
/// Run det session → `(det_h, det_w)` probability map as a row-major Vec.
fn run_det(&self, det_img: &image::RgbImage) -> Result<ProbMap> {
let (w, h) = (det_img.width() as usize, det_img.height() as usize);
let mut arr = Array4::<f32>::zeros((1, 3, h, w));
for (x, y, px) in det_img.enumerate_pixels() {
let (xi, yi) = (x as usize, y as usize);
for c in 0..3 {
let v = f32::from(px[c]) / 255.0;
arr[[0, c, yi, xi]] = (v - IMAGENET_MEAN[c]) / IMAGENET_STD[c];
}
}
let input = Value::from_array(arr).context("det Value::from_array")?;
let sess = self.det.lock().unwrap_or_else(std::sync::PoisonError::into_inner);
let outputs = sess
.run(ort::inputs![self.det_input_name.as_str() => input]?)
.context("det session run")?;
let out_name = sess.outputs[0].name.clone();
let view = outputs[out_name.as_str()]
.try_extract_tensor::<f32>()
.context("det output extract")?;
// shape [1,1,H,W]
let shape = view.shape();
let (oh, ow) = (shape[shape.len() - 2], shape[shape.len() - 1]);
let data: Vec<f32> = view.iter().copied().collect();
Ok(ProbMap { w: ow, h: oh, data })
}
/// Run rec session on a rectified crop → (decoded string, mean confidence).
fn run_rec(&self, crop: &image::RgbImage) -> Result<(String, f32)> {
// resize keep-aspect to height 48, then this single crop is its own batch
let (cw, ch) = (crop.width().max(1), crop.height().max(1));
let new_w = ((REC_HEIGHT as f32 / ch as f32) * cw as f32).round().max(1.0) as u32;
let resized = image::imageops::resize(
crop,
new_w,
REC_HEIGHT,
image::imageops::FilterType::Triangle,
);
let w = new_w as usize;
let h = REC_HEIGHT as usize;
let mut arr = Array4::<f32>::zeros((1, 3, h, w));
for (x, y, px) in resized.enumerate_pixels() {
let (xi, yi) = (x as usize, y as usize);
for c in 0..3 {
let v = f32::from(px[c]) / 255.0;
arr[[0, c, yi, xi]] = (v - 0.5) / 0.5; // [-1, 1]
}
}
let input = Value::from_array(arr).context("rec Value::from_array")?;
let sess = self.rec.lock().unwrap_or_else(std::sync::PoisonError::into_inner);
let outputs = sess
.run(ort::inputs![self.rec_input_name.as_str() => input]?)
.context("rec session run")?;
let out_name = sess.outputs[0].name.clone();
let view = outputs[out_name.as_str()]
.try_extract_tensor::<f32>()
.context("rec output extract")?;
// shape [1, T, C]
let shape = view.shape();
let (t, c) = (shape[shape.len() - 2], shape[shape.len() - 1]);
if c != REC_CLASSES {
anyhow::bail!(
"rec output has {c} classes, expected {REC_CLASSES} \
(dict {DICT_LINES} + blank + space)"
);
}
let data: Vec<f32> = view.iter().copied().collect();
Ok(self.ctc_greedy_decode(&data, t, c))
}
/// CTC greedy decode over `[T, C]` logits/probs (row-major). Delegates to
/// [`ctc_greedy_decode_with_dict`] so the algorithm is testable without
/// loading ONNX sessions (see `tests::ctc_greedy_decode_golden`).
fn ctc_greedy_decode(&self, data: &[f32], t: usize, c: usize) -> (String, f32) {
ctc_greedy_decode_with_dict(data, t, c, &self.dict)
}
}
/// CTC greedy decode: per-timestep argmax → collapse consecutive duplicates →
/// drop blank (index 0) → map class index to string via `dict`.
/// Pure Rust, no I/O — usable in unit tests without loading ONNX sessions.
fn ctc_greedy_decode_with_dict(data: &[f32], t: usize, c: usize, dict: &[String]) -> (String, f32) {
let class_to_str = |idx: usize| -> Option<&str> {
match idx {
CTC_BLANK => None,
CTC_SPACE => Some(" "),
i if (1..=DICT_LINES).contains(&i) => Some(dict[i - 1].as_str()),
_ => None,
}
};
let mut out = String::new();
let mut confs: Vec<f32> = Vec::new();
let mut prev = usize::MAX;
for ti in 0..t {
let row = &data[ti * c..(ti + 1) * c];
let mut best = 0usize;
let mut best_v = f32::MIN;
for (i, &v) in row.iter().enumerate() {
if v > best_v {
best_v = v;
best = i;
}
}
if best != prev && best != CTC_BLANK {
if let Some(s) = class_to_str(best) {
out.push_str(s);
confs.push(best_v);
}
}
prev = best;
}
let conf = if confs.is_empty() {
0.0
} else {
confs.iter().sum::<f32>() / confs.len() as f32
};
(out, conf)
}
fn empty_ocr(e: &OnnxPaddleOcr) -> OcrText {
OcrText {
joined: String::new(),
regions: Vec::new(),
engine: PADDLE_ONNX_ENGINE.to_string(),
engine_version: e.engine_version.clone(),
}
}
/// Load the dict file: one token per line, trailing newline tolerated.
/// Empty lines are preserved as empty tokens (PaddleOCR dicts may carry a
/// blank-looking line; index integrity matters more than trimming).
fn load_dict(path: &Path) -> Result<Vec<String>> {
let raw = std::fs::read_to_string(path)?;
// split on '\n'; drop a single trailing empty element from the final newline
let mut lines: Vec<String> = raw.split('\n').map(|s| s.trim_end_matches('\r').to_string()).collect();
if lines.last().is_some_and(String::is_empty) {
lines.pop();
}
Ok(lines)
}
/// Resolve the paddle-onnx `engine_version` for `config` without loading the
/// ONNX sessions (T9). This is the same blake3-over-assets string that a
/// constructed [`OnnxPaddleOcr`] exposes via [`OcrEngine::engine_version`], so
/// the ingest config signature can include it. Reads ~17 MB of model bytes —
/// callers MUST memoize per (det,rec,dict) triple (m3: never re-hash per asset).
pub fn engine_version_for_config(config: &kebab_config::Config) -> Result<String> {
compute_engine_version(&ModelPaths::from_config(config))
}
/// blake3 over det + rec + dict bytes → stable `engine_version`.
fn compute_engine_version(paths: &ModelPaths) -> Result<String> {
let mut hasher = blake3::Hasher::new();
for p in [&paths.det, &paths.rec, &paths.dict] {
let bytes = std::fs::read(p).with_context(|| format!("reading {}", p.display()))?;
hasher.update(&bytes);
}
let hash = hasher.finalize();
let hex = hash.to_hex();
Ok(format!("ppocrv5-mobile-kor-{}", &hex.as_str()[..12]))
}
/// det resize target: keep aspect, cap long edge at `min(max_pixels, 960)`,
/// then round each dim to a multiple of 32 (DBNet stride). Reproduces the T0a
/// golden (192×900 → 192×896).
fn det_target_dims(w: u32, h: u32, max_pixels: u32) -> (u32, u32) {
let limit = DET_LIMIT_SIDE_LEN.min(max_pixels.max(32));
let long = w.max(h);
let ratio = if long > limit {
limit as f32 / long as f32
} else {
1.0
};
let rw = (w as f32 * ratio).round().max(1.0);
let rh = (h as f32 * ratio).round().max(1.0);
let round32 = |v: f32| -> u32 {
let r = (v / 32.0).round() as u32 * 32;
r.max(32)
};
(round32(rw), round32(rh))
}
// ── det postprocessing ──────────────────────────────────────────────────────
struct ProbMap {
w: usize,
h: usize,
data: Vec<f32>,
}
impl ProbMap {
#[inline]
fn at(&self, x: usize, y: usize) -> f32 {
self.data[y * self.w + x]
}
}
/// A detected text box: 4 corners (clockwise from top-left) in det-image
/// coordinates (later scaled to original).
#[derive(Clone, Debug)]
struct DetBox {
corners: [(f32, f32); 4],
}
impl DetBox {
fn center_x(&self) -> f32 {
self.corners.iter().map(|p| p.0).sum::<f32>() / 4.0
}
fn center_y(&self) -> f32 {
self.corners.iter().map(|p| p.1).sum::<f32>() / 4.0
}
/// Axis-aligned bounding box (x, y, w, h) clamped to non-negative.
fn aabb(&self) -> (u32, u32, u32, u32) {
let xs = self.corners.iter().map(|p| p.0);
let ys = self.corners.iter().map(|p| p.1);
let minx = xs.clone().fold(f32::MAX, f32::min).max(0.0);
let maxx = xs.fold(f32::MIN, f32::max).max(0.0);
let miny = ys.clone().fold(f32::MAX, f32::min).max(0.0);
let maxy = ys.fold(f32::MIN, f32::max).max(0.0);
(
minx.round() as u32,
miny.round() as u32,
(maxx - minx).round().max(0.0) as u32,
(maxy - miny).round().max(0.0) as u32,
)
}
}
/// DBNet-style postprocess: threshold → connected components → contour →
/// min-area rect (rotating calipers) → box-score filter → unclip → boxes.
/// Pinned by `tests/golden/det_boxes_clean_paragraph.json` (3 boxes).
fn det_postprocess(
prob: &ProbMap,
w: usize,
h: usize,
score_thresh: f32,
unclip_ratio: f32,
) -> Vec<DetBox> {
use image::{GrayImage, Luma};
// binarize at the detection threshold
let mut bin = GrayImage::new(w as u32, h as u32);
for y in 0..h {
for x in 0..w {
let v = if prob.at(x, y) > DET_BIN_THRESH { 255u8 } else { 0u8 };
bin.put_pixel(x as u32, y as u32, Luma([v]));
}
}
let contours = imageproc::contours::find_contours::<u32>(&bin);
let mut boxes = Vec::new();
for contour in &contours {
if contour.points.len() < 4 {
continue;
}
let pts: Vec<(f32, f32)> = contour
.points
.iter()
.map(|p| (p.x as f32, p.y as f32))
.collect();
let Some(rect) = min_area_rect(&pts) else {
continue;
};
// mean-prob box score over the AABB of the rotated rect
let score = box_score(prob, &rect.corners);
if score < score_thresh {
continue;
}
let unclipped = unclip_rect(&rect, unclip_ratio);
boxes.push(DetBox { corners: unclipped });
}
boxes
}
/// Mean probability inside the axis-aligned bbox of the rect — the
/// `box_thresh` mean-prob filter used by the golden harness.
fn box_score(prob: &ProbMap, corners: &[(f32, f32); 4]) -> f32 {
let minx = corners.iter().map(|p| p.0).fold(f32::MAX, f32::min).max(0.0) as usize;
let maxx = (corners.iter().map(|p| p.0).fold(f32::MIN, f32::max).max(0.0) as usize)
.min(prob.w.saturating_sub(1));
let miny = corners.iter().map(|p| p.1).fold(f32::MAX, f32::min).max(0.0) as usize;
let maxy = (corners.iter().map(|p| p.1).fold(f32::MIN, f32::max).max(0.0) as usize)
.min(prob.h.saturating_sub(1));
if maxx <= minx || maxy <= miny {
return 0.0;
}
let mut sum = 0.0f32;
let mut n = 0usize;
for y in miny..=maxy {
for x in minx..=maxx {
sum += prob.at(x, y);
n += 1;
}
}
if n == 0 { 0.0 } else { sum / n as f32 }
}
/// Rotated rect described by its 4 corners + box dims.
#[derive(Clone, Debug)]
struct RotRect {
corners: [(f32, f32); 4],
width: f32,
height: f32,
}
/// Minimum-area enclosing rectangle of a point set via rotating calipers on
/// the convex hull (pure Rust — no OpenCV / clipper2).
fn min_area_rect(points: &[(f32, f32)]) -> Option<RotRect> {
let hull = convex_hull(points);
if hull.len() < 3 {
return None;
}
let n = hull.len();
let mut best_area = f32::MAX;
let mut best: Option<RotRect> = None;
for i in 0..n {
let p0 = hull[i];
let p1 = hull[(i + 1) % n];
let edge = (p1.0 - p0.0, p1.1 - p0.1);
let len = (edge.0 * edge.0 + edge.1 * edge.1).sqrt();
if len < 1e-6 {
continue;
}
let ux = (edge.0 / len, edge.1 / len); // edge direction
let uy = (-ux.1, ux.0); // normal
let (mut min_u, mut max_u) = (f32::MAX, f32::MIN);
let (mut min_v, mut max_v) = (f32::MAX, f32::MIN);
for &p in &hull {
let du = p.0 * ux.0 + p.1 * ux.1;
let dv = p.0 * uy.0 + p.1 * uy.1;
min_u = min_u.min(du);
max_u = max_u.max(du);
min_v = min_v.min(dv);
max_v = max_v.max(dv);
}
let area = (max_u - min_u) * (max_v - min_v);
if area < best_area {
best_area = area;
// reconstruct corners in (u,v) basis → world
let to_world = |u: f32, v: f32| (u * ux.0 + v * uy.0, u * ux.1 + v * uy.1);
let corners = [
to_world(min_u, min_v),
to_world(max_u, min_v),
to_world(max_u, max_v),
to_world(min_u, max_v),
];
best = Some(RotRect {
corners,
width: max_u - min_u,
height: max_v - min_v,
});
}
}
best
}
/// Andrew's monotone chain convex hull. Returns CCW hull without duplicates.
fn convex_hull(points: &[(f32, f32)]) -> Vec<(f32, f32)> {
let mut pts: Vec<(f32, f32)> = points.to_vec();
pts.sort_by(|a, b| {
a.0.partial_cmp(&b.0)
.unwrap_or(std::cmp::Ordering::Equal)
.then(a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
});
pts.dedup();
if pts.len() < 3 {
return pts;
}
let cross = |o: (f32, f32), a: (f32, f32), b: (f32, f32)| {
(a.0 - o.0) * (b.1 - o.1) - (a.1 - o.1) * (b.0 - o.0)
};
let mut lower: Vec<(f32, f32)> = Vec::new();
for &p in &pts {
while lower.len() >= 2 && cross(lower[lower.len() - 2], lower[lower.len() - 1], p) <= 0.0 {
lower.pop();
}
lower.push(p);
}
let mut upper: Vec<(f32, f32)> = Vec::new();
for &p in pts.iter().rev() {
while upper.len() >= 2 && cross(upper[upper.len() - 2], upper[upper.len() - 1], p) <= 0.0 {
upper.pop();
}
upper.push(p);
}
lower.pop();
upper.pop();
lower.extend(upper);
lower
}
/// Unclip a rotated rect by `ratio` (PaddleOCR `distance = area*ratio/perimeter`),
/// expanding width + height by `2*distance`. For a rectangle this matches the
/// general polygon offset PaddleOCR uses (pyclipper) — pure Rust here.
fn unclip_rect(rect: &RotRect, ratio: f32) -> [(f32, f32); 4] {
let area = rect.width * rect.height;
let perimeter = 2.0 * (rect.width + rect.height);
if perimeter < 1e-6 {
return rect.corners;
}
let distance = area * ratio / perimeter;
// Offset every EDGE outward by `distance` (PaddleOCR pyclipper polygon
// offset): width and height each grow by 2*distance. A naive radial
// push-from-centroid is WRONG for text boxes — a wide/short box has an
// almost-horizontal diagonal, so radial expansion barely grows the height
// and clips character tops/bottoms (ㄷ→ㄴ, ascenders lost). We instead
// expand along the rect's own (u, v) axes recovered from its ordered
// corners (c0=min_u,min_v; c1=max_u,min_v; c2=max_u,max_v; c3=min_u,max_v).
let c = &rect.corners;
let unit = |dx: f32, dy: f32| -> (f32, f32) {
let len = (dx * dx + dy * dy).sqrt();
if len > 1e-6 { (dx / len, dy / len) } else { (0.0, 0.0) }
};
let u = unit(c[1].0 - c[0].0, c[1].1 - c[0].1); // +u (along width)
let v = unit(c[3].0 - c[0].0, c[3].1 - c[0].1); // +v (along height)
let off = |p: (f32, f32), su: f32, sv: f32| -> (f32, f32) {
(
p.0 + su * distance * u.0 + sv * distance * v.0,
p.1 + su * distance * u.1 + sv * distance * v.1,
)
};
[
off(c[0], -1.0, -1.0),
off(c[1], 1.0, -1.0),
off(c[2], 1.0, 1.0),
off(c[3], -1.0, 1.0),
]
}
// ── crop + rectify ───────────────────────────────────────────────────────────
/// Perspective-warp the quadrilateral `corners` (clockwise from top-left) into
/// a horizontal strip. Output size derives from the box edge lengths.
fn rectify_crop(img: &image::RgbImage, corners: &[(f32, f32); 4]) -> image::RgbImage {
// order corners: top-left, top-right, bottom-right, bottom-left
let ordered = order_corners(corners);
let dist = |a: (f32, f32), b: (f32, f32)| ((a.0 - b.0).powi(2) + (a.1 - b.1).powi(2)).sqrt();
let w = dist(ordered[0], ordered[1]).max(dist(ordered[3], ordered[2]));
let h = dist(ordered[0], ordered[3]).max(dist(ordered[1], ordered[2]));
let out_w = w.round().max(1.0) as u32;
let out_h = h.round().max(1.0) as u32;
let mut out = image::RgbImage::new(out_w, out_h);
let (iw, ih) = (img.width() as f32, img.height() as f32);
// bilinear map from output grid back to the source quad (inverse via
// bilinear interpolation of the four corners — adequate for near-affine
// text boxes).
for oy in 0..out_h {
let fy = oy as f32 / (out_h.max(1) as f32 - 1.0).max(1.0);
for ox in 0..out_w {
let fx = ox as f32 / (out_w.max(1) as f32 - 1.0).max(1.0);
// bilinear blend of the four source corners
let top = (
ordered[0].0 + (ordered[1].0 - ordered[0].0) * fx,
ordered[0].1 + (ordered[1].1 - ordered[0].1) * fx,
);
let bot = (
ordered[3].0 + (ordered[2].0 - ordered[3].0) * fx,
ordered[3].1 + (ordered[2].1 - ordered[3].1) * fx,
);
let sx = (top.0 + (bot.0 - top.0) * fy).clamp(0.0, iw - 1.0);
let sy = (top.1 + (bot.1 - top.1) * fy).clamp(0.0, ih - 1.0);
let px = img.get_pixel(sx.round() as u32, sy.round() as u32);
out.put_pixel(ox, oy, *px);
}
}
out
}
/// Order 4 corners as [top-left, top-right, bottom-right, bottom-left] using
/// coordinate sums/diffs (standard PaddleOCR ordering).
fn order_corners(corners: &[(f32, f32); 4]) -> [(f32, f32); 4] {
// top-left has smallest x+y, bottom-right largest x+y;
// top-right smallest y-x, bottom-left largest y-x.
let mut tl = corners[0];
let mut br = corners[0];
let mut tr = corners[0];
let mut bl = corners[0];
let (mut min_sum, mut max_sum) = (f32::MAX, f32::MIN);
let (mut min_diff, mut max_diff) = (f32::MAX, f32::MIN);
for &p in corners {
let sum = p.0 + p.1;
let diff = p.1 - p.0;
if sum < min_sum {
min_sum = sum;
tl = p;
}
if sum > max_sum {
max_sum = sum;
br = p;
}
if diff < min_diff {
min_diff = diff;
tr = p;
}
if diff > max_diff {
max_diff = diff;
bl = p;
}
}
[tl, tr, br, bl]
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn det_target_dims_matches_golden() {
// T0a golden: clean_paragraph 192×900 → det input 192×896.
assert_eq!(det_target_dims(900, 192, 1600), (896, 192));
}
#[test]
fn convex_hull_square() {
let pts = vec![(0.0, 0.0), (10.0, 0.0), (10.0, 10.0), (0.0, 10.0), (5.0, 5.0)];
let hull = convex_hull(&pts);
assert_eq!(hull.len(), 4);
}
#[test]
fn min_area_rect_axis_aligned() {
let pts = vec![(0.0, 0.0), (20.0, 0.0), (20.0, 5.0), (0.0, 5.0)];
let r = min_area_rect(&pts).expect("rect");
let (lo, hi) = (r.width.min(r.height), r.width.max(r.height));
assert!((lo - 5.0).abs() < 1e-3, "short side {lo}");
assert!((hi - 20.0).abs() < 1e-3, "long side {hi}");
}
#[test]
fn dict_length_mismatch_is_construction_error() {
// T10: a dict whose line count != DICT_LINES must fail at construction
// (before loading the ONNX sessions) rather than mis-decoding silently.
use std::io::Write;
let dir = tempfile::tempdir().unwrap();
let dict_path = dir.path().join("bad_dict.txt");
let mut f = std::fs::File::create(&dict_path).unwrap();
writeln!(f, "a\nb\nc").unwrap(); // 3 lines, not DICT_LINES
let paths = ModelPaths {
det: dir.path().join("unused_det.onnx"),
rec: dir.path().join("unused_rec.onnx"),
dict: dict_path,
};
let err = OnnxPaddleOcr::from_paths(&paths, 0.3, 1.5, 1000, 1600)
.expect_err("dict mismatch must error");
let msg = format!("{err:#}");
assert!(msg.contains("dict has 3 lines"), "unexpected error: {msg}");
}
#[test]
fn model_paths_from_config_uses_overrides() {
// T7: unset overrides → bundled default asset paths.
let mut cfg = kebab_config::Config::defaults();
let def = ModelPaths::from_config(&cfg);
assert!(def.det.ends_with("ppocrv5_mobile_det.onnx"), "{:?}", def.det);
assert!(def.rec.ends_with("korean_ppocrv5_mobile_rec.onnx"), "{:?}", def.rec);
assert!(def.dict.ends_with("korean_dict.txt"), "{:?}", def.dict);
// Override det + dict; rec stays bundled (partial override allowed).
cfg.image.ocr.det_model = Some("/custom/det.onnx".to_string());
cfg.image.ocr.dict = Some("/custom/dict.txt".to_string());
let ov = ModelPaths::from_config(&cfg);
assert_eq!(ov.det, PathBuf::from("/custom/det.onnx"));
assert_eq!(ov.dict, PathBuf::from("/custom/dict.txt"));
assert!(ov.rec.ends_with("korean_ppocrv5_mobile_rec.onnx"), "{:?}", ov.rec);
}
#[test]
fn unclip_expands_box() {
let rect = RotRect {
corners: [(0.0, 0.0), (20.0, 0.0), (20.0, 5.0), (0.0, 5.0)],
width: 20.0,
height: 5.0,
};
let out = unclip_rect(&rect, 1.5);
// unclipped box must be strictly larger than the original
let orig_minx = 0.0;
let new_minx = out.iter().map(|p| p.0).fold(f32::MAX, f32::min);
assert!(new_minx < orig_minx, "expected expansion, got {new_minx}");
}
/// Golden pin: verify `ctc_greedy_decode_with_dict` against pre-recorded
/// argmax sequences in `tests/golden/ctc_rec_golden.json`. No ONNX sessions
/// needed — only the bundled dict is loaded.
#[test]
fn ctc_greedy_decode_golden() {
let json_str = include_str!("../tests/golden/ctc_rec_golden.json");
let golden: serde_json::Value = serde_json::from_str(json_str).unwrap();
let dict_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("assets/paddleocr-onnx/korean_dict.txt");
let dict = load_dict(&dict_path).expect("bundled dict must load");
for case in golden["rec_cases"].as_array().unwrap() {
let t = case["T"].as_u64().unwrap() as usize;
let c = case["C"].as_u64().unwrap() as usize;
let argmax_idx: Vec<usize> = case["argmax_idx"]
.as_array()
.unwrap()
.iter()
.map(|v| v.as_u64().unwrap() as usize)
.collect();
let expected = case["decoded"].as_str().unwrap();
// build one-hot logits: timestep t fires class argmax_idx[t] = 1.0
let mut data = vec![0.0f32; t * c];
for (ti, &idx) in argmax_idx.iter().enumerate() {
data[ti * c + idx] = 1.0;
}
let (decoded, _conf) = ctc_greedy_decode_with_dict(&data, t, c, &dict);
assert_eq!(
decoded, expected,
"CTC decode mismatch for text={:?}",
case["text"]
);
}
}
/// Golden pin: verify `box_score` and `unclip_rect` against corner data
/// from `tests/golden/det_boxes_clean_paragraph.json`. No ONNX needed.
#[test]
fn det_box_score_golden() {
let json_str = include_str!("../tests/golden/det_boxes_clean_paragraph.json");
let golden: serde_json::Value = serde_json::from_str(json_str).unwrap();
let hw = golden["det_input_hw"].as_array().unwrap();
let h = hw[0].as_u64().unwrap() as usize;
let w = hw[1].as_u64().unwrap() as usize;
let thresh = golden["thresh"].as_f64().unwrap() as f32;
let unclip_ratio = golden["unclip_ratio"].as_f64().unwrap() as f32;
// uniform prob map at 0.9 — all boxes must score above det thresh
let prob = ProbMap { w, h, data: vec![0.9f32; w * h] };
for box_entry in golden["boxes"].as_array().unwrap() {
let poly = box_entry["poly"].as_array().unwrap();
let corners: [(f32, f32); 4] = [
(poly[0][0].as_f64().unwrap() as f32, poly[0][1].as_f64().unwrap() as f32),
(poly[1][0].as_f64().unwrap() as f32, poly[1][1].as_f64().unwrap() as f32),
(poly[2][0].as_f64().unwrap() as f32, poly[2][1].as_f64().unwrap() as f32),
(poly[3][0].as_f64().unwrap() as f32, poly[3][1].as_f64().unwrap() as f32),
];
// box_score must be above det threshold
let score = box_score(&prob, &corners);
assert!(
score > thresh,
"box_score {score:.4} ≤ thresh {thresh} for poly {poly:?}"
);
// unclip_rect must expand the bounding box (min x strictly decreases)
let rect_w = (corners[1].0 - corners[0].0).abs().max(1.0);
let rect_h = (corners[3].1 - corners[0].1).abs().max(1.0);
let rot = RotRect { corners, width: rect_w, height: rect_h };
let expanded = unclip_rect(&rot, unclip_ratio);
let orig_min_x = corners.iter().map(|p| p.0).fold(f32::MAX, f32::min);
let exp_min_x = expanded.iter().map(|p| p.0).fold(f32::MAX, f32::min);
assert!(
exp_min_x < orig_min_x,
"unclip_rect must expand: orig_min_x={orig_min_x} exp_min_x={exp_min_x}"
);
}
}
}

View File

@@ -0,0 +1,516 @@
{
"dict_lines": 11945,
"rec_classes": 11947,
"blank_index": 0,
"space_index": 11946,
"mapping": "idx0=blank; idx 1..N=dict[idx-1]; idx N+1=space; classes=dict+2",
"rec_norm": "RGB, /255 then (x-0.5)/0.5 => [-1,1], height=48 keep-aspect pad",
"det_norm": "RGB, ImageNet mean/std *255 then /std, NCHW",
"rec_cases": [
{
"text": "RAG 시스템 검색 결과",
"decoded": "RAG시스템 검색 결과",
"cer": 0.0769,
"cer_nospace": 0.0,
"mapping_ok": true,
"T": 40,
"C": 11947,
"argmax_idx": [
0,
0,
11553,
0,
11536,
0,
0,
11542,
0,
0,
0,
6185,
0,
0,
6129,
0,
0,
9897,
0,
0,
11946,
0,
461,
0,
0,
0,
5654,
0,
11946,
0,
509,
0,
0,
0,
585,
0,
0,
0,
0,
0
],
"collapsed_idx": [
11553,
11536,
11542,
6185,
6129,
9897,
11946,
461,
5654,
11946,
509,
585
],
"collapsed_conf": [
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002
],
"fired_timesteps": [
2,
4,
7,
11,
14,
17,
20,
22,
26,
28,
30,
34
],
"fired_logit_top5": [
{
"t": 2,
"top5_idx": [
11553,
11583,
11551,
0,
11541
],
"top5_val": [
0.9998,
0.0001,
0.0,
0.0,
0.0
]
},
{
"t": 4,
"top5_idx": [
11536,
11566,
0,
11748,
11551
],
"top5_val": [
0.9998,
0.0001,
0.0,
0.0,
0.0
]
},
{
"t": 7,
"top5_idx": [
11542,
0,
11572,
11946,
11585
],
"top5_val": [
0.9994,
0.0004,
0.0001,
0.0001,
0.0
]
},
{
"t": 11,
"top5_idx": [
6185,
0,
11946,
7949,
11518
],
"top5_val": [
0.9993,
0.0003,
0.0001,
0.0001,
0.0
]
},
{
"t": 14,
"top5_idx": [
6129,
7893,
0,
9069,
11536
],
"top5_val": [
0.9997,
0.0002,
0.0,
0.0,
0.0
]
},
{
"t": 17,
"top5_idx": [
9897,
9882,
9889,
9785,
3429
],
"top5_val": [
0.9999,
0.0,
0.0,
0.0,
0.0
]
},
{
"t": 20,
"top5_idx": [
11946,
0,
11516,
11518,
11579
],
"top5_val": [
0.9026,
0.0971,
0.0002,
0.0001,
0.0
]
},
{
"t": 22,
"top5_idx": [
461,
462,
9281,
349,
0
],
"top5_val": [
0.9995,
0.0003,
0.0001,
0.0,
0.0
]
},
{
"t": 26,
"top5_idx": [
5654,
0,
5766,
8594,
6830
],
"top5_val": [
1.0,
0.0,
0.0,
0.0,
0.0
]
},
{
"t": 28,
"top5_idx": [
11946,
0,
11516,
11549,
11564
],
"top5_val": [
0.9422,
0.0576,
0.0001,
0.0,
0.0
]
},
{
"t": 30,
"top5_idx": [
509,
0,
453,
11946,
505
],
"top5_val": [
0.9994,
0.0004,
0.0001,
0.0,
0.0
]
},
{
"t": 34,
"top5_idx": [
585,
641,
0,
10329,
589
],
"top5_val": [
0.9999,
0.0,
0.0,
0.0,
0.0
]
}
]
},
{
"text": "Embedding vector 0123",
"decoded": "Embedding vector 0123",
"cer": 0.0,
"cer_nospace": 0.0,
"mapping_ok": true,
"T": 41,
"C": 11947,
"argmax_idx": [
0,
11540,
0,
0,
11578,
0,
0,
11567,
0,
11570,
0,
11569,
0,
11569,
0,
11574,
0,
11579,
11572,
11572,
11946,
0,
11587,
11570,
0,
11568,
0,
11585,
11580,
0,
11583,
11946,
11946,
11520,
0,
11521,
0,
11522,
0,
11523,
0
],
"collapsed_idx": [
11540,
11578,
11567,
11570,
11569,
11569,
11574,
11579,
11572,
11946,
11587,
11570,
11568,
11585,
11580,
11583,
11946,
11520,
11521,
11522,
11523
],
"collapsed_conf": [
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0001,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002
]
},
{
"text": "한글 OCR 정확도 테스트",
"decoded": "한글 OCR 정확도 테스트",
"cer": 0.0,
"cer_nospace": 0.0,
"mapping_ok": true,
"T": 41,
"C": 11947,
"argmax_idx": [
0,
0,
10921,
0,
0,
0,
845,
0,
11946,
0,
11550,
0,
0,
11538,
0,
11553,
0,
11946,
0,
7522,
0,
0,
11170,
0,
0,
0,
2321,
0,
11946,
11946,
9881,
0,
0,
0,
6129,
0,
0,
0,
10245,
0,
0
],
"collapsed_idx": [
10921,
845,
11946,
11550,
11538,
11553,
11946,
7522,
11170,
2321,
11946,
9881,
6129,
10245
],
"collapsed_conf": [
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002,
0.0002
]
}
],
"det_cases": [
{
"fixture": "clean_paragraph.png",
"orig_hw": [
192,
900
],
"det_input_hw": [
192,
896
],
"prob_shape": [
192,
896
],
"prob_max": 1.0,
"prob_mean": 0.1139,
"positives_at_0.3": 19682,
"positive_frac": 0.1144,
"box_count": 3,
"postproc": "thresh=0.3 -> findContours -> minAreaRect -> unclip(ratio=1.5, area*r/peri); box_thresh=0.5 mean-prob filter; coords scaled back to orig hw"
}
],
"blank_index_confirmed_by_gt": true
}

View File

@@ -0,0 +1,78 @@
{
"fixture": "clean_paragraph.png",
"orig_hw": [
192,
900
],
"det_input_hw": [
192,
896
],
"thresh": 0.3,
"unclip_ratio": 1.5,
"boxes": [
{
"poly": [
[
29,
135
],
[
615,
134
],
[
615,
149
],
[
29,
150
]
],
"score": 0.8724
},
{
"poly": [
[
30,
92
],
[
597,
92
],
[
597,
105
],
[
30,
105
]
],
"score": 0.9627
},
{
"poly": [
[
30,
47
],
[
509,
47
],
[
509,
60
],
[
30,
60
]
],
"score": 0.9304
}
]
}

View File

@@ -0,0 +1,145 @@
//! T11 e2e accuracy gate for the paddle-onnx OCR engine.
//!
//! Runs the full `OnnxPaddleOcr` pipeline (det → rectify → rec → CTC) over the
//! synthetic OCR benchmark fixtures and asserts the mean character error rate
//! (CER) over the clean text set is `<= 0.05`, matching the spec gate.
//!
//! Model assets come from `KEBAB_TEST_OCR_MODEL_DIR` (default: the crate's
//! bundled `assets/paddleocr-onnx/`). Fixtures come from
//! `KEBAB_TEST_OCR_FIXTURE_DIR` (default: the dogfood corpus). If either is
//! absent the test skips with a warning rather than failing — CI without the
//! large models / fixtures stays green (plan T0/M4).
use std::collections::HashMap;
use std::path::PathBuf;
use kebab_parse_image::{ModelPaths, OcrEngine, OnnxPaddleOcr};
/// Collapse all whitespace runs to a single space + trim — matches the Python
/// `score_lib.norm` so the Rust gate and the bench harness agree.
fn norm(s: &str) -> String {
s.split_whitespace().collect::<Vec<_>>().join(" ")
}
/// Character error rate = Levenshtein(gt, pred) / len(gt), both normalized.
fn cer(gt: &str, pred: &str) -> f64 {
let g: Vec<char> = norm(gt).chars().collect();
let p: Vec<char> = norm(pred).chars().collect();
if g.is_empty() {
return if p.is_empty() { 0.0 } else { 1.0 };
}
let (m, n) = (g.len(), p.len());
let mut prev: Vec<usize> = (0..=n).collect();
for i in 1..=m {
let mut cur = vec![i; n + 1];
for j in 1..=n {
let cost = usize::from(g[i - 1] != p[j - 1]);
cur[j] = (prev[j] + 1).min(cur[j - 1] + 1).min(prev[j - 1] + cost);
}
prev = cur;
}
prev[n] as f64 / m as f64
}
fn fixture_dir() -> PathBuf {
std::env::var("KEBAB_TEST_OCR_FIXTURE_DIR").map_or_else(
|_| PathBuf::from("/build/dogfood/corpus/images/synthetic-ocr-bench"),
PathBuf::from,
)
}
/// T10: undecodable image bytes must surface as an error (the kebab-app caller
/// then skips the asset + records provenance), not panic or return garbage.
#[test]
fn paddle_onnx_decode_failure_is_error() {
let paths = ModelPaths::from_default_dir();
if !paths.det.exists() || !paths.rec.exists() || !paths.dict.exists() {
eprintln!("SKIP paddle_onnx_decode_failure_is_error: model assets not found");
return;
}
let engine = OnnxPaddleOcr::from_paths(&paths, 0.3, 1.5, 1000, 1600).unwrap();
let err = engine
.recognize(b"not a real image", None)
.expect_err("garbage bytes must fail to decode");
let msg = format!("{err:#}");
assert!(msg.contains("decoding image"), "unexpected error: {msg}");
}
#[test]
fn paddle_onnx_cer_gate() {
let paths = ModelPaths::from_default_dir();
if !paths.det.exists() || !paths.rec.exists() || !paths.dict.exists() {
eprintln!(
"SKIP paddle_onnx_cer_gate: model assets not found (det={}). \
Set KEBAB_TEST_OCR_MODEL_DIR or place assets/paddleocr-onnx/.",
paths.det.display()
);
return;
}
let fdir = fixture_dir();
let gt_path = fdir.join("gt.json");
if !gt_path.exists() {
eprintln!(
"SKIP paddle_onnx_cer_gate: fixtures not found at {}",
fdir.display()
);
return;
}
let gt: HashMap<String, String> =
serde_json::from_str(&std::fs::read_to_string(&gt_path).unwrap()).unwrap();
let engine = OnnxPaddleOcr::from_paths(&paths, 0.3, 1.5, 1000, 1600)
.expect("build OnnxPaddleOcr from bundled assets");
// "clean" set used for the gate — the standard, well-formed text fixtures.
// low_contrast / small_dense are intentionally hard and tracked but not
// part of the hard gate.
let gate_set = [
"clean_paragraph.png",
"title_body.png",
"tech_terms.png",
"korean_heavy.png",
"numbers_table.png",
];
let mut gate_cers = Vec::new();
let mut names: Vec<&String> = gt.keys().collect();
names.sort();
println!("\n=== paddle-onnx CER per fixture ===");
for name in names {
let img_path = fdir.join(name);
if !img_path.exists() {
continue;
}
let bytes = std::fs::read(&img_path).unwrap();
let t0 = std::time::Instant::now();
let out = engine.recognize(&bytes, None).expect("recognize");
let dt = t0.elapsed();
let c = cer(&gt[name], &out.joined);
if std::env::var("KEBAB_OCR_DUMP").is_ok() {
println!(" GT [{name}]: {:?}", norm(&gt[name]));
println!(" OUT [{name}]: {:?}", norm(&out.joined));
}
let gated = gate_set.contains(&name.as_str());
println!(
"{:<22} CER={:.4} {} ({} regions, {} ms)",
name,
c,
if gated { "[gate]" } else { " " },
out.regions.len(),
dt.as_millis()
);
if gated {
gate_cers.push(c);
}
}
assert!(!gate_cers.is_empty(), "no gate fixtures were scored");
let mean = gate_cers.iter().sum::<f64>() / gate_cers.len() as f64;
println!("=== mean gate CER = {mean:.4} (threshold 0.05) ===\n");
assert!(
mean <= 0.05,
"paddle-onnx mean CER {mean:.4} exceeds 0.05 gate"
);
}

View File

@@ -20,7 +20,7 @@ Cargo workspace, 함수 호출 기반 모듈러 모놀리스. UI binary (`kebab-
| 한국어 형태소분석 | `lindera-ko-dic` (FTS5 외부 tokenizer, v0.20.1) — 2자 이상 한국어 query 지원 |
| LLM | Ollama HTTP (default `gemma4:e4b` ─ OCR / caption 와 family 통일. 사용자가 더 큰 variant `gemma4:26b` 등으로 override 가능) |
| 음성 ASR | `whisper.cpp` (via `whisper-rs`) — P8 보류, 시스템 dep brainstorm 후 |
| OCR (image) | Ollama vision LM (default `gemma4:e4b`) `OcrEngine` trait 으로 Tesseract / Apple Vision 등 future swap (HOTFIXES P6-2) |
| OCR (image) | `OcrEngine` trait, 2 백엔드: **`ollama-vision`** (default, `gemma4:e4b`) / **`paddle-onnx`** (v0.27.0 — PP-OCRv5 ONNX in-process via `ort` =2.0.0-rc.9, DBNet det + CTC rec, 후처리 min-area rect/unclip pure-Rust, Python 런타임 0). engine 선택은 `[image.ocr] engine`, 팩토리는 `kebab-app::build_image_ocr_engine`. e2e CER 0.005 / 큰 페이지 <4초. (HOTFIXES P6-2, 2026-06-04) |
| OCR (PDF, v0.20.0+) | Ollama vision LM (default `qwen2.5vl:3b`) — post-extract enrichment via `kebab-app::pdf_ocr_apply` (H-1 resolution). DCTDecode-only v1 (FlateDecode/CCITTFax skip + warning). family asymmetry vs image OCR: PoC alnum 94.79% (qwen2.5vl) >> 27% (gemma4:e4b 받침), 본 단계에서 PDF OCR 만 qwen2.5vl. |
| Image caption | Ollama vision LM, runtime gate `image.caption.enabled` (default OFF) |
| RAG groundedness 검증 | `kebab-nli` 의 mDeBERTa-v3 XNLI 가 `(packed_chunks, generated_answer)` entailment 검사 (fb-41). `[rag] nli_threshold > 0` (default 0 = disabled, production 권장 0.5) 일 때 활성 — 미달 시 `refusal_reason = nli_verification_failed` (LLM self-judge ceiling 보완). 첫 호출 시 ~280 MB ONNX 자동 다운로드 |
@@ -212,7 +212,7 @@ kebab/
│ ├── kebab-rag/ # RAG pipeline (P4-3)
│ ├── kebab-nli/ # NLI verifier (mDeBERTa-v3 XNLI, fb-41 PR-9a/9b/9c-1)
│ ├── kebab-eval/ # golden query runner + metrics (P5-1, P5-2)
│ ├── kebab-parse-image/ # ImageExtractor + Ollama OCR + caption (P6)
│ ├── kebab-parse-image/ # ImageExtractor + OCR (ollama-vision + paddle-onnx ONNX) + caption (P6)
│ ├── kebab-parse-pdf/ # lopdf per-page text extractor (P7-1)
│ ├── kebab-parse-code/ # tree-sitter AST extractors: Rust (P10-1A-2), Python + TypeScript + JavaScript (P10-1B), Go (P10-1C-Go), Java + Kotlin (P10-1C-JK — java.rs + kotlin.rs), C + C++ (P10-1D — c.rs + cpp.rs); chunker lives in kebab-chunk
│ ├── kebab-app/ # facade (P0 시그니처 + P3-5/P6-4/P7-3 본체). src/derivation_payload.rs = 캐시 payload 인코딩 (v0.21.0)

View File

@@ -358,6 +358,24 @@ lang_hint = "kor"
이미지 자산 한 장당 OCR 1 호출 + Caption 1 호출 → ~3-6초 (`gemma4:e4b` 기준). 다이어그램 / 카메라 사진 / 스크린샷 위주 워크스페이스에 권장. 책 / 스캔본은 P7 PDF 라인으로.
**v0.27.0 — paddle-onnx 엔진 (오프라인, Ollama 불필요).** `[image.ocr] engine = "paddle-onnx"` 로 바꾸면 PP-OCRv5 ONNX 를 in-process 로 실행한다 (원격 vision LM 불필요, 큰 페이지 CPU <4초). embedding 까지 끄려면 `[models.embedding] provider = "none"` (lexical-only) 로 두면 Ollama 없이 OCR→FTS5 검색 전체 경로를 스모크할 수 있다:
```toml
[models.embedding]
provider = "none" # lexical-only — Ollama 불필요
[image.ocr]
enabled = true
engine = "paddle-onnx" # PP-OCRv5 ONNX in-process (Python/원격 0)
model = "ppocrv5-mobile-kor"
languages = ["kor", "eng"]
max_pixels = 1600
# det_model / rec_model / dict 로 번들 모델 경로 override 가능 (생략 시 번들 사용)
# score_thresh = 0.3 / unclip_ratio = 1.5 / max_boxes = 1000 으로 검출 튜닝
```
스모크: `kebab ingest --config <cfg>` 후 `kebab search --config <cfg> --mode lexical "<이미지 안 한국어 단어>"` 가 그 image chunk 를 반환하면 OCR→FTS5 wiring 정상. engine 또는 모델을 바꾸면 다음 ingest 가 영향 이미지를 자동 재색인한다.
## P7-3 PDF ingestion
`config.toml` 의 `[workspace] include` 에 `**/*.pdf` 를 추가하면 `kebab ingest` 가 텍스트 PDF 자산도 색인합니다. 외부 service 의존 없음 — `kebab-parse-pdf` 가 lopdf 로 페이지 단위 텍스트 추출, `kebab-chunk::PdfPageV1Chunker` 가 페이지 경계를 절대 넘지 않는 chunk 생성.

View File

@@ -0,0 +1,89 @@
# Plan: Rust 네이티브 OCR 엔진 (PP-OCRv5 ONNX) 구현
spec: `docs/superpowers/specs/2026-06-04-rust-native-ocr-spec.md`. 브랜치 `feat/rust-native-ocr`.
빌드 `CARGO_TARGET_DIR=/build/out/cargo-target`, 테스트 **`-j 8`**(절대 `-j 1` 금지), touched 크레이트 위주(`-p kebab-parse-image -p kebab-app -p kebab-config`).
참조 구현: `oar-ocr`(Apache-2.0) 소스 + Python PaddleOCR + 검증된 PoC `/build/cache/ocr-bench/{rust-poc,onnx,rc9-spike}/`(변환 ONNX + rc.9 동작 확인).
## Task 0a — 레퍼런스 골든 하네스 (C1 — 최우선 선행, executor 차단 제거)
**T3/T5 골든은 oar-ocr 로 못 만든다**(중간 텐서 미노출, PoC 는 최종텍스트만). 먼저 Python `onnxruntime` 직접(oar-ocr X)으로 변환 모델을 돌려 fixture 별 중간 산출을 골든으로 덤프:
- 입력: `/build/dogfood/corpus/images/synthetic-ocr-bench/` fixtures + 변환 ONNX(`/build/cache/ocr-bench/onnx/`).
- 덤프(JSON/npy, repo `crates/kebab-parse-image/tests/golden/`): (a) det 확률맵 슬라이스, (b) threshold 후 박스 폴리곤, (c) **rec 원시 logits `[T,C]`**, (d) 디코드 문자열, (e) 전처리 텐서 일부값.
- **M2 해결**: 알려진 텍스트라인 crop 의 logits + argmax 로 **blank 인덱스 + dict 11,945→클래스 11,947 매핑(+2 정체)을 경험적으로 도출**해 plan/주석에 사실로 기록(추정 금지). 경계문자(dict 첫/끝) 포함 골든.
- 도구: 기존 venv `/build/cache/ocr-bench/venv`(onnxruntime 직접 설치) 또는 paddleocr API 의 raw 단계. 하네스 스크립트는 `/build/cache/ocr-bench/` 에 보관(런타임 의존 아님, 골든 생성 전용).
- 수용: 각 fixture 골든 파일 생성 + blank 인덱스 문서화. 이후 T3~T5 가 이 골든에 핀.
## Task 0 — 모델 번들 (결정 C-1: include_bytes, release feature 게이트)
- 변환 ONNX(이미 존재: `/build/cache/ocr-bench/onnx/{ppocrv5_mobile_det.onnx, korean_ppocrv5_mobile_rec.onnx, korean_dict.txt}`)를 repo `crates/kebab-parse-image/assets/paddleocr-onnx/` 에 배치(+NOTICE, Apache-2.0).
- `bundled-ocr-models` cargo feature: on 이면 `include_bytes!` 로 임베드, off(dev 기본)면 config override 경로 필수. release 빌드는 feature on.
- 대안 C-2/C-3 는 빌드/링크 부담 측정 후 폴백(spec §모델 배포). 17MB 임베드의 dev 링크 영향 먼저 측정 — 과하면 C-2(repo 벤더 + OUT_DIR) 전환.
- **assets 17MB 커밋 방식 결정(M4/packaging)**: git-LFS 권장(clone/`cargo package` 비대 회피). `.gitattributes``*.onnx filter=lfs`. NOTICE(Apache-2.0) 동반.
- **테스트 모델 출처(M4)**: OCR 단위/e2e 테스트는 `bundled-ocr-models` feature 무관하게 `KEBAB_TEST_OCR_MODEL_DIR`(기본 `assets/paddleocr-onnx/`)에서 로드. 모델 없으면 `#[ignore]` 가 아니라 명확 skip+경고(CI 는 assets 존재 가정). dev 빌드 OCR 테스트가 모델 못 찾아 실패하는 모호함 제거.
- 수용: feature on 빌드 임베드 확인, off 빌드 정상, 테스트가 assets 에서 모델 로드.
## Task 1 — 의존성 (kebab-parse-image/Cargo.toml)
- `ort = { workspace = true, features = ["ndarray", "download-binaries"] }`(C1: 단독빌드 링크, nli 선례 주석). `ndarray = { workspace = true }`. `imageproc`(연결요소/윤곽).
- `ort-sys` caret 으로 rc.12 끌려가지 않게 Cargo.lock 정합 확인(rc.9 고정). unclip 다각형 offset 은 **pure-Rust 직접 구현**(clipper2 C++ FFI 회피 — spec).
- 수용: `cargo build -p kebab-parse-image -j 8` 링크 성공(onnxruntime), `cargo tree` 에 ort 단일 rc.9.
## Task 2 — OnnxPaddleOcr 골격 + 전처리 (kebab-parse-image)
- **선행 사실 확인**: rc.9 `ort::Session``Send+Sync` 인지 먼저 확인(아니면 Mutex 래핑). 결과를 주석에 기록.
- 신규 모듈 `paddle_onnx.rs`. `OcrEngine` 구현. **`engine_version`=생성 시 모델+dict blake3 1회 계산해 String 캐시**(m3: per-asset 재해시 금지 — `ingest_config_signature` 가 자산마다 호출). format 고정(후일 변경 시 mass 재색인 주의).
- det/rec `ort::Session` 2개 1회 로드 후 보관. **max_pixels 자체 bounds 적용**(spec 의 ocr.rs MIN/MAX clamp 은 Ollama private — paddle 은 자기 clamp 명시).
- 전처리: 디코드(image)→긴변 max_pixels 축소→BGR mean/std 정규화→`Array4<f32>`.
- 수용: 단위테스트 — 알려진 이미지→입력텐서 일부 값 골든(T0a).
## Task 3 — det 후처리 (단계 단위, 골든벡터)
- det Session 추론(`[1,1,H,W]` 확률맵, rc.9 `try_extract_tensor``ArrayViewD`) → threshold 0.3 이진화 → imageproc 연결요소/윤곽 → **min-area rotated-rect(rotating calipers 직접 구현)****unclip(pure-Rust 다각형 offset, ratio 1.5)** → 박스 Vec.
- 수용: 합성 fixture 기대 박스 개수/대략 좌표 골든. min-area rect·unclip 각각 단위테스트.
## Task 4 — crop + rectify
- 회전 박스 → perspective/affine warp 로 수평 정렬(oar-ocr 가 제공하던 부분 이식).
- 수용: 회전 텍스트 fixture → 정렬 crop 골든.
## Task 5 — rec + CTC decode
- crop→48×W 정규화→rec Session(`[1,T,C]`) → CTC greedy(argmax/timestep→연속중복 제거→blank 제거).
- **blank 인덱스 + 11,945→11,947 매핑은 T0a 하네스에서 도출한 사실을 사용**(추정 금지). bounds-check(dict 길이≠클래스 시 생성 에러).
- 수용: T0a 골든 logit→문자열 일치(blank/중복/**경계문자 dict 첫·끝** 포함).
## Task 6 — 조립 + OcrText
- 박스 reading-order(상→하,좌→우) → `OcrText{joined, regions:[OcrRegion{bbox,text,confidence}], engine, engine_version}`. per-region 실제 confidence(Ollama 상수1.0 대비 값 변화 — release note).
- 수용: e2e — 합성 한/영 fixture **CER ≤ 0.05**, bbox>0. PoC 0.976 baseline 대비 회귀 없음.
- **CER 게이트 실패 시 폴백 사다리(M3)**: ① T0a 단계 골든과 diff 해 어느 단계 divergence 인지 국소화 → ② det postproc(unclip/min-area rect)가 원인이면 **oar-ocr 의 해당 함수를 verbatim 이식**(Apache-2.0, NOTICE+파일별 출처 표기 — 코드 파생물) → ③ time-box(예 반나절) 초과 시 리더 escalate. 손수 재유도에 매몰 금지.
## Task 7 — config (kebab-config)
- `OcrCfg`: `engine` 값에 "paddle-onnx" 문서화(기본 "ollama-vision" 유지). 신규 override `det_model`/`rec_model`/`dict`(Option), `score_thresh`(0.3)/`unclip_ratio`(1.5)/`max_boxes`(1000). `KEBAB_IMAGE_OCR_*` env. serde default(forward-compat) + init 템플릿 노출.
- 수용: override 미지정→번들 모델, 지정→그 경로 사용 테스트. config migrate(#198) 무수정 로드 회귀.
## Task 8 — 엔진 팩토리 (kebab-app/lib.rs) — **4개 site 전부(M1)**
구체타입 `OllamaVisionOcr` 가 박힌 곳이 4군데 — 누락 시 타입에러로 막힘:
- `:360` image 엔진 생성 → `Box<dyn OcrEngine>` 팩토리(`match engine`: ollama-vision|paddle-onnx|err).
- `:379` pdf 엔진 생성 → 동일 팩토리.
- `:839` `ImagePipeline.ocr_engine` 필드 → `Option<&dyn OcrEngine>`.
- `:1113`, `:2096` `pdf_ocr_engine: Option<&OllamaVisionOcr>` 함수 시그니처 2곳 → `Option<&dyn OcrEngine>`.
- `apply_ocr_to_pdf_pages`(`pdf_ocr_apply.rs:93`)는 이미 `&dyn OcrEngine` — 스레딩만 변경, 헬퍼 불변. `--config` facade 스레딩(`OnnxPaddleOcr::new(cfg,…)`).
- 수용: 팩토리 단위테스트(선택/미지값 에러). **ollama-vision 경로 출력 동일** 회귀 테스트(구체→dyn 전환 무영향).
## Task 9 — 서명 cascade (C3, kebab-app)
- `ingest_config_signature` image/pdf 브랜치 `|ocr:1:{model}``|ocr:1:{engine}:{engine_version}`(engine + 모델/dict blake3).
- 수용: (a)ollama↔paddle 동일model→서명다름 (b)engine_version 다름→다름 (c)search 등 무관→불변. → 엔진/모델 변경 시 v0.26.2 자동 재색인.
## Task 10 — 에러 매트릭스 (spec §에러 처리)
- 다운로드/blake3 실패→fail-fast, 디코드불가→skip+provenance, det 0박스→`OcrText{"",[]}` 성공, rec 빈→박스skip, 박스폭증→max_boxes 절단+로그, dict 불일치→생성에러.
- 수용: 각 케이스 단위/통합 테스트.
## Task 11 — 검증 게이트
- `cargo clippy --workspace --all-targets -j 8 -- -D warnings` 0.
- `cargo test -p kebab-parse-image -p kebab-app -p kebab-config -j 8` 통과(+ `-p kebab-parse-image` 단독 링크 확인).
- 스모크: `engine="paddle-onnx"` 이미지 ingest→FTS5 hit, 큰 페이지 CPU <5초.
## Task 12 — 문서 + 버전 + 도그푸딩
- README(Configuration: `image.ocr.engine`+모델 번들), docs/SMOKE(config 예시), HANDOFF 1줄, docs/ARCHITECTURE(OCR 백엔드/그래프), HOTFIXES dated entry.
- Cargo.toml workspace version **minor bump**(+Cargo.lock). release notes(엔진 추가/per-region confidence/오프라인).
- 도그푸딩: 사용자 실제 이미지·책 스캔 정확도·속도 → HOTFIXES + release notes evidence.
- 결과 요약 `/tmp/rust-ocr-result.md`(게이트 + 스모크 + 도그푸딩 캡처).
## 리뷰 루프
완료 → 리더 clippy/타깃테스트(-j8) 독립 재확인 + paddle-onnx 스모크 → `gitea-pr`(title `feat(ocr): PP-OCRv5 ONNX Rust 네이티브 OCR 엔진`) → 리뷰 루프 → 사용자 머지. 모델 ONNX 는 release feature/asset 로 동반.
## 단계 의존
**T0a(레퍼런스 골든+blank 도출) 최우선 선행** → T0(번들),T1(deps) → T2→T3→T4→T5→T6(파이프라인 순차, 각 T0a 골든에 핀) ∥ T7(config) → T8(팩토리 4site)→T9(서명)→T10(에러) → T11 게이트 → T12 문서. T3~T5 가 핵심 난도(직접 이식), T0a 골든+T6 폴백사다리로 회귀·매몰 차단. T8 의 정확한 라인(:1113/:2096 등)은 구현 시점 grep 으로 재확인(코드 이동 가능).

View File

@@ -0,0 +1,192 @@
# Spec: Rust 네이티브 OCR 엔진 (PP-OCRv5 ONNX, in-process)
**날짜**: 2026-06-04
**유형**: feature (minor) — 신규 OCR 엔진 + config 키 + 동작 변화
**상태**: draft (self-review 대기)
**contract_sections**: design §6 (parse/extract), §8 (deps), §9 (versioning cascade)
## 동기
현재 이미지/PDF OCR 은 Ollama Vision LLM(`gemma4:e4b` 8B) 1콜(`crates/kebab-parse-image/src/ocr.rs`, `OllamaVisionOcr`). 사용자 실측 문제:
- 실제 이미지 한 장당 **~50초**(VLM 은 글자를 토큰 단위로 생성 → 조밀 페이지는 본질적으로 느림). 모델을 바꿔도(qwen2.5vl:3b GPU 20~28초) 사용자 허용치 미달.
- 사용자 결정: **배치 ingest 용도 + Python 의존 불가 + Rust 내장**.
### 근거 벤치 (2026-06-04, `/build/dogfood/logs/2026-06-04-ocr-model-bench.md`)
| 방식 | 작은 이미지 | 초대형 1757×2644 | 정확도 | 비고 |
|---|---|---|---|---|
| gemma4:e4b 8B VLM (GPU) | 11초 | 43초 | 0.65~0.82 | 현재 |
| qwen2.5vl:3b VLM (GPU) | 3.6초 | 20초 | 0.93 | 속도 미달 |
| **PP-OCRv5 mobile ONNX, Rust (CPU)** | **0.05초** | **2.75초** | **0.976** | **PoC 검증됨** |
VLM 은 생성 병목으로 탈락. **검출+인식형 전용 엔진(PP-OCRv5)을 ONNX 로 Rust in-process 실행**이 속도·정확도·한국어·단일바이너리 모두 만족. PoC: `oar-ocr` 0.6.3 + `ort` 로 위 수치 확인(오류는 띄어쓰기뿐, 한국어 오인식 0). PoC 코드/모델: `/build/cache/ocr-bench/{rust-poc,onnx}/`.
## 핵심 설계 결정: oar-ocr 미채택, 핀 ort 위 직접 구현
PoC 는 `oar-ocr` 0.6.3 으로 검증했으나 **프로덕션 의존성으로는 쓰지 않는다**. 이유(load-bearing):
- kebab 은 `ort = "=2.0.0-rc.9"`**의도적 핀**(workspace `Cargo.toml:195-204`): fastembed 4.9 의 ONNX Runtime+tokenizer 스택을 워크스페이스 단일 버전으로 유지. `ndarray = "0.16"` 도 동일.
- `oar-ocr` 0.6.3 은 `ort 2.0.0-rc.12` + `ndarray 0.17` 요구. `ort``ort-sys` 가 onnxruntime 네이티브 라이브러리를 `links` 하므로 **두 버전 공존 불가** → oar-ocr 채택 시 ort/ndarray 를 bump 해야 하고, 이는 fastembed/kebab-nli/kebab-embed-candle 의 임베딩·NLI 스택을 흔든다(사용자 우선순위인 검색 품질 직결, [[search-quality-dogfood]]).
**→ PaddleOCR 의 전/후처리(검출 DBNet postproc + 인식 CTC decode)를 kebab 의 기존 핀 `ort`(rc.9) 위에 직접 구현.** oar-ocr(Apache-2.0) 소스 + Python PaddleOCR 을 레퍼런스로. 공유 ort 라 새 네이티브 의존성 0, 임베딩 스택 무영향.
### C2 검증 완료 (rc.9 스파이크, 2026-06-04)
PoC 는 oar-ocr 경유 ort **rc.12** 로 돌았으므로, 핀 **rc.9** 가 paddle2onnx 산출 모델을 실제 로드/추론하는지 별도 검증함(`/build/cache/ocr-bench/rc9-spike/`). 결과 **PASS**:
- `ort = "=2.0.0-rc.9"` + `ort-sys = "=2.0.0-rc.9"`(caret 으로 rc.12 끌려가는 것 방지 — kebab Cargo.lock 과 동일) + `ndarray 0.16` + feature `["ndarray","download-binaries"]` 로 빌드/링크/onnxruntime 다운로드 성공.
- det: 입력 `"x"` → 출력 `[1,1,640,640]`(DBNet 확률맵). rec: 출력 `[1,40,11947]`(timestep×클래스; dict 11,945 + blank/특수 = 11,947, CTC 정합 확인).
- `try_extract_tensor::<f32>()` 는 rc.9 에서 `ArrayViewD<f32>` 반환(rc.12 의 `(shape,&[T])` 와 다름) — 구현 시 유의.
- **함의**: 핀 ort 유지(ort/ndarray bump 불필요)로 임베딩 스택 무영향 확정. opset 호환 OK. 출력 형태가 후처리 설계(det threshold→박스 / rec CTC)와 일치.
### 추가 의존성
- `image`(이미 허용), `ndarray`(workspace `=0.16`), `ort`(workspace `=2.0.0-rc.9`, **features `["ndarray","download-binaries"]`**).
- **download-binaries 필수**: `kebab-parse-image` 는 fastembed 빌드그래프에 없어, 단독 빌드(`cargo test -p kebab-parse-image`)시 onnxruntime 링크 위해 명시 필요. `kebab-nli/Cargo.toml:23` 의 동일 선례 주석 그대로 따름.
- `ort-sys` 가 caret 으로 rc.12 로 끌려가지 않도록 workspace 핀과 Cargo.lock 정합 확인.
- `imageproc` — det 확률맵 연결요소/윤곽 추출. **단 min-area rotated-rect 는 imageproc 미제공 → rotating-calipers 직접 구현**.
- DBNet unclip(다각형 확장): **`clipper2` 는 C++ FFI 가능성 → single-binary/pure-Rust 위배 위험. 우선 pure-Rust 다각형 offset 직접 구현 또는 검증된 pure-Rust crate.** (plan 에서 clipper2 가 C++ 링크인지 확인 후 택일.)
## 파이프라인 (OnnxPaddleOcr)
`crates/kebab-parse-image/src/` 에 신규 모듈. `OcrEngine` trait(`ocr.rs:54`) 구현:
```rust
pub trait OcrEngine: Send + Sync {
fn engine_name(&self) -> &'static str; // "paddle-onnx"
fn engine_version(&self) -> String; // "ppocrv5-mobile-kor-v1" (+model hash)
fn recognize(&self, image_bytes: &[u8], lang_hint: Option<&Lang>) -> Result<OcrText>;
}
```
`recognize` 단계 (PoC 와 동일 알고리즘):
1. **디코드+다운스케일**: `image` 로 디코드 → 긴변 `max_pixels`(기본 1600) 로 축소(기존 `OcrCfg.max_pixels` 재사용, qwen 과 달리 PP-OCRv5 는 원본도 안전하나 속도 위해 유지).
2. **검출(det)**: BGR 정규화 → det ONNX(`PP-OCRv5_mobile_det`) → 확률맵 → threshold(0.3) 이진화 → 윤곽(imageproc) → min-area rect → unclip(ratio 1.5) → 텍스트 박스 N개.
3. **인식(rec)**: 각 박스 crop+회전보정 → 48×W 리사이즈/정규화 → rec ONNX(`korean_PP-OCRv5_mobile_rec`) → CTC greedy decode(dict 11,945자, blank 처리) → 텍스트+score.
4. **조립**: 박스를 reading-order(상→하, 좌→우) 정렬 → `OcrText { joined, regions: Vec<OcrRegion{bbox,text,confidence}>, engine, engine_version }`. **Ollama 와 달리 per-line bbox/confidence 제공**(`OcrRegion` 풍부화).
배치: PoC 는 박스별 순차 rec. 성능 충분(초대형 2.75초)하나, rec 를 ort 배치 입력으로 묶으면 추가 향상 가능(plan 에서 측정 후 결정).
### 단계별 분해 (M1 — 각 단계 골든벡터 단위테스트)
후처리가 실제 난도. "쉽다"로 뭉뚱그리지 않고 **각 단계를 독립 테스트 가능 단위**로 쪼갠다. 각 단위는 oar-ocr/Python PaddleOCR 이 **같은 fixture** 에 내는 출력을 골든벡터로 박아 단계별 회귀(0.976 baseline 대비)를 잡는다:
1. **전처리**(resize/pad/normalize): det 입력 정규화(mean/std, /255). 골든: 알려진 이미지→텐서 일부 값.
2. **det 후처리**: 확률맵(`[1,1,H,W]`)→threshold(0.3)→연결요소(imageproc)→**min-area rotated-rect(rotating calipers 직접 구현)**→**unclip(다각형 offset, ratio 1.5)**→박스. 골든: 합성 이미지의 기대 박스 개수/대략 좌표.
3. **crop+rectify**: 회전 박스→perspective/affine warp 로 수평 정렬(oar-ocr 가 공짜 제공하던 부분; 직접 구현 필요). 골든: 회전 텍스트 fixture.
4. **rec 전처리+추론**: crop→48×W 정규화→rec ONNX→`[1,T,C]` logits.
5. **CTC greedy decode**: argmax per timestep→연속중복 제거→blank(인덱스 0 또는 dict 길이 위치, **PaddleOCR 규약 정확 매칭**) 제거→dict 인덱스→char. dict 길이(11,945) vs rec 출력 클래스(11,947) 정합 + **인덱스 bounds-check**(잘못된 dict 길이/빈 줄 방어). 골든: 알려진 logit→문자열.
6. **box reading-order**: 상→하, 좌→우 정렬(가로쓰기 전제; 세로/회전 페이지는 비범위).
각 단계 divergence 를 end-to-end 가 아니라 단위에서 잡는다(M1 권고).
## Config
`OcrCfg`(`kebab-config/src/lib.rs:343`)에 `engine` 필드 **이미 존재**(기본 `"ollama-vision"`). 변경:
- `engine` 값에 `"paddle-onnx"` 추가(문서화). 기본값은 **당장 바꾸지 않음**(default 변경은 별도 결정 — 아래 "기본 엔진" 참조).
- 신규(선택) 필드: `det_model` / `rec_model` / `dict` 경로 override(미지정 시 자동 다운로드 캐시 경로). `score_thresh`(기본 0.3), `unclip_ratio`(기본 1.5) 는 고급 튜닝용(기본값 고정, 노출 최소).
- `pdf.ocr` 도 동일 `engine` 분기 적용(같은 trait).
### 모델 배포 — 결정 C: kebab 와 함께 번들 (HF 미사용, 사용자 확정 2026-06-04)
제3자(HF) 호스팅 의존 제거. 변환본(det 4.7MB + korean rec 13MB + dict ≈ **17MB**)을 kebab 자체에 번들. **구체 기법은 plan 에서 택1**(모두 HF/외부 네트워크 0):
- **C-1 바이너리 임베드(`include_bytes!`)**: 모델을 바이너리에 박음. 진정한 single-binary·완전 오프라인·재현성 100%. 비용: 릴리스 바이너리 +17MB, 그리고 dev/test 빌드마다 17MB 링크 부담 → **release feature(`bundled-ocr-models`) 게이트**로 dev 빌드 제외 가능. 로컬-first 철학 최적합.
- **C-2 repo 벤더링**: `assets/paddleocr-onnx/`(git 또는 git-LFS) 에 두고 빌드 시 `OUT_DIR` 복사 또는 런타임 상대경로. 바이너리 비대 회피하나 배포 시 파일 동반 필요.
- **C-3 gitea 릴리스 에셋 + 첫 실행 다운로드**: `gitea-release --asset` 로 첨부, 첫 실행 시 릴리스 URL 에서 `model_dir/paddleocr-onnx/` 로 받음. 바이너리 lean 하나 첫 실행 시 gitea 네트워크 필요(에어갭 불가) — 로컬-first 와 약간 상충.
**권장 = C-1(release feature 게이트)**: 오프라인·재현성·single-binary 가 kebab 정체성과 가장 정합. plan 에서 빌드/링크 영향 측정 후 확정.
- **무결성**: 임베드(C-1)면 빌드 시점 고정이라 별도 해시 불요(바이너리=정본). C-2/C-3 면 blake3 pin 필수.
- **라이선스**: PP-OCRv5 가중치 Apache-2.0 — 재배포 가능. 번들에 NOTICE 동반.
- **오프라인**: C-1 완전 오프라인. config override(`det_model`/`rec_model`/`dict`)로 로컬 모델 교체 항상 가능.
## 엔진 선택 (kebab-app 팩토리)
현재 `OllamaVisionOcr` 하드코딩(`kebab-app/src/lib.rs:360`(image), `379`(pdf)). 변경:
```rust
let ocr_engine: Option<Box<dyn OcrEngine>> = if cfg.image.ocr.enabled {
match cfg.image.ocr.engine.as_str() {
"ollama-vision" => Some(Box::new(OllamaVisionOcr::new(cfg)?)),
"paddle-onnx" => Some(Box::new(OnnxPaddleOcr::new(cfg)?)),
other => bail!("unknown image.ocr.engine: {other}"),
}
} else { None };
```
- `ImagePipeline.ocr_engine``Option<&'a dyn OcrEngine>` 로(현재 구체타입 `&OllamaVisionOcr`).
- pdf 경로 동일. `apply_ocr`/`apply_ocr_to_pdf_pages` 는 이미 `&dyn OcrEngine` 받음 → 변경 불필요.
- `OnnxPaddleOcr` 는 한 번 생성(모델 1회 로드) 후 ingest 전체에서 재사용(PoC 모델로드 58ms, 무시 가능).
## 버전/재색인 cascade
OCR 엔진 변경 시 **영향 자산 자동 재색인**되어야 함(v0.26.2 메커니즘). 현재 `ingest_config_signature`(`kebab-app/src/lib.rs:3036` 부근)의 image/pdf 브랜치는 `|ocr:1:{ocr.model}` 만 서명.
**C3 (필수, 권장 아님)**: paddle-onnx 브랜치에서 `model`("gemma4:e4b" 기본) 은 **미사용** — 실제 모델 정체성은 det/rec/dict + engine_version 에 있음. 따라서:
- 서명을 `|ocr:1:{engine}:{engine_version}` 로(엔진 + 모델/dict 식별자). `engine_version()`(spec 의 model+dict blake3 해시 포함, 라인 47)을 **반드시** 서명에 사용.
- 이유: ① `engine="ollama-vision"→"paddle-onnx"` 전환 시 model 이 기본값 그대로면 `{model}` 만으론 서명 불변 → **재색인 안 됨**(silent stale index, v0.26.2 가 없애려던 바로 그 버그). ② 모델 재변환/dict 수정 시 engine_version 변화로 재색인 트리거.
- 단위테스트(필수): (a) `ollama-vision``paddle-onnx` 동일 model → 서명 다름. (b) 동일 engine, engine_version 다름 → 서명 다름. (c) 무관 설정(search 등) → 서명 불변.
## 기본 엔진 (default) — 별도 결정
본 spec 은 `paddle-onnx` 를 **선택 가능**하게만 한다. kebab 의 `image.ocr.engine` **기본값을 `paddle-onnx` 로 바꿀지**는 후속 결정:
- 바꾸면: 신규 사용자/기본 동작 변화 + 모델 다운로드 기본화. 강력하나 영향 큼.
- v1 은 기본 `ollama-vision` 유지, opt-in `paddle-onnx`. 도그푸딩 후 기본 전환을 별 PR 로. (사용자 본인 config 는 즉시 `paddle-onnx`.)
## 에러 처리 (M3 — 명시 매트릭스)
배치 ingest 가 미지의 사용자 스캔을 돈다. 각 케이스 동작 확정:
| 케이스 | 동작 | 근거 |
|---|---|---|
| 모델 다운로드 실패 | 엔진 생성 시 **fail-fast**(Ollama 와 동일, `lib.rs:360`) | 색인 시작 전 차단 |
| blake3 불일치 | fail-fast + 사유 | 무결성 |
| 디코드 불가 이미지 | **자산 skip + provenance 노트**(ingest 중단 X) | 기존 `apply_ocr` "skip vs surface" 계약(`ocr.rs:75`) |
| det 0 박스(빈 이미지 등) | **성공, `OcrText{joined:"", regions:[]}`**(에러 아님) | Ollama 빈줄 동작(`ocr.rs:290`) 미러 |
| rec 빈 출력(한 박스) | 그 박스 skip, 나머지 진행 | |
| 박스 폭증(노이즈 스캔) | **`max_boxes` 상한**(기본 예: 1000) 초과분 절단 + 로그 | 메모리/지연 cliff 방지 |
| dict 길이 ≠ rec 클래스 | 생성 시 에러(정합 검증) | bounds-check |
ort `Session` 은 생성 후 1회 로드·재사용. ingest 는 현재 직렬(`lib.rs:460`, rayon 없음)이라 동시접근 없음 — 단 `OcrEngine: Send+Sync` 유지(미래 병렬화 대비, rc.9 Session Send/Sync 확인은 plan).
## 검증 기준
- `cargo clippy --workspace --all-targets -j 8 -- -D warnings` 0.
- `cargo test -p kebab-parse-image -p kebab-app -j 8` 통과(touched 크레이트; `kebab-parse-image` 단독 빌드가 download-binaries 로 링크되는지 포함).
- 신규 단위테스트:
- 단계별 골든벡터(전처리/det후처리/CTC/박스정렬) — baseline 0.976 대비 단계 회귀 감지.
- OnnxPaddleOcr e2e: 합성 한/영 fixture → **CER ≤ 0.05**(=문자정확도 ≥95%), bbox>0. (단 합성 fixture 는 실코퍼스 회귀 미보장 → 도그푸딩 병행.)
- CTC decode: 알려진 logit→문자열(blank/중복 제거, bounds-check).
- 엔진 팩토리: `engine="paddle-onnx"`→OnnxPaddleOcr, 미지 값 에러.
- 서명(C3): 위 (a)(b)(c) 케이스.
- config override(`det_model`/`rec_model`/`dict`) 가 실제 사용됨 + **`--config` facade 스레딩**(CLAUDE.md facade rule, P3-5/P4-3 회귀 전례) — `OnnxPaddleOcr::new(cfg, …)` 가 explicit Config 받음.
- 회귀 가드: `engine="ollama-vision"`(기본) 경로 — 팩토리 리팩터(구체타입→`&dyn`) 후에도 **출력 동일** 핀하는 테스트.
- 스모크: `engine="paddle-onnx"` 이미지 ingest → OCR 텍스트 FTS5 hit. 큰 페이지 CPU <5초.
- 도그푸딩: 사용자 실제 이미지/책 스캔 정확도·속도(HOTFIXES + release notes).
## 의존성 규칙 (design §8)
`kebab-parse-image` allowed: kebab-core, kebab-config, serde, image, tracing, thiserror(task p6-2). 추가: `ort`(workspace, features `["ndarray","download-binaries"]`), `ndarray`(workspace), `imageproc`. **clipper2 미추가**(C++ FFI 회피 — unclip pure-Rust 직접). **hf-hub 미추가**(결정 C: 모델 번들, 외부 다운로드 0). **금지 유지**: kebab-store-*/embed-*/llm-* 미import. UI 크레이트 영향 없음.
## 비범위
- **OCR 텍스트→임베딩 갭**(현재 OCR 은 FTS5 lexical 전용, 벡터 미포함). 사용자 "OCR 모델만 먼저" → 별도 작업.
- **caption** 은 gemma 유지([[project_llm_default]]).
- **GPU provider**(ort CUDA/CoreML): CPU 로 충분(2.75초). 후속 옵션.
- **기본 엔진 전환**(default `paddle-onnx`): 도그푸딩 후 별 PR.
- 다국어 dict 동적 전환(현재 korean dict = 한+영+숫자+기호 11,945자로 한/영 충분).
## 잔여 노트 (critic minors)
- **max_pixels(m1)**: 기존 `[256,4096]` clamp 은 VLM 프롬프트 비용 기준. det/rec 엔진은 비용이 latency 라 trade-off 다름. v1 은 기본 1600 **유지(의도적)** — PoC 에서 1600 대 원본 정확도 차 미미, 속도 이점. plan 에서 paddle-onnx 전용 기본 재검토 가능.
- **config 마이그레이션(m3)**: 신규 키(`det_model` 등)는 serde default 로 forward-compat(기존 파일 무수정 로드). `kebab config migrate`(#198) 가 주석/순서 보존하며 신규 키 추가 — migration 핸들링 불필요(serde default), 단 init 템플릿에 신규 키 노출.
- **per-region confidence(open q)**: Ollama 는 region confidence 상수 1.0, paddle-onnx 는 실제 score. `OcrRegion` 형태 불변이라 wire 호환(값만 의미있어짐) — release note 1줄.
- **세로/회전 페이지**: 비범위(가로쓰기 reading-order 전제). 회전 박스 rectify 는 지원하나 페이지 전체 세로조판은 미지원 명시.
## 버전/문서
- feature(신규 engine 값 + 동작) → **minor bump**.
- README(Configuration: `image.ocr.engine`, 모델 첫 다운로드 안내), docs/SMOKE(config 예시), HANDOFF 1줄, docs/ARCHITECTURE(새 OCR 백엔드 추가 시 그래프/결정), HOTFIXES dated entry(도그푸딩 evidence). wire schema 불변(OcrText 내부, `--json` 표면 동일).

View File

@@ -14,6 +14,39 @@ historical contract that was implemented; this file accumulates the
deltas so phase 5+ readers can find the live behavior without diffing
git history.
## 2026-06-04 — PP-OCRv5 ONNX Rust 네이티브 OCR 엔진 (v0.27.0)
**무엇을 추가했나.** 이미지 OCR 에 두 번째 백엔드 `paddle-onnx` 를 붙였다. 기존 `ollama-vision`
(원격 vision LM, 이미지당 ~50초)은 default 로 유지하고, `[image.ocr] engine = "paddle-onnx"`
PP-OCRv5(검출 DBNet + 인식 CTC) ONNX 모델을 `ort`(=2.0.0-rc.9) 로 **in-process** 실행한다 —
Python 런타임/원격 호출 없이 큰 페이지 CPU <4초. `OcrEngine` trait 의 두 번째 구현
`OnnxPaddleOcr`(`crates/kebab-parse-image/src/paddle_onnx.rs`), 팩토리는
`kebab-app::build_image_ocr_engine`/`build_pdf_ocr_engine` (`match engine`). 검출 후처리
(min-area rect = rotating calipers, unclip = polygon offset)는 clipper2/OpenCV 없이 pure-Rust.
**T11 e2e 에서 발견·수정한 핵심 버그 (unclip).** 첫 실측 CER 이 0.26(게이트 0.05) 으로 크게
초과. 단계 골든(`crates/kebab-parse-image/tests/golden/`) 와 prediction dump 로 국소화한 결과
`unclip_rect` 가 corner 를 centroid 기준 **방사(radial) 확장**하고 있었다. 텍스트 박스는
wide/short(예 586×15)라 대각선이 거의 수평 → 방사 확장 시 corner 가 수평으로만 ~11px 움직이고
**세로로는 거의 안 커져** 글자 윗/아랫부분이 잘렸다(ㄷ→ㄴ 로 `다``나`, ascender 손실).
PaddleOCR pyclipper 처럼 **edge 별로 바깥으로 offset**(width·height 각각 2·distance 증가) 하도록
rect 자체 (u,v) 축 기준 확장으로 재작성. 결과: mean gate CER **0.2585 → 0.0049**
(clean_paragraph/korean_heavy/numbers_table/tech_terms = 0.0), PoC 0.024 baseline 보다 우수.
큰 페이지 3.9초 < 5초 게이트. **교훈**: 회전 사각형 unclip 은 방사 확장이 아니라 polygon edge
offset 이어야 한다.
**Config / 서명 cascade.** `[image.ocr]``det_model`/`rec_model`/`dict`(Option, override) +
`score_thresh`(0.3)/`unclip_ratio`(1.5)/`max_boxes`(1000) serde-default 필드 + `KEBAB_IMAGE_OCR_*`
env 추가(기존 config 무수정 로드 — forward-compat). `ingest_config_signature` 의 image/pdf 브랜치를
`|ocr:1:{model}``|ocr:1:{engine}:{engine_version}` 로 바꿔 engine 전환(ollama↔paddle) 또는
모델 변경 시 영향 자산 자동 재색인. paddle engine_version 은 모델 3-asset blake3 를 **per-process
1회만** 계산(triple 키 memo) — 자산마다 17MB 재해시 회피.
**모델 배포.** ONNX 2개(det 4.7MB / rec 13MB) + dict + NOTICE 를 `crates/kebab-parse-image/
assets/paddleocr-onnx/` 에 둔다(Git LFS). 테스트는 `KEBAB_IMAGE_OCR_MODEL_DIR`(기본 = 번들 dir)
에서 로드, e2e(`tests/paddle_e2e.rs`)는 모델/fixture 부재 시 깨끗이 skip(CI green). 자세한 설계:
spec/plan `docs/superpowers/{specs,plans}/2026-06-04-rust-native-ocr-*.md`.
## 2026-06-03 — ingest 출력 영향 설정 변경 시 영향 자산 자동 재색인 (v0.26.2)
**무엇이 깨졌나.** `[image.ocr]` / `[image.caption]` 를 off→색인→on 으로 바꿔도 증분