feat(config): add [pdf.ocr] section — qwen2.5vl:3b default, opt-in + env overrides + doc(app): PdfOcrOpts field doc (Step 4 I-1)
Step 5 (Group F) of v0.20.0 sub-item 1 (scanned PDF OCR) plan +
Step 4 reviewer Important I-1 fix (PdfOcrOpts field doc) 동봉.
F1 — `kebab-config::PdfCfg` + `PdfOcrCfg` + 4 default fn:
- PdfCfg { ocr: PdfOcrCfg }.
- PdfOcrCfg with 11 field (enabled/always_on/engine/model/endpoint/
languages/max_pixels/request_timeout_secs/valid_ratio_threshold/
min_char_count/lang_hint).
- defaults: opt-in (enabled=false), qwen2.5vl:3b, 0.5 threshold, 20 char.
- mirror of image OCR cfg pattern (spec §4.5).
Config struct extension:
- `pdf: PdfCfg` field with `#[serde(default = "PdfCfg::defaults")]`.
11 env var override (parallel to KEBAB_IMAGE_OCR_*):
KEBAB_PDF_OCR_{ENABLED,ALWAYS_ON,ENGINE,MODEL,ENDPOINT,LANGUAGES,
MAX_PIXELS,REQUEST_TIMEOUT_SECS,VALID_RATIO_THRESHOLD,MIN_CHAR_COUNT,
LANG_HINT}.
F2 — `crates/kebab-config/tests/pdf_ocr.rs` (신규):
- toml roundtrip (11 field).
- defaults (opt-in + qwen2.5vl:3b).
- env override (4 key sample + default preservation).
F3 (Step 4 I-1) — `pdf_ocr_apply.rs` 4 public item 의 doc comment:
- PdfOcrOpts struct + 6 field.
- PdfOcrSummary struct + 2 field.
- apply_ocr_to_pdf_pages fn (Errors block 포함).
- PdfOcrProgress enum + 2 variant + 5 field.
body 변경 0, doc-only.
spec: docs/superpowers/specs/2026-05-27-pdf-scanned-ocr-spec.md (§4.5)
plan: docs/superpowers/plans/2026-05-27-pdf-scanned-ocr-plan.md (Step 5 F1+F2)
prior: 9f003ef (Step 4) — code reviewer Important I-1 resolution
contract: §9 (additive minor wire bump — Step 7)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,11 +22,26 @@ use lopdf::Document as LopdfDocument;
|
||||
use time::OffsetDateTime;
|
||||
use tracing::warn;
|
||||
|
||||
/// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`].
|
||||
/// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade
|
||||
/// (`kebab_app::ingest_one_pdf_asset`) fills these from
|
||||
/// `kebab_config::Config::pdf::ocr` plus runtime flags (CLI / SIGINT).
|
||||
pub struct PdfOcrOpts {
|
||||
/// Master switch. `false` short-circuits to
|
||||
/// `PdfOcrSummary { pages_ocrd: 0, ms_total: 0 }` without lopdf reparse.
|
||||
pub enabled: bool,
|
||||
/// `true` → 모든 page OCR (dual-block path, new `Block::Paragraph` push).
|
||||
/// `false` → text-detect block 의 `min_char_count` 또는
|
||||
/// `valid_ratio_threshold` 미달인 page 만 OCR (in-place mutate).
|
||||
pub always_on: bool,
|
||||
/// 0.0..=1.0. text-detect block 의 `compute_valid_char_ratio` 가
|
||||
/// 본 임계 미만이면 OCR fallback. Default `0.5`.
|
||||
pub valid_ratio_threshold: f32,
|
||||
/// text-detect block 의 char count 가 본 임계 미만이면 OCR fallback.
|
||||
/// empty page (cover, blank separator) 자동 skip. Default `20`.
|
||||
pub min_char_count: u32,
|
||||
/// OCR engine 에 전달할 언어 힌트 (예: `Lang("kor".into())`).
|
||||
/// `None` → no hint passed to engine.
|
||||
pub lang_hint: Option<Lang>,
|
||||
/// Optional per-page cancellation handle. checked at start of each page
|
||||
/// loop iteration; set→true 시 `cancelled mid-PDF` error 반환. plan §6 E4
|
||||
@@ -34,12 +49,34 @@ pub struct PdfOcrOpts {
|
||||
pub cancel: Option<Arc<AtomicBool>>,
|
||||
}
|
||||
|
||||
/// OCR run summary returned by [`apply_ocr_to_pdf_pages`] for the caller's
|
||||
/// `IngestItem.pdf_ocr_pages` + `pdf_ocr_ms_total` wire fields (§4.6.2).
|
||||
#[derive(Debug)]
|
||||
pub struct PdfOcrSummary {
|
||||
/// Number of pages 가 OCR pipeline 을 실제 통과 (skipped page 제외).
|
||||
pub pages_ocrd: u32,
|
||||
/// Cumulative wall-clock duration of successful OCR engine calls (ms).
|
||||
/// `saturating_add` 사용 — 24-day cumulative 까지 overflow-safe.
|
||||
pub ms_total: u64,
|
||||
}
|
||||
|
||||
/// Post-extract OCR enrichment for PDF. Walks `canonical.blocks` page-by-page,
|
||||
/// classifies each page via `text_quality::compute_valid_char_ratio` +
|
||||
/// `min_char_count`, and either:
|
||||
/// - skips (vector PDF + sufficient text + `always_on=false`),
|
||||
/// - mutates the text-detect `Block::Paragraph` in-place with OCR output
|
||||
/// (scanned/mojibake page), or
|
||||
/// - pushes a new `Block::Paragraph` with dual ordinal (`always_on=true` +
|
||||
/// vector page).
|
||||
///
|
||||
/// Errors:
|
||||
/// - cancel handle (`opts.cancel = Some(true)`) → `Err("PDF OCR cancelled mid-PDF at page N")`.
|
||||
/// - lopdf re-parse failure → `Err(...)`.
|
||||
/// - per-page OCR engine failure 또는 DCTDecode 부재 → `ProvenanceKind::Warning`
|
||||
/// event push + `emit_progress(Finished { skipped: true })` + continue
|
||||
/// (no `Err` propagation).
|
||||
///
|
||||
/// See spec §4.1 + §4.4 for the full pipeline.
|
||||
pub fn apply_ocr_to_pdf_pages<F>(
|
||||
canonical: &mut CanonicalDocument,
|
||||
engine: &dyn OcrEngine,
|
||||
@@ -233,12 +270,26 @@ fn find_paragraph_block_idx(blocks: &[Block], page_num: u32) -> usize {
|
||||
.expect("PdfTextExtractor emits 1 Block::Paragraph per page (invariant)")
|
||||
}
|
||||
|
||||
/// Per-page OCR progress event 가 caller 의 `emit_progress` closure 호출 시 emit.
|
||||
/// Step 6 의 ingest_one_pdf_asset 가 IngestEvent::PdfOcrStarted / PdfOcrFinished
|
||||
/// 로 carry (spec §4.6.1 wire schema).
|
||||
pub enum PdfOcrProgress {
|
||||
Started { page: u32 },
|
||||
Finished {
|
||||
/// page 별 OCR 시작 시 emit. `engine.recognize` 호출 직전.
|
||||
Started {
|
||||
/// 1-based PDF page number.
|
||||
page: u32,
|
||||
},
|
||||
/// page 별 OCR 종료 시 emit (성공 / skip / failure 모두).
|
||||
Finished {
|
||||
/// 1-based PDF page number.
|
||||
page: u32,
|
||||
/// `engine.recognize` wall-clock duration. skip path 의 의미는 mixed
|
||||
/// (DCTDecode 부재 시 `0`, OCR engine 실패 시 actual latency before bail).
|
||||
ms: u64,
|
||||
/// OCR result text 의 char count. skip 시 `0`.
|
||||
chars: u32,
|
||||
/// `true` = DCTDecode 부재 또는 OCR engine 실패 로 skip.
|
||||
/// `false` = 정상 OCR 완료.
|
||||
skipped: bool,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -50,6 +50,11 @@ pub struct Config {
|
||||
/// load cleanly with built-in defaults.
|
||||
#[serde(default)]
|
||||
pub ingest: IngestCfg,
|
||||
/// v0.20.0 sub-item 1: PDF ingest pipeline settings. `#[serde(default)]`
|
||||
/// so pre-v0.20 config files without a `[pdf]` section load with
|
||||
/// built-in defaults (OCR disabled — opt-in for scanned PDF KB).
|
||||
#[serde(default = "PdfCfg::defaults")]
|
||||
pub pdf: PdfCfg,
|
||||
/// p9-fb-05: directory of the on-disk config file this `Config`
|
||||
/// was loaded from, if any. Populated by `Config::from_file` /
|
||||
/// `Config::load` — never serialized (`#[serde(skip)]`). Used by
|
||||
@@ -392,6 +397,88 @@ impl CaptionCfg {
|
||||
}
|
||||
}
|
||||
|
||||
/// Settings for the PDF ingest pipeline (P7 + v0.20.0 sub-item 1).
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct PdfCfg {
|
||||
#[serde(default = "PdfOcrCfg::defaults")]
|
||||
pub ocr: PdfOcrCfg,
|
||||
}
|
||||
|
||||
impl PdfCfg {
|
||||
pub fn defaults() -> Self {
|
||||
Self { ocr: PdfOcrCfg::defaults() }
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for PdfCfg {
|
||||
fn default() -> Self { Self::defaults() }
|
||||
}
|
||||
|
||||
/// v0.20.0 sub-item 1: scanned PDF OCR via Ollama vision LLM. Default
|
||||
/// disabled — opt-in because OCR adds ~45-100s per scanned page on CPU
|
||||
/// (qwen2.5vl:3b, remote). Enable for book / paper scan KB.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct PdfOcrCfg {
|
||||
/// Run OCR on scanned PDF pages. Default `false` (opt-in).
|
||||
pub enabled: bool,
|
||||
/// `false` (default) — text-detect first + vision fallback on
|
||||
/// scanned pages only. `true` — vision LLM 호출 on every page
|
||||
/// (vector PDF 의 dual-text confidence boost — doubles chunk count).
|
||||
pub always_on: bool,
|
||||
/// Engine identifier. v1 only ships `"ollama-vision"`.
|
||||
pub engine: String,
|
||||
/// Vision model id. Default `"qwen2.5vl:3b"` per PoC (§3.5 family
|
||||
/// asymmetry vs image OCR's gemma4:e4b is acknowledged).
|
||||
pub model: String,
|
||||
/// HTTP endpoint. `None` → fall back to `models.llm.endpoint`.
|
||||
#[serde(default)]
|
||||
pub endpoint: Option<String>,
|
||||
/// BCP-47 language hints rendered into prompt.
|
||||
pub languages: Vec<String>,
|
||||
/// Long-edge cap (px). Larger images bloat prompt cost.
|
||||
pub max_pixels: u32,
|
||||
/// HTTP request timeout (sec). Same `0` = "fail immediately"
|
||||
/// semantics as `image.ocr.request_timeout_secs` (NOT a disable
|
||||
/// sentinel — see image.ocr docs).
|
||||
#[serde(default = "default_pdf_ocr_request_timeout_secs")]
|
||||
pub request_timeout_secs: u64,
|
||||
/// Valid char ratio threshold (0.0..=1.0). Page with ratio below
|
||||
/// this is classified as scanned/mojibake → OCR fallback. Default
|
||||
/// `0.5`.
|
||||
#[serde(default = "default_pdf_ocr_valid_ratio")]
|
||||
pub valid_ratio_threshold: f32,
|
||||
/// Minimum char count per page below which page is auto-scanned.
|
||||
/// Default `20`.
|
||||
#[serde(default = "default_pdf_ocr_min_char_count")]
|
||||
pub min_char_count: u32,
|
||||
/// Single-page lang hint. Default `Some("kor")`. `None` = no hint.
|
||||
#[serde(default = "default_pdf_ocr_lang_hint")]
|
||||
pub lang_hint: Option<String>,
|
||||
}
|
||||
|
||||
impl PdfOcrCfg {
|
||||
pub fn defaults() -> Self {
|
||||
Self {
|
||||
enabled: false,
|
||||
always_on: false,
|
||||
engine: "ollama-vision".to_string(),
|
||||
model: "qwen2.5vl:3b".to_string(),
|
||||
endpoint: None,
|
||||
languages: vec!["eng".to_string(), "kor".to_string()],
|
||||
max_pixels: 2048,
|
||||
request_timeout_secs: default_pdf_ocr_request_timeout_secs(),
|
||||
valid_ratio_threshold: default_pdf_ocr_valid_ratio(),
|
||||
min_char_count: default_pdf_ocr_min_char_count(),
|
||||
lang_hint: default_pdf_ocr_lang_hint(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn default_pdf_ocr_request_timeout_secs() -> u64 { 600 }
|
||||
fn default_pdf_ocr_valid_ratio() -> f32 { 0.5 }
|
||||
fn default_pdf_ocr_min_char_count() -> u32 { 20 }
|
||||
fn default_pdf_ocr_lang_hint() -> Option<String> { Some("kor".to_string()) }
|
||||
|
||||
/// p9-fb-14: TUI-only configuration. Currently a single `theme`
|
||||
/// selector (`"dark"` / `"light"`); future fields (custom role
|
||||
/// overrides, mode-machine cursor shapes, …) extend the same
|
||||
@@ -539,6 +626,7 @@ impl Config {
|
||||
image: ImageCfg::defaults(),
|
||||
ui: UiCfg::defaults(),
|
||||
ingest: IngestCfg::default(),
|
||||
pdf: PdfCfg::defaults(),
|
||||
// p9-fb-05: defaults are not loaded from disk, so no
|
||||
// source_dir. Relative `workspace.root` (rare with
|
||||
// defaults) falls back to caller `cwd` via the
|
||||
@@ -903,6 +991,45 @@ impl Config {
|
||||
self.image.caption.prompt_template_version = v.clone();
|
||||
}
|
||||
|
||||
// pdf.ocr (v0.20.0 sub-item 1)
|
||||
"KEBAB_PDF_OCR_ENABLED" => self.pdf.ocr.enabled = parse_bool(v),
|
||||
"KEBAB_PDF_OCR_ALWAYS_ON" => self.pdf.ocr.always_on = parse_bool(v),
|
||||
"KEBAB_PDF_OCR_ENGINE" => self.pdf.ocr.engine = v.clone(),
|
||||
"KEBAB_PDF_OCR_MODEL" => self.pdf.ocr.model = v.clone(),
|
||||
"KEBAB_PDF_OCR_ENDPOINT" => {
|
||||
self.pdf.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) };
|
||||
}
|
||||
"KEBAB_PDF_OCR_LANGUAGES" => {
|
||||
self.pdf.ocr.languages = v
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
}
|
||||
"KEBAB_PDF_OCR_MAX_PIXELS" => {
|
||||
if let Ok(n) = v.parse::<u32>() {
|
||||
self.pdf.ocr.max_pixels = n;
|
||||
}
|
||||
}
|
||||
"KEBAB_PDF_OCR_REQUEST_TIMEOUT_SECS" => {
|
||||
if let Ok(n) = v.parse::<u64>() {
|
||||
self.pdf.ocr.request_timeout_secs = n;
|
||||
}
|
||||
}
|
||||
"KEBAB_PDF_OCR_VALID_RATIO_THRESHOLD" => {
|
||||
if let Ok(n) = v.parse::<f32>() {
|
||||
self.pdf.ocr.valid_ratio_threshold = n.clamp(0.0, 1.0);
|
||||
}
|
||||
}
|
||||
"KEBAB_PDF_OCR_MIN_CHAR_COUNT" => {
|
||||
if let Ok(n) = v.parse::<u32>() {
|
||||
self.pdf.ocr.min_char_count = n;
|
||||
}
|
||||
}
|
||||
"KEBAB_PDF_OCR_LANG_HINT" => {
|
||||
self.pdf.ocr.lang_hint = if v.is_empty() { None } else { Some(v.clone()) };
|
||||
}
|
||||
|
||||
// Unknown KEBAB_* keys are silently ignored — see
|
||||
// `env_unknown_key_is_ignored` test.
|
||||
_ => {}
|
||||
|
||||
80
crates/kebab-config/tests/pdf_ocr.rs
Normal file
80
crates/kebab-config/tests/pdf_ocr.rs
Normal file
@@ -0,0 +1,80 @@
|
||||
// crates/kebab-config/tests/pdf_ocr.rs
|
||||
//
|
||||
// Integration tests for [pdf.ocr] config section (v0.20.0 sub-item 1).
|
||||
|
||||
use std::collections::HashMap;
|
||||
use kebab_config::{Config, PdfCfg};
|
||||
|
||||
// Test 1: toml roundtrip — spec §4.5 line 1034-1047 example block.
|
||||
// Config requires many required fields; test the [pdf] section via PdfCfg wrapper.
|
||||
#[derive(serde::Deserialize)]
|
||||
struct PdfWrapper { pdf: PdfCfg }
|
||||
|
||||
#[test]
|
||||
fn pdf_ocr_toml_roundtrip() {
|
||||
let toml = r#"
|
||||
[pdf.ocr]
|
||||
enabled = true
|
||||
always_on = false
|
||||
engine = "ollama-vision"
|
||||
model = "qwen2.5vl:7b"
|
||||
endpoint = "http://192.168.0.47:11434"
|
||||
languages = ["eng", "kor"]
|
||||
max_pixels = 3072
|
||||
request_timeout_secs = 900
|
||||
valid_ratio_threshold = 0.6
|
||||
min_char_count = 30
|
||||
lang_hint = "kor"
|
||||
"#;
|
||||
let w: PdfWrapper = toml::from_str(toml).expect("parse toml");
|
||||
let ocr = &w.pdf.ocr;
|
||||
assert!(ocr.enabled);
|
||||
assert!(!ocr.always_on);
|
||||
assert_eq!(ocr.engine, "ollama-vision");
|
||||
assert_eq!(ocr.model, "qwen2.5vl:7b");
|
||||
assert_eq!(ocr.endpoint.as_deref(), Some("http://192.168.0.47:11434"));
|
||||
assert_eq!(ocr.languages, vec!["eng".to_string(), "kor".to_string()]);
|
||||
assert_eq!(ocr.max_pixels, 3072);
|
||||
assert_eq!(ocr.request_timeout_secs, 900);
|
||||
assert!((ocr.valid_ratio_threshold - 0.6).abs() < 1e-6);
|
||||
assert_eq!(ocr.min_char_count, 30);
|
||||
assert_eq!(ocr.lang_hint.as_deref(), Some("kor"));
|
||||
}
|
||||
|
||||
// Test 2: defaults — opt-in, qwen2.5vl:3b model, 0.5 threshold, 20 min_char.
|
||||
#[test]
|
||||
fn pdf_ocr_defaults_off_with_qwen_3b() {
|
||||
let cfg = Config::defaults();
|
||||
assert_eq!(cfg.pdf.ocr.enabled, false);
|
||||
assert_eq!(cfg.pdf.ocr.always_on, false);
|
||||
assert_eq!(cfg.pdf.ocr.engine, "ollama-vision");
|
||||
assert_eq!(cfg.pdf.ocr.model, "qwen2.5vl:3b");
|
||||
assert!(cfg.pdf.ocr.endpoint.is_none());
|
||||
assert_eq!(cfg.pdf.ocr.languages, vec!["eng".to_string(), "kor".to_string()]);
|
||||
assert_eq!(cfg.pdf.ocr.max_pixels, 2048);
|
||||
assert_eq!(cfg.pdf.ocr.request_timeout_secs, 600);
|
||||
assert!((cfg.pdf.ocr.valid_ratio_threshold - 0.5).abs() < 1e-6);
|
||||
assert_eq!(cfg.pdf.ocr.min_char_count, 20);
|
||||
assert_eq!(cfg.pdf.ocr.lang_hint.as_deref(), Some("kor"));
|
||||
}
|
||||
|
||||
// Test 3: env var override — 4 keys 의 typical override case.
|
||||
#[test]
|
||||
fn pdf_ocr_env_overrides() {
|
||||
let mut env: HashMap<String, String> = HashMap::new();
|
||||
env.insert("KEBAB_PDF_OCR_ENABLED".to_string(), "true".to_string());
|
||||
env.insert("KEBAB_PDF_OCR_MODEL".to_string(), "qwen2.5vl:7b".to_string());
|
||||
env.insert("KEBAB_PDF_OCR_ALWAYS_ON".to_string(), "true".to_string());
|
||||
env.insert("KEBAB_PDF_OCR_VALID_RATIO_THRESHOLD".to_string(), "0.75".to_string());
|
||||
|
||||
let cfg = Config::defaults().apply_env(&env);
|
||||
|
||||
assert_eq!(cfg.pdf.ocr.enabled, true);
|
||||
assert_eq!(cfg.pdf.ocr.model, "qwen2.5vl:7b");
|
||||
assert_eq!(cfg.pdf.ocr.always_on, true);
|
||||
assert!((cfg.pdf.ocr.valid_ratio_threshold - 0.75).abs() < 1e-6);
|
||||
|
||||
// 다른 env var 가 default 보존
|
||||
assert_eq!(cfg.pdf.ocr.engine, "ollama-vision");
|
||||
assert_eq!(cfg.pdf.ocr.min_char_count, 20);
|
||||
}
|
||||
Reference in New Issue
Block a user