From 3d45994693aea8c6c017bcc70c903f345b93819c Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 4 Jun 2026 12:44:27 +0000 Subject: [PATCH] =?UTF-8?q?refactor(config):=20signature=20paddle=20?= =?UTF-8?q?=EA=B2=BD=EB=A1=9C=20=EB=AF=B8=EB=94=94=EC=96=B4=ED=99=94=20+?= =?UTF-8?q?=20=EB=B0=94=EC=9D=B4=ED=8A=B8=20=EB=B6=88=EB=B3=80=20=EA=B3=A8?= =?UTF-8?q?=EB=93=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ocr_engine_version_for_sig 가 det/rec/dict 를 호출자(미디어별)로부터 받도록 인자화 — image 는 [ingest.image.ocr], pdf 는 [ingest.pdf.ocr]. v2 의 pdf↔image paddle 비대칭 제거. engine_version_for_paths 신설(kebab-parse-image). 출력 문자열은 값 기반이라 v2 와 바이트 동일(불변식 #1). test seam + 골든 추가. Co-Authored-By: Claude Opus 4.8 --- crates/kebab-app/src/lib.rs | 112 +++++++++++------- crates/kebab-app/tests/config_invalidation.rs | 21 ++++ crates/kebab-parse-image/src/lib.rs | 5 +- crates/kebab-parse-image/src/paddle_onnx.rs | 20 ++++ 4 files changed, 116 insertions(+), 42 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index c0e097b..8dcb638 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -54,7 +54,7 @@ use kebab_core::{ use kebab_llm_local::OllamaLanguageModel; use kebab_parse_image::{ OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, OnnxPaddleOcr, PADDLE_ONNX_ENGINE, - apply_caption, apply_ocr, engine_version_for_config, + apply_caption, apply_ocr, engine_version_for_paths, }; use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -3090,21 +3090,31 @@ static PADDLE_OCR_VERSION_MEMO: std::sync::OnceLock< std::sync::Mutex>, > = std::sync::OnceLock::new(); -/// T9: resolve the OCR `engine_version` string used inside the ingest config +/// T9/v3: resolve the OCR `engine_version` string used inside the ingest config /// signature. ollama-vision is self-describing from `engine/model` (cheap, no /// I/O). paddle-onnx hashes the bundled/override model assets (memoized). -fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model: &str) -> String { +/// +/// v3: paddle 경로(det/rec/dict)는 **호출자가 미디어별로** 넘긴다 — image 는 +/// `[ingest.image.ocr]`, pdf 는 `[ingest.pdf.ocr]`. v2 의 "pdf 가 image paddle +/// 을 빌려쓰던" 비대칭을 제거한다. 마이그레이션(T5)이 pdf 대칭 키를 image 값 +/// 으로 채우므로 미변환 v2 → v3 의 signature 는 바이트 동일하게 유지된다. +fn ocr_engine_version_for_sig( + engine: &str, + model: &str, + det: Option<&str>, + rec: Option<&str>, + dict: Option<&str>, +) -> String { if engine != PADDLE_ONNX_ENGINE { // ollama-vision (and any non-paddle engine): the daemon exposes no // stable per-model revision, so engine/model is the identity. return format!("ollama/{model}"); } - let ocr = &config.ingest.image.ocr; let key = format!( "{}|{}|{}", - ocr.det_model.as_deref().unwrap_or(""), - ocr.rec_model.as_deref().unwrap_or(""), - ocr.dict.as_deref().unwrap_or(""), + det.unwrap_or(""), + rec.unwrap_or(""), + dict.unwrap_or(""), ); let memo = PADDLE_OCR_VERSION_MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new())); if let Some(v) = memo.lock().unwrap().get(&key) { @@ -3114,7 +3124,7 @@ fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model // ingest the engine was already built (fail-fast) so the assets are // present and this succeeds; the path-derived identity below is an // unreachable-in-practice guard that keeps the signature total. - let version = engine_version_for_config(config).unwrap_or_else(|e| { + let version = engine_version_for_paths(det, rec, dict).unwrap_or_else(|e| { tracing::warn!( target: "kebab-app::ingest", error = %e, @@ -3126,6 +3136,14 @@ fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model version } +/// v3: signature 바이트 불변 골든을 위한 테스트 seam. `ingest_config_signature` +/// 는 private 이라 통합 테스트에서 직접 못 부른다. 값 기반이라 struct 경로가 +/// 바뀌어도(미디어 ingest 통합) 출력 문자열은 v2 와 바이트 동일해야 한다. +#[doc(hidden)] +pub fn test_ingest_config_signature(c: &kebab_config::Config, m: &MediaType) -> String { + ingest_config_signature(c, m) +} + fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> String { // Common (every media type): chunking parameters that move chunk // boundaries. `target_tokens` / `overlap_tokens` change re-chunking for @@ -3148,7 +3166,13 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> sig.push_str(&format!( "|ocr:1:{}:{}", ocr.engine, - ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model) + ocr_engine_version_for_sig( + &ocr.engine, + &ocr.model, + ocr.det_model.as_deref(), + ocr.rec_model.as_deref(), + ocr.dict.as_deref(), + ) )); } else { sig.push_str("|ocr:0"); @@ -3172,7 +3196,13 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> ocr.enabled, ocr.always_on, ocr.engine, - ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model) + ocr_engine_version_for_sig( + &ocr.engine, + &ocr.model, + ocr.det_model.as_deref(), + ocr.rec_model.as_deref(), + ocr.dict.as_deref(), + ) )); } else { sig.push_str("|pdfocr:0"); @@ -3739,7 +3769,7 @@ mod ingest_config_signature_tests { fn chunking_change_invalidates_all_types() { let base = Config::defaults(); let mut bumped = base.clone(); - bumped.chunking.target_tokens += 100; + bumped.ingest.chunking.target_tokens += 100; for m in [md(), img(), pdf(), code()] { assert_ne!( ingest_config_signature(&base, &m), @@ -3749,14 +3779,14 @@ mod ingest_config_signature_tests { } let mut overlap = base.clone(); - overlap.chunking.overlap_tokens += 10; + overlap.ingest.chunking.overlap_tokens += 10; assert_ne!( ingest_config_signature(&base, &md()), ingest_config_signature(&overlap, &md()) ); let mut headings = base.clone(); - headings.chunking.respect_markdown_headings = !base.chunking.respect_markdown_headings; + headings.ingest.chunking.respect_markdown_headings = !base.ingest.chunking.respect_markdown_headings; assert_ne!( ingest_config_signature(&base, &md()), ingest_config_signature(&headings, &md()) @@ -3768,9 +3798,9 @@ mod ingest_config_signature_tests { #[test] fn image_ocr_toggle_invalidates_image_only() { let base = Config::defaults(); - assert!(!base.image.ocr.enabled, "default OCR is off"); + assert!(!base.ingest.image.ocr.enabled, "default OCR is off"); let mut on = base.clone(); - on.image.ocr.enabled = true; + on.ingest.image.ocr.enabled = true; assert_ne!( ingest_config_signature(&base, &img()), @@ -3792,16 +3822,16 @@ mod ingest_config_signature_tests { fn image_ocr_model_matters_only_when_enabled() { let mut off_a = Config::defaults(); let mut off_b = off_a.clone(); - off_b.image.ocr.model = "some-other-model".to_string(); + off_b.ingest.image.ocr.model = "some-other-model".to_string(); assert_eq!( ingest_config_signature(&off_a, &img()), ingest_config_signature(&off_b, &img()), "OCR model is irrelevant while OCR is off" ); - off_a.image.ocr.enabled = true; + off_a.ingest.image.ocr.enabled = true; let mut on_b = off_a.clone(); - on_b.image.ocr.model = "some-other-model".to_string(); + on_b.ingest.image.ocr.model = "some-other-model".to_string(); assert_ne!( ingest_config_signature(&off_a, &img()), ingest_config_signature(&on_b, &img()), @@ -3814,14 +3844,14 @@ mod ingest_config_signature_tests { fn image_caption_toggle_and_prompt_invalidate_image() { let base = Config::defaults(); let mut on = base.clone(); - on.image.caption.enabled = true; + on.ingest.image.caption.enabled = true; assert_ne!( ingest_config_signature(&base, &img()), ingest_config_signature(&on, &img()) ); let mut prompt = on.clone(); - prompt.image.caption.prompt_template_version = "caption-v9".to_string(); + prompt.ingest.image.caption.prompt_template_version = "caption-v9".to_string(); assert_ne!( ingest_config_signature(&on, &img()), ingest_config_signature(&prompt, &img()), @@ -3835,7 +3865,7 @@ mod ingest_config_signature_tests { fn pdf_ocr_toggle_invalidates_pdf_only() { let base = Config::defaults(); let mut enabled = base.clone(); - enabled.pdf.ocr.enabled = true; + enabled.ingest.pdf.ocr.enabled = true; assert_ne!( ingest_config_signature(&base, &pdf()), ingest_config_signature(&enabled, &pdf()), @@ -3843,7 +3873,7 @@ mod ingest_config_signature_tests { ); let mut always = base.clone(); - always.pdf.ocr.always_on = true; + always.ingest.pdf.ocr.always_on = true; assert_ne!( ingest_config_signature(&base, &pdf()), ingest_config_signature(&always, &pdf()), @@ -3921,13 +3951,13 @@ mod ingest_config_signature_tests { // ui other.ui.theme = "light".to_string(); // image runtime-only (non-output) knobs - other.image.ocr.max_pixels += 100; - other.image.ocr.languages.push("jpn".to_string()); - other.image.ocr.request_timeout_secs += 10; + other.ingest.image.ocr.max_pixels += 100; + other.ingest.image.ocr.languages.push("jpn".to_string()); + other.ingest.image.ocr.request_timeout_secs += 10; // pdf runtime-only knobs - other.pdf.ocr.max_pixels += 100; - other.pdf.ocr.request_timeout_secs += 10; - other.pdf.ocr.languages.push("jpn".to_string()); + other.ingest.pdf.ocr.max_pixels += 100; + other.ingest.pdf.ocr.request_timeout_secs += 10; + other.ingest.pdf.ocr.languages.push("jpn".to_string()); for m in [md(), img(), pdf(), code()] { assert_eq!( @@ -3946,10 +3976,10 @@ mod ingest_config_signature_tests { #[test] fn image_ocr_engine_switch_invalidates_image() { let mut ollama = Config::defaults(); - ollama.image.ocr.enabled = true; + ollama.ingest.image.ocr.enabled = true; // same `model` string on both — only the engine differs let mut paddle = ollama.clone(); - paddle.image.ocr.engine = "paddle-onnx".to_string(); + paddle.ingest.image.ocr.engine = "paddle-onnx".to_string(); assert_ne!( ingest_config_signature(&ollama, &img()), ingest_config_signature(&paddle, &img()), @@ -3962,10 +3992,10 @@ mod ingest_config_signature_tests { #[test] fn image_ocr_engine_version_change_invalidates_image() { let mut a = Config::defaults(); - a.image.ocr.enabled = true; - a.image.ocr.model = "gemma4:e4b".to_string(); + a.ingest.image.ocr.enabled = true; + a.ingest.image.ocr.model = "gemma4:e4b".to_string(); let mut b = a.clone(); - b.image.ocr.model = "qwen2.5vl:3b".to_string(); + b.ingest.image.ocr.model = "qwen2.5vl:3b".to_string(); assert_ne!( ingest_config_signature(&a, &img()), ingest_config_signature(&b, &img()), @@ -3978,10 +4008,10 @@ mod ingest_config_signature_tests { #[test] fn image_ocr_paddle_model_path_change_invalidates_image() { let mut base = Config::defaults(); - base.image.ocr.enabled = true; - base.image.ocr.engine = "paddle-onnx".to_string(); + base.ingest.image.ocr.enabled = true; + base.ingest.image.ocr.engine = "paddle-onnx".to_string(); let mut overridden = base.clone(); - overridden.image.ocr.det_model = Some("/some/other/det.onnx".to_string()); + overridden.ingest.image.ocr.det_model = Some("/some/other/det.onnx".to_string()); assert_ne!( ingest_config_signature(&base, &img()), ingest_config_signature(&overridden, &img()), @@ -3994,11 +4024,11 @@ mod ingest_config_signature_tests { #[test] fn paddle_image_signature_stable_for_unrelated_change() { let mut base = Config::defaults(); - base.image.ocr.enabled = true; - base.image.ocr.engine = "paddle-onnx".to_string(); + base.ingest.image.ocr.enabled = true; + base.ingest.image.ocr.engine = "paddle-onnx".to_string(); let mut other = base.clone(); other.search.default_k += 3; - other.image.ocr.max_pixels += 100; // runtime-only knob + other.ingest.image.ocr.max_pixels += 100; // runtime-only knob assert_eq!( ingest_config_signature(&base, &img()), ingest_config_signature(&other, &img()), @@ -4010,9 +4040,9 @@ mod ingest_config_signature_tests { #[test] fn pdf_ocr_engine_switch_invalidates_pdf() { let mut ollama = Config::defaults(); - ollama.pdf.ocr.enabled = true; + ollama.ingest.pdf.ocr.enabled = true; let mut paddle = ollama.clone(); - paddle.pdf.ocr.engine = "paddle-onnx".to_string(); + paddle.ingest.pdf.ocr.engine = "paddle-onnx".to_string(); assert_ne!( ingest_config_signature(&ollama, &pdf()), ingest_config_signature(&paddle, &pdf()), diff --git a/crates/kebab-app/tests/config_invalidation.rs b/crates/kebab-app/tests/config_invalidation.rs index a301f5b..237fdcf 100644 --- a/crates/kebab-app/tests/config_invalidation.rs +++ b/crates/kebab-app/tests/config_invalidation.rs @@ -146,3 +146,24 @@ fn search_setting_change_reindexes_nothing() { assert_eq!(second.new, 0); assert_eq!(second.errors, 0); } + +/// v3 불변식 #1: `ingest_config_signature` 출력 문자열은 값 기반이라 struct +/// 경로 재편(미디어 ingest 통합) 후에도 v2 와 **바이트 동일**해야 한다. 깨지면 +/// 업그레이드 시 전체 재색인 발생. paddle-onnx image 분기 형식 골든. +#[test] +fn ingest_signature_image_paddle_byte_stable() { + let mut cfg = kebab_config::Config::defaults(); + cfg.ingest.image.ocr.enabled = true; + cfg.ingest.image.ocr.engine = "paddle-onnx".into(); + let sig = kebab_app::test_ingest_config_signature( + &cfg, + &kebab_core::MediaType::Image(kebab_core::ImageType::Png), + ); + // 골든: chunk:... |ocr:1:paddle-onnx: |cap:0 + assert!( + sig.starts_with("chunk:500:80:true:md-heading-v1"), + "chunk prefix drift: {sig}" + ); + assert!(sig.contains("|ocr:1:paddle-onnx:"), "ocr token drift: {sig}"); + assert!(sig.ends_with("|cap:0"), "cap token drift: {sig}"); +} diff --git a/crates/kebab-parse-image/src/lib.rs b/crates/kebab-parse-image/src/lib.rs index 177724e..422d010 100644 --- a/crates/kebab-parse-image/src/lib.rs +++ b/crates/kebab-parse-image/src/lib.rs @@ -34,7 +34,10 @@ pub mod paddle_onnx; pub use caption::{apply_caption, caption_image}; pub use ocr::{OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, apply_ocr}; -pub use paddle_onnx::{ModelPaths, OnnxPaddleOcr, PADDLE_ONNX_ENGINE, engine_version_for_config}; +pub use paddle_onnx::{ + ModelPaths, OnnxPaddleOcr, PADDLE_ONNX_ENGINE, engine_version_for_config, + engine_version_for_paths, +}; use anyhow::{Context, Result}; use kebab_core::{ diff --git a/crates/kebab-parse-image/src/paddle_onnx.rs b/crates/kebab-parse-image/src/paddle_onnx.rs index 6738db0..3fda464 100644 --- a/crates/kebab-parse-image/src/paddle_onnx.rs +++ b/crates/kebab-parse-image/src/paddle_onnx.rs @@ -474,6 +474,26 @@ pub fn engine_version_for_config(config: &kebab_config::Config) -> Result, + rec: Option<&str>, + dict: Option<&str>, +) -> Result { + let defaults = ModelPaths::from_default_dir(); + let paths = ModelPaths { + det: det.map(PathBuf::from).unwrap_or(defaults.det), + rec: rec.map(PathBuf::from).unwrap_or(defaults.rec), + dict: dict.map(PathBuf::from).unwrap_or(defaults.dict), + }; + compute_engine_version(&paths) +} + /// blake3 over det + rec + dict bytes → stable `engine_version`. fn compute_engine_version(paths: &ModelPaths) -> Result { let mut hasher = blake3::Hasher::new();