refactor(config): signature paddle 경로 미디어화 + 바이트 불변 골든
ocr_engine_version_for_sig 가 det/rec/dict 를 호출자(미디어별)로부터 받도록 인자화 — image 는 [ingest.image.ocr], pdf 는 [ingest.pdf.ocr]. v2 의 pdf↔image paddle 비대칭 제거. engine_version_for_paths 신설(kebab-parse-image). 출력 문자열은 값 기반이라 v2 와 바이트 동일(불변식 #1). test seam + 골든 추가. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -54,7 +54,7 @@ use kebab_core::{
|
||||
use kebab_llm_local::OllamaLanguageModel;
|
||||
use kebab_parse_image::{
|
||||
OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, OnnxPaddleOcr, PADDLE_ONNX_ENGINE,
|
||||
apply_caption, apply_ocr, engine_version_for_config,
|
||||
apply_caption, apply_ocr, engine_version_for_paths,
|
||||
};
|
||||
use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
@@ -3090,21 +3090,31 @@ static PADDLE_OCR_VERSION_MEMO: std::sync::OnceLock<
|
||||
std::sync::Mutex<std::collections::HashMap<String, String>>,
|
||||
> = std::sync::OnceLock::new();
|
||||
|
||||
/// T9: resolve the OCR `engine_version` string used inside the ingest config
|
||||
/// T9/v3: resolve the OCR `engine_version` string used inside the ingest config
|
||||
/// signature. ollama-vision is self-describing from `engine/model` (cheap, no
|
||||
/// I/O). paddle-onnx hashes the bundled/override model assets (memoized).
|
||||
fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model: &str) -> String {
|
||||
///
|
||||
/// v3: paddle 경로(det/rec/dict)는 **호출자가 미디어별로** 넘긴다 — image 는
|
||||
/// `[ingest.image.ocr]`, pdf 는 `[ingest.pdf.ocr]`. v2 의 "pdf 가 image paddle
|
||||
/// 을 빌려쓰던" 비대칭을 제거한다. 마이그레이션(T5)이 pdf 대칭 키를 image 값
|
||||
/// 으로 채우므로 미변환 v2 → v3 의 signature 는 바이트 동일하게 유지된다.
|
||||
fn ocr_engine_version_for_sig(
|
||||
engine: &str,
|
||||
model: &str,
|
||||
det: Option<&str>,
|
||||
rec: Option<&str>,
|
||||
dict: Option<&str>,
|
||||
) -> String {
|
||||
if engine != PADDLE_ONNX_ENGINE {
|
||||
// ollama-vision (and any non-paddle engine): the daemon exposes no
|
||||
// stable per-model revision, so engine/model is the identity.
|
||||
return format!("ollama/{model}");
|
||||
}
|
||||
let ocr = &config.ingest.image.ocr;
|
||||
let key = format!(
|
||||
"{}|{}|{}",
|
||||
ocr.det_model.as_deref().unwrap_or("<bundled>"),
|
||||
ocr.rec_model.as_deref().unwrap_or("<bundled>"),
|
||||
ocr.dict.as_deref().unwrap_or("<bundled>"),
|
||||
det.unwrap_or("<bundled>"),
|
||||
rec.unwrap_or("<bundled>"),
|
||||
dict.unwrap_or("<bundled>"),
|
||||
);
|
||||
let memo = PADDLE_OCR_VERSION_MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()));
|
||||
if let Some(v) = memo.lock().unwrap().get(&key) {
|
||||
@@ -3114,7 +3124,7 @@ fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model
|
||||
// ingest the engine was already built (fail-fast) so the assets are
|
||||
// present and this succeeds; the path-derived identity below is an
|
||||
// unreachable-in-practice guard that keeps the signature total.
|
||||
let version = engine_version_for_config(config).unwrap_or_else(|e| {
|
||||
let version = engine_version_for_paths(det, rec, dict).unwrap_or_else(|e| {
|
||||
tracing::warn!(
|
||||
target: "kebab-app::ingest",
|
||||
error = %e,
|
||||
@@ -3126,6 +3136,14 @@ fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model
|
||||
version
|
||||
}
|
||||
|
||||
/// v3: signature 바이트 불변 골든을 위한 테스트 seam. `ingest_config_signature`
|
||||
/// 는 private 이라 통합 테스트에서 직접 못 부른다. 값 기반이라 struct 경로가
|
||||
/// 바뀌어도(미디어 ingest 통합) 출력 문자열은 v2 와 바이트 동일해야 한다.
|
||||
#[doc(hidden)]
|
||||
pub fn test_ingest_config_signature(c: &kebab_config::Config, m: &MediaType) -> String {
|
||||
ingest_config_signature(c, m)
|
||||
}
|
||||
|
||||
fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> String {
|
||||
// Common (every media type): chunking parameters that move chunk
|
||||
// boundaries. `target_tokens` / `overlap_tokens` change re-chunking for
|
||||
@@ -3148,7 +3166,13 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
|
||||
sig.push_str(&format!(
|
||||
"|ocr:1:{}:{}",
|
||||
ocr.engine,
|
||||
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
|
||||
ocr_engine_version_for_sig(
|
||||
&ocr.engine,
|
||||
&ocr.model,
|
||||
ocr.det_model.as_deref(),
|
||||
ocr.rec_model.as_deref(),
|
||||
ocr.dict.as_deref(),
|
||||
)
|
||||
));
|
||||
} else {
|
||||
sig.push_str("|ocr:0");
|
||||
@@ -3172,7 +3196,13 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
|
||||
ocr.enabled,
|
||||
ocr.always_on,
|
||||
ocr.engine,
|
||||
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
|
||||
ocr_engine_version_for_sig(
|
||||
&ocr.engine,
|
||||
&ocr.model,
|
||||
ocr.det_model.as_deref(),
|
||||
ocr.rec_model.as_deref(),
|
||||
ocr.dict.as_deref(),
|
||||
)
|
||||
));
|
||||
} else {
|
||||
sig.push_str("|pdfocr:0");
|
||||
@@ -3739,7 +3769,7 @@ mod ingest_config_signature_tests {
|
||||
fn chunking_change_invalidates_all_types() {
|
||||
let base = Config::defaults();
|
||||
let mut bumped = base.clone();
|
||||
bumped.chunking.target_tokens += 100;
|
||||
bumped.ingest.chunking.target_tokens += 100;
|
||||
for m in [md(), img(), pdf(), code()] {
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &m),
|
||||
@@ -3749,14 +3779,14 @@ mod ingest_config_signature_tests {
|
||||
}
|
||||
|
||||
let mut overlap = base.clone();
|
||||
overlap.chunking.overlap_tokens += 10;
|
||||
overlap.ingest.chunking.overlap_tokens += 10;
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &md()),
|
||||
ingest_config_signature(&overlap, &md())
|
||||
);
|
||||
|
||||
let mut headings = base.clone();
|
||||
headings.chunking.respect_markdown_headings = !base.chunking.respect_markdown_headings;
|
||||
headings.ingest.chunking.respect_markdown_headings = !base.ingest.chunking.respect_markdown_headings;
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &md()),
|
||||
ingest_config_signature(&headings, &md())
|
||||
@@ -3768,9 +3798,9 @@ mod ingest_config_signature_tests {
|
||||
#[test]
|
||||
fn image_ocr_toggle_invalidates_image_only() {
|
||||
let base = Config::defaults();
|
||||
assert!(!base.image.ocr.enabled, "default OCR is off");
|
||||
assert!(!base.ingest.image.ocr.enabled, "default OCR is off");
|
||||
let mut on = base.clone();
|
||||
on.image.ocr.enabled = true;
|
||||
on.ingest.image.ocr.enabled = true;
|
||||
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &img()),
|
||||
@@ -3792,16 +3822,16 @@ mod ingest_config_signature_tests {
|
||||
fn image_ocr_model_matters_only_when_enabled() {
|
||||
let mut off_a = Config::defaults();
|
||||
let mut off_b = off_a.clone();
|
||||
off_b.image.ocr.model = "some-other-model".to_string();
|
||||
off_b.ingest.image.ocr.model = "some-other-model".to_string();
|
||||
assert_eq!(
|
||||
ingest_config_signature(&off_a, &img()),
|
||||
ingest_config_signature(&off_b, &img()),
|
||||
"OCR model is irrelevant while OCR is off"
|
||||
);
|
||||
|
||||
off_a.image.ocr.enabled = true;
|
||||
off_a.ingest.image.ocr.enabled = true;
|
||||
let mut on_b = off_a.clone();
|
||||
on_b.image.ocr.model = "some-other-model".to_string();
|
||||
on_b.ingest.image.ocr.model = "some-other-model".to_string();
|
||||
assert_ne!(
|
||||
ingest_config_signature(&off_a, &img()),
|
||||
ingest_config_signature(&on_b, &img()),
|
||||
@@ -3814,14 +3844,14 @@ mod ingest_config_signature_tests {
|
||||
fn image_caption_toggle_and_prompt_invalidate_image() {
|
||||
let base = Config::defaults();
|
||||
let mut on = base.clone();
|
||||
on.image.caption.enabled = true;
|
||||
on.ingest.image.caption.enabled = true;
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &img()),
|
||||
ingest_config_signature(&on, &img())
|
||||
);
|
||||
|
||||
let mut prompt = on.clone();
|
||||
prompt.image.caption.prompt_template_version = "caption-v9".to_string();
|
||||
prompt.ingest.image.caption.prompt_template_version = "caption-v9".to_string();
|
||||
assert_ne!(
|
||||
ingest_config_signature(&on, &img()),
|
||||
ingest_config_signature(&prompt, &img()),
|
||||
@@ -3835,7 +3865,7 @@ mod ingest_config_signature_tests {
|
||||
fn pdf_ocr_toggle_invalidates_pdf_only() {
|
||||
let base = Config::defaults();
|
||||
let mut enabled = base.clone();
|
||||
enabled.pdf.ocr.enabled = true;
|
||||
enabled.ingest.pdf.ocr.enabled = true;
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &pdf()),
|
||||
ingest_config_signature(&enabled, &pdf()),
|
||||
@@ -3843,7 +3873,7 @@ mod ingest_config_signature_tests {
|
||||
);
|
||||
|
||||
let mut always = base.clone();
|
||||
always.pdf.ocr.always_on = true;
|
||||
always.ingest.pdf.ocr.always_on = true;
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &pdf()),
|
||||
ingest_config_signature(&always, &pdf()),
|
||||
@@ -3921,13 +3951,13 @@ mod ingest_config_signature_tests {
|
||||
// ui
|
||||
other.ui.theme = "light".to_string();
|
||||
// image runtime-only (non-output) knobs
|
||||
other.image.ocr.max_pixels += 100;
|
||||
other.image.ocr.languages.push("jpn".to_string());
|
||||
other.image.ocr.request_timeout_secs += 10;
|
||||
other.ingest.image.ocr.max_pixels += 100;
|
||||
other.ingest.image.ocr.languages.push("jpn".to_string());
|
||||
other.ingest.image.ocr.request_timeout_secs += 10;
|
||||
// pdf runtime-only knobs
|
||||
other.pdf.ocr.max_pixels += 100;
|
||||
other.pdf.ocr.request_timeout_secs += 10;
|
||||
other.pdf.ocr.languages.push("jpn".to_string());
|
||||
other.ingest.pdf.ocr.max_pixels += 100;
|
||||
other.ingest.pdf.ocr.request_timeout_secs += 10;
|
||||
other.ingest.pdf.ocr.languages.push("jpn".to_string());
|
||||
|
||||
for m in [md(), img(), pdf(), code()] {
|
||||
assert_eq!(
|
||||
@@ -3946,10 +3976,10 @@ mod ingest_config_signature_tests {
|
||||
#[test]
|
||||
fn image_ocr_engine_switch_invalidates_image() {
|
||||
let mut ollama = Config::defaults();
|
||||
ollama.image.ocr.enabled = true;
|
||||
ollama.ingest.image.ocr.enabled = true;
|
||||
// same `model` string on both — only the engine differs
|
||||
let mut paddle = ollama.clone();
|
||||
paddle.image.ocr.engine = "paddle-onnx".to_string();
|
||||
paddle.ingest.image.ocr.engine = "paddle-onnx".to_string();
|
||||
assert_ne!(
|
||||
ingest_config_signature(&ollama, &img()),
|
||||
ingest_config_signature(&paddle, &img()),
|
||||
@@ -3962,10 +3992,10 @@ mod ingest_config_signature_tests {
|
||||
#[test]
|
||||
fn image_ocr_engine_version_change_invalidates_image() {
|
||||
let mut a = Config::defaults();
|
||||
a.image.ocr.enabled = true;
|
||||
a.image.ocr.model = "gemma4:e4b".to_string();
|
||||
a.ingest.image.ocr.enabled = true;
|
||||
a.ingest.image.ocr.model = "gemma4:e4b".to_string();
|
||||
let mut b = a.clone();
|
||||
b.image.ocr.model = "qwen2.5vl:3b".to_string();
|
||||
b.ingest.image.ocr.model = "qwen2.5vl:3b".to_string();
|
||||
assert_ne!(
|
||||
ingest_config_signature(&a, &img()),
|
||||
ingest_config_signature(&b, &img()),
|
||||
@@ -3978,10 +4008,10 @@ mod ingest_config_signature_tests {
|
||||
#[test]
|
||||
fn image_ocr_paddle_model_path_change_invalidates_image() {
|
||||
let mut base = Config::defaults();
|
||||
base.image.ocr.enabled = true;
|
||||
base.image.ocr.engine = "paddle-onnx".to_string();
|
||||
base.ingest.image.ocr.enabled = true;
|
||||
base.ingest.image.ocr.engine = "paddle-onnx".to_string();
|
||||
let mut overridden = base.clone();
|
||||
overridden.image.ocr.det_model = Some("/some/other/det.onnx".to_string());
|
||||
overridden.ingest.image.ocr.det_model = Some("/some/other/det.onnx".to_string());
|
||||
assert_ne!(
|
||||
ingest_config_signature(&base, &img()),
|
||||
ingest_config_signature(&overridden, &img()),
|
||||
@@ -3994,11 +4024,11 @@ mod ingest_config_signature_tests {
|
||||
#[test]
|
||||
fn paddle_image_signature_stable_for_unrelated_change() {
|
||||
let mut base = Config::defaults();
|
||||
base.image.ocr.enabled = true;
|
||||
base.image.ocr.engine = "paddle-onnx".to_string();
|
||||
base.ingest.image.ocr.enabled = true;
|
||||
base.ingest.image.ocr.engine = "paddle-onnx".to_string();
|
||||
let mut other = base.clone();
|
||||
other.search.default_k += 3;
|
||||
other.image.ocr.max_pixels += 100; // runtime-only knob
|
||||
other.ingest.image.ocr.max_pixels += 100; // runtime-only knob
|
||||
assert_eq!(
|
||||
ingest_config_signature(&base, &img()),
|
||||
ingest_config_signature(&other, &img()),
|
||||
@@ -4010,9 +4040,9 @@ mod ingest_config_signature_tests {
|
||||
#[test]
|
||||
fn pdf_ocr_engine_switch_invalidates_pdf() {
|
||||
let mut ollama = Config::defaults();
|
||||
ollama.pdf.ocr.enabled = true;
|
||||
ollama.ingest.pdf.ocr.enabled = true;
|
||||
let mut paddle = ollama.clone();
|
||||
paddle.pdf.ocr.engine = "paddle-onnx".to_string();
|
||||
paddle.ingest.pdf.ocr.engine = "paddle-onnx".to_string();
|
||||
assert_ne!(
|
||||
ingest_config_signature(&ollama, &pdf()),
|
||||
ingest_config_signature(&paddle, &pdf()),
|
||||
|
||||
@@ -146,3 +146,24 @@ fn search_setting_change_reindexes_nothing() {
|
||||
assert_eq!(second.new, 0);
|
||||
assert_eq!(second.errors, 0);
|
||||
}
|
||||
|
||||
/// v3 불변식 #1: `ingest_config_signature` 출력 문자열은 값 기반이라 struct
|
||||
/// 경로 재편(미디어 ingest 통합) 후에도 v2 와 **바이트 동일**해야 한다. 깨지면
|
||||
/// 업그레이드 시 전체 재색인 발생. paddle-onnx image 분기 형식 골든.
|
||||
#[test]
|
||||
fn ingest_signature_image_paddle_byte_stable() {
|
||||
let mut cfg = kebab_config::Config::defaults();
|
||||
cfg.ingest.image.ocr.enabled = true;
|
||||
cfg.ingest.image.ocr.engine = "paddle-onnx".into();
|
||||
let sig = kebab_app::test_ingest_config_signature(
|
||||
&cfg,
|
||||
&kebab_core::MediaType::Image(kebab_core::ImageType::Png),
|
||||
);
|
||||
// 골든: chunk:... |ocr:1:paddle-onnx:<engine_version> |cap:0
|
||||
assert!(
|
||||
sig.starts_with("chunk:500:80:true:md-heading-v1"),
|
||||
"chunk prefix drift: {sig}"
|
||||
);
|
||||
assert!(sig.contains("|ocr:1:paddle-onnx:"), "ocr token drift: {sig}");
|
||||
assert!(sig.ends_with("|cap:0"), "cap token drift: {sig}");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user