feat(config): config.toml v2→v3 스키마 재편 — 미디어 [ingest.*] 통합 + 무손실 자동 마이그레이션 #207

Merged
altair823 merged 12 commits from feat/config-schema-reorg into main 2026-06-04 14:36:44 +00:00
4 changed files with 116 additions and 42 deletions
Showing only changes of commit 3d45994693 - Show all commits

View File

@@ -54,7 +54,7 @@ use kebab_core::{
use kebab_llm_local::OllamaLanguageModel;
use kebab_parse_image::{
OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, OnnxPaddleOcr, PADDLE_ONNX_ENGINE,
apply_caption, apply_ocr, engine_version_for_config,
apply_caption, apply_ocr, engine_version_for_paths,
};
use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
use kebab_source_fs::FsSourceConnector;
@@ -3090,21 +3090,31 @@ static PADDLE_OCR_VERSION_MEMO: std::sync::OnceLock<
std::sync::Mutex<std::collections::HashMap<String, String>>,
> = std::sync::OnceLock::new();
/// T9: resolve the OCR `engine_version` string used inside the ingest config
/// T9/v3: resolve the OCR `engine_version` string used inside the ingest config
/// signature. ollama-vision is self-describing from `engine/model` (cheap, no
/// I/O). paddle-onnx hashes the bundled/override model assets (memoized).
fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model: &str) -> String {
///
/// v3: paddle 경로(det/rec/dict)는 **호출자가 미디어별로** 넘긴다 — image 는
/// `[ingest.image.ocr]`, pdf 는 `[ingest.pdf.ocr]`. v2 의 "pdf 가 image paddle
/// 을 빌려쓰던" 비대칭을 제거한다. 마이그레이션(T5)이 pdf 대칭 키를 image 값
/// 으로 채우므로 미변환 v2 → v3 의 signature 는 바이트 동일하게 유지된다.
fn ocr_engine_version_for_sig(
engine: &str,
model: &str,
det: Option<&str>,
rec: Option<&str>,
dict: Option<&str>,
) -> String {
if engine != PADDLE_ONNX_ENGINE {
// ollama-vision (and any non-paddle engine): the daemon exposes no
// stable per-model revision, so engine/model is the identity.
return format!("ollama/{model}");
}
let ocr = &config.ingest.image.ocr;
let key = format!(
"{}|{}|{}",
ocr.det_model.as_deref().unwrap_or("<bundled>"),
ocr.rec_model.as_deref().unwrap_or("<bundled>"),
ocr.dict.as_deref().unwrap_or("<bundled>"),
det.unwrap_or("<bundled>"),
rec.unwrap_or("<bundled>"),
dict.unwrap_or("<bundled>"),
);
let memo = PADDLE_OCR_VERSION_MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()));
if let Some(v) = memo.lock().unwrap().get(&key) {
@@ -3114,7 +3124,7 @@ fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model
// ingest the engine was already built (fail-fast) so the assets are
// present and this succeeds; the path-derived identity below is an
// unreachable-in-practice guard that keeps the signature total.
let version = engine_version_for_config(config).unwrap_or_else(|e| {
let version = engine_version_for_paths(det, rec, dict).unwrap_or_else(|e| {
tracing::warn!(
target: "kebab-app::ingest",
error = %e,
@@ -3126,6 +3136,14 @@ fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model
version
}
/// v3: signature 바이트 불변 골든을 위한 테스트 seam. `ingest_config_signature`
/// 는 private 이라 통합 테스트에서 직접 못 부른다. 값 기반이라 struct 경로가
/// 바뀌어도(미디어 ingest 통합) 출력 문자열은 v2 와 바이트 동일해야 한다.
#[doc(hidden)]
pub fn test_ingest_config_signature(c: &kebab_config::Config, m: &MediaType) -> String {
ingest_config_signature(c, m)
}
fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> String {
// Common (every media type): chunking parameters that move chunk
// boundaries. `target_tokens` / `overlap_tokens` change re-chunking for
@@ -3148,7 +3166,13 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
sig.push_str(&format!(
"|ocr:1:{}:{}",
ocr.engine,
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
ocr_engine_version_for_sig(
&ocr.engine,
&ocr.model,
ocr.det_model.as_deref(),
ocr.rec_model.as_deref(),
ocr.dict.as_deref(),
)
));
} else {
sig.push_str("|ocr:0");
@@ -3172,7 +3196,13 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
ocr.enabled,
ocr.always_on,
ocr.engine,
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
ocr_engine_version_for_sig(
&ocr.engine,
&ocr.model,
ocr.det_model.as_deref(),
ocr.rec_model.as_deref(),
ocr.dict.as_deref(),
)
));
} else {
sig.push_str("|pdfocr:0");
@@ -3739,7 +3769,7 @@ mod ingest_config_signature_tests {
fn chunking_change_invalidates_all_types() {
let base = Config::defaults();
let mut bumped = base.clone();
bumped.chunking.target_tokens += 100;
bumped.ingest.chunking.target_tokens += 100;
for m in [md(), img(), pdf(), code()] {
assert_ne!(
ingest_config_signature(&base, &m),
@@ -3749,14 +3779,14 @@ mod ingest_config_signature_tests {
}
let mut overlap = base.clone();
overlap.chunking.overlap_tokens += 10;
overlap.ingest.chunking.overlap_tokens += 10;
assert_ne!(
ingest_config_signature(&base, &md()),
ingest_config_signature(&overlap, &md())
);
let mut headings = base.clone();
headings.chunking.respect_markdown_headings = !base.chunking.respect_markdown_headings;
headings.ingest.chunking.respect_markdown_headings = !base.ingest.chunking.respect_markdown_headings;
assert_ne!(
ingest_config_signature(&base, &md()),
ingest_config_signature(&headings, &md())
@@ -3768,9 +3798,9 @@ mod ingest_config_signature_tests {
#[test]
fn image_ocr_toggle_invalidates_image_only() {
let base = Config::defaults();
assert!(!base.image.ocr.enabled, "default OCR is off");
assert!(!base.ingest.image.ocr.enabled, "default OCR is off");
let mut on = base.clone();
on.image.ocr.enabled = true;
on.ingest.image.ocr.enabled = true;
assert_ne!(
ingest_config_signature(&base, &img()),
@@ -3792,16 +3822,16 @@ mod ingest_config_signature_tests {
fn image_ocr_model_matters_only_when_enabled() {
let mut off_a = Config::defaults();
let mut off_b = off_a.clone();
off_b.image.ocr.model = "some-other-model".to_string();
off_b.ingest.image.ocr.model = "some-other-model".to_string();
assert_eq!(
ingest_config_signature(&off_a, &img()),
ingest_config_signature(&off_b, &img()),
"OCR model is irrelevant while OCR is off"
);
off_a.image.ocr.enabled = true;
off_a.ingest.image.ocr.enabled = true;
let mut on_b = off_a.clone();
on_b.image.ocr.model = "some-other-model".to_string();
on_b.ingest.image.ocr.model = "some-other-model".to_string();
assert_ne!(
ingest_config_signature(&off_a, &img()),
ingest_config_signature(&on_b, &img()),
@@ -3814,14 +3844,14 @@ mod ingest_config_signature_tests {
fn image_caption_toggle_and_prompt_invalidate_image() {
let base = Config::defaults();
let mut on = base.clone();
on.image.caption.enabled = true;
on.ingest.image.caption.enabled = true;
assert_ne!(
ingest_config_signature(&base, &img()),
ingest_config_signature(&on, &img())
);
let mut prompt = on.clone();
prompt.image.caption.prompt_template_version = "caption-v9".to_string();
prompt.ingest.image.caption.prompt_template_version = "caption-v9".to_string();
assert_ne!(
ingest_config_signature(&on, &img()),
ingest_config_signature(&prompt, &img()),
@@ -3835,7 +3865,7 @@ mod ingest_config_signature_tests {
fn pdf_ocr_toggle_invalidates_pdf_only() {
let base = Config::defaults();
let mut enabled = base.clone();
enabled.pdf.ocr.enabled = true;
enabled.ingest.pdf.ocr.enabled = true;
assert_ne!(
ingest_config_signature(&base, &pdf()),
ingest_config_signature(&enabled, &pdf()),
@@ -3843,7 +3873,7 @@ mod ingest_config_signature_tests {
);
let mut always = base.clone();
always.pdf.ocr.always_on = true;
always.ingest.pdf.ocr.always_on = true;
assert_ne!(
ingest_config_signature(&base, &pdf()),
ingest_config_signature(&always, &pdf()),
@@ -3921,13 +3951,13 @@ mod ingest_config_signature_tests {
// ui
other.ui.theme = "light".to_string();
// image runtime-only (non-output) knobs
other.image.ocr.max_pixels += 100;
other.image.ocr.languages.push("jpn".to_string());
other.image.ocr.request_timeout_secs += 10;
other.ingest.image.ocr.max_pixels += 100;
other.ingest.image.ocr.languages.push("jpn".to_string());
other.ingest.image.ocr.request_timeout_secs += 10;
// pdf runtime-only knobs
other.pdf.ocr.max_pixels += 100;
other.pdf.ocr.request_timeout_secs += 10;
other.pdf.ocr.languages.push("jpn".to_string());
other.ingest.pdf.ocr.max_pixels += 100;
other.ingest.pdf.ocr.request_timeout_secs += 10;
other.ingest.pdf.ocr.languages.push("jpn".to_string());
for m in [md(), img(), pdf(), code()] {
assert_eq!(
@@ -3946,10 +3976,10 @@ mod ingest_config_signature_tests {
#[test]
fn image_ocr_engine_switch_invalidates_image() {
let mut ollama = Config::defaults();
ollama.image.ocr.enabled = true;
ollama.ingest.image.ocr.enabled = true;
// same `model` string on both — only the engine differs
let mut paddle = ollama.clone();
paddle.image.ocr.engine = "paddle-onnx".to_string();
paddle.ingest.image.ocr.engine = "paddle-onnx".to_string();
assert_ne!(
ingest_config_signature(&ollama, &img()),
ingest_config_signature(&paddle, &img()),
@@ -3962,10 +3992,10 @@ mod ingest_config_signature_tests {
#[test]
fn image_ocr_engine_version_change_invalidates_image() {
let mut a = Config::defaults();
a.image.ocr.enabled = true;
a.image.ocr.model = "gemma4:e4b".to_string();
a.ingest.image.ocr.enabled = true;
a.ingest.image.ocr.model = "gemma4:e4b".to_string();
let mut b = a.clone();
b.image.ocr.model = "qwen2.5vl:3b".to_string();
b.ingest.image.ocr.model = "qwen2.5vl:3b".to_string();
assert_ne!(
ingest_config_signature(&a, &img()),
ingest_config_signature(&b, &img()),
@@ -3978,10 +4008,10 @@ mod ingest_config_signature_tests {
#[test]
fn image_ocr_paddle_model_path_change_invalidates_image() {
let mut base = Config::defaults();
base.image.ocr.enabled = true;
base.image.ocr.engine = "paddle-onnx".to_string();
base.ingest.image.ocr.enabled = true;
base.ingest.image.ocr.engine = "paddle-onnx".to_string();
let mut overridden = base.clone();
overridden.image.ocr.det_model = Some("/some/other/det.onnx".to_string());
overridden.ingest.image.ocr.det_model = Some("/some/other/det.onnx".to_string());
assert_ne!(
ingest_config_signature(&base, &img()),
ingest_config_signature(&overridden, &img()),
@@ -3994,11 +4024,11 @@ mod ingest_config_signature_tests {
#[test]
fn paddle_image_signature_stable_for_unrelated_change() {
let mut base = Config::defaults();
base.image.ocr.enabled = true;
base.image.ocr.engine = "paddle-onnx".to_string();
base.ingest.image.ocr.enabled = true;
base.ingest.image.ocr.engine = "paddle-onnx".to_string();
let mut other = base.clone();
other.search.default_k += 3;
other.image.ocr.max_pixels += 100; // runtime-only knob
other.ingest.image.ocr.max_pixels += 100; // runtime-only knob
assert_eq!(
ingest_config_signature(&base, &img()),
ingest_config_signature(&other, &img()),
@@ -4010,9 +4040,9 @@ mod ingest_config_signature_tests {
#[test]
fn pdf_ocr_engine_switch_invalidates_pdf() {
let mut ollama = Config::defaults();
ollama.pdf.ocr.enabled = true;
ollama.ingest.pdf.ocr.enabled = true;
let mut paddle = ollama.clone();
paddle.pdf.ocr.engine = "paddle-onnx".to_string();
paddle.ingest.pdf.ocr.engine = "paddle-onnx".to_string();
assert_ne!(
ingest_config_signature(&ollama, &pdf()),
ingest_config_signature(&paddle, &pdf()),

View File

@@ -146,3 +146,24 @@ fn search_setting_change_reindexes_nothing() {
assert_eq!(second.new, 0);
assert_eq!(second.errors, 0);
}
/// v3 불변식 #1: `ingest_config_signature` 출력 문자열은 값 기반이라 struct
/// 경로 재편(미디어 ingest 통합) 후에도 v2 와 **바이트 동일**해야 한다. 깨지면
/// 업그레이드 시 전체 재색인 발생. paddle-onnx image 분기 형식 골든.
#[test]
fn ingest_signature_image_paddle_byte_stable() {
let mut cfg = kebab_config::Config::defaults();
cfg.ingest.image.ocr.enabled = true;
cfg.ingest.image.ocr.engine = "paddle-onnx".into();
let sig = kebab_app::test_ingest_config_signature(
&cfg,
&kebab_core::MediaType::Image(kebab_core::ImageType::Png),
);
// 골든: chunk:... |ocr:1:paddle-onnx:<engine_version> |cap:0
assert!(
sig.starts_with("chunk:500:80:true:md-heading-v1"),
"chunk prefix drift: {sig}"
);
assert!(sig.contains("|ocr:1:paddle-onnx:"), "ocr token drift: {sig}");
assert!(sig.ends_with("|cap:0"), "cap token drift: {sig}");
}

View File

@@ -34,7 +34,10 @@ pub mod paddle_onnx;
pub use caption::{apply_caption, caption_image};
pub use ocr::{OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, apply_ocr};
pub use paddle_onnx::{ModelPaths, OnnxPaddleOcr, PADDLE_ONNX_ENGINE, engine_version_for_config};
pub use paddle_onnx::{
ModelPaths, OnnxPaddleOcr, PADDLE_ONNX_ENGINE, engine_version_for_config,
engine_version_for_paths,
};
use anyhow::{Context, Result};
use kebab_core::{

View File

@@ -474,6 +474,26 @@ pub fn engine_version_for_config(config: &kebab_config::Config) -> Result<String
compute_engine_version(&ModelPaths::from_config(config))
}
/// v3: `engine_version` 을 명시적 (det,rec,dict) override 로부터 계산한다.
/// `ingest_config_signature` 의 미디어별 경로(image 는 `[ingest.image.ocr]`,
/// pdf 는 `[ingest.pdf.ocr]`)를 받아 쓰기 위함 — v2 의 "pdf 가 image paddle
/// 경로를 빌려쓰던" 비대칭 제거. `None` override 는 번들 모델로 fallback.
/// `engine_version_for_config` 과 동일하게 ~17 MB 를 읽으므로 호출자가
/// (det,rec,dict) triple 별로 memoize 해야 한다.
pub fn engine_version_for_paths(
det: Option<&str>,
rec: Option<&str>,
dict: Option<&str>,
) -> Result<String> {
let defaults = ModelPaths::from_default_dir();
let paths = ModelPaths {
det: det.map(PathBuf::from).unwrap_or(defaults.det),
rec: rec.map(PathBuf::from).unwrap_or(defaults.rec),
dict: dict.map(PathBuf::from).unwrap_or(defaults.dict),
};
compute_engine_version(&paths)
}
/// blake3 over det + rec + dict bytes → stable `engine_version`.
fn compute_engine_version(paths: &ModelPaths) -> Result<String> {
let mut hasher = blake3::Hasher::new();