|
|
|
|
@@ -54,7 +54,7 @@ use kebab_core::{
|
|
|
|
|
use kebab_llm_local::OllamaLanguageModel;
|
|
|
|
|
use kebab_parse_image::{
|
|
|
|
|
OLLAMA_VISION_ENGINE, OcrEngine, OllamaVisionOcr, OnnxPaddleOcr, PADDLE_ONNX_ENGINE,
|
|
|
|
|
apply_caption, apply_ocr, engine_version_for_config,
|
|
|
|
|
apply_caption, apply_ocr, engine_version_for_paths,
|
|
|
|
|
};
|
|
|
|
|
use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
|
|
|
|
|
use kebab_source_fs::FsSourceConnector;
|
|
|
|
|
@@ -3090,21 +3090,31 @@ static PADDLE_OCR_VERSION_MEMO: std::sync::OnceLock<
|
|
|
|
|
std::sync::Mutex<std::collections::HashMap<String, String>>,
|
|
|
|
|
> = std::sync::OnceLock::new();
|
|
|
|
|
|
|
|
|
|
/// T9: resolve the OCR `engine_version` string used inside the ingest config
|
|
|
|
|
/// T9/v3: resolve the OCR `engine_version` string used inside the ingest config
|
|
|
|
|
/// signature. ollama-vision is self-describing from `engine/model` (cheap, no
|
|
|
|
|
/// I/O). paddle-onnx hashes the bundled/override model assets (memoized).
|
|
|
|
|
fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model: &str) -> String {
|
|
|
|
|
///
|
|
|
|
|
/// v3: paddle 경로(det/rec/dict)는 **호출자가 미디어별로** 넘긴다 — image 는
|
|
|
|
|
/// `[ingest.image.ocr]`, pdf 는 `[ingest.pdf.ocr]`. v2 의 "pdf 가 image paddle
|
|
|
|
|
/// 을 빌려쓰던" 비대칭을 제거한다. 마이그레이션(T5)이 pdf 대칭 키를 image 값
|
|
|
|
|
/// 으로 채우므로 미변환 v2 → v3 의 signature 는 바이트 동일하게 유지된다.
|
|
|
|
|
fn ocr_engine_version_for_sig(
|
|
|
|
|
engine: &str,
|
|
|
|
|
model: &str,
|
|
|
|
|
det: Option<&str>,
|
|
|
|
|
rec: Option<&str>,
|
|
|
|
|
dict: Option<&str>,
|
|
|
|
|
) -> String {
|
|
|
|
|
if engine != PADDLE_ONNX_ENGINE {
|
|
|
|
|
// ollama-vision (and any non-paddle engine): the daemon exposes no
|
|
|
|
|
// stable per-model revision, so engine/model is the identity.
|
|
|
|
|
return format!("ollama/{model}");
|
|
|
|
|
}
|
|
|
|
|
let ocr = &config.ingest.image.ocr;
|
|
|
|
|
let key = format!(
|
|
|
|
|
"{}|{}|{}",
|
|
|
|
|
ocr.det_model.as_deref().unwrap_or("<bundled>"),
|
|
|
|
|
ocr.rec_model.as_deref().unwrap_or("<bundled>"),
|
|
|
|
|
ocr.dict.as_deref().unwrap_or("<bundled>"),
|
|
|
|
|
det.unwrap_or("<bundled>"),
|
|
|
|
|
rec.unwrap_or("<bundled>"),
|
|
|
|
|
dict.unwrap_or("<bundled>"),
|
|
|
|
|
);
|
|
|
|
|
let memo = PADDLE_OCR_VERSION_MEMO.get_or_init(|| std::sync::Mutex::new(std::collections::HashMap::new()));
|
|
|
|
|
if let Some(v) = memo.lock().unwrap().get(&key) {
|
|
|
|
|
@@ -3114,7 +3124,7 @@ fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model
|
|
|
|
|
// ingest the engine was already built (fail-fast) so the assets are
|
|
|
|
|
// present and this succeeds; the path-derived identity below is an
|
|
|
|
|
// unreachable-in-practice guard that keeps the signature total.
|
|
|
|
|
let version = engine_version_for_config(config).unwrap_or_else(|e| {
|
|
|
|
|
let version = engine_version_for_paths(det, rec, dict).unwrap_or_else(|e| {
|
|
|
|
|
tracing::warn!(
|
|
|
|
|
target: "kebab-app::ingest",
|
|
|
|
|
error = %e,
|
|
|
|
|
@@ -3126,6 +3136,14 @@ fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model
|
|
|
|
|
version
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// v3: signature 바이트 불변 골든을 위한 테스트 seam. `ingest_config_signature`
|
|
|
|
|
/// 는 private 이라 통합 테스트에서 직접 못 부른다. 값 기반이라 struct 경로가
|
|
|
|
|
/// 바뀌어도(미디어 ingest 통합) 출력 문자열은 v2 와 바이트 동일해야 한다.
|
|
|
|
|
#[doc(hidden)]
|
|
|
|
|
pub fn test_ingest_config_signature(c: &kebab_config::Config, m: &MediaType) -> String {
|
|
|
|
|
ingest_config_signature(c, m)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) -> String {
|
|
|
|
|
// Common (every media type): chunking parameters that move chunk
|
|
|
|
|
// boundaries. `target_tokens` / `overlap_tokens` change re-chunking for
|
|
|
|
|
@@ -3148,7 +3166,13 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
|
|
|
|
|
sig.push_str(&format!(
|
|
|
|
|
"|ocr:1:{}:{}",
|
|
|
|
|
ocr.engine,
|
|
|
|
|
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
|
|
|
|
|
ocr_engine_version_for_sig(
|
|
|
|
|
&ocr.engine,
|
|
|
|
|
&ocr.model,
|
|
|
|
|
ocr.det_model.as_deref(),
|
|
|
|
|
ocr.rec_model.as_deref(),
|
|
|
|
|
ocr.dict.as_deref(),
|
|
|
|
|
)
|
|
|
|
|
));
|
|
|
|
|
} else {
|
|
|
|
|
sig.push_str("|ocr:0");
|
|
|
|
|
@@ -3172,7 +3196,13 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
|
|
|
|
|
ocr.enabled,
|
|
|
|
|
ocr.always_on,
|
|
|
|
|
ocr.engine,
|
|
|
|
|
ocr_engine_version_for_sig(config, &ocr.engine, &ocr.model)
|
|
|
|
|
ocr_engine_version_for_sig(
|
|
|
|
|
&ocr.engine,
|
|
|
|
|
&ocr.model,
|
|
|
|
|
ocr.det_model.as_deref(),
|
|
|
|
|
ocr.rec_model.as_deref(),
|
|
|
|
|
ocr.dict.as_deref(),
|
|
|
|
|
)
|
|
|
|
|
));
|
|
|
|
|
} else {
|
|
|
|
|
sig.push_str("|pdfocr:0");
|
|
|
|
|
@@ -3739,7 +3769,7 @@ mod ingest_config_signature_tests {
|
|
|
|
|
fn chunking_change_invalidates_all_types() {
|
|
|
|
|
let base = Config::defaults();
|
|
|
|
|
let mut bumped = base.clone();
|
|
|
|
|
bumped.chunking.target_tokens += 100;
|
|
|
|
|
bumped.ingest.chunking.target_tokens += 100;
|
|
|
|
|
for m in [md(), img(), pdf(), code()] {
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&base, &m),
|
|
|
|
|
@@ -3749,14 +3779,14 @@ mod ingest_config_signature_tests {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let mut overlap = base.clone();
|
|
|
|
|
overlap.chunking.overlap_tokens += 10;
|
|
|
|
|
overlap.ingest.chunking.overlap_tokens += 10;
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&base, &md()),
|
|
|
|
|
ingest_config_signature(&overlap, &md())
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let mut headings = base.clone();
|
|
|
|
|
headings.chunking.respect_markdown_headings = !base.chunking.respect_markdown_headings;
|
|
|
|
|
headings.ingest.chunking.respect_markdown_headings = !base.ingest.chunking.respect_markdown_headings;
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&base, &md()),
|
|
|
|
|
ingest_config_signature(&headings, &md())
|
|
|
|
|
@@ -3768,9 +3798,9 @@ mod ingest_config_signature_tests {
|
|
|
|
|
#[test]
|
|
|
|
|
fn image_ocr_toggle_invalidates_image_only() {
|
|
|
|
|
let base = Config::defaults();
|
|
|
|
|
assert!(!base.image.ocr.enabled, "default OCR is off");
|
|
|
|
|
assert!(!base.ingest.image.ocr.enabled, "default OCR is off");
|
|
|
|
|
let mut on = base.clone();
|
|
|
|
|
on.image.ocr.enabled = true;
|
|
|
|
|
on.ingest.image.ocr.enabled = true;
|
|
|
|
|
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&base, &img()),
|
|
|
|
|
@@ -3792,16 +3822,16 @@ mod ingest_config_signature_tests {
|
|
|
|
|
fn image_ocr_model_matters_only_when_enabled() {
|
|
|
|
|
let mut off_a = Config::defaults();
|
|
|
|
|
let mut off_b = off_a.clone();
|
|
|
|
|
off_b.image.ocr.model = "some-other-model".to_string();
|
|
|
|
|
off_b.ingest.image.ocr.model = "some-other-model".to_string();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
ingest_config_signature(&off_a, &img()),
|
|
|
|
|
ingest_config_signature(&off_b, &img()),
|
|
|
|
|
"OCR model is irrelevant while OCR is off"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
off_a.image.ocr.enabled = true;
|
|
|
|
|
off_a.ingest.image.ocr.enabled = true;
|
|
|
|
|
let mut on_b = off_a.clone();
|
|
|
|
|
on_b.image.ocr.model = "some-other-model".to_string();
|
|
|
|
|
on_b.ingest.image.ocr.model = "some-other-model".to_string();
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&off_a, &img()),
|
|
|
|
|
ingest_config_signature(&on_b, &img()),
|
|
|
|
|
@@ -3814,14 +3844,14 @@ mod ingest_config_signature_tests {
|
|
|
|
|
fn image_caption_toggle_and_prompt_invalidate_image() {
|
|
|
|
|
let base = Config::defaults();
|
|
|
|
|
let mut on = base.clone();
|
|
|
|
|
on.image.caption.enabled = true;
|
|
|
|
|
on.ingest.image.caption.enabled = true;
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&base, &img()),
|
|
|
|
|
ingest_config_signature(&on, &img())
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let mut prompt = on.clone();
|
|
|
|
|
prompt.image.caption.prompt_template_version = "caption-v9".to_string();
|
|
|
|
|
prompt.ingest.image.caption.prompt_template_version = "caption-v9".to_string();
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&on, &img()),
|
|
|
|
|
ingest_config_signature(&prompt, &img()),
|
|
|
|
|
@@ -3835,7 +3865,7 @@ mod ingest_config_signature_tests {
|
|
|
|
|
fn pdf_ocr_toggle_invalidates_pdf_only() {
|
|
|
|
|
let base = Config::defaults();
|
|
|
|
|
let mut enabled = base.clone();
|
|
|
|
|
enabled.pdf.ocr.enabled = true;
|
|
|
|
|
enabled.ingest.pdf.ocr.enabled = true;
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&base, &pdf()),
|
|
|
|
|
ingest_config_signature(&enabled, &pdf()),
|
|
|
|
|
@@ -3843,7 +3873,7 @@ mod ingest_config_signature_tests {
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let mut always = base.clone();
|
|
|
|
|
always.pdf.ocr.always_on = true;
|
|
|
|
|
always.ingest.pdf.ocr.always_on = true;
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&base, &pdf()),
|
|
|
|
|
ingest_config_signature(&always, &pdf()),
|
|
|
|
|
@@ -3921,13 +3951,13 @@ mod ingest_config_signature_tests {
|
|
|
|
|
// ui
|
|
|
|
|
other.ui.theme = "light".to_string();
|
|
|
|
|
// image runtime-only (non-output) knobs
|
|
|
|
|
other.image.ocr.max_pixels += 100;
|
|
|
|
|
other.image.ocr.languages.push("jpn".to_string());
|
|
|
|
|
other.image.ocr.request_timeout_secs += 10;
|
|
|
|
|
other.ingest.image.ocr.max_pixels += 100;
|
|
|
|
|
other.ingest.image.ocr.languages.push("jpn".to_string());
|
|
|
|
|
other.ingest.image.ocr.request_timeout_secs += 10;
|
|
|
|
|
// pdf runtime-only knobs
|
|
|
|
|
other.pdf.ocr.max_pixels += 100;
|
|
|
|
|
other.pdf.ocr.request_timeout_secs += 10;
|
|
|
|
|
other.pdf.ocr.languages.push("jpn".to_string());
|
|
|
|
|
other.ingest.pdf.ocr.max_pixels += 100;
|
|
|
|
|
other.ingest.pdf.ocr.request_timeout_secs += 10;
|
|
|
|
|
other.ingest.pdf.ocr.languages.push("jpn".to_string());
|
|
|
|
|
|
|
|
|
|
for m in [md(), img(), pdf(), code()] {
|
|
|
|
|
assert_eq!(
|
|
|
|
|
@@ -3946,10 +3976,10 @@ mod ingest_config_signature_tests {
|
|
|
|
|
#[test]
|
|
|
|
|
fn image_ocr_engine_switch_invalidates_image() {
|
|
|
|
|
let mut ollama = Config::defaults();
|
|
|
|
|
ollama.image.ocr.enabled = true;
|
|
|
|
|
ollama.ingest.image.ocr.enabled = true;
|
|
|
|
|
// same `model` string on both — only the engine differs
|
|
|
|
|
let mut paddle = ollama.clone();
|
|
|
|
|
paddle.image.ocr.engine = "paddle-onnx".to_string();
|
|
|
|
|
paddle.ingest.image.ocr.engine = "paddle-onnx".to_string();
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&ollama, &img()),
|
|
|
|
|
ingest_config_signature(&paddle, &img()),
|
|
|
|
|
@@ -3962,10 +3992,10 @@ mod ingest_config_signature_tests {
|
|
|
|
|
#[test]
|
|
|
|
|
fn image_ocr_engine_version_change_invalidates_image() {
|
|
|
|
|
let mut a = Config::defaults();
|
|
|
|
|
a.image.ocr.enabled = true;
|
|
|
|
|
a.image.ocr.model = "gemma4:e4b".to_string();
|
|
|
|
|
a.ingest.image.ocr.enabled = true;
|
|
|
|
|
a.ingest.image.ocr.model = "gemma4:e4b".to_string();
|
|
|
|
|
let mut b = a.clone();
|
|
|
|
|
b.image.ocr.model = "qwen2.5vl:3b".to_string();
|
|
|
|
|
b.ingest.image.ocr.model = "qwen2.5vl:3b".to_string();
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&a, &img()),
|
|
|
|
|
ingest_config_signature(&b, &img()),
|
|
|
|
|
@@ -3978,10 +4008,10 @@ mod ingest_config_signature_tests {
|
|
|
|
|
#[test]
|
|
|
|
|
fn image_ocr_paddle_model_path_change_invalidates_image() {
|
|
|
|
|
let mut base = Config::defaults();
|
|
|
|
|
base.image.ocr.enabled = true;
|
|
|
|
|
base.image.ocr.engine = "paddle-onnx".to_string();
|
|
|
|
|
base.ingest.image.ocr.enabled = true;
|
|
|
|
|
base.ingest.image.ocr.engine = "paddle-onnx".to_string();
|
|
|
|
|
let mut overridden = base.clone();
|
|
|
|
|
overridden.image.ocr.det_model = Some("/some/other/det.onnx".to_string());
|
|
|
|
|
overridden.ingest.image.ocr.det_model = Some("/some/other/det.onnx".to_string());
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&base, &img()),
|
|
|
|
|
ingest_config_signature(&overridden, &img()),
|
|
|
|
|
@@ -3994,11 +4024,11 @@ mod ingest_config_signature_tests {
|
|
|
|
|
#[test]
|
|
|
|
|
fn paddle_image_signature_stable_for_unrelated_change() {
|
|
|
|
|
let mut base = Config::defaults();
|
|
|
|
|
base.image.ocr.enabled = true;
|
|
|
|
|
base.image.ocr.engine = "paddle-onnx".to_string();
|
|
|
|
|
base.ingest.image.ocr.enabled = true;
|
|
|
|
|
base.ingest.image.ocr.engine = "paddle-onnx".to_string();
|
|
|
|
|
let mut other = base.clone();
|
|
|
|
|
other.search.default_k += 3;
|
|
|
|
|
other.image.ocr.max_pixels += 100; // runtime-only knob
|
|
|
|
|
other.ingest.image.ocr.max_pixels += 100; // runtime-only knob
|
|
|
|
|
assert_eq!(
|
|
|
|
|
ingest_config_signature(&base, &img()),
|
|
|
|
|
ingest_config_signature(&other, &img()),
|
|
|
|
|
@@ -4010,9 +4040,9 @@ mod ingest_config_signature_tests {
|
|
|
|
|
#[test]
|
|
|
|
|
fn pdf_ocr_engine_switch_invalidates_pdf() {
|
|
|
|
|
let mut ollama = Config::defaults();
|
|
|
|
|
ollama.pdf.ocr.enabled = true;
|
|
|
|
|
ollama.ingest.pdf.ocr.enabled = true;
|
|
|
|
|
let mut paddle = ollama.clone();
|
|
|
|
|
paddle.pdf.ocr.engine = "paddle-onnx".to_string();
|
|
|
|
|
paddle.ingest.pdf.ocr.engine = "paddle-onnx".to_string();
|
|
|
|
|
assert_ne!(
|
|
|
|
|
ingest_config_signature(&ollama, &pdf()),
|
|
|
|
|
ingest_config_signature(&paddle, &pdf()),
|
|
|
|
|
|