From 148c8b704068702d1790b884fd3f937753e06ad8 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 4 Jun 2026 12:37:09 +0000 Subject: [PATCH] =?UTF-8?q?refactor(config):=20v3=20=EB=A0=88=EC=9D=B4?= =?UTF-8?q?=EC=95=84=EC=9B=83=20=E2=80=94=20=EB=AF=B8=EB=94=94=EC=96=B4=20?= =?UTF-8?q?ingest=20=ED=86=B5=ED=95=A9=20+=20pdf=20paddle=20=EB=8C=80?= =?UTF-8?q?=EC=B9=AD=20+=20float=20=EC=A7=81=EB=A0=AC=ED=99=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Config 의 indexing/chunking/image/pdf top-level 필드를 ingest: IngestCfg 하나로 통합. leaf 구조체는 불변, 부모 경로만 [ingest.*] 하위로 이동. PdfOcrCfg 에 paddle 대칭 6키(det/rec/dict/score_thresh/unclip_ratio/ max_boxes) 추가. ser_f32_clean 으로 f32 직렬화 정리(0.3000000119→0.3). apply_env RHS 를 self.ingest.* 로 갱신(env 키 문자열 LHS 불변). Co-Authored-By: Claude Opus 4.8 --- crates/kebab-config/src/lib.rs | 292 +++++++++++++++++---------- crates/kebab-config/src/migrate.rs | 10 +- crates/kebab-config/tests/pdf_ocr.rs | 34 ++-- 3 files changed, 211 insertions(+), 125 deletions(-) diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index d7ec7f6..7fd458c 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -12,6 +12,18 @@ mod paths; pub mod migrate; pub use paths::{expand_path, expand_path_with_base}; +/// f32 의 shortest round-trip(Display)을 f64 로 재파싱해 직렬화한다. +/// `0.3_f32` 가 `0.30000001192092896` 으로 새지 않고 `0.3` 으로 출력되게 한다. +/// 마이그레이션 시 toml_edit relocation 의 무손실 비교를 깨지 않도록, 그리고 +/// `kebab config migrate` 산출물이 사람이 읽기 좋게. +fn ser_f32_clean(v: &f32, s: S) -> Result +where + S: serde::Serializer, +{ + let clean: f64 = format!("{v}").parse().unwrap_or(f64::from(*v)); + s.serialize_f64(clean) +} + /// Signal: `Config::from_file` / `Config::load` failed due to missing path, /// I/O failure, TOML parse failure, or post-parse validation failure. /// @@ -39,32 +51,20 @@ pub struct Config { pub schema_version: u32, pub workspace: WorkspaceCfg, pub storage: StorageCfg, - pub indexing: IndexingCfg, - pub chunking: ChunkingCfg, pub models: ModelsCfg, + /// v3: 모든 미디어 형식 ingest 설정의 우산 — 병렬도(← 옛 `[indexing]`), + /// chunking, code, image, pdf 가 전부 `[ingest.*]` 하위로 통합됐다. + /// `#[serde(default)]` 로 두어 미변환 / 부분 config 도 로드된다(자동 + /// 변환은 `Config::from_file` 가 메모리에서 수행 — T6). + #[serde(default)] + pub ingest: IngestCfg, pub search: SearchCfg, pub rag: RagCfg, - /// Image-pipeline settings (P6: OCR, captioning). Tagged - /// `#[serde(default)]` so pre-P6 config files that predate the - /// `[image]` section still load — defaults disable OCR / caption - /// (they cost a model call per asset). - #[serde(default = "ImageCfg::defaults")] - pub image: ImageCfg, /// p9-fb-14: TUI palette + role-style mapping. `#[serde(default)]` /// so configs that predate this section still load (defaults to /// `dark`). #[serde(default = "UiCfg::defaults")] pub ui: UiCfg, - /// p10-1A-1: code ingest settings. `#[serde(default)]` so existing - /// config files without an `[ingest]` / `[ingest.code]` section - /// load cleanly with built-in defaults. - #[serde(default)] - pub ingest: IngestCfg, - /// v0.20.0 sub-item 1: PDF ingest pipeline settings. `#[serde(default)]` - /// so pre-v0.20 config files without a `[pdf]` section load with - /// built-in defaults (OCR disabled — opt-in for scanned PDF KB). - #[serde(default = "PdfCfg::defaults")] - pub pdf: PdfCfg, /// v0.20.x ingest log surface. `#[serde(default)]` so pre-v0.20 /// config files without a `[logging]` section load with built-in /// defaults (enabled=true, dir=~/.local/state/kebab/logs). @@ -104,13 +104,6 @@ pub struct StorageCfg { pub copy_threshold_mb: u64, } -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct IndexingCfg { - pub max_parallel_extractors: u32, - pub max_parallel_embeddings: u32, - pub watch_filesystem: bool, -} - #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct ChunkingCfg { pub target_tokens: usize, @@ -119,6 +112,17 @@ pub struct ChunkingCfg { pub chunker_version: String, } +impl ChunkingCfg { + pub fn defaults() -> Self { + Self { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: true, + chunker_version: "md-heading-v1".to_string(), + } + } +} + #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct ModelsCfg { pub embedding: EmbeddingModelCfg, @@ -186,6 +190,7 @@ pub struct LlmCfg { pub model: String, pub context_tokens: usize, pub endpoint: String, + #[serde(serialize_with = "ser_f32_clean")] pub temperature: f32, pub seed: u64, /// v0.17.0 post-dogfood: Hard ceiling on a single HTTP exchange to @@ -244,6 +249,7 @@ fn default_stale_threshold_days() -> u32 { #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct RagCfg { pub prompt_template_version: String, + #[serde(serialize_with = "ser_f32_clean")] pub score_gate: f32, pub explain_default: bool, pub max_context_tokens: usize, @@ -293,7 +299,7 @@ pub struct RagCfg { /// /// Single-pass `ask` ignores this knob entirely — only multi-hop /// runs through the verification step (PR-9c-2 wires it). - #[serde(default = "default_nli_threshold")] + #[serde(default = "default_nli_threshold", serialize_with = "ser_f32_clean")] pub nli_threshold: f32, } @@ -397,11 +403,11 @@ pub struct OcrCfg { pub dict: Option, /// DBNet detection box score threshold (0.0..=1.0). Boxes whose mean /// probability is below this are dropped. Default `0.3`. - #[serde(default = "default_ocr_score_thresh")] + #[serde(default = "default_ocr_score_thresh", serialize_with = "ser_f32_clean")] pub score_thresh: f32, /// Polygon unclip ratio applied to each detected box before crop. /// Larger = more padding around the text. Default `1.5`. - #[serde(default = "default_ocr_unclip_ratio")] + #[serde(default = "default_ocr_unclip_ratio", serialize_with = "ser_f32_clean")] pub unclip_ratio: f32, /// Hard cap on detected boxes per image (runaway guard). Extra boxes /// past this count are truncated with a warning. Default `1000`. @@ -583,7 +589,7 @@ pub struct PdfOcrCfg { /// Valid char ratio threshold (0.0..=1.0). Page with ratio below /// this is classified as scanned/mojibake → OCR fallback. Default /// `0.5`. - #[serde(default = "default_pdf_ocr_valid_ratio")] + #[serde(default = "default_pdf_ocr_valid_ratio", serialize_with = "ser_f32_clean")] pub valid_ratio_threshold: f32, /// Minimum char count per page below which page is auto-scanned. /// Default `20`. @@ -592,6 +598,30 @@ pub struct PdfOcrCfg { /// Single-page lang hint. Default `Some("kor")`. `None` = no hint. #[serde(default = "default_pdf_ocr_lang_hint")] pub lang_hint: Option, + + // ── paddle-onnx engine overrides (v3) ─────────────────────────────── + // Symmetric with `[ingest.image.ocr]`. v2 의 "pdf paddle 이 image 의 + // 모델 경로를 빌려쓰던" 비대칭을 제거 — pdf 자체 키로 옮긴다. 마이그레이션 + // (T5)이 image 값을 이 키로 복사해 signature 바이트 동일 유지. 전부 + // `#[serde(default)]` 이라 pre-v3 config 도 로드. + /// Override path to the detection ONNX model. `None` → bundled. + #[serde(default)] + pub det_model: Option, + /// Override path to the recognition ONNX model. `None` → bundled. + #[serde(default)] + pub rec_model: Option, + /// Override path to the character dictionary. `None` → bundled. + #[serde(default)] + pub dict: Option, + /// DBNet detection box score threshold (0.0..=1.0). Default `0.3`. + #[serde(default = "default_ocr_score_thresh", serialize_with = "ser_f32_clean")] + pub score_thresh: f32, + /// Polygon unclip ratio applied to each detected box. Default `1.5`. + #[serde(default = "default_ocr_unclip_ratio", serialize_with = "ser_f32_clean")] + pub unclip_ratio: f32, + /// Hard cap on detected boxes per page (runaway guard). Default `1000`. + #[serde(default = "default_ocr_max_boxes")] + pub max_boxes: usize, } impl PdfOcrCfg { @@ -608,6 +638,12 @@ impl PdfOcrCfg { valid_ratio_threshold: default_pdf_ocr_valid_ratio(), min_char_count: default_pdf_ocr_min_char_count(), lang_hint: default_pdf_ocr_lang_hint(), + det_model: None, + rec_model: None, + dict: None, + score_thresh: default_ocr_score_thresh(), + unclip_ratio: default_ocr_unclip_ratio(), + max_boxes: default_ocr_max_boxes(), } } } @@ -659,12 +695,47 @@ impl UiCfg { } } -/// p10-1A-1: top-level ingest configuration wrapper. Contains per-media-type -/// sub-sections; currently only `code` is defined. -#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] -#[serde(default)] +/// v3: 모든 미디어 형식 ingest 설정의 우산. 스칼라(병렬도)는 ← 옛 `[indexing]`, +/// 미디어별 하위 테이블(chunking/code/image/pdf)은 ← 옛 top-level 섹션. +/// 직렬화 순서 = 필드 순서: 스칼라(병렬도) 먼저, 하위 테이블 뒤 +/// (TOML 의 "bare key 는 sub-table header 앞" 규칙 준수). +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct IngestCfg { + #[serde(default = "default_max_parallel_extractors")] + pub max_parallel_extractors: u32, + #[serde(default = "default_max_parallel_embeddings")] + pub max_parallel_embeddings: u32, + #[serde(default)] + pub watch_filesystem: bool, + #[serde(default = "ChunkingCfg::defaults")] + pub chunking: ChunkingCfg, + #[serde(default)] pub code: IngestCodeCfg, + #[serde(default = "ImageCfg::defaults")] + pub image: ImageCfg, + #[serde(default = "PdfCfg::defaults")] + pub pdf: PdfCfg, +} + +impl Default for IngestCfg { + fn default() -> Self { + Self { + max_parallel_extractors: default_max_parallel_extractors(), + max_parallel_embeddings: default_max_parallel_embeddings(), + watch_filesystem: false, + chunking: ChunkingCfg::defaults(), + code: IngestCodeCfg::default(), + image: ImageCfg::defaults(), + pdf: PdfCfg::defaults(), + } + } +} + +fn default_max_parallel_extractors() -> u32 { + 2 +} +fn default_max_parallel_embeddings() -> u32 { + 1 } /// p10-1A-1: settings for the code ingest pipeline. All fields have @@ -728,17 +799,6 @@ impl Config { runs_dir: "{data_dir}/runs".to_string(), copy_threshold_mb: 100, }, - indexing: IndexingCfg { - max_parallel_extractors: 2, - max_parallel_embeddings: 1, - watch_filesystem: false, - }, - chunking: ChunkingCfg { - target_tokens: 500, - overlap_tokens: 80, - respect_markdown_headings: true, - chunker_version: "md-heading-v1".to_string(), - }, models: ModelsCfg { embedding: EmbeddingModelCfg { provider: "fastembed".to_string(), @@ -765,6 +825,15 @@ impl Config { }, nli: NliCfg::defaults(), }, + ingest: IngestCfg { + max_parallel_extractors: 2, + max_parallel_embeddings: 1, + watch_filesystem: false, + chunking: ChunkingCfg::defaults(), + code: IngestCodeCfg::default(), + image: ImageCfg::defaults(), + pdf: PdfCfg::defaults(), + }, search: SearchCfg { default_k: 10, hybrid_fusion: "rrf".to_string(), @@ -783,10 +852,7 @@ impl Config { multi_hop_max_pool_chunks: default_multi_hop_max_pool_chunks(), nli_threshold: default_nli_threshold(), }, - image: ImageCfg::defaults(), ui: UiCfg::defaults(), - ingest: IngestCfg::default(), - pdf: PdfCfg::defaults(), logging: LoggingCfg::default(), // p9-fb-05: defaults are not loaded from disk, so no // source_dir. Relative `workspace.root` (rare with @@ -963,33 +1029,33 @@ impl Config { // indexing "KEBAB_INDEXING_MAX_PARALLEL_EXTRACTORS" => { if let Ok(n) = v.parse::() { - self.indexing.max_parallel_extractors = n; + self.ingest.max_parallel_extractors = n; } } "KEBAB_INDEXING_MAX_PARALLEL_EMBEDDINGS" => { if let Ok(n) = v.parse::() { - self.indexing.max_parallel_embeddings = n; + self.ingest.max_parallel_embeddings = n; } } "KEBAB_INDEXING_WATCH_FILESYSTEM" => { - self.indexing.watch_filesystem = parse_bool(v); + self.ingest.watch_filesystem = parse_bool(v); } // chunking "KEBAB_CHUNKING_TARGET_TOKENS" => { if let Ok(n) = v.parse::() { - self.chunking.target_tokens = n; + self.ingest.chunking.target_tokens = n; } } "KEBAB_CHUNKING_OVERLAP_TOKENS" => { if let Ok(n) = v.parse::() { - self.chunking.overlap_tokens = n; + self.ingest.chunking.overlap_tokens = n; } } "KEBAB_CHUNKING_RESPECT_MARKDOWN_HEADINGS" => { - self.chunking.respect_markdown_headings = parse_bool(v); + self.ingest.chunking.respect_markdown_headings = parse_bool(v); } - "KEBAB_CHUNKING_CHUNKER_VERSION" => self.chunking.chunker_version = v.clone(), + "KEBAB_CHUNKING_CHUNKER_VERSION" => self.ingest.chunking.chunker_version = v.clone(), // models.embedding "KEBAB_MODELS_EMBEDDING_PROVIDER" => self.models.embedding.provider = v.clone(), @@ -1122,18 +1188,18 @@ impl Config { // image.ocr "KEBAB_IMAGE_OCR_ENABLED" => { - self.image.ocr.enabled = parse_bool(v); + self.ingest.image.ocr.enabled = parse_bool(v); } - "KEBAB_IMAGE_OCR_ENGINE" => self.image.ocr.engine = v.clone(), - "KEBAB_IMAGE_OCR_MODEL" => self.image.ocr.model = v.clone(), + "KEBAB_IMAGE_OCR_ENGINE" => self.ingest.image.ocr.engine = v.clone(), + "KEBAB_IMAGE_OCR_MODEL" => self.ingest.image.ocr.model = v.clone(), "KEBAB_IMAGE_OCR_ENDPOINT" => { // Empty env value is treated the same as "fall back // to models.llm.endpoint" — i.e. set None. - self.image.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) }; + self.ingest.image.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) }; } "KEBAB_IMAGE_OCR_LANGUAGES" => { // Comma-separated list, e.g. "eng,kor". - self.image.ocr.languages = v + self.ingest.image.ocr.languages = v .split(',') .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) @@ -1141,66 +1207,66 @@ impl Config { } "KEBAB_IMAGE_OCR_MAX_PIXELS" => { if let Ok(n) = v.parse::() { - self.image.ocr.max_pixels = n; + self.ingest.image.ocr.max_pixels = n; } } "KEBAB_IMAGE_OCR_REQUEST_TIMEOUT_SECS" => { if let Ok(n) = v.parse::() { - self.image.ocr.request_timeout_secs = n; + self.ingest.image.ocr.request_timeout_secs = n; } } // paddle-onnx engine overrides (v0.27.0). Empty string → None // (fall back to bundled / KEBAB_IMAGE_OCR_MODEL_DIR). "KEBAB_IMAGE_OCR_DET_MODEL" => { - self.image.ocr.det_model = + self.ingest.image.ocr.det_model = if v.is_empty() { None } else { Some(v.clone()) }; } "KEBAB_IMAGE_OCR_REC_MODEL" => { - self.image.ocr.rec_model = + self.ingest.image.ocr.rec_model = if v.is_empty() { None } else { Some(v.clone()) }; } "KEBAB_IMAGE_OCR_DICT" => { - self.image.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) }; + self.ingest.image.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) }; } "KEBAB_IMAGE_OCR_SCORE_THRESH" => { if let Ok(f) = v.parse::() { - self.image.ocr.score_thresh = f; + self.ingest.image.ocr.score_thresh = f; } } "KEBAB_IMAGE_OCR_UNCLIP_RATIO" => { if let Ok(f) = v.parse::() { - self.image.ocr.unclip_ratio = f; + self.ingest.image.ocr.unclip_ratio = f; } } "KEBAB_IMAGE_OCR_MAX_BOXES" => { if let Ok(n) = v.parse::() { - self.image.ocr.max_boxes = n; + self.ingest.image.ocr.max_boxes = n; } } // image.caption (P6-3) "KEBAB_IMAGE_CAPTION_ENABLED" => { - self.image.caption.enabled = parse_bool(v); + self.ingest.image.caption.enabled = parse_bool(v); } "KEBAB_IMAGE_CAPTION_MAX_PIXELS" => { if let Ok(n) = v.parse::() { - self.image.caption.max_pixels = n; + self.ingest.image.caption.max_pixels = n; } } "KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION" => { - self.image.caption.prompt_template_version = v.clone(); + self.ingest.image.caption.prompt_template_version = v.clone(); } // pdf.ocr (v0.20.0 sub-item 1) - "KEBAB_PDF_OCR_ENABLED" => self.pdf.ocr.enabled = parse_bool(v), - "KEBAB_PDF_OCR_ALWAYS_ON" => self.pdf.ocr.always_on = parse_bool(v), - "KEBAB_PDF_OCR_ENGINE" => self.pdf.ocr.engine = v.clone(), - "KEBAB_PDF_OCR_MODEL" => self.pdf.ocr.model = v.clone(), + "KEBAB_PDF_OCR_ENABLED" => self.ingest.pdf.ocr.enabled = parse_bool(v), + "KEBAB_PDF_OCR_ALWAYS_ON" => self.ingest.pdf.ocr.always_on = parse_bool(v), + "KEBAB_PDF_OCR_ENGINE" => self.ingest.pdf.ocr.engine = v.clone(), + "KEBAB_PDF_OCR_MODEL" => self.ingest.pdf.ocr.model = v.clone(), "KEBAB_PDF_OCR_ENDPOINT" => { - self.pdf.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) }; + self.ingest.pdf.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) }; } "KEBAB_PDF_OCR_LANGUAGES" => { - self.pdf.ocr.languages = v + self.ingest.pdf.ocr.languages = v .split(',') .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) @@ -1208,26 +1274,26 @@ impl Config { } "KEBAB_PDF_OCR_MAX_PIXELS" => { if let Ok(n) = v.parse::() { - self.pdf.ocr.max_pixels = n; + self.ingest.pdf.ocr.max_pixels = n; } } "KEBAB_PDF_OCR_REQUEST_TIMEOUT_SECS" => { if let Ok(n) = v.parse::() { - self.pdf.ocr.request_timeout_secs = n; + self.ingest.pdf.ocr.request_timeout_secs = n; } } "KEBAB_PDF_OCR_VALID_RATIO_THRESHOLD" => { if let Ok(n) = v.parse::() { - self.pdf.ocr.valid_ratio_threshold = n.clamp(0.0, 1.0); + self.ingest.pdf.ocr.valid_ratio_threshold = n.clamp(0.0, 1.0); } } "KEBAB_PDF_OCR_MIN_CHAR_COUNT" => { if let Ok(n) = v.parse::() { - self.pdf.ocr.min_char_count = n; + self.ingest.pdf.ocr.min_char_count = n; } } "KEBAB_PDF_OCR_LANG_HINT" => { - self.pdf.ocr.lang_hint = if v.is_empty() { None } else { Some(v.clone()) }; + self.ingest.pdf.ocr.lang_hint = if v.is_empty() { None } else { Some(v.clone()) }; } // Unknown KEBAB_* keys are silently ignored — see @@ -1413,11 +1479,27 @@ theme = "dark" assert_eq!(c, back); } + #[test] + fn v3_layout_nests_media_under_ingest() { + let c = Config::defaults(); + // 새 경로가 컴파일·접근 가능해야 한다. + assert_eq!(c.ingest.max_parallel_extractors, 2); + assert_eq!(c.ingest.chunking.target_tokens, 500); + assert_eq!(c.ingest.code.max_file_bytes, 262_144); + assert_eq!(c.ingest.image.ocr.engine, "ollama-vision"); + assert_eq!(c.ingest.image.caption.max_pixels, 768); + assert_eq!(c.ingest.pdf.ocr.model, "qwen2.5vl:3b"); + // pdf paddle 대칭 키 존재 + 기본값. + assert_eq!(c.ingest.pdf.ocr.score_thresh, 0.3); + assert_eq!(c.ingest.pdf.ocr.max_boxes, 1000); + assert!(c.ingest.pdf.ocr.det_model.is_none()); + } + #[test] fn defaults_match_design_64_score_gate() { let c = Config::defaults(); assert_eq!(c.rag.score_gate, 0.30); - assert_eq!(c.chunking.target_tokens, 500); + assert_eq!(c.ingest.chunking.target_tokens, 500); assert_eq!(c.models.embedding.model, "multilingual-e5-large"); assert_eq!(c.models.embedding.dimensions, 1024); assert_eq!(c.search.rrf_k, 60); @@ -1462,7 +1544,7 @@ theme = "dark" "777".to_string(), ); let c = Config::defaults().apply_env(&env); - assert_eq!(c.chunking.target_tokens, 777); + assert_eq!(c.ingest.chunking.target_tokens, 777); } #[test] @@ -1517,24 +1599,24 @@ theme = "dark" "true".to_string(), ); let c = Config::defaults().apply_env(&env); - assert!(c.indexing.watch_filesystem); + assert!(c.ingest.watch_filesystem); } #[test] fn image_ocr_defaults_disabled_with_ollama_vision() { let c = Config::defaults(); - assert!(!c.image.ocr.enabled); - assert_eq!(c.image.ocr.engine, "ollama-vision"); - assert_eq!(c.image.ocr.model, "gemma4:e4b"); - assert_eq!(c.image.ocr.languages, vec!["eng", "kor"]); - assert_eq!(c.image.ocr.max_pixels, 1600); + assert!(!c.ingest.image.ocr.enabled); + assert_eq!(c.ingest.image.ocr.engine, "ollama-vision"); + assert_eq!(c.ingest.image.ocr.model, "gemma4:e4b"); + assert_eq!(c.ingest.image.ocr.languages, vec!["eng", "kor"]); + assert_eq!(c.ingest.image.ocr.max_pixels, 1600); } /// v0.17.2 post-dogfood: matches the legacy hard-coded 300s cap so /// existing configs that omit the new field keep behaving identically. #[test] fn default_ocr_request_timeout_secs_is_300() { - assert_eq!(Config::defaults().image.ocr.request_timeout_secs, 300); + assert_eq!(Config::defaults().ingest.image.ocr.request_timeout_secs, 300); } #[test] @@ -1545,7 +1627,7 @@ theme = "dark" "900".to_string(), ); let c = Config::defaults().apply_env(&env); - assert_eq!(c.image.ocr.request_timeout_secs, 900); + assert_eq!(c.ingest.image.ocr.request_timeout_secs, 900); } /// post-v0.17.1 dogfood: a config file written before the OCR @@ -1555,7 +1637,7 @@ theme = "dark" #[test] fn legacy_config_without_ocr_request_timeout_secs_uses_default() { let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config"); - assert_eq!(c.image.ocr.request_timeout_secs, 300); + assert_eq!(c.ingest.image.ocr.request_timeout_secs, 300); } // ── p9-fb-41: multi-hop RAG knobs ──────────────────────────────────── @@ -1707,14 +1789,14 @@ theme = "dark" ); env.insert("KEBAB_IMAGE_OCR_MAX_PIXELS".to_string(), "2048".to_string()); let c = Config::defaults().apply_env(&env); - assert!(c.image.ocr.enabled); - assert_eq!(c.image.ocr.model, "gemma4:31b"); + assert!(c.ingest.image.ocr.enabled); + assert_eq!(c.ingest.image.ocr.model, "gemma4:31b"); assert_eq!( - c.image.ocr.endpoint.as_deref(), + c.ingest.image.ocr.endpoint.as_deref(), Some("http://192.168.0.47:11434") ); - assert_eq!(c.image.ocr.languages, vec!["eng", "kor", "jpn"]); - assert_eq!(c.image.ocr.max_pixels, 2048); + assert_eq!(c.ingest.image.ocr.languages, vec!["eng", "kor", "jpn"]); + assert_eq!(c.ingest.image.ocr.max_pixels, 2048); } /// Pre-P6 config files don't have an `[image]` section. The @@ -1723,9 +1805,9 @@ theme = "dark" #[test] fn image_caption_defaults_disabled() { let c = Config::defaults(); - assert!(!c.image.caption.enabled); - assert_eq!(c.image.caption.max_pixels, 768); - assert_eq!(c.image.caption.prompt_template_version, "caption-v1"); + assert!(!c.ingest.image.caption.enabled); + assert_eq!(c.ingest.image.caption.max_pixels, 768); + assert_eq!(c.ingest.image.caption.prompt_template_version, "caption-v1"); } #[test] @@ -1744,9 +1826,9 @@ theme = "dark" "caption-v2".to_string(), ); let c = Config::defaults().apply_env(&env); - assert!(c.image.caption.enabled); - assert_eq!(c.image.caption.max_pixels, 1024); - assert_eq!(c.image.caption.prompt_template_version, "caption-v2"); + assert!(c.ingest.image.caption.enabled); + assert_eq!(c.ingest.image.caption.max_pixels, 1024); + assert_eq!(c.ingest.image.caption.prompt_template_version, "caption-v2"); } /// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None` @@ -1757,7 +1839,7 @@ theme = "dark" let mut env = HashMap::new(); env.insert("KEBAB_IMAGE_OCR_ENDPOINT".to_string(), String::new()); let c = Config::defaults().apply_env(&env); - assert_eq!(c.image.ocr.endpoint, None); + assert_eq!(c.ingest.image.ocr.endpoint, None); } #[test] @@ -1820,7 +1902,7 @@ explain_default = false max_context_tokens = 8000 "#; let c: Config = toml::from_str(toml_text).expect("pre-P6 TOML must still parse"); - assert_eq!(c.image, ImageCfg::defaults()); + assert_eq!(c.ingest.image, ImageCfg::defaults()); } /// p9-fb-25: legacy config with `workspace.include = [...]` must diff --git a/crates/kebab-config/src/migrate.rs b/crates/kebab-config/src/migrate.rs index d58f4c7..d41669a 100644 --- a/crates/kebab-config/src/migrate.rs +++ b/crates/kebab-config/src/migrate.rs @@ -254,12 +254,16 @@ mod tests { fn annotated_default_has_all_sections_and_parses_back_to_defaults() { let doc = annotated_default_document(); let text = doc.to_string(); - // PdfCfg/ImageCfg/ModelsCfg/IngestCfg 는 스칼라 필드가 없어 bare - // `[pdf]` 등은 안 나오고 `[pdf.ocr]` 같은 하위 테이블만 직렬화된다. + // v3: 미디어 형식 섹션이 전부 `[ingest.*]` 하위로 통합됐다. IngestCfg + // 는 스칼라(병렬도) 필드가 있어 bare `[ingest]` + 하위 테이블이 함께 + // 직렬화된다. for section in [ "[workspace]", + "[ingest]", + "[ingest.chunking]", "[ingest.code]", - "[pdf.ocr]", + "[ingest.image.ocr]", + "[ingest.pdf.ocr]", "[logging]", "[ui]", ] { diff --git a/crates/kebab-config/tests/pdf_ocr.rs b/crates/kebab-config/tests/pdf_ocr.rs index b1e8cc4..fa142a4 100644 --- a/crates/kebab-config/tests/pdf_ocr.rs +++ b/crates/kebab-config/tests/pdf_ocr.rs @@ -47,20 +47,20 @@ lang_hint = "kor" #[test] fn pdf_ocr_defaults_off_with_qwen_3b() { let cfg = Config::defaults(); - assert!(!cfg.pdf.ocr.enabled); - assert!(!cfg.pdf.ocr.always_on); - assert_eq!(cfg.pdf.ocr.engine, "ollama-vision"); - assert_eq!(cfg.pdf.ocr.model, "qwen2.5vl:3b"); - assert!(cfg.pdf.ocr.endpoint.is_none()); + assert!(!cfg.ingest.pdf.ocr.enabled); + assert!(!cfg.ingest.pdf.ocr.always_on); + assert_eq!(cfg.ingest.pdf.ocr.engine, "ollama-vision"); + assert_eq!(cfg.ingest.pdf.ocr.model, "qwen2.5vl:3b"); + assert!(cfg.ingest.pdf.ocr.endpoint.is_none()); assert_eq!( - cfg.pdf.ocr.languages, + cfg.ingest.pdf.ocr.languages, vec!["eng".to_string(), "kor".to_string()] ); - assert_eq!(cfg.pdf.ocr.max_pixels, 2048); - assert_eq!(cfg.pdf.ocr.request_timeout_secs, 180); // Bug #11: 600 → 60 → 180 (HOTFIXES 2026-05-28) - assert!((cfg.pdf.ocr.valid_ratio_threshold - 0.5).abs() < 1e-6); - assert_eq!(cfg.pdf.ocr.min_char_count, 20); - assert_eq!(cfg.pdf.ocr.lang_hint.as_deref(), Some("kor")); + assert_eq!(cfg.ingest.pdf.ocr.max_pixels, 2048); + assert_eq!(cfg.ingest.pdf.ocr.request_timeout_secs, 180); // Bug #11: 600 → 60 → 180 (HOTFIXES 2026-05-28) + assert!((cfg.ingest.pdf.ocr.valid_ratio_threshold - 0.5).abs() < 1e-6); + assert_eq!(cfg.ingest.pdf.ocr.min_char_count, 20); + assert_eq!(cfg.ingest.pdf.ocr.lang_hint.as_deref(), Some("kor")); } // Test 3: env var override — 4 keys 의 typical override case. @@ -80,12 +80,12 @@ fn pdf_ocr_env_overrides() { let cfg = Config::defaults().apply_env(&env); - assert!(cfg.pdf.ocr.enabled); - assert_eq!(cfg.pdf.ocr.model, "qwen2.5vl:7b"); - assert!(cfg.pdf.ocr.always_on); - assert!((cfg.pdf.ocr.valid_ratio_threshold - 0.75).abs() < 1e-6); + assert!(cfg.ingest.pdf.ocr.enabled); + assert_eq!(cfg.ingest.pdf.ocr.model, "qwen2.5vl:7b"); + assert!(cfg.ingest.pdf.ocr.always_on); + assert!((cfg.ingest.pdf.ocr.valid_ratio_threshold - 0.75).abs() < 1e-6); // 다른 env var 가 default 보존 - assert_eq!(cfg.pdf.ocr.engine, "ollama-vision"); - assert_eq!(cfg.pdf.ocr.min_char_count, 20); + assert_eq!(cfg.ingest.pdf.ocr.engine, "ollama-vision"); + assert_eq!(cfg.ingest.pdf.ocr.min_char_count, 20); }