feat(config): config.toml v2→v3 스키마 재편 — 미디어 [ingest.*] 통합 + 무손실 자동 마이그레이션 #207

Merged
altair823 merged 12 commits from feat/config-schema-reorg into main 2026-06-04 14:36:44 +00:00
3 changed files with 211 additions and 125 deletions
Showing only changes of commit 148c8b7040 - Show all commits

View File

@@ -12,6 +12,18 @@ mod paths;
pub mod migrate;
pub use paths::{expand_path, expand_path_with_base};
/// f32 의 shortest round-trip(Display)을 f64 로 재파싱해 직렬화한다.
/// `0.3_f32` 가 `0.30000001192092896` 으로 새지 않고 `0.3` 으로 출력되게 한다.
/// 마이그레이션 시 toml_edit relocation 의 무손실 비교를 깨지 않도록, 그리고
/// `kebab config migrate` 산출물이 사람이 읽기 좋게.
fn ser_f32_clean<S>(v: &f32, s: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let clean: f64 = format!("{v}").parse().unwrap_or(f64::from(*v));
s.serialize_f64(clean)
}
/// Signal: `Config::from_file` / `Config::load` failed due to missing path,
/// I/O failure, TOML parse failure, or post-parse validation failure.
///
@@ -39,32 +51,20 @@ pub struct Config {
pub schema_version: u32,
pub workspace: WorkspaceCfg,
pub storage: StorageCfg,
pub indexing: IndexingCfg,
pub chunking: ChunkingCfg,
pub models: ModelsCfg,
/// v3: 모든 미디어 형식 ingest 설정의 우산 — 병렬도(← 옛 `[indexing]`),
/// chunking, code, image, pdf 가 전부 `[ingest.*]` 하위로 통합됐다.
/// `#[serde(default)]` 로 두어 미변환 / 부분 config 도 로드된다(자동
/// 변환은 `Config::from_file` 가 메모리에서 수행 — T6).
#[serde(default)]
pub ingest: IngestCfg,
pub search: SearchCfg,
pub rag: RagCfg,
/// Image-pipeline settings (P6: OCR, captioning). Tagged
/// `#[serde(default)]` so pre-P6 config files that predate the
/// `[image]` section still load — defaults disable OCR / caption
/// (they cost a model call per asset).
#[serde(default = "ImageCfg::defaults")]
pub image: ImageCfg,
/// p9-fb-14: TUI palette + role-style mapping. `#[serde(default)]`
/// so configs that predate this section still load (defaults to
/// `dark`).
#[serde(default = "UiCfg::defaults")]
pub ui: UiCfg,
/// p10-1A-1: code ingest settings. `#[serde(default)]` so existing
/// config files without an `[ingest]` / `[ingest.code]` section
/// load cleanly with built-in defaults.
#[serde(default)]
pub ingest: IngestCfg,
/// v0.20.0 sub-item 1: PDF ingest pipeline settings. `#[serde(default)]`
/// so pre-v0.20 config files without a `[pdf]` section load with
/// built-in defaults (OCR disabled — opt-in for scanned PDF KB).
#[serde(default = "PdfCfg::defaults")]
pub pdf: PdfCfg,
/// v0.20.x ingest log surface. `#[serde(default)]` so pre-v0.20
/// config files without a `[logging]` section load with built-in
/// defaults (enabled=true, dir=~/.local/state/kebab/logs).
@@ -104,13 +104,6 @@ pub struct StorageCfg {
pub copy_threshold_mb: u64,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IndexingCfg {
pub max_parallel_extractors: u32,
pub max_parallel_embeddings: u32,
pub watch_filesystem: bool,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ChunkingCfg {
pub target_tokens: usize,
@@ -119,6 +112,17 @@ pub struct ChunkingCfg {
pub chunker_version: String,
}
impl ChunkingCfg {
pub fn defaults() -> Self {
Self {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: true,
chunker_version: "md-heading-v1".to_string(),
}
}
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ModelsCfg {
pub embedding: EmbeddingModelCfg,
@@ -186,6 +190,7 @@ pub struct LlmCfg {
pub model: String,
pub context_tokens: usize,
pub endpoint: String,
#[serde(serialize_with = "ser_f32_clean")]
pub temperature: f32,
pub seed: u64,
/// v0.17.0 post-dogfood: Hard ceiling on a single HTTP exchange to
@@ -244,6 +249,7 @@ fn default_stale_threshold_days() -> u32 {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct RagCfg {
pub prompt_template_version: String,
#[serde(serialize_with = "ser_f32_clean")]
pub score_gate: f32,
pub explain_default: bool,
pub max_context_tokens: usize,
@@ -293,7 +299,7 @@ pub struct RagCfg {
///
/// Single-pass `ask` ignores this knob entirely — only multi-hop
/// runs through the verification step (PR-9c-2 wires it).
#[serde(default = "default_nli_threshold")]
#[serde(default = "default_nli_threshold", serialize_with = "ser_f32_clean")]
pub nli_threshold: f32,
}
@@ -397,11 +403,11 @@ pub struct OcrCfg {
pub dict: Option<String>,
/// DBNet detection box score threshold (0.0..=1.0). Boxes whose mean
/// probability is below this are dropped. Default `0.3`.
#[serde(default = "default_ocr_score_thresh")]
#[serde(default = "default_ocr_score_thresh", serialize_with = "ser_f32_clean")]
pub score_thresh: f32,
/// Polygon unclip ratio applied to each detected box before crop.
/// Larger = more padding around the text. Default `1.5`.
#[serde(default = "default_ocr_unclip_ratio")]
#[serde(default = "default_ocr_unclip_ratio", serialize_with = "ser_f32_clean")]
pub unclip_ratio: f32,
/// Hard cap on detected boxes per image (runaway guard). Extra boxes
/// past this count are truncated with a warning. Default `1000`.
@@ -583,7 +589,7 @@ pub struct PdfOcrCfg {
/// Valid char ratio threshold (0.0..=1.0). Page with ratio below
/// this is classified as scanned/mojibake → OCR fallback. Default
/// `0.5`.
#[serde(default = "default_pdf_ocr_valid_ratio")]
#[serde(default = "default_pdf_ocr_valid_ratio", serialize_with = "ser_f32_clean")]
pub valid_ratio_threshold: f32,
/// Minimum char count per page below which page is auto-scanned.
/// Default `20`.
@@ -592,6 +598,30 @@ pub struct PdfOcrCfg {
/// Single-page lang hint. Default `Some("kor")`. `None` = no hint.
#[serde(default = "default_pdf_ocr_lang_hint")]
pub lang_hint: Option<String>,
// ── paddle-onnx engine overrides (v3) ───────────────────────────────
// Symmetric with `[ingest.image.ocr]`. v2 의 "pdf paddle 이 image 의
// 모델 경로를 빌려쓰던" 비대칭을 제거 — pdf 자체 키로 옮긴다. 마이그레이션
// (T5)이 image 값을 이 키로 복사해 signature 바이트 동일 유지. 전부
// `#[serde(default)]` 이라 pre-v3 config 도 로드.
/// Override path to the detection ONNX model. `None` → bundled.
#[serde(default)]
pub det_model: Option<String>,
/// Override path to the recognition ONNX model. `None` → bundled.
#[serde(default)]
pub rec_model: Option<String>,
/// Override path to the character dictionary. `None` → bundled.
#[serde(default)]
pub dict: Option<String>,
/// DBNet detection box score threshold (0.0..=1.0). Default `0.3`.
#[serde(default = "default_ocr_score_thresh", serialize_with = "ser_f32_clean")]
pub score_thresh: f32,
/// Polygon unclip ratio applied to each detected box. Default `1.5`.
#[serde(default = "default_ocr_unclip_ratio", serialize_with = "ser_f32_clean")]
pub unclip_ratio: f32,
/// Hard cap on detected boxes per page (runaway guard). Default `1000`.
#[serde(default = "default_ocr_max_boxes")]
pub max_boxes: usize,
}
impl PdfOcrCfg {
@@ -608,6 +638,12 @@ impl PdfOcrCfg {
valid_ratio_threshold: default_pdf_ocr_valid_ratio(),
min_char_count: default_pdf_ocr_min_char_count(),
lang_hint: default_pdf_ocr_lang_hint(),
det_model: None,
rec_model: None,
dict: None,
score_thresh: default_ocr_score_thresh(),
unclip_ratio: default_ocr_unclip_ratio(),
max_boxes: default_ocr_max_boxes(),
}
}
}
@@ -659,12 +695,47 @@ impl UiCfg {
}
}
/// p10-1A-1: top-level ingest configuration wrapper. Contains per-media-type
/// sub-sections; currently only `code` is defined.
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
#[serde(default)]
/// v3: 모든 미디어 형식 ingest 설정의 우산. 스칼라(병렬도)는 ← 옛 `[indexing]`,
/// 미디어별 하위 테이블(chunking/code/image/pdf)은 ← 옛 top-level 섹션.
/// 직렬화 순서 = 필드 순서: 스칼라(병렬도) 먼저, 하위 테이블 뒤
/// (TOML 의 "bare key 는 sub-table header 앞" 규칙 준수).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IngestCfg {
#[serde(default = "default_max_parallel_extractors")]
pub max_parallel_extractors: u32,
#[serde(default = "default_max_parallel_embeddings")]
pub max_parallel_embeddings: u32,
#[serde(default)]
pub watch_filesystem: bool,
#[serde(default = "ChunkingCfg::defaults")]
pub chunking: ChunkingCfg,
#[serde(default)]
pub code: IngestCodeCfg,
#[serde(default = "ImageCfg::defaults")]
pub image: ImageCfg,
#[serde(default = "PdfCfg::defaults")]
pub pdf: PdfCfg,
}
impl Default for IngestCfg {
fn default() -> Self {
Self {
max_parallel_extractors: default_max_parallel_extractors(),
max_parallel_embeddings: default_max_parallel_embeddings(),
watch_filesystem: false,
chunking: ChunkingCfg::defaults(),
code: IngestCodeCfg::default(),
image: ImageCfg::defaults(),
pdf: PdfCfg::defaults(),
}
}
}
fn default_max_parallel_extractors() -> u32 {
2
}
fn default_max_parallel_embeddings() -> u32 {
1
}
/// p10-1A-1: settings for the code ingest pipeline. All fields have
@@ -728,17 +799,6 @@ impl Config {
runs_dir: "{data_dir}/runs".to_string(),
copy_threshold_mb: 100,
},
indexing: IndexingCfg {
max_parallel_extractors: 2,
max_parallel_embeddings: 1,
watch_filesystem: false,
},
chunking: ChunkingCfg {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: true,
chunker_version: "md-heading-v1".to_string(),
},
models: ModelsCfg {
embedding: EmbeddingModelCfg {
provider: "fastembed".to_string(),
@@ -765,6 +825,15 @@ impl Config {
},
nli: NliCfg::defaults(),
},
ingest: IngestCfg {
max_parallel_extractors: 2,
max_parallel_embeddings: 1,
watch_filesystem: false,
chunking: ChunkingCfg::defaults(),
code: IngestCodeCfg::default(),
image: ImageCfg::defaults(),
pdf: PdfCfg::defaults(),
},
search: SearchCfg {
default_k: 10,
hybrid_fusion: "rrf".to_string(),
@@ -783,10 +852,7 @@ impl Config {
multi_hop_max_pool_chunks: default_multi_hop_max_pool_chunks(),
nli_threshold: default_nli_threshold(),
},
image: ImageCfg::defaults(),
ui: UiCfg::defaults(),
ingest: IngestCfg::default(),
pdf: PdfCfg::defaults(),
logging: LoggingCfg::default(),
// p9-fb-05: defaults are not loaded from disk, so no
// source_dir. Relative `workspace.root` (rare with
@@ -963,33 +1029,33 @@ impl Config {
// indexing
"KEBAB_INDEXING_MAX_PARALLEL_EXTRACTORS" => {
if let Ok(n) = v.parse::<u32>() {
self.indexing.max_parallel_extractors = n;
self.ingest.max_parallel_extractors = n;
}
}
"KEBAB_INDEXING_MAX_PARALLEL_EMBEDDINGS" => {
if let Ok(n) = v.parse::<u32>() {
self.indexing.max_parallel_embeddings = n;
self.ingest.max_parallel_embeddings = n;
}
}
"KEBAB_INDEXING_WATCH_FILESYSTEM" => {
self.indexing.watch_filesystem = parse_bool(v);
self.ingest.watch_filesystem = parse_bool(v);
}
// chunking
"KEBAB_CHUNKING_TARGET_TOKENS" => {
if let Ok(n) = v.parse::<usize>() {
self.chunking.target_tokens = n;
self.ingest.chunking.target_tokens = n;
}
}
"KEBAB_CHUNKING_OVERLAP_TOKENS" => {
if let Ok(n) = v.parse::<usize>() {
self.chunking.overlap_tokens = n;
self.ingest.chunking.overlap_tokens = n;
}
}
"KEBAB_CHUNKING_RESPECT_MARKDOWN_HEADINGS" => {
self.chunking.respect_markdown_headings = parse_bool(v);
self.ingest.chunking.respect_markdown_headings = parse_bool(v);
}
"KEBAB_CHUNKING_CHUNKER_VERSION" => self.chunking.chunker_version = v.clone(),
"KEBAB_CHUNKING_CHUNKER_VERSION" => self.ingest.chunking.chunker_version = v.clone(),
// models.embedding
"KEBAB_MODELS_EMBEDDING_PROVIDER" => self.models.embedding.provider = v.clone(),
@@ -1122,18 +1188,18 @@ impl Config {
// image.ocr
"KEBAB_IMAGE_OCR_ENABLED" => {
self.image.ocr.enabled = parse_bool(v);
self.ingest.image.ocr.enabled = parse_bool(v);
}
"KEBAB_IMAGE_OCR_ENGINE" => self.image.ocr.engine = v.clone(),
"KEBAB_IMAGE_OCR_MODEL" => self.image.ocr.model = v.clone(),
"KEBAB_IMAGE_OCR_ENGINE" => self.ingest.image.ocr.engine = v.clone(),
"KEBAB_IMAGE_OCR_MODEL" => self.ingest.image.ocr.model = v.clone(),
"KEBAB_IMAGE_OCR_ENDPOINT" => {
// Empty env value is treated the same as "fall back
// to models.llm.endpoint" — i.e. set None.
self.image.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) };
self.ingest.image.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_LANGUAGES" => {
// Comma-separated list, e.g. "eng,kor".
self.image.ocr.languages = v
self.ingest.image.ocr.languages = v
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
@@ -1141,66 +1207,66 @@ impl Config {
}
"KEBAB_IMAGE_OCR_MAX_PIXELS" => {
if let Ok(n) = v.parse::<u32>() {
self.image.ocr.max_pixels = n;
self.ingest.image.ocr.max_pixels = n;
}
}
"KEBAB_IMAGE_OCR_REQUEST_TIMEOUT_SECS" => {
if let Ok(n) = v.parse::<u64>() {
self.image.ocr.request_timeout_secs = n;
self.ingest.image.ocr.request_timeout_secs = n;
}
}
// paddle-onnx engine overrides (v0.27.0). Empty string → None
// (fall back to bundled / KEBAB_IMAGE_OCR_MODEL_DIR).
"KEBAB_IMAGE_OCR_DET_MODEL" => {
self.image.ocr.det_model =
self.ingest.image.ocr.det_model =
if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_REC_MODEL" => {
self.image.ocr.rec_model =
self.ingest.image.ocr.rec_model =
if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_DICT" => {
self.image.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) };
self.ingest.image.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_SCORE_THRESH" => {
if let Ok(f) = v.parse::<f32>() {
self.image.ocr.score_thresh = f;
self.ingest.image.ocr.score_thresh = f;
}
}
"KEBAB_IMAGE_OCR_UNCLIP_RATIO" => {
if let Ok(f) = v.parse::<f32>() {
self.image.ocr.unclip_ratio = f;
self.ingest.image.ocr.unclip_ratio = f;
}
}
"KEBAB_IMAGE_OCR_MAX_BOXES" => {
if let Ok(n) = v.parse::<usize>() {
self.image.ocr.max_boxes = n;
self.ingest.image.ocr.max_boxes = n;
}
}
// image.caption (P6-3)
"KEBAB_IMAGE_CAPTION_ENABLED" => {
self.image.caption.enabled = parse_bool(v);
self.ingest.image.caption.enabled = parse_bool(v);
}
"KEBAB_IMAGE_CAPTION_MAX_PIXELS" => {
if let Ok(n) = v.parse::<u32>() {
self.image.caption.max_pixels = n;
self.ingest.image.caption.max_pixels = n;
}
}
"KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION" => {
self.image.caption.prompt_template_version = v.clone();
self.ingest.image.caption.prompt_template_version = v.clone();
}
// pdf.ocr (v0.20.0 sub-item 1)
"KEBAB_PDF_OCR_ENABLED" => self.pdf.ocr.enabled = parse_bool(v),
"KEBAB_PDF_OCR_ALWAYS_ON" => self.pdf.ocr.always_on = parse_bool(v),
"KEBAB_PDF_OCR_ENGINE" => self.pdf.ocr.engine = v.clone(),
"KEBAB_PDF_OCR_MODEL" => self.pdf.ocr.model = v.clone(),
"KEBAB_PDF_OCR_ENABLED" => self.ingest.pdf.ocr.enabled = parse_bool(v),
"KEBAB_PDF_OCR_ALWAYS_ON" => self.ingest.pdf.ocr.always_on = parse_bool(v),
"KEBAB_PDF_OCR_ENGINE" => self.ingest.pdf.ocr.engine = v.clone(),
"KEBAB_PDF_OCR_MODEL" => self.ingest.pdf.ocr.model = v.clone(),
"KEBAB_PDF_OCR_ENDPOINT" => {
self.pdf.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) };
self.ingest.pdf.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_PDF_OCR_LANGUAGES" => {
self.pdf.ocr.languages = v
self.ingest.pdf.ocr.languages = v
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
@@ -1208,26 +1274,26 @@ impl Config {
}
"KEBAB_PDF_OCR_MAX_PIXELS" => {
if let Ok(n) = v.parse::<u32>() {
self.pdf.ocr.max_pixels = n;
self.ingest.pdf.ocr.max_pixels = n;
}
}
"KEBAB_PDF_OCR_REQUEST_TIMEOUT_SECS" => {
if let Ok(n) = v.parse::<u64>() {
self.pdf.ocr.request_timeout_secs = n;
self.ingest.pdf.ocr.request_timeout_secs = n;
}
}
"KEBAB_PDF_OCR_VALID_RATIO_THRESHOLD" => {
if let Ok(n) = v.parse::<f32>() {
self.pdf.ocr.valid_ratio_threshold = n.clamp(0.0, 1.0);
self.ingest.pdf.ocr.valid_ratio_threshold = n.clamp(0.0, 1.0);
}
}
"KEBAB_PDF_OCR_MIN_CHAR_COUNT" => {
if let Ok(n) = v.parse::<u32>() {
self.pdf.ocr.min_char_count = n;
self.ingest.pdf.ocr.min_char_count = n;
}
}
"KEBAB_PDF_OCR_LANG_HINT" => {
self.pdf.ocr.lang_hint = if v.is_empty() { None } else { Some(v.clone()) };
self.ingest.pdf.ocr.lang_hint = if v.is_empty() { None } else { Some(v.clone()) };
}
// Unknown KEBAB_* keys are silently ignored — see
@@ -1413,11 +1479,27 @@ theme = "dark"
assert_eq!(c, back);
}
#[test]
fn v3_layout_nests_media_under_ingest() {
let c = Config::defaults();
// 새 경로가 컴파일·접근 가능해야 한다.
assert_eq!(c.ingest.max_parallel_extractors, 2);
assert_eq!(c.ingest.chunking.target_tokens, 500);
assert_eq!(c.ingest.code.max_file_bytes, 262_144);
assert_eq!(c.ingest.image.ocr.engine, "ollama-vision");
assert_eq!(c.ingest.image.caption.max_pixels, 768);
assert_eq!(c.ingest.pdf.ocr.model, "qwen2.5vl:3b");
// pdf paddle 대칭 키 존재 + 기본값.
assert_eq!(c.ingest.pdf.ocr.score_thresh, 0.3);
assert_eq!(c.ingest.pdf.ocr.max_boxes, 1000);
assert!(c.ingest.pdf.ocr.det_model.is_none());
}
#[test]
fn defaults_match_design_64_score_gate() {
let c = Config::defaults();
assert_eq!(c.rag.score_gate, 0.30);
assert_eq!(c.chunking.target_tokens, 500);
assert_eq!(c.ingest.chunking.target_tokens, 500);
assert_eq!(c.models.embedding.model, "multilingual-e5-large");
assert_eq!(c.models.embedding.dimensions, 1024);
assert_eq!(c.search.rrf_k, 60);
@@ -1462,7 +1544,7 @@ theme = "dark"
"777".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert_eq!(c.chunking.target_tokens, 777);
assert_eq!(c.ingest.chunking.target_tokens, 777);
}
#[test]
@@ -1517,24 +1599,24 @@ theme = "dark"
"true".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert!(c.indexing.watch_filesystem);
assert!(c.ingest.watch_filesystem);
}
#[test]
fn image_ocr_defaults_disabled_with_ollama_vision() {
let c = Config::defaults();
assert!(!c.image.ocr.enabled);
assert_eq!(c.image.ocr.engine, "ollama-vision");
assert_eq!(c.image.ocr.model, "gemma4:e4b");
assert_eq!(c.image.ocr.languages, vec!["eng", "kor"]);
assert_eq!(c.image.ocr.max_pixels, 1600);
assert!(!c.ingest.image.ocr.enabled);
assert_eq!(c.ingest.image.ocr.engine, "ollama-vision");
assert_eq!(c.ingest.image.ocr.model, "gemma4:e4b");
assert_eq!(c.ingest.image.ocr.languages, vec!["eng", "kor"]);
assert_eq!(c.ingest.image.ocr.max_pixels, 1600);
}
/// v0.17.2 post-dogfood: matches the legacy hard-coded 300s cap so
/// existing configs that omit the new field keep behaving identically.
#[test]
fn default_ocr_request_timeout_secs_is_300() {
assert_eq!(Config::defaults().image.ocr.request_timeout_secs, 300);
assert_eq!(Config::defaults().ingest.image.ocr.request_timeout_secs, 300);
}
#[test]
@@ -1545,7 +1627,7 @@ theme = "dark"
"900".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert_eq!(c.image.ocr.request_timeout_secs, 900);
assert_eq!(c.ingest.image.ocr.request_timeout_secs, 900);
}
/// post-v0.17.1 dogfood: a config file written before the OCR
@@ -1555,7 +1637,7 @@ theme = "dark"
#[test]
fn legacy_config_without_ocr_request_timeout_secs_uses_default() {
let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config");
assert_eq!(c.image.ocr.request_timeout_secs, 300);
assert_eq!(c.ingest.image.ocr.request_timeout_secs, 300);
}
// ── p9-fb-41: multi-hop RAG knobs ────────────────────────────────────
@@ -1707,14 +1789,14 @@ theme = "dark"
);
env.insert("KEBAB_IMAGE_OCR_MAX_PIXELS".to_string(), "2048".to_string());
let c = Config::defaults().apply_env(&env);
assert!(c.image.ocr.enabled);
assert_eq!(c.image.ocr.model, "gemma4:31b");
assert!(c.ingest.image.ocr.enabled);
assert_eq!(c.ingest.image.ocr.model, "gemma4:31b");
assert_eq!(
c.image.ocr.endpoint.as_deref(),
c.ingest.image.ocr.endpoint.as_deref(),
Some("http://192.168.0.47:11434")
);
assert_eq!(c.image.ocr.languages, vec!["eng", "kor", "jpn"]);
assert_eq!(c.image.ocr.max_pixels, 2048);
assert_eq!(c.ingest.image.ocr.languages, vec!["eng", "kor", "jpn"]);
assert_eq!(c.ingest.image.ocr.max_pixels, 2048);
}
/// Pre-P6 config files don't have an `[image]` section. The
@@ -1723,9 +1805,9 @@ theme = "dark"
#[test]
fn image_caption_defaults_disabled() {
let c = Config::defaults();
assert!(!c.image.caption.enabled);
assert_eq!(c.image.caption.max_pixels, 768);
assert_eq!(c.image.caption.prompt_template_version, "caption-v1");
assert!(!c.ingest.image.caption.enabled);
assert_eq!(c.ingest.image.caption.max_pixels, 768);
assert_eq!(c.ingest.image.caption.prompt_template_version, "caption-v1");
}
#[test]
@@ -1744,9 +1826,9 @@ theme = "dark"
"caption-v2".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert!(c.image.caption.enabled);
assert_eq!(c.image.caption.max_pixels, 1024);
assert_eq!(c.image.caption.prompt_template_version, "caption-v2");
assert!(c.ingest.image.caption.enabled);
assert_eq!(c.ingest.image.caption.max_pixels, 1024);
assert_eq!(c.ingest.image.caption.prompt_template_version, "caption-v2");
}
/// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None`
@@ -1757,7 +1839,7 @@ theme = "dark"
let mut env = HashMap::new();
env.insert("KEBAB_IMAGE_OCR_ENDPOINT".to_string(), String::new());
let c = Config::defaults().apply_env(&env);
assert_eq!(c.image.ocr.endpoint, None);
assert_eq!(c.ingest.image.ocr.endpoint, None);
}
#[test]
@@ -1820,7 +1902,7 @@ explain_default = false
max_context_tokens = 8000
"#;
let c: Config = toml::from_str(toml_text).expect("pre-P6 TOML must still parse");
assert_eq!(c.image, ImageCfg::defaults());
assert_eq!(c.ingest.image, ImageCfg::defaults());
}
/// p9-fb-25: legacy config with `workspace.include = [...]` must

View File

@@ -254,12 +254,16 @@ mod tests {
fn annotated_default_has_all_sections_and_parses_back_to_defaults() {
let doc = annotated_default_document();
let text = doc.to_string();
// PdfCfg/ImageCfg/ModelsCfg/IngestCfg 는 스칼라 필드가 없어 bare
// `[pdf]` 등은 안 나오고 `[pdf.ocr]` 같은 하위 테이블만 직렬화된다.
// v3: 미디어 형식 섹션이 전부 `[ingest.*]` 하위로 통합됐다. IngestCfg
// 는 스칼라(병렬도) 필드가 있어 bare `[ingest]` + 하위 테이블이 함께
// 직렬화된다.
for section in [
"[workspace]",
"[ingest]",
"[ingest.chunking]",
"[ingest.code]",
"[pdf.ocr]",
"[ingest.image.ocr]",
"[ingest.pdf.ocr]",
"[logging]",
"[ui]",
] {

View File

@@ -47,20 +47,20 @@ lang_hint = "kor"
#[test]
fn pdf_ocr_defaults_off_with_qwen_3b() {
let cfg = Config::defaults();
assert!(!cfg.pdf.ocr.enabled);
assert!(!cfg.pdf.ocr.always_on);
assert_eq!(cfg.pdf.ocr.engine, "ollama-vision");
assert_eq!(cfg.pdf.ocr.model, "qwen2.5vl:3b");
assert!(cfg.pdf.ocr.endpoint.is_none());
assert!(!cfg.ingest.pdf.ocr.enabled);
assert!(!cfg.ingest.pdf.ocr.always_on);
assert_eq!(cfg.ingest.pdf.ocr.engine, "ollama-vision");
assert_eq!(cfg.ingest.pdf.ocr.model, "qwen2.5vl:3b");
assert!(cfg.ingest.pdf.ocr.endpoint.is_none());
assert_eq!(
cfg.pdf.ocr.languages,
cfg.ingest.pdf.ocr.languages,
vec!["eng".to_string(), "kor".to_string()]
);
assert_eq!(cfg.pdf.ocr.max_pixels, 2048);
assert_eq!(cfg.pdf.ocr.request_timeout_secs, 180); // Bug #11: 600 → 60 → 180 (HOTFIXES 2026-05-28)
assert!((cfg.pdf.ocr.valid_ratio_threshold - 0.5).abs() < 1e-6);
assert_eq!(cfg.pdf.ocr.min_char_count, 20);
assert_eq!(cfg.pdf.ocr.lang_hint.as_deref(), Some("kor"));
assert_eq!(cfg.ingest.pdf.ocr.max_pixels, 2048);
assert_eq!(cfg.ingest.pdf.ocr.request_timeout_secs, 180); // Bug #11: 600 → 60 → 180 (HOTFIXES 2026-05-28)
assert!((cfg.ingest.pdf.ocr.valid_ratio_threshold - 0.5).abs() < 1e-6);
assert_eq!(cfg.ingest.pdf.ocr.min_char_count, 20);
assert_eq!(cfg.ingest.pdf.ocr.lang_hint.as_deref(), Some("kor"));
}
// Test 3: env var override — 4 keys 의 typical override case.
@@ -80,12 +80,12 @@ fn pdf_ocr_env_overrides() {
let cfg = Config::defaults().apply_env(&env);
assert!(cfg.pdf.ocr.enabled);
assert_eq!(cfg.pdf.ocr.model, "qwen2.5vl:7b");
assert!(cfg.pdf.ocr.always_on);
assert!((cfg.pdf.ocr.valid_ratio_threshold - 0.75).abs() < 1e-6);
assert!(cfg.ingest.pdf.ocr.enabled);
assert_eq!(cfg.ingest.pdf.ocr.model, "qwen2.5vl:7b");
assert!(cfg.ingest.pdf.ocr.always_on);
assert!((cfg.ingest.pdf.ocr.valid_ratio_threshold - 0.75).abs() < 1e-6);
// 다른 env var 가 default 보존
assert_eq!(cfg.pdf.ocr.engine, "ollama-vision");
assert_eq!(cfg.pdf.ocr.min_char_count, 20);
assert_eq!(cfg.ingest.pdf.ocr.engine, "ollama-vision");
assert_eq!(cfg.ingest.pdf.ocr.min_char_count, 20);
}