refactor(config): v3 레이아웃 — 미디어 ingest 통합 + pdf paddle 대칭 + float 직렬화
Config 의 indexing/chunking/image/pdf top-level 필드를 ingest: IngestCfg 하나로 통합. leaf 구조체는 불변, 부모 경로만 [ingest.*] 하위로 이동. PdfOcrCfg 에 paddle 대칭 6키(det/rec/dict/score_thresh/unclip_ratio/ max_boxes) 추가. ser_f32_clean 으로 f32 직렬화 정리(0.3000000119→0.3). apply_env RHS 를 self.ingest.* 로 갱신(env 키 문자열 LHS 불변). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,18 @@ mod paths;
|
|||||||
pub mod migrate;
|
pub mod migrate;
|
||||||
pub use paths::{expand_path, expand_path_with_base};
|
pub use paths::{expand_path, expand_path_with_base};
|
||||||
|
|
||||||
|
/// f32 의 shortest round-trip(Display)을 f64 로 재파싱해 직렬화한다.
|
||||||
|
/// `0.3_f32` 가 `0.30000001192092896` 으로 새지 않고 `0.3` 으로 출력되게 한다.
|
||||||
|
/// 마이그레이션 시 toml_edit relocation 의 무손실 비교를 깨지 않도록, 그리고
|
||||||
|
/// `kebab config migrate` 산출물이 사람이 읽기 좋게.
|
||||||
|
fn ser_f32_clean<S>(v: &f32, s: S) -> Result<S::Ok, S::Error>
|
||||||
|
where
|
||||||
|
S: serde::Serializer,
|
||||||
|
{
|
||||||
|
let clean: f64 = format!("{v}").parse().unwrap_or(f64::from(*v));
|
||||||
|
s.serialize_f64(clean)
|
||||||
|
}
|
||||||
|
|
||||||
/// Signal: `Config::from_file` / `Config::load` failed due to missing path,
|
/// Signal: `Config::from_file` / `Config::load` failed due to missing path,
|
||||||
/// I/O failure, TOML parse failure, or post-parse validation failure.
|
/// I/O failure, TOML parse failure, or post-parse validation failure.
|
||||||
///
|
///
|
||||||
@@ -39,32 +51,20 @@ pub struct Config {
|
|||||||
pub schema_version: u32,
|
pub schema_version: u32,
|
||||||
pub workspace: WorkspaceCfg,
|
pub workspace: WorkspaceCfg,
|
||||||
pub storage: StorageCfg,
|
pub storage: StorageCfg,
|
||||||
pub indexing: IndexingCfg,
|
|
||||||
pub chunking: ChunkingCfg,
|
|
||||||
pub models: ModelsCfg,
|
pub models: ModelsCfg,
|
||||||
|
/// v3: 모든 미디어 형식 ingest 설정의 우산 — 병렬도(← 옛 `[indexing]`),
|
||||||
|
/// chunking, code, image, pdf 가 전부 `[ingest.*]` 하위로 통합됐다.
|
||||||
|
/// `#[serde(default)]` 로 두어 미변환 / 부분 config 도 로드된다(자동
|
||||||
|
/// 변환은 `Config::from_file` 가 메모리에서 수행 — T6).
|
||||||
|
#[serde(default)]
|
||||||
|
pub ingest: IngestCfg,
|
||||||
pub search: SearchCfg,
|
pub search: SearchCfg,
|
||||||
pub rag: RagCfg,
|
pub rag: RagCfg,
|
||||||
/// Image-pipeline settings (P6: OCR, captioning). Tagged
|
|
||||||
/// `#[serde(default)]` so pre-P6 config files that predate the
|
|
||||||
/// `[image]` section still load — defaults disable OCR / caption
|
|
||||||
/// (they cost a model call per asset).
|
|
||||||
#[serde(default = "ImageCfg::defaults")]
|
|
||||||
pub image: ImageCfg,
|
|
||||||
/// p9-fb-14: TUI palette + role-style mapping. `#[serde(default)]`
|
/// p9-fb-14: TUI palette + role-style mapping. `#[serde(default)]`
|
||||||
/// so configs that predate this section still load (defaults to
|
/// so configs that predate this section still load (defaults to
|
||||||
/// `dark`).
|
/// `dark`).
|
||||||
#[serde(default = "UiCfg::defaults")]
|
#[serde(default = "UiCfg::defaults")]
|
||||||
pub ui: UiCfg,
|
pub ui: UiCfg,
|
||||||
/// p10-1A-1: code ingest settings. `#[serde(default)]` so existing
|
|
||||||
/// config files without an `[ingest]` / `[ingest.code]` section
|
|
||||||
/// load cleanly with built-in defaults.
|
|
||||||
#[serde(default)]
|
|
||||||
pub ingest: IngestCfg,
|
|
||||||
/// v0.20.0 sub-item 1: PDF ingest pipeline settings. `#[serde(default)]`
|
|
||||||
/// so pre-v0.20 config files without a `[pdf]` section load with
|
|
||||||
/// built-in defaults (OCR disabled — opt-in for scanned PDF KB).
|
|
||||||
#[serde(default = "PdfCfg::defaults")]
|
|
||||||
pub pdf: PdfCfg,
|
|
||||||
/// v0.20.x ingest log surface. `#[serde(default)]` so pre-v0.20
|
/// v0.20.x ingest log surface. `#[serde(default)]` so pre-v0.20
|
||||||
/// config files without a `[logging]` section load with built-in
|
/// config files without a `[logging]` section load with built-in
|
||||||
/// defaults (enabled=true, dir=~/.local/state/kebab/logs).
|
/// defaults (enabled=true, dir=~/.local/state/kebab/logs).
|
||||||
@@ -104,13 +104,6 @@ pub struct StorageCfg {
|
|||||||
pub copy_threshold_mb: u64,
|
pub copy_threshold_mb: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
||||||
pub struct IndexingCfg {
|
|
||||||
pub max_parallel_extractors: u32,
|
|
||||||
pub max_parallel_embeddings: u32,
|
|
||||||
pub watch_filesystem: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct ChunkingCfg {
|
pub struct ChunkingCfg {
|
||||||
pub target_tokens: usize,
|
pub target_tokens: usize,
|
||||||
@@ -119,6 +112,17 @@ pub struct ChunkingCfg {
|
|||||||
pub chunker_version: String,
|
pub chunker_version: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl ChunkingCfg {
|
||||||
|
pub fn defaults() -> Self {
|
||||||
|
Self {
|
||||||
|
target_tokens: 500,
|
||||||
|
overlap_tokens: 80,
|
||||||
|
respect_markdown_headings: true,
|
||||||
|
chunker_version: "md-heading-v1".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct ModelsCfg {
|
pub struct ModelsCfg {
|
||||||
pub embedding: EmbeddingModelCfg,
|
pub embedding: EmbeddingModelCfg,
|
||||||
@@ -186,6 +190,7 @@ pub struct LlmCfg {
|
|||||||
pub model: String,
|
pub model: String,
|
||||||
pub context_tokens: usize,
|
pub context_tokens: usize,
|
||||||
pub endpoint: String,
|
pub endpoint: String,
|
||||||
|
#[serde(serialize_with = "ser_f32_clean")]
|
||||||
pub temperature: f32,
|
pub temperature: f32,
|
||||||
pub seed: u64,
|
pub seed: u64,
|
||||||
/// v0.17.0 post-dogfood: Hard ceiling on a single HTTP exchange to
|
/// v0.17.0 post-dogfood: Hard ceiling on a single HTTP exchange to
|
||||||
@@ -244,6 +249,7 @@ fn default_stale_threshold_days() -> u32 {
|
|||||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct RagCfg {
|
pub struct RagCfg {
|
||||||
pub prompt_template_version: String,
|
pub prompt_template_version: String,
|
||||||
|
#[serde(serialize_with = "ser_f32_clean")]
|
||||||
pub score_gate: f32,
|
pub score_gate: f32,
|
||||||
pub explain_default: bool,
|
pub explain_default: bool,
|
||||||
pub max_context_tokens: usize,
|
pub max_context_tokens: usize,
|
||||||
@@ -293,7 +299,7 @@ pub struct RagCfg {
|
|||||||
///
|
///
|
||||||
/// Single-pass `ask` ignores this knob entirely — only multi-hop
|
/// Single-pass `ask` ignores this knob entirely — only multi-hop
|
||||||
/// runs through the verification step (PR-9c-2 wires it).
|
/// runs through the verification step (PR-9c-2 wires it).
|
||||||
#[serde(default = "default_nli_threshold")]
|
#[serde(default = "default_nli_threshold", serialize_with = "ser_f32_clean")]
|
||||||
pub nli_threshold: f32,
|
pub nli_threshold: f32,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -397,11 +403,11 @@ pub struct OcrCfg {
|
|||||||
pub dict: Option<String>,
|
pub dict: Option<String>,
|
||||||
/// DBNet detection box score threshold (0.0..=1.0). Boxes whose mean
|
/// DBNet detection box score threshold (0.0..=1.0). Boxes whose mean
|
||||||
/// probability is below this are dropped. Default `0.3`.
|
/// probability is below this are dropped. Default `0.3`.
|
||||||
#[serde(default = "default_ocr_score_thresh")]
|
#[serde(default = "default_ocr_score_thresh", serialize_with = "ser_f32_clean")]
|
||||||
pub score_thresh: f32,
|
pub score_thresh: f32,
|
||||||
/// Polygon unclip ratio applied to each detected box before crop.
|
/// Polygon unclip ratio applied to each detected box before crop.
|
||||||
/// Larger = more padding around the text. Default `1.5`.
|
/// Larger = more padding around the text. Default `1.5`.
|
||||||
#[serde(default = "default_ocr_unclip_ratio")]
|
#[serde(default = "default_ocr_unclip_ratio", serialize_with = "ser_f32_clean")]
|
||||||
pub unclip_ratio: f32,
|
pub unclip_ratio: f32,
|
||||||
/// Hard cap on detected boxes per image (runaway guard). Extra boxes
|
/// Hard cap on detected boxes per image (runaway guard). Extra boxes
|
||||||
/// past this count are truncated with a warning. Default `1000`.
|
/// past this count are truncated with a warning. Default `1000`.
|
||||||
@@ -583,7 +589,7 @@ pub struct PdfOcrCfg {
|
|||||||
/// Valid char ratio threshold (0.0..=1.0). Page with ratio below
|
/// Valid char ratio threshold (0.0..=1.0). Page with ratio below
|
||||||
/// this is classified as scanned/mojibake → OCR fallback. Default
|
/// this is classified as scanned/mojibake → OCR fallback. Default
|
||||||
/// `0.5`.
|
/// `0.5`.
|
||||||
#[serde(default = "default_pdf_ocr_valid_ratio")]
|
#[serde(default = "default_pdf_ocr_valid_ratio", serialize_with = "ser_f32_clean")]
|
||||||
pub valid_ratio_threshold: f32,
|
pub valid_ratio_threshold: f32,
|
||||||
/// Minimum char count per page below which page is auto-scanned.
|
/// Minimum char count per page below which page is auto-scanned.
|
||||||
/// Default `20`.
|
/// Default `20`.
|
||||||
@@ -592,6 +598,30 @@ pub struct PdfOcrCfg {
|
|||||||
/// Single-page lang hint. Default `Some("kor")`. `None` = no hint.
|
/// Single-page lang hint. Default `Some("kor")`. `None` = no hint.
|
||||||
#[serde(default = "default_pdf_ocr_lang_hint")]
|
#[serde(default = "default_pdf_ocr_lang_hint")]
|
||||||
pub lang_hint: Option<String>,
|
pub lang_hint: Option<String>,
|
||||||
|
|
||||||
|
// ── paddle-onnx engine overrides (v3) ───────────────────────────────
|
||||||
|
// Symmetric with `[ingest.image.ocr]`. v2 의 "pdf paddle 이 image 의
|
||||||
|
// 모델 경로를 빌려쓰던" 비대칭을 제거 — pdf 자체 키로 옮긴다. 마이그레이션
|
||||||
|
// (T5)이 image 값을 이 키로 복사해 signature 바이트 동일 유지. 전부
|
||||||
|
// `#[serde(default)]` 이라 pre-v3 config 도 로드.
|
||||||
|
/// Override path to the detection ONNX model. `None` → bundled.
|
||||||
|
#[serde(default)]
|
||||||
|
pub det_model: Option<String>,
|
||||||
|
/// Override path to the recognition ONNX model. `None` → bundled.
|
||||||
|
#[serde(default)]
|
||||||
|
pub rec_model: Option<String>,
|
||||||
|
/// Override path to the character dictionary. `None` → bundled.
|
||||||
|
#[serde(default)]
|
||||||
|
pub dict: Option<String>,
|
||||||
|
/// DBNet detection box score threshold (0.0..=1.0). Default `0.3`.
|
||||||
|
#[serde(default = "default_ocr_score_thresh", serialize_with = "ser_f32_clean")]
|
||||||
|
pub score_thresh: f32,
|
||||||
|
/// Polygon unclip ratio applied to each detected box. Default `1.5`.
|
||||||
|
#[serde(default = "default_ocr_unclip_ratio", serialize_with = "ser_f32_clean")]
|
||||||
|
pub unclip_ratio: f32,
|
||||||
|
/// Hard cap on detected boxes per page (runaway guard). Default `1000`.
|
||||||
|
#[serde(default = "default_ocr_max_boxes")]
|
||||||
|
pub max_boxes: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PdfOcrCfg {
|
impl PdfOcrCfg {
|
||||||
@@ -608,6 +638,12 @@ impl PdfOcrCfg {
|
|||||||
valid_ratio_threshold: default_pdf_ocr_valid_ratio(),
|
valid_ratio_threshold: default_pdf_ocr_valid_ratio(),
|
||||||
min_char_count: default_pdf_ocr_min_char_count(),
|
min_char_count: default_pdf_ocr_min_char_count(),
|
||||||
lang_hint: default_pdf_ocr_lang_hint(),
|
lang_hint: default_pdf_ocr_lang_hint(),
|
||||||
|
det_model: None,
|
||||||
|
rec_model: None,
|
||||||
|
dict: None,
|
||||||
|
score_thresh: default_ocr_score_thresh(),
|
||||||
|
unclip_ratio: default_ocr_unclip_ratio(),
|
||||||
|
max_boxes: default_ocr_max_boxes(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -659,12 +695,47 @@ impl UiCfg {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// p10-1A-1: top-level ingest configuration wrapper. Contains per-media-type
|
/// v3: 모든 미디어 형식 ingest 설정의 우산. 스칼라(병렬도)는 ← 옛 `[indexing]`,
|
||||||
/// sub-sections; currently only `code` is defined.
|
/// 미디어별 하위 테이블(chunking/code/image/pdf)은 ← 옛 top-level 섹션.
|
||||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
/// 직렬화 순서 = 필드 순서: 스칼라(병렬도) 먼저, 하위 테이블 뒤
|
||||||
#[serde(default)]
|
/// (TOML 의 "bare key 는 sub-table header 앞" 규칙 준수).
|
||||||
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct IngestCfg {
|
pub struct IngestCfg {
|
||||||
|
#[serde(default = "default_max_parallel_extractors")]
|
||||||
|
pub max_parallel_extractors: u32,
|
||||||
|
#[serde(default = "default_max_parallel_embeddings")]
|
||||||
|
pub max_parallel_embeddings: u32,
|
||||||
|
#[serde(default)]
|
||||||
|
pub watch_filesystem: bool,
|
||||||
|
#[serde(default = "ChunkingCfg::defaults")]
|
||||||
|
pub chunking: ChunkingCfg,
|
||||||
|
#[serde(default)]
|
||||||
pub code: IngestCodeCfg,
|
pub code: IngestCodeCfg,
|
||||||
|
#[serde(default = "ImageCfg::defaults")]
|
||||||
|
pub image: ImageCfg,
|
||||||
|
#[serde(default = "PdfCfg::defaults")]
|
||||||
|
pub pdf: PdfCfg,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for IngestCfg {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
max_parallel_extractors: default_max_parallel_extractors(),
|
||||||
|
max_parallel_embeddings: default_max_parallel_embeddings(),
|
||||||
|
watch_filesystem: false,
|
||||||
|
chunking: ChunkingCfg::defaults(),
|
||||||
|
code: IngestCodeCfg::default(),
|
||||||
|
image: ImageCfg::defaults(),
|
||||||
|
pdf: PdfCfg::defaults(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_max_parallel_extractors() -> u32 {
|
||||||
|
2
|
||||||
|
}
|
||||||
|
fn default_max_parallel_embeddings() -> u32 {
|
||||||
|
1
|
||||||
}
|
}
|
||||||
|
|
||||||
/// p10-1A-1: settings for the code ingest pipeline. All fields have
|
/// p10-1A-1: settings for the code ingest pipeline. All fields have
|
||||||
@@ -728,17 +799,6 @@ impl Config {
|
|||||||
runs_dir: "{data_dir}/runs".to_string(),
|
runs_dir: "{data_dir}/runs".to_string(),
|
||||||
copy_threshold_mb: 100,
|
copy_threshold_mb: 100,
|
||||||
},
|
},
|
||||||
indexing: IndexingCfg {
|
|
||||||
max_parallel_extractors: 2,
|
|
||||||
max_parallel_embeddings: 1,
|
|
||||||
watch_filesystem: false,
|
|
||||||
},
|
|
||||||
chunking: ChunkingCfg {
|
|
||||||
target_tokens: 500,
|
|
||||||
overlap_tokens: 80,
|
|
||||||
respect_markdown_headings: true,
|
|
||||||
chunker_version: "md-heading-v1".to_string(),
|
|
||||||
},
|
|
||||||
models: ModelsCfg {
|
models: ModelsCfg {
|
||||||
embedding: EmbeddingModelCfg {
|
embedding: EmbeddingModelCfg {
|
||||||
provider: "fastembed".to_string(),
|
provider: "fastembed".to_string(),
|
||||||
@@ -765,6 +825,15 @@ impl Config {
|
|||||||
},
|
},
|
||||||
nli: NliCfg::defaults(),
|
nli: NliCfg::defaults(),
|
||||||
},
|
},
|
||||||
|
ingest: IngestCfg {
|
||||||
|
max_parallel_extractors: 2,
|
||||||
|
max_parallel_embeddings: 1,
|
||||||
|
watch_filesystem: false,
|
||||||
|
chunking: ChunkingCfg::defaults(),
|
||||||
|
code: IngestCodeCfg::default(),
|
||||||
|
image: ImageCfg::defaults(),
|
||||||
|
pdf: PdfCfg::defaults(),
|
||||||
|
},
|
||||||
search: SearchCfg {
|
search: SearchCfg {
|
||||||
default_k: 10,
|
default_k: 10,
|
||||||
hybrid_fusion: "rrf".to_string(),
|
hybrid_fusion: "rrf".to_string(),
|
||||||
@@ -783,10 +852,7 @@ impl Config {
|
|||||||
multi_hop_max_pool_chunks: default_multi_hop_max_pool_chunks(),
|
multi_hop_max_pool_chunks: default_multi_hop_max_pool_chunks(),
|
||||||
nli_threshold: default_nli_threshold(),
|
nli_threshold: default_nli_threshold(),
|
||||||
},
|
},
|
||||||
image: ImageCfg::defaults(),
|
|
||||||
ui: UiCfg::defaults(),
|
ui: UiCfg::defaults(),
|
||||||
ingest: IngestCfg::default(),
|
|
||||||
pdf: PdfCfg::defaults(),
|
|
||||||
logging: LoggingCfg::default(),
|
logging: LoggingCfg::default(),
|
||||||
// p9-fb-05: defaults are not loaded from disk, so no
|
// p9-fb-05: defaults are not loaded from disk, so no
|
||||||
// source_dir. Relative `workspace.root` (rare with
|
// source_dir. Relative `workspace.root` (rare with
|
||||||
@@ -963,33 +1029,33 @@ impl Config {
|
|||||||
// indexing
|
// indexing
|
||||||
"KEBAB_INDEXING_MAX_PARALLEL_EXTRACTORS" => {
|
"KEBAB_INDEXING_MAX_PARALLEL_EXTRACTORS" => {
|
||||||
if let Ok(n) = v.parse::<u32>() {
|
if let Ok(n) = v.parse::<u32>() {
|
||||||
self.indexing.max_parallel_extractors = n;
|
self.ingest.max_parallel_extractors = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_INDEXING_MAX_PARALLEL_EMBEDDINGS" => {
|
"KEBAB_INDEXING_MAX_PARALLEL_EMBEDDINGS" => {
|
||||||
if let Ok(n) = v.parse::<u32>() {
|
if let Ok(n) = v.parse::<u32>() {
|
||||||
self.indexing.max_parallel_embeddings = n;
|
self.ingest.max_parallel_embeddings = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_INDEXING_WATCH_FILESYSTEM" => {
|
"KEBAB_INDEXING_WATCH_FILESYSTEM" => {
|
||||||
self.indexing.watch_filesystem = parse_bool(v);
|
self.ingest.watch_filesystem = parse_bool(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
// chunking
|
// chunking
|
||||||
"KEBAB_CHUNKING_TARGET_TOKENS" => {
|
"KEBAB_CHUNKING_TARGET_TOKENS" => {
|
||||||
if let Ok(n) = v.parse::<usize>() {
|
if let Ok(n) = v.parse::<usize>() {
|
||||||
self.chunking.target_tokens = n;
|
self.ingest.chunking.target_tokens = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_CHUNKING_OVERLAP_TOKENS" => {
|
"KEBAB_CHUNKING_OVERLAP_TOKENS" => {
|
||||||
if let Ok(n) = v.parse::<usize>() {
|
if let Ok(n) = v.parse::<usize>() {
|
||||||
self.chunking.overlap_tokens = n;
|
self.ingest.chunking.overlap_tokens = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_CHUNKING_RESPECT_MARKDOWN_HEADINGS" => {
|
"KEBAB_CHUNKING_RESPECT_MARKDOWN_HEADINGS" => {
|
||||||
self.chunking.respect_markdown_headings = parse_bool(v);
|
self.ingest.chunking.respect_markdown_headings = parse_bool(v);
|
||||||
}
|
}
|
||||||
"KEBAB_CHUNKING_CHUNKER_VERSION" => self.chunking.chunker_version = v.clone(),
|
"KEBAB_CHUNKING_CHUNKER_VERSION" => self.ingest.chunking.chunker_version = v.clone(),
|
||||||
|
|
||||||
// models.embedding
|
// models.embedding
|
||||||
"KEBAB_MODELS_EMBEDDING_PROVIDER" => self.models.embedding.provider = v.clone(),
|
"KEBAB_MODELS_EMBEDDING_PROVIDER" => self.models.embedding.provider = v.clone(),
|
||||||
@@ -1122,18 +1188,18 @@ impl Config {
|
|||||||
|
|
||||||
// image.ocr
|
// image.ocr
|
||||||
"KEBAB_IMAGE_OCR_ENABLED" => {
|
"KEBAB_IMAGE_OCR_ENABLED" => {
|
||||||
self.image.ocr.enabled = parse_bool(v);
|
self.ingest.image.ocr.enabled = parse_bool(v);
|
||||||
}
|
}
|
||||||
"KEBAB_IMAGE_OCR_ENGINE" => self.image.ocr.engine = v.clone(),
|
"KEBAB_IMAGE_OCR_ENGINE" => self.ingest.image.ocr.engine = v.clone(),
|
||||||
"KEBAB_IMAGE_OCR_MODEL" => self.image.ocr.model = v.clone(),
|
"KEBAB_IMAGE_OCR_MODEL" => self.ingest.image.ocr.model = v.clone(),
|
||||||
"KEBAB_IMAGE_OCR_ENDPOINT" => {
|
"KEBAB_IMAGE_OCR_ENDPOINT" => {
|
||||||
// Empty env value is treated the same as "fall back
|
// Empty env value is treated the same as "fall back
|
||||||
// to models.llm.endpoint" — i.e. set None.
|
// to models.llm.endpoint" — i.e. set None.
|
||||||
self.image.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) };
|
self.ingest.image.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) };
|
||||||
}
|
}
|
||||||
"KEBAB_IMAGE_OCR_LANGUAGES" => {
|
"KEBAB_IMAGE_OCR_LANGUAGES" => {
|
||||||
// Comma-separated list, e.g. "eng,kor".
|
// Comma-separated list, e.g. "eng,kor".
|
||||||
self.image.ocr.languages = v
|
self.ingest.image.ocr.languages = v
|
||||||
.split(',')
|
.split(',')
|
||||||
.map(|s| s.trim().to_string())
|
.map(|s| s.trim().to_string())
|
||||||
.filter(|s| !s.is_empty())
|
.filter(|s| !s.is_empty())
|
||||||
@@ -1141,66 +1207,66 @@ impl Config {
|
|||||||
}
|
}
|
||||||
"KEBAB_IMAGE_OCR_MAX_PIXELS" => {
|
"KEBAB_IMAGE_OCR_MAX_PIXELS" => {
|
||||||
if let Ok(n) = v.parse::<u32>() {
|
if let Ok(n) = v.parse::<u32>() {
|
||||||
self.image.ocr.max_pixels = n;
|
self.ingest.image.ocr.max_pixels = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_IMAGE_OCR_REQUEST_TIMEOUT_SECS" => {
|
"KEBAB_IMAGE_OCR_REQUEST_TIMEOUT_SECS" => {
|
||||||
if let Ok(n) = v.parse::<u64>() {
|
if let Ok(n) = v.parse::<u64>() {
|
||||||
self.image.ocr.request_timeout_secs = n;
|
self.ingest.image.ocr.request_timeout_secs = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// paddle-onnx engine overrides (v0.27.0). Empty string → None
|
// paddle-onnx engine overrides (v0.27.0). Empty string → None
|
||||||
// (fall back to bundled / KEBAB_IMAGE_OCR_MODEL_DIR).
|
// (fall back to bundled / KEBAB_IMAGE_OCR_MODEL_DIR).
|
||||||
"KEBAB_IMAGE_OCR_DET_MODEL" => {
|
"KEBAB_IMAGE_OCR_DET_MODEL" => {
|
||||||
self.image.ocr.det_model =
|
self.ingest.image.ocr.det_model =
|
||||||
if v.is_empty() { None } else { Some(v.clone()) };
|
if v.is_empty() { None } else { Some(v.clone()) };
|
||||||
}
|
}
|
||||||
"KEBAB_IMAGE_OCR_REC_MODEL" => {
|
"KEBAB_IMAGE_OCR_REC_MODEL" => {
|
||||||
self.image.ocr.rec_model =
|
self.ingest.image.ocr.rec_model =
|
||||||
if v.is_empty() { None } else { Some(v.clone()) };
|
if v.is_empty() { None } else { Some(v.clone()) };
|
||||||
}
|
}
|
||||||
"KEBAB_IMAGE_OCR_DICT" => {
|
"KEBAB_IMAGE_OCR_DICT" => {
|
||||||
self.image.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) };
|
self.ingest.image.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) };
|
||||||
}
|
}
|
||||||
"KEBAB_IMAGE_OCR_SCORE_THRESH" => {
|
"KEBAB_IMAGE_OCR_SCORE_THRESH" => {
|
||||||
if let Ok(f) = v.parse::<f32>() {
|
if let Ok(f) = v.parse::<f32>() {
|
||||||
self.image.ocr.score_thresh = f;
|
self.ingest.image.ocr.score_thresh = f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_IMAGE_OCR_UNCLIP_RATIO" => {
|
"KEBAB_IMAGE_OCR_UNCLIP_RATIO" => {
|
||||||
if let Ok(f) = v.parse::<f32>() {
|
if let Ok(f) = v.parse::<f32>() {
|
||||||
self.image.ocr.unclip_ratio = f;
|
self.ingest.image.ocr.unclip_ratio = f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_IMAGE_OCR_MAX_BOXES" => {
|
"KEBAB_IMAGE_OCR_MAX_BOXES" => {
|
||||||
if let Ok(n) = v.parse::<usize>() {
|
if let Ok(n) = v.parse::<usize>() {
|
||||||
self.image.ocr.max_boxes = n;
|
self.ingest.image.ocr.max_boxes = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// image.caption (P6-3)
|
// image.caption (P6-3)
|
||||||
"KEBAB_IMAGE_CAPTION_ENABLED" => {
|
"KEBAB_IMAGE_CAPTION_ENABLED" => {
|
||||||
self.image.caption.enabled = parse_bool(v);
|
self.ingest.image.caption.enabled = parse_bool(v);
|
||||||
}
|
}
|
||||||
"KEBAB_IMAGE_CAPTION_MAX_PIXELS" => {
|
"KEBAB_IMAGE_CAPTION_MAX_PIXELS" => {
|
||||||
if let Ok(n) = v.parse::<u32>() {
|
if let Ok(n) = v.parse::<u32>() {
|
||||||
self.image.caption.max_pixels = n;
|
self.ingest.image.caption.max_pixels = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION" => {
|
"KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION" => {
|
||||||
self.image.caption.prompt_template_version = v.clone();
|
self.ingest.image.caption.prompt_template_version = v.clone();
|
||||||
}
|
}
|
||||||
|
|
||||||
// pdf.ocr (v0.20.0 sub-item 1)
|
// pdf.ocr (v0.20.0 sub-item 1)
|
||||||
"KEBAB_PDF_OCR_ENABLED" => self.pdf.ocr.enabled = parse_bool(v),
|
"KEBAB_PDF_OCR_ENABLED" => self.ingest.pdf.ocr.enabled = parse_bool(v),
|
||||||
"KEBAB_PDF_OCR_ALWAYS_ON" => self.pdf.ocr.always_on = parse_bool(v),
|
"KEBAB_PDF_OCR_ALWAYS_ON" => self.ingest.pdf.ocr.always_on = parse_bool(v),
|
||||||
"KEBAB_PDF_OCR_ENGINE" => self.pdf.ocr.engine = v.clone(),
|
"KEBAB_PDF_OCR_ENGINE" => self.ingest.pdf.ocr.engine = v.clone(),
|
||||||
"KEBAB_PDF_OCR_MODEL" => self.pdf.ocr.model = v.clone(),
|
"KEBAB_PDF_OCR_MODEL" => self.ingest.pdf.ocr.model = v.clone(),
|
||||||
"KEBAB_PDF_OCR_ENDPOINT" => {
|
"KEBAB_PDF_OCR_ENDPOINT" => {
|
||||||
self.pdf.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) };
|
self.ingest.pdf.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) };
|
||||||
}
|
}
|
||||||
"KEBAB_PDF_OCR_LANGUAGES" => {
|
"KEBAB_PDF_OCR_LANGUAGES" => {
|
||||||
self.pdf.ocr.languages = v
|
self.ingest.pdf.ocr.languages = v
|
||||||
.split(',')
|
.split(',')
|
||||||
.map(|s| s.trim().to_string())
|
.map(|s| s.trim().to_string())
|
||||||
.filter(|s| !s.is_empty())
|
.filter(|s| !s.is_empty())
|
||||||
@@ -1208,26 +1274,26 @@ impl Config {
|
|||||||
}
|
}
|
||||||
"KEBAB_PDF_OCR_MAX_PIXELS" => {
|
"KEBAB_PDF_OCR_MAX_PIXELS" => {
|
||||||
if let Ok(n) = v.parse::<u32>() {
|
if let Ok(n) = v.parse::<u32>() {
|
||||||
self.pdf.ocr.max_pixels = n;
|
self.ingest.pdf.ocr.max_pixels = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_PDF_OCR_REQUEST_TIMEOUT_SECS" => {
|
"KEBAB_PDF_OCR_REQUEST_TIMEOUT_SECS" => {
|
||||||
if let Ok(n) = v.parse::<u64>() {
|
if let Ok(n) = v.parse::<u64>() {
|
||||||
self.pdf.ocr.request_timeout_secs = n;
|
self.ingest.pdf.ocr.request_timeout_secs = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_PDF_OCR_VALID_RATIO_THRESHOLD" => {
|
"KEBAB_PDF_OCR_VALID_RATIO_THRESHOLD" => {
|
||||||
if let Ok(n) = v.parse::<f32>() {
|
if let Ok(n) = v.parse::<f32>() {
|
||||||
self.pdf.ocr.valid_ratio_threshold = n.clamp(0.0, 1.0);
|
self.ingest.pdf.ocr.valid_ratio_threshold = n.clamp(0.0, 1.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_PDF_OCR_MIN_CHAR_COUNT" => {
|
"KEBAB_PDF_OCR_MIN_CHAR_COUNT" => {
|
||||||
if let Ok(n) = v.parse::<u32>() {
|
if let Ok(n) = v.parse::<u32>() {
|
||||||
self.pdf.ocr.min_char_count = n;
|
self.ingest.pdf.ocr.min_char_count = n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"KEBAB_PDF_OCR_LANG_HINT" => {
|
"KEBAB_PDF_OCR_LANG_HINT" => {
|
||||||
self.pdf.ocr.lang_hint = if v.is_empty() { None } else { Some(v.clone()) };
|
self.ingest.pdf.ocr.lang_hint = if v.is_empty() { None } else { Some(v.clone()) };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unknown KEBAB_* keys are silently ignored — see
|
// Unknown KEBAB_* keys are silently ignored — see
|
||||||
@@ -1413,11 +1479,27 @@ theme = "dark"
|
|||||||
assert_eq!(c, back);
|
assert_eq!(c, back);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn v3_layout_nests_media_under_ingest() {
|
||||||
|
let c = Config::defaults();
|
||||||
|
// 새 경로가 컴파일·접근 가능해야 한다.
|
||||||
|
assert_eq!(c.ingest.max_parallel_extractors, 2);
|
||||||
|
assert_eq!(c.ingest.chunking.target_tokens, 500);
|
||||||
|
assert_eq!(c.ingest.code.max_file_bytes, 262_144);
|
||||||
|
assert_eq!(c.ingest.image.ocr.engine, "ollama-vision");
|
||||||
|
assert_eq!(c.ingest.image.caption.max_pixels, 768);
|
||||||
|
assert_eq!(c.ingest.pdf.ocr.model, "qwen2.5vl:3b");
|
||||||
|
// pdf paddle 대칭 키 존재 + 기본값.
|
||||||
|
assert_eq!(c.ingest.pdf.ocr.score_thresh, 0.3);
|
||||||
|
assert_eq!(c.ingest.pdf.ocr.max_boxes, 1000);
|
||||||
|
assert!(c.ingest.pdf.ocr.det_model.is_none());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn defaults_match_design_64_score_gate() {
|
fn defaults_match_design_64_score_gate() {
|
||||||
let c = Config::defaults();
|
let c = Config::defaults();
|
||||||
assert_eq!(c.rag.score_gate, 0.30);
|
assert_eq!(c.rag.score_gate, 0.30);
|
||||||
assert_eq!(c.chunking.target_tokens, 500);
|
assert_eq!(c.ingest.chunking.target_tokens, 500);
|
||||||
assert_eq!(c.models.embedding.model, "multilingual-e5-large");
|
assert_eq!(c.models.embedding.model, "multilingual-e5-large");
|
||||||
assert_eq!(c.models.embedding.dimensions, 1024);
|
assert_eq!(c.models.embedding.dimensions, 1024);
|
||||||
assert_eq!(c.search.rrf_k, 60);
|
assert_eq!(c.search.rrf_k, 60);
|
||||||
@@ -1462,7 +1544,7 @@ theme = "dark"
|
|||||||
"777".to_string(),
|
"777".to_string(),
|
||||||
);
|
);
|
||||||
let c = Config::defaults().apply_env(&env);
|
let c = Config::defaults().apply_env(&env);
|
||||||
assert_eq!(c.chunking.target_tokens, 777);
|
assert_eq!(c.ingest.chunking.target_tokens, 777);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -1517,24 +1599,24 @@ theme = "dark"
|
|||||||
"true".to_string(),
|
"true".to_string(),
|
||||||
);
|
);
|
||||||
let c = Config::defaults().apply_env(&env);
|
let c = Config::defaults().apply_env(&env);
|
||||||
assert!(c.indexing.watch_filesystem);
|
assert!(c.ingest.watch_filesystem);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn image_ocr_defaults_disabled_with_ollama_vision() {
|
fn image_ocr_defaults_disabled_with_ollama_vision() {
|
||||||
let c = Config::defaults();
|
let c = Config::defaults();
|
||||||
assert!(!c.image.ocr.enabled);
|
assert!(!c.ingest.image.ocr.enabled);
|
||||||
assert_eq!(c.image.ocr.engine, "ollama-vision");
|
assert_eq!(c.ingest.image.ocr.engine, "ollama-vision");
|
||||||
assert_eq!(c.image.ocr.model, "gemma4:e4b");
|
assert_eq!(c.ingest.image.ocr.model, "gemma4:e4b");
|
||||||
assert_eq!(c.image.ocr.languages, vec!["eng", "kor"]);
|
assert_eq!(c.ingest.image.ocr.languages, vec!["eng", "kor"]);
|
||||||
assert_eq!(c.image.ocr.max_pixels, 1600);
|
assert_eq!(c.ingest.image.ocr.max_pixels, 1600);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// v0.17.2 post-dogfood: matches the legacy hard-coded 300s cap so
|
/// v0.17.2 post-dogfood: matches the legacy hard-coded 300s cap so
|
||||||
/// existing configs that omit the new field keep behaving identically.
|
/// existing configs that omit the new field keep behaving identically.
|
||||||
#[test]
|
#[test]
|
||||||
fn default_ocr_request_timeout_secs_is_300() {
|
fn default_ocr_request_timeout_secs_is_300() {
|
||||||
assert_eq!(Config::defaults().image.ocr.request_timeout_secs, 300);
|
assert_eq!(Config::defaults().ingest.image.ocr.request_timeout_secs, 300);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -1545,7 +1627,7 @@ theme = "dark"
|
|||||||
"900".to_string(),
|
"900".to_string(),
|
||||||
);
|
);
|
||||||
let c = Config::defaults().apply_env(&env);
|
let c = Config::defaults().apply_env(&env);
|
||||||
assert_eq!(c.image.ocr.request_timeout_secs, 900);
|
assert_eq!(c.ingest.image.ocr.request_timeout_secs, 900);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// post-v0.17.1 dogfood: a config file written before the OCR
|
/// post-v0.17.1 dogfood: a config file written before the OCR
|
||||||
@@ -1555,7 +1637,7 @@ theme = "dark"
|
|||||||
#[test]
|
#[test]
|
||||||
fn legacy_config_without_ocr_request_timeout_secs_uses_default() {
|
fn legacy_config_without_ocr_request_timeout_secs_uses_default() {
|
||||||
let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config");
|
let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config");
|
||||||
assert_eq!(c.image.ocr.request_timeout_secs, 300);
|
assert_eq!(c.ingest.image.ocr.request_timeout_secs, 300);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── p9-fb-41: multi-hop RAG knobs ────────────────────────────────────
|
// ── p9-fb-41: multi-hop RAG knobs ────────────────────────────────────
|
||||||
@@ -1707,14 +1789,14 @@ theme = "dark"
|
|||||||
);
|
);
|
||||||
env.insert("KEBAB_IMAGE_OCR_MAX_PIXELS".to_string(), "2048".to_string());
|
env.insert("KEBAB_IMAGE_OCR_MAX_PIXELS".to_string(), "2048".to_string());
|
||||||
let c = Config::defaults().apply_env(&env);
|
let c = Config::defaults().apply_env(&env);
|
||||||
assert!(c.image.ocr.enabled);
|
assert!(c.ingest.image.ocr.enabled);
|
||||||
assert_eq!(c.image.ocr.model, "gemma4:31b");
|
assert_eq!(c.ingest.image.ocr.model, "gemma4:31b");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
c.image.ocr.endpoint.as_deref(),
|
c.ingest.image.ocr.endpoint.as_deref(),
|
||||||
Some("http://192.168.0.47:11434")
|
Some("http://192.168.0.47:11434")
|
||||||
);
|
);
|
||||||
assert_eq!(c.image.ocr.languages, vec!["eng", "kor", "jpn"]);
|
assert_eq!(c.ingest.image.ocr.languages, vec!["eng", "kor", "jpn"]);
|
||||||
assert_eq!(c.image.ocr.max_pixels, 2048);
|
assert_eq!(c.ingest.image.ocr.max_pixels, 2048);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Pre-P6 config files don't have an `[image]` section. The
|
/// Pre-P6 config files don't have an `[image]` section. The
|
||||||
@@ -1723,9 +1805,9 @@ theme = "dark"
|
|||||||
#[test]
|
#[test]
|
||||||
fn image_caption_defaults_disabled() {
|
fn image_caption_defaults_disabled() {
|
||||||
let c = Config::defaults();
|
let c = Config::defaults();
|
||||||
assert!(!c.image.caption.enabled);
|
assert!(!c.ingest.image.caption.enabled);
|
||||||
assert_eq!(c.image.caption.max_pixels, 768);
|
assert_eq!(c.ingest.image.caption.max_pixels, 768);
|
||||||
assert_eq!(c.image.caption.prompt_template_version, "caption-v1");
|
assert_eq!(c.ingest.image.caption.prompt_template_version, "caption-v1");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -1744,9 +1826,9 @@ theme = "dark"
|
|||||||
"caption-v2".to_string(),
|
"caption-v2".to_string(),
|
||||||
);
|
);
|
||||||
let c = Config::defaults().apply_env(&env);
|
let c = Config::defaults().apply_env(&env);
|
||||||
assert!(c.image.caption.enabled);
|
assert!(c.ingest.image.caption.enabled);
|
||||||
assert_eq!(c.image.caption.max_pixels, 1024);
|
assert_eq!(c.ingest.image.caption.max_pixels, 1024);
|
||||||
assert_eq!(c.image.caption.prompt_template_version, "caption-v2");
|
assert_eq!(c.ingest.image.caption.prompt_template_version, "caption-v2");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None`
|
/// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None`
|
||||||
@@ -1757,7 +1839,7 @@ theme = "dark"
|
|||||||
let mut env = HashMap::new();
|
let mut env = HashMap::new();
|
||||||
env.insert("KEBAB_IMAGE_OCR_ENDPOINT".to_string(), String::new());
|
env.insert("KEBAB_IMAGE_OCR_ENDPOINT".to_string(), String::new());
|
||||||
let c = Config::defaults().apply_env(&env);
|
let c = Config::defaults().apply_env(&env);
|
||||||
assert_eq!(c.image.ocr.endpoint, None);
|
assert_eq!(c.ingest.image.ocr.endpoint, None);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -1820,7 +1902,7 @@ explain_default = false
|
|||||||
max_context_tokens = 8000
|
max_context_tokens = 8000
|
||||||
"#;
|
"#;
|
||||||
let c: Config = toml::from_str(toml_text).expect("pre-P6 TOML must still parse");
|
let c: Config = toml::from_str(toml_text).expect("pre-P6 TOML must still parse");
|
||||||
assert_eq!(c.image, ImageCfg::defaults());
|
assert_eq!(c.ingest.image, ImageCfg::defaults());
|
||||||
}
|
}
|
||||||
|
|
||||||
/// p9-fb-25: legacy config with `workspace.include = [...]` must
|
/// p9-fb-25: legacy config with `workspace.include = [...]` must
|
||||||
|
|||||||
@@ -254,12 +254,16 @@ mod tests {
|
|||||||
fn annotated_default_has_all_sections_and_parses_back_to_defaults() {
|
fn annotated_default_has_all_sections_and_parses_back_to_defaults() {
|
||||||
let doc = annotated_default_document();
|
let doc = annotated_default_document();
|
||||||
let text = doc.to_string();
|
let text = doc.to_string();
|
||||||
// PdfCfg/ImageCfg/ModelsCfg/IngestCfg 는 스칼라 필드가 없어 bare
|
// v3: 미디어 형식 섹션이 전부 `[ingest.*]` 하위로 통합됐다. IngestCfg
|
||||||
// `[pdf]` 등은 안 나오고 `[pdf.ocr]` 같은 하위 테이블만 직렬화된다.
|
// 는 스칼라(병렬도) 필드가 있어 bare `[ingest]` + 하위 테이블이 함께
|
||||||
|
// 직렬화된다.
|
||||||
for section in [
|
for section in [
|
||||||
"[workspace]",
|
"[workspace]",
|
||||||
|
"[ingest]",
|
||||||
|
"[ingest.chunking]",
|
||||||
"[ingest.code]",
|
"[ingest.code]",
|
||||||
"[pdf.ocr]",
|
"[ingest.image.ocr]",
|
||||||
|
"[ingest.pdf.ocr]",
|
||||||
"[logging]",
|
"[logging]",
|
||||||
"[ui]",
|
"[ui]",
|
||||||
] {
|
] {
|
||||||
|
|||||||
@@ -47,20 +47,20 @@ lang_hint = "kor"
|
|||||||
#[test]
|
#[test]
|
||||||
fn pdf_ocr_defaults_off_with_qwen_3b() {
|
fn pdf_ocr_defaults_off_with_qwen_3b() {
|
||||||
let cfg = Config::defaults();
|
let cfg = Config::defaults();
|
||||||
assert!(!cfg.pdf.ocr.enabled);
|
assert!(!cfg.ingest.pdf.ocr.enabled);
|
||||||
assert!(!cfg.pdf.ocr.always_on);
|
assert!(!cfg.ingest.pdf.ocr.always_on);
|
||||||
assert_eq!(cfg.pdf.ocr.engine, "ollama-vision");
|
assert_eq!(cfg.ingest.pdf.ocr.engine, "ollama-vision");
|
||||||
assert_eq!(cfg.pdf.ocr.model, "qwen2.5vl:3b");
|
assert_eq!(cfg.ingest.pdf.ocr.model, "qwen2.5vl:3b");
|
||||||
assert!(cfg.pdf.ocr.endpoint.is_none());
|
assert!(cfg.ingest.pdf.ocr.endpoint.is_none());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
cfg.pdf.ocr.languages,
|
cfg.ingest.pdf.ocr.languages,
|
||||||
vec!["eng".to_string(), "kor".to_string()]
|
vec!["eng".to_string(), "kor".to_string()]
|
||||||
);
|
);
|
||||||
assert_eq!(cfg.pdf.ocr.max_pixels, 2048);
|
assert_eq!(cfg.ingest.pdf.ocr.max_pixels, 2048);
|
||||||
assert_eq!(cfg.pdf.ocr.request_timeout_secs, 180); // Bug #11: 600 → 60 → 180 (HOTFIXES 2026-05-28)
|
assert_eq!(cfg.ingest.pdf.ocr.request_timeout_secs, 180); // Bug #11: 600 → 60 → 180 (HOTFIXES 2026-05-28)
|
||||||
assert!((cfg.pdf.ocr.valid_ratio_threshold - 0.5).abs() < 1e-6);
|
assert!((cfg.ingest.pdf.ocr.valid_ratio_threshold - 0.5).abs() < 1e-6);
|
||||||
assert_eq!(cfg.pdf.ocr.min_char_count, 20);
|
assert_eq!(cfg.ingest.pdf.ocr.min_char_count, 20);
|
||||||
assert_eq!(cfg.pdf.ocr.lang_hint.as_deref(), Some("kor"));
|
assert_eq!(cfg.ingest.pdf.ocr.lang_hint.as_deref(), Some("kor"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test 3: env var override — 4 keys 의 typical override case.
|
// Test 3: env var override — 4 keys 의 typical override case.
|
||||||
@@ -80,12 +80,12 @@ fn pdf_ocr_env_overrides() {
|
|||||||
|
|
||||||
let cfg = Config::defaults().apply_env(&env);
|
let cfg = Config::defaults().apply_env(&env);
|
||||||
|
|
||||||
assert!(cfg.pdf.ocr.enabled);
|
assert!(cfg.ingest.pdf.ocr.enabled);
|
||||||
assert_eq!(cfg.pdf.ocr.model, "qwen2.5vl:7b");
|
assert_eq!(cfg.ingest.pdf.ocr.model, "qwen2.5vl:7b");
|
||||||
assert!(cfg.pdf.ocr.always_on);
|
assert!(cfg.ingest.pdf.ocr.always_on);
|
||||||
assert!((cfg.pdf.ocr.valid_ratio_threshold - 0.75).abs() < 1e-6);
|
assert!((cfg.ingest.pdf.ocr.valid_ratio_threshold - 0.75).abs() < 1e-6);
|
||||||
|
|
||||||
// 다른 env var 가 default 보존
|
// 다른 env var 가 default 보존
|
||||||
assert_eq!(cfg.pdf.ocr.engine, "ollama-vision");
|
assert_eq!(cfg.ingest.pdf.ocr.engine, "ollama-vision");
|
||||||
assert_eq!(cfg.pdf.ocr.min_char_count, 20);
|
assert_eq!(cfg.ingest.pdf.ocr.min_char_count, 20);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user