Files
kebab/crates/kebab-config/src/lib.rs
altair823 15e6918cef feat(config): env 이름 보존 RHS 갱신 + pdf paddle 신규 env 6키
apply_env whitelist 의 키 문자열(LHS) 전부 불변, 대입 대상만 self.ingest.*
(불변식 #2). KEBAB_PDF_OCR_{DET_MODEL,REC_MODEL,DICT,SCORE_THRESH,
UNCLIP_RATIO,MAX_BOXES} 신규(image.ocr paddle 패턴 대칭).
게이트: clippy --workspace --all-targets 0, kebab-config/app/eval 테스트 green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-04 12:56:25 +00:00

2253 lines
88 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! `kb-config` — `Config` schema and XDG path resolution (§6).
//!
//! Layer order (`Config::load`): defaults → file → env (`KEBAB_<SECTION>_<KEY>`).
//! CLI overrides land later, applied by `kb-cli` after `Config::load`.
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use serde::{Deserialize, Serialize};
mod paths;
pub mod migrate;
pub use paths::{expand_path, expand_path_with_base};
/// f32 의 shortest round-trip(Display)을 f64 로 재파싱해 직렬화한다.
/// `0.3_f32` 가 `0.30000001192092896` 으로 새지 않고 `0.3` 으로 출력되게 한다.
/// 마이그레이션 시 toml_edit relocation 의 무손실 비교를 깨지 않도록, 그리고
/// `kebab config migrate` 산출물이 사람이 읽기 좋게.
fn ser_f32_clean<S>(v: &f32, s: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let clean: f64 = format!("{v}").parse().unwrap_or(f64::from(*v));
s.serialize_f64(clean)
}
/// Signal: `Config::from_file` / `Config::load` failed due to missing path,
/// I/O failure, TOML parse failure, or post-parse validation failure.
///
/// Wrapped into `anyhow::Error` at the API boundary so callers that need
/// structured details (e.g. kebab-cli's `error_classify`) can
/// `downcast_ref::<ConfigInvalid>()` for the wire record.
#[derive(Debug, thiserror::Error)]
#[error("config invalid at {path}: {cause}")]
pub struct ConfigInvalid {
pub path: PathBuf,
pub cause: String,
}
/// p20-bugfix3 Bug #10: explicit `--config <path>` was missing → silent
/// fallback to defaults instead of fail-fast. `kebab-app::error_wire::classify`
/// downcasts → `code: "config_not_found"` ErrorV1.
#[derive(Debug, thiserror::Error)]
#[error("config file does not exist: {path}")]
pub struct ConfigNotFound {
pub path: PathBuf,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Config {
pub schema_version: u32,
pub workspace: WorkspaceCfg,
pub storage: StorageCfg,
pub models: ModelsCfg,
/// v3: 모든 미디어 형식 ingest 설정의 우산 — 병렬도(← 옛 `[indexing]`),
/// chunking, code, image, pdf 가 전부 `[ingest.*]` 하위로 통합됐다.
/// `#[serde(default)]` 로 두어 미변환 / 부분 config 도 로드된다(자동
/// 변환은 `Config::from_file` 가 메모리에서 수행 — T6).
#[serde(default)]
pub ingest: IngestCfg,
pub search: SearchCfg,
pub rag: RagCfg,
/// p9-fb-14: TUI palette + role-style mapping. `#[serde(default)]`
/// so configs that predate this section still load (defaults to
/// `dark`).
#[serde(default = "UiCfg::defaults")]
pub ui: UiCfg,
/// v0.20.x ingest log surface. `#[serde(default)]` so pre-v0.20
/// config files without a `[logging]` section load with built-in
/// defaults (enabled=true, dir=~/.local/state/kebab/logs).
#[serde(default)]
pub logging: LoggingCfg,
/// p9-fb-05: directory of the on-disk config file this `Config`
/// was loaded from, if any. Populated by `Config::from_file` /
/// `Config::load` — never serialized (`#[serde(skip)]`). Used by
/// `expand_path_with_base` to resolve relative `workspace.root`
/// against the config file's location instead of the user's
/// `cwd` (so `--config /tmp/cfg.toml` + `root = "kb"` reads
/// `/tmp/kb` no matter where the user invoked from).
///
/// `pub(crate)` so external callers can't break the
/// "stamped only by from_file/load" invariant by hand. Use
/// [`Config::with_source_dir`] for tests / programmatic
/// construction that need a specific `source_dir`.
#[serde(skip)]
pub(crate) source_dir: Option<PathBuf>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct WorkspaceCfg {
pub root: String,
pub exclude: Vec<String>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct StorageCfg {
pub data_dir: String,
pub sqlite: String,
pub vector_dir: String,
pub asset_dir: String,
pub artifact_dir: String,
pub model_dir: String,
pub runs_dir: String,
pub copy_threshold_mb: u64,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ChunkingCfg {
pub target_tokens: usize,
pub overlap_tokens: usize,
pub respect_markdown_headings: bool,
pub chunker_version: String,
}
impl ChunkingCfg {
pub fn defaults() -> Self {
Self {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: true,
chunker_version: "md-heading-v1".to_string(),
}
}
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ModelsCfg {
pub embedding: EmbeddingModelCfg,
pub llm: LlmCfg,
/// p9-fb-41 PR-9c-1: NLI verifier model + provider knob.
/// `#[serde(default)]` so pre-v0.18 config files that predate the
/// `[models.nli]` section still load with built-in defaults
/// (`Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7` / `onnx`).
/// The verifier itself is gated by `[rag].nli_threshold` — even
/// with a model configured here, threshold `0.0` (the default)
/// skips the verification step entirely.
#[serde(default = "NliCfg::defaults")]
pub nli: NliCfg,
}
/// p9-fb-41 PR-9c-1: NLI verifier configuration. The model id flows to
/// `OnnxNliVerifier::new` via `kebab-nli` (PR-9c-2 wiring); the provider
/// is reserved for future verifier swap-in (currently only `"onnx"` is
/// recognized — anything else falls back to the same path).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct NliCfg {
pub model: String,
pub provider: String,
}
impl NliCfg {
pub fn defaults() -> Self {
Self {
model: "Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7".to_string(),
provider: "onnx".to_string(),
}
}
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct EmbeddingModelCfg {
/// `fastembed` (default, onnxruntime), `candle` (pure-Rust, NUMA-safe),
/// or `ollama` (remote HTTP embedding endpoint). `none` disables
/// embeddings (lexical-only). Unknown values error at embedder
/// construction.
pub provider: String,
pub model: String,
pub version: String,
pub dimensions: usize,
pub batch_size: usize,
/// Cap on the CPU worker threads the `candle` provider spins up
/// (sizes the global rayon pool; env `KEBAB_EMBED_THREADS` overrides).
/// `0` = auto (rayon default = #cores). Lever to sidestep the
/// onnxruntime 48-thread NUMA double-free; ignored by the `fastembed`
/// provider. Defaulted on load so pre-0.22 config files still parse.
#[serde(default)]
pub num_threads: u32,
/// HTTP endpoint for the `ollama` embedding provider (e.g.
/// `"http://127.0.0.1:11434"`). `None` (or a missing key in TOML) means
/// "fall back to `models.llm.endpoint`" — same convention as the OCR /
/// vision endpoints. Ignored by the `fastembed` / `candle` providers.
/// Defaulted on load so pre-0.26 config files still parse.
#[serde(default)]
pub endpoint: Option<String>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct LlmCfg {
pub provider: String,
pub model: String,
pub context_tokens: usize,
pub endpoint: String,
#[serde(serialize_with = "ser_f32_clean")]
pub temperature: f32,
pub seed: u64,
/// v0.17.0 post-dogfood: Hard ceiling on a single HTTP exchange to
/// the LLM endpoint (Ollama, etc.). Cold-loading an 8B+ model on
/// CPU-only hosts can spend 60-90s on model load + several minutes
/// on a first inference, blowing past the old hard-coded 300s cap
/// and surfacing as `error: kb-rag: llm.generate_stream` to the
/// user. Config-driven so 16-GB / CPU-only deployments using small
/// (≤4B) models can keep the original 300s and large-model dogfood
/// can dial it up (e.g. 1200s) without rebuilding.
///
/// **Edge case — `0` is NOT a disable sentinel.**
/// `reqwest::ClientBuilder::timeout(Duration::from_secs(0))` sets a
/// 0-second read timeout, so every request fails *immediately* with
/// `error: kb-rag: ollama timeout`. To approximate "no cap", use a
/// large finite value (e.g. `u64::MAX` ≈ 5.8 × 10¹¹ years, or
/// just a generous number like `86400`).
#[serde(default = "default_llm_request_timeout_secs")]
pub request_timeout_secs: u64,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct SearchCfg {
pub default_k: usize,
pub hybrid_fusion: String,
pub rrf_k: u32,
pub snippet_chars: usize,
/// p9-fb-19: in-memory LRU cache capacity for `App::search`.
/// One entry ≈ 5 KB → default 256 caps memory at ~1.3 MB. Set
/// to `0` to disable the cache entirely. Stale entries
/// (corpus_revision mismatch) are evicted on next access.
#[serde(default = "default_cache_capacity")]
pub cache_capacity: usize,
/// p9-fb-32: hits and citations whose source doc was last
/// re-processed more than this many days ago are marked
/// `stale: true` in wire / TUI / CLI surfaces. `0` disables.
#[serde(default = "default_stale_threshold_days")]
pub stale_threshold_days: u32,
}
fn default_cache_capacity() -> usize {
256
}
/// v0.17.0 post-dogfood: matches the legacy hard-coded ceiling so
/// existing configs that omit the field keep behaving identically.
/// Overridable per config / `KEBAB_MODELS_LLM_REQUEST_TIMEOUT_SECS`.
fn default_llm_request_timeout_secs() -> u64 {
300
}
fn default_stale_threshold_days() -> u32 {
30
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct RagCfg {
pub prompt_template_version: String,
#[serde(serialize_with = "ser_f32_clean")]
pub score_gate: f32,
pub explain_default: bool,
pub max_context_tokens: usize,
/// p9-fb-41: hard ceiling on the number of multi-hop iterations
/// (decompose iter + decide iters). When the LLM keeps returning
/// `continue` past this depth the pipeline cuts to `synthesize`
/// with `HopRecord.forced_stop = true`. Default `3` — enough for
/// most cross-doc reasoning, low enough to bound LLM cost.
#[serde(default = "default_multi_hop_max_depth")]
pub multi_hop_max_depth: u32,
/// p9-fb-41: cap on how many sub-queries the LLM may emit in a
/// single decompose / decide call. This is the *prompt-side
/// soft hint* — the value the pipeline injects into the
/// decompose / decide prompts so the LLM knows what to aim for.
/// kebab-rag enforces a separate compile-time hard ceiling
/// (`MULTI_HOP_MAX_SUB_QUERIES_HARD_CAP`, currently 10) as a
/// safety net against misbehaving models — if you raise this
/// knob above the hard cap, bump the const in the same PR.
/// Default `5`.
#[serde(default = "default_multi_hop_max_sub_queries_per_iter")]
pub multi_hop_max_sub_queries_per_iter: u32,
/// p9-fb-41: hard ceiling on the deduped chunk pool. When the
/// accumulated pool would exceed this many chunks the pipeline
/// stops accepting new retrieval results and forces synthesize
/// with `forced_stop = true`.
///
/// Default `15` — tuned down from the original 30 in the v0.18
/// pre-cut dogfood (`tasks/HOTFIXES.md` 2026-05-25 fb-41 entry,
/// "post-PR-7 dogfood retest + PR-8 partial mitigation" sub-section).
/// With 30 chunks the synthesize prompt was large enough for
/// gemma3:4b to lose the citation rule + drift into unrelated
/// chunks; 15 keeps the prompt tight while still allowing 3-iter
/// cross-doc reasoning over ~5 chunks per iter.
#[serde(default = "default_multi_hop_max_pool_chunks")]
pub multi_hop_max_pool_chunks: u32,
/// p9-fb-41 PR-9c-1: minimum NLI entailment score required for the
/// multi-hop synthesize answer to be returned as `grounded=true`
/// (spec §2.6 single gate). When the post-synthesize NLI verifier
/// returns `NliScores::faithfulness() < nli_threshold` the
/// pipeline refuses with `RefusalReason::NliVerificationFailed`.
///
/// Default `0.0` = verification disabled — no NLI call, multi-hop
/// matches its PR-3b behavior exactly. Set to e.g. `0.5` to
/// activate the gate. Knob lives on `[rag]` (the gate is a RAG
/// policy, not a model property); the model itself comes from
/// `[models.nli].model`.
///
/// Single-pass `ask` ignores this knob entirely — only multi-hop
/// runs through the verification step (PR-9c-2 wires it).
#[serde(default = "default_nli_threshold", serialize_with = "ser_f32_clean")]
pub nli_threshold: f32,
}
fn default_multi_hop_max_depth() -> u32 {
3
}
fn default_multi_hop_max_sub_queries_per_iter() -> u32 {
5
}
fn default_multi_hop_max_pool_chunks() -> u32 {
15
}
/// p9-fb-41 PR-9c-1: NLI gate disabled by default per spec §2.6
/// (verification opt-in — users explicitly raise the threshold once
/// they're ready to trade refusal-rate for groundedness).
fn default_nli_threshold() -> f32 {
0.0
}
/// Settings for the image ingest pipeline (P6). `ocr` controls OCR
/// behaviour (P6-2); `caption` controls vision-LM captioning (P6-3).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ImageCfg {
#[serde(default = "OcrCfg::defaults")]
pub ocr: OcrCfg,
#[serde(default = "CaptionCfg::defaults")]
pub caption: CaptionCfg,
}
impl ImageCfg {
pub fn defaults() -> Self {
Self {
ocr: OcrCfg::defaults(),
caption: CaptionCfg::defaults(),
}
}
}
/// OCR settings (P6-2). v1 ships a single Ollama-vision adapter; the
/// `OcrEngine` trait in `kebab-parse-image` keeps the door open for
/// Tesseract / Apple Vision / PaddleOCR engines as feature-gated
/// alternatives in P+. See `tasks/HOTFIXES.md` (2026-05-02) for the
/// rationale on dropping the original Tesseract default.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct OcrCfg {
/// Run OCR on every image during ingest. Default `false` because
/// OCR adds one model call per asset.
pub enabled: bool,
/// Engine identifier. v1 only ships `"ollama-vision"`.
pub engine: String,
/// Model id passed to the engine (e.g. `"gemma4:e4b"` for
/// Ollama-vision).
pub model: String,
/// HTTP endpoint for the OCR engine. `None` (or a missing key in
/// TOML) means "fall back to `models.llm.endpoint`" — convenient
/// when the same Ollama host serves both LLM and vision.
#[serde(default)]
pub endpoint: Option<String>,
/// BCP-47 language hints (e.g. `["eng", "kor"]`). The adapter
/// renders them into the prompt; the LLM honours them probabilistically.
pub languages: Vec<String>,
/// Cap the long edge of the image (in pixels) before sending. Larger
/// images bloat prompt cost. Default `1600`.
pub max_pixels: u32,
/// v0.17.2 post-dogfood: Hard ceiling on a single HTTP exchange to
/// the OCR endpoint. Sister knob to [`LlmCfg::request_timeout_secs`]
/// — kept separate because OCR latency is typically shorter than
/// chat-LLM cold start, and large vision models on CPU-only hosts
/// occasionally need a different budget. See HOTFIXES 2026-05-25
/// for the rationale.
///
/// **Edge case — `0` is NOT a disable sentinel.** Same semantics as
/// [`LlmCfg::request_timeout_secs`]: `Duration::from_secs(0)` means
/// "every request fails immediately" (reqwest 0.12.x — the read
/// timeout is applied as a 0-second deadline), not "no timeout".
/// To approximate "no cap", use a large finite value (e.g.
/// `u64::MAX` ≈ 5.8 × 10¹¹ years, or just a generous number like
/// `86400`).
#[serde(default = "default_ocr_request_timeout_secs")]
pub request_timeout_secs: u64,
// ── paddle-onnx engine overrides (v0.27.0) ──────────────────────────
// Only consulted when `engine == "paddle-onnx"`; the ollama-vision
// engine ignores them. All `#[serde(default)]` so pre-v0.27 config
// files load unchanged.
/// Override path to the detection ONNX model. `None` → bundled
/// `assets/paddleocr-onnx/ppocrv5_mobile_det.onnx` (or the directory
/// named by `KEBAB_IMAGE_OCR_MODEL_DIR`).
#[serde(default)]
pub det_model: Option<String>,
/// Override path to the recognition ONNX model. `None` → bundled
/// `assets/paddleocr-onnx/korean_ppocrv5_mobile_rec.onnx`.
#[serde(default)]
pub rec_model: Option<String>,
/// Override path to the character dictionary. `None` → bundled
/// `assets/paddleocr-onnx/korean_dict.txt`.
#[serde(default)]
pub dict: Option<String>,
/// DBNet detection box score threshold (0.0..=1.0). Boxes whose mean
/// probability is below this are dropped. Default `0.3`.
#[serde(default = "default_ocr_score_thresh", serialize_with = "ser_f32_clean")]
pub score_thresh: f32,
/// Polygon unclip ratio applied to each detected box before crop.
/// Larger = more padding around the text. Default `1.5`.
#[serde(default = "default_ocr_unclip_ratio", serialize_with = "ser_f32_clean")]
pub unclip_ratio: f32,
/// Hard cap on detected boxes per image (runaway guard). Extra boxes
/// past this count are truncated with a warning. Default `1000`.
#[serde(default = "default_ocr_max_boxes")]
pub max_boxes: usize,
}
impl OcrCfg {
pub fn defaults() -> Self {
Self {
enabled: false,
engine: "ollama-vision".to_string(),
model: "gemma4:e4b".to_string(),
endpoint: None,
languages: vec!["eng".to_string(), "kor".to_string()],
max_pixels: 1600,
request_timeout_secs: default_ocr_request_timeout_secs(),
det_model: None,
rec_model: None,
dict: None,
score_thresh: default_ocr_score_thresh(),
unclip_ratio: default_ocr_unclip_ratio(),
max_boxes: default_ocr_max_boxes(),
}
}
}
/// paddle-onnx DBNet box score threshold default. See [`OcrCfg::score_thresh`].
fn default_ocr_score_thresh() -> f32 {
0.3
}
/// paddle-onnx unclip ratio default. See [`OcrCfg::unclip_ratio`].
fn default_ocr_unclip_ratio() -> f32 {
1.5
}
/// paddle-onnx box-count cap default. See [`OcrCfg::max_boxes`].
fn default_ocr_max_boxes() -> usize {
1000
}
/// v0.17.2 post-dogfood: matches the legacy hard-coded ceiling so
/// existing configs that omit the field keep behaving identically.
/// Overridable per config / `KEBAB_IMAGE_OCR_REQUEST_TIMEOUT_SECS`.
fn default_ocr_request_timeout_secs() -> u64 {
300
}
/// Caption settings (P6-3). Caption uses the same Ollama-vision /
/// `LanguageModel` pipeline as the rest of the workspace; the trait
/// abstraction is the part the spec demands. `enabled` defaults to
/// `false` because captioning costs one model call per asset and the
/// output is model-generated (low trust).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct CaptionCfg {
/// Run captioning on every image during ingest. Default `false`.
pub enabled: bool,
/// Cap the long edge of the image (in pixels) before sending. The
/// spec recommends an aggressive 768×768 cap because larger
/// vision-LM inputs translate directly into prompt cost. Default
/// `768`.
pub max_pixels: u32,
/// Caption prompt template version pinned into wire output via
/// `ModelCaption.model_version`. Bump when the prompt changes so
/// downstream eval can detect regressions.
pub prompt_template_version: String,
}
impl CaptionCfg {
pub fn defaults() -> Self {
Self {
enabled: false,
max_pixels: 768,
prompt_template_version: "caption-v1".to_string(),
}
}
}
/// Settings for the PDF ingest pipeline (P7 + v0.20.0 sub-item 1).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct PdfCfg {
#[serde(default = "PdfOcrCfg::defaults")]
pub ocr: PdfOcrCfg,
}
impl PdfCfg {
pub fn defaults() -> Self {
Self {
ocr: PdfOcrCfg::defaults(),
}
}
}
impl Default for PdfCfg {
fn default() -> Self {
Self::defaults()
}
}
/// v0.20.x ingest log surface: structured ndjson log written per ingest run.
/// `#[serde(default)]` on Config.logging ensures pre-v0.20 config files load cleanly.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct LoggingCfg {
/// Write structured ndjson log for each ingest run. Default `true`.
/// Set `false` to suppress log file creation entirely (AC-6).
#[serde(default = "default_ingest_log_enabled")]
pub ingest_log_enabled: bool,
/// Directory for per-run log files. Default `{state_dir}/logs`.
/// `{state_dir}` expands to the XDG state dir (e.g. `~/.local/state/kebab`).
#[serde(default = "default_ingest_log_dir")]
pub ingest_log_dir: PathBuf,
/// v0.20.x r2 Enhancement 4: keep the most recent N ingest log files.
/// Older files (beyond this count) are deleted at ingest start.
/// Default 100. AC-9: #[serde(default)] ensures backward compat.
#[serde(default = "default_keep_recent_runs")]
pub keep_recent_runs: u32,
/// v0.20.x r2 Enhancement 4: delete log files older than N days.
/// Also applied to `pdf_ocr_events` SQLite rows. Default 30.
#[serde(default = "default_retention_days")]
pub retention_days: u32,
}
fn default_ingest_log_enabled() -> bool {
true
}
fn default_ingest_log_dir() -> PathBuf {
PathBuf::from("{state_dir}/logs")
}
fn default_keep_recent_runs() -> u32 {
100
}
fn default_retention_days() -> u32 {
30
}
impl Default for LoggingCfg {
fn default() -> Self {
Self {
ingest_log_enabled: default_ingest_log_enabled(),
ingest_log_dir: default_ingest_log_dir(),
keep_recent_runs: default_keep_recent_runs(),
retention_days: default_retention_days(),
}
}
}
/// v0.20.0 sub-item 1: scanned PDF OCR via Ollama vision LLM. Default
/// disabled — opt-in because OCR adds ~45-100s per scanned page on CPU
/// (qwen2.5vl:3b, remote). Enable for book / paper scan KB.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct PdfOcrCfg {
/// Run OCR on scanned PDF pages. Default `false` (opt-in).
pub enabled: bool,
/// `false` (default) — text-detect first + vision fallback on
/// scanned pages only. `true` — vision LLM 호출 on every page
/// (vector PDF 의 dual-text confidence boost — doubles chunk count).
pub always_on: bool,
/// Engine identifier: `"ollama-vision"` or `"paddle-onnx"`. When set to
/// `"paddle-onnx"`, model paths and tuning knobs are read from
/// `[image.ocr]`, not `[pdf.ocr]` — PaddleOCR has no PDF-specific tuning.
pub engine: String,
/// Vision model id. Default `"qwen2.5vl:3b"` per PoC (§3.5 family
/// asymmetry vs image OCR's gemma4:e4b is acknowledged).
pub model: String,
/// HTTP endpoint. `None` → fall back to `models.llm.endpoint`.
#[serde(default)]
pub endpoint: Option<String>,
/// BCP-47 language hints rendered into prompt.
pub languages: Vec<String>,
/// Long-edge cap (px). Larger images bloat prompt cost.
pub max_pixels: u32,
/// HTTP request timeout (sec). Same `0` = "fail immediately"
/// semantics as `image.ocr.request_timeout_secs` (NOT a disable
/// sentinel — see image.ocr docs).
#[serde(default = "default_pdf_ocr_request_timeout_secs")]
pub request_timeout_secs: u64,
/// Valid char ratio threshold (0.0..=1.0). Page with ratio below
/// this is classified as scanned/mojibake → OCR fallback. Default
/// `0.5`.
#[serde(default = "default_pdf_ocr_valid_ratio", serialize_with = "ser_f32_clean")]
pub valid_ratio_threshold: f32,
/// Minimum char count per page below which page is auto-scanned.
/// Default `20`.
#[serde(default = "default_pdf_ocr_min_char_count")]
pub min_char_count: u32,
/// Single-page lang hint. Default `Some("kor")`. `None` = no hint.
#[serde(default = "default_pdf_ocr_lang_hint")]
pub lang_hint: Option<String>,
// ── paddle-onnx engine overrides (v3) ───────────────────────────────
// Symmetric with `[ingest.image.ocr]`. v2 의 "pdf paddle 이 image 의
// 모델 경로를 빌려쓰던" 비대칭을 제거 — pdf 자체 키로 옮긴다. 마이그레이션
// (T5)이 image 값을 이 키로 복사해 signature 바이트 동일 유지. 전부
// `#[serde(default)]` 이라 pre-v3 config 도 로드.
/// Override path to the detection ONNX model. `None` → bundled.
#[serde(default)]
pub det_model: Option<String>,
/// Override path to the recognition ONNX model. `None` → bundled.
#[serde(default)]
pub rec_model: Option<String>,
/// Override path to the character dictionary. `None` → bundled.
#[serde(default)]
pub dict: Option<String>,
/// DBNet detection box score threshold (0.0..=1.0). Default `0.3`.
#[serde(default = "default_ocr_score_thresh", serialize_with = "ser_f32_clean")]
pub score_thresh: f32,
/// Polygon unclip ratio applied to each detected box. Default `1.5`.
#[serde(default = "default_ocr_unclip_ratio", serialize_with = "ser_f32_clean")]
pub unclip_ratio: f32,
/// Hard cap on detected boxes per page (runaway guard). Default `1000`.
#[serde(default = "default_ocr_max_boxes")]
pub max_boxes: usize,
}
impl PdfOcrCfg {
pub fn defaults() -> Self {
Self {
enabled: false,
always_on: false,
engine: "ollama-vision".to_string(),
model: "qwen2.5vl:3b".to_string(),
endpoint: None,
languages: vec!["eng".to_string(), "kor".to_string()],
max_pixels: 2048,
request_timeout_secs: default_pdf_ocr_request_timeout_secs(),
valid_ratio_threshold: default_pdf_ocr_valid_ratio(),
min_char_count: default_pdf_ocr_min_char_count(),
lang_hint: default_pdf_ocr_lang_hint(),
det_model: None,
rec_model: None,
dict: None,
score_thresh: default_ocr_score_thresh(),
unclip_ratio: default_ocr_unclip_ratio(),
max_boxes: default_ocr_max_boxes(),
}
}
}
/// PDF OCR per-page request timeout 의 기본값.
/// 6-32s 가 정상 throughput; 180s 초과는 Ollama 다운 / 매우 dense·고해상도 page 의 신호.
/// `config.toml` 의 `[pdf.ocr] request_timeout_secs = N` 로 override.
///
/// HOTFIXES 2026-05-27 (Bug #11): metro-korea.pdf dogfood 에서 page 8/13 모두
/// 기존 600s default 까지 완전 timeout (`chars: 0, skipped: true` × 20분 cost) →
/// 우선 60s 로 하향. parent spec §1000 / §1628 OQ-1 ("CPU 환경 105s 의 5x 여유") 가
/// 가정한 "page 당 평균 105s" 보다 실측 cloud GPU Ollama 가 6-32s 로 훨씬 빠름.
///
/// HOTFIXES 2026-05-28 (Bug #11 follow-up): 60s 는 dense Korean page (특히
/// metro-korea.pdf page 8/9/13) 의 OCR 을 강제 timeout 시켜 본문 indexed 손실.
/// **conservative starting point 180s 로 재조정** + dogfood evidence 기반 sweet spot
/// 점진적 축소 정책. user 가 `[pdf.ocr] request_timeout_secs = N` 으로 직접 tune.
fn default_pdf_ocr_request_timeout_secs() -> u64 {
180
}
fn default_pdf_ocr_valid_ratio() -> f32 {
0.5
}
fn default_pdf_ocr_min_char_count() -> u32 {
20
}
fn default_pdf_ocr_lang_hint() -> Option<String> {
Some("kor".to_string())
}
/// p9-fb-14: TUI-only configuration. Currently a single `theme`
/// selector (`"dark"` / `"light"`); future fields (custom role
/// overrides, mode-machine cursor shapes, …) extend the same
/// section so the CLI doesn't grow a per-feature `[ui.*]` table.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct UiCfg {
/// Palette name. Recognized: `"dark"` (default), `"light"`.
/// Unknown values fall back to `"dark"` at construction time
/// — config never errors on a typo, the TUI just keeps the
/// default theme so the user has a working shell.
pub theme: String,
}
impl UiCfg {
pub fn defaults() -> Self {
Self {
theme: "dark".to_string(),
}
}
}
/// v3: 모든 미디어 형식 ingest 설정의 우산. 스칼라(병렬도)는 ← 옛 `[indexing]`,
/// 미디어별 하위 테이블(chunking/code/image/pdf)은 ← 옛 top-level 섹션.
/// 직렬화 순서 = 필드 순서: 스칼라(병렬도) 먼저, 하위 테이블 뒤
/// (TOML 의 "bare key 는 sub-table header 앞" 규칙 준수).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IngestCfg {
#[serde(default = "default_max_parallel_extractors")]
pub max_parallel_extractors: u32,
#[serde(default = "default_max_parallel_embeddings")]
pub max_parallel_embeddings: u32,
#[serde(default)]
pub watch_filesystem: bool,
#[serde(default = "ChunkingCfg::defaults")]
pub chunking: ChunkingCfg,
#[serde(default)]
pub code: IngestCodeCfg,
#[serde(default = "ImageCfg::defaults")]
pub image: ImageCfg,
#[serde(default = "PdfCfg::defaults")]
pub pdf: PdfCfg,
}
impl Default for IngestCfg {
fn default() -> Self {
Self {
max_parallel_extractors: default_max_parallel_extractors(),
max_parallel_embeddings: default_max_parallel_embeddings(),
watch_filesystem: false,
chunking: ChunkingCfg::defaults(),
code: IngestCodeCfg::default(),
image: ImageCfg::defaults(),
pdf: PdfCfg::defaults(),
}
}
}
fn default_max_parallel_extractors() -> u32 {
2
}
fn default_max_parallel_embeddings() -> u32 {
1
}
/// p10-1A-1: settings for the code ingest pipeline. All fields have
/// reasonable defaults so the user need not set anything in `config.toml`
/// to get working code ingest.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(default)]
pub struct IngestCodeCfg {
/// Generated header sniff. Reads first ~512 bytes, checks 7 markers.
pub skip_generated_header: bool,
/// Max byte size per file. Bigger files skipped.
pub max_file_bytes: u64,
/// Max line count per file. Bigger files skipped (byte cap checked first).
pub max_file_lines: u32,
/// User extra skip globs (gitignore syntax). Applied on top of built-in
/// + `.gitignore` + `.kebabignore`.
pub extra_skip_globs: Vec<String>,
/// AST chunk size cap. Functions/classes longer than this fall back to
/// paragraph-based split (1A-2 and later).
pub ast_chunk_max_lines: u32,
/// Tier 3 fallback chunker: lines per chunk.
pub fallback_lines_per_chunk: u32,
/// Tier 3 fallback chunker: line overlap between adjacent chunks.
pub fallback_lines_overlap: u32,
}
impl Default for IngestCodeCfg {
fn default() -> Self {
Self {
skip_generated_header: true,
max_file_bytes: 262_144,
max_file_lines: 5_000,
extra_skip_globs: vec![],
ast_chunk_max_lines: 200,
fallback_lines_per_chunk: 80,
fallback_lines_overlap: 20,
}
}
}
impl Config {
/// Defaults per design §6.4.
pub fn defaults() -> Self {
Self {
schema_version: crate::migrate::CURRENT_SCHEMA_VERSION,
workspace: WorkspaceCfg {
root: "~/KnowledgeBase".to_string(),
exclude: vec![
".git/**".to_string(),
"node_modules/**".to_string(),
".obsidian/**".to_string(),
],
},
storage: StorageCfg {
data_dir: "${XDG_DATA_HOME:-~/.local/share}/kebab".to_string(),
sqlite: "{data_dir}/kebab.sqlite".to_string(),
vector_dir: "{data_dir}/lancedb".to_string(),
asset_dir: "{data_dir}/assets".to_string(),
artifact_dir: "{data_dir}/artifacts".to_string(),
model_dir: "{data_dir}/models".to_string(),
runs_dir: "{data_dir}/runs".to_string(),
copy_threshold_mb: 100,
},
models: ModelsCfg {
embedding: EmbeddingModelCfg {
provider: "fastembed".to_string(),
model: "multilingual-e5-large".to_string(),
version: "v1".to_string(),
dimensions: 1024,
batch_size: 64,
num_threads: 0,
endpoint: None,
},
llm: LlmCfg {
provider: "ollama".to_string(),
// gemma4 계열 통일 — OCR (P6-2) + caption (P6-3)
// 어댑터가 같은 family 사용. 사용자가 더 큰
// variant (gemma4:26b 등) 원하면 자기 config.toml
// 에서 override. CPU-only / ≤16 GB RAM 환경이면
// gemma3:4b 같은 ≤4B Q4 모델 권장 (README 참조).
model: "gemma4:e4b".to_string(),
context_tokens: 32768,
endpoint: "http://127.0.0.1:11434".to_string(),
temperature: 0.0,
seed: 0,
request_timeout_secs: default_llm_request_timeout_secs(),
},
nli: NliCfg::defaults(),
},
ingest: IngestCfg {
max_parallel_extractors: 2,
max_parallel_embeddings: 1,
watch_filesystem: false,
chunking: ChunkingCfg::defaults(),
code: IngestCodeCfg::default(),
image: ImageCfg::defaults(),
pdf: PdfCfg::defaults(),
},
search: SearchCfg {
default_k: 10,
hybrid_fusion: "rrf".to_string(),
rrf_k: 60,
snippet_chars: 220,
cache_capacity: default_cache_capacity(),
stale_threshold_days: 30,
},
rag: RagCfg {
prompt_template_version: "rag-v3".to_string(),
score_gate: 0.30,
explain_default: false,
max_context_tokens: 8000,
multi_hop_max_depth: default_multi_hop_max_depth(),
multi_hop_max_sub_queries_per_iter: default_multi_hop_max_sub_queries_per_iter(),
multi_hop_max_pool_chunks: default_multi_hop_max_pool_chunks(),
nli_threshold: default_nli_threshold(),
},
ui: UiCfg::defaults(),
logging: LoggingCfg::default(),
// p9-fb-05: defaults are not loaded from disk, so no
// source_dir. Relative `workspace.root` (rare with
// defaults) falls back to caller `cwd` via the
// `unwrap_or_else(...)` in `expand_path_with_base`
// sites — see kebab-app's resolve_workspace_root.
source_dir: None,
}
}
/// p9-fb-05: read-only accessor for the source-file directory
/// (where `from_file` / `load` stamped it). Returns `None` for
/// `Config::defaults()` and other in-memory constructions.
pub fn source_dir(&self) -> Option<&Path> {
self.source_dir.as_deref()
}
/// p9-fb-05: builder for tests / programmatic callers that need
/// to pin `source_dir` without going through `from_file`. Returns
/// `self` so it chains: `Config::defaults().with_source_dir(p)`.
pub fn with_source_dir(mut self, dir: PathBuf) -> Self {
self.source_dir = Some(dir);
self
}
/// p9-fb-05: resolve `workspace.root` to an absolute `PathBuf`.
/// Order:
/// 1. tilde / env / `${VAR}` substitutions per [`expand_path`].
/// 2. if still relative, join onto `source_dir` (config file's
/// directory) when known, else `cwd`.
///
/// Tilde / absolute / `${VAR}`-rooted inputs ignore `source_dir`.
/// `Config::defaults()` (which has no `source_dir`) effectively
/// uses `cwd` for relative inputs — which is the surprising
/// case spec p9-fb-05 calls out as a foot-gun, but it can only
/// arise when the user is using defaults AND has a relative
/// root, which is rare (defaults ship `~/KnowledgeBase`).
pub fn resolve_workspace_root(&self) -> PathBuf {
let base = self.source_dir.clone().unwrap_or_else(|| {
std::env::current_dir().unwrap_or_else(|e| {
// chroot / deleted-cwd / permission failure: log so a
// user with an environment problem doesn't silently
// wonder why their workspace.root resolved to "./root"
// (which then fails at `create_dir_all` time with a
// less obvious error).
tracing::warn!(
target: "kebab-config",
error = %e,
"current_dir() failed; falling back to '.' for workspace.root resolution"
);
PathBuf::from(".")
})
});
paths::expand_path_with_base(&self.workspace.root, "", &base)
}
/// Read config from disk and merge env overrides on top of it. If the
/// file is missing, defaults are used (so `kb doctor` runs with no
/// prior `kb init`).
pub fn load(path: Option<&Path>) -> anyhow::Result<Self> {
let from_disk = match path {
Some(p) if p.exists() => Self::from_file(p)?,
Some(p) => {
return Err(anyhow::Error::new(ConfigNotFound {
path: p.to_path_buf(),
}));
}
None => {
let p = Self::xdg_config_path();
if p.exists() {
Self::from_file(&p)?
} else {
// macOS migration: if the new XDG path is absent but the
// old ~/Library/Application Support/kebab/config.toml exists,
// copy it to the new location so the user doesn't lose settings.
if let Some(legacy) = Self::macos_legacy_config_path() {
if legacy.exists() && !p.exists() {
if let Some(parent) = p.parent() {
let _ = std::fs::create_dir_all(parent);
}
if std::fs::copy(&legacy, &p).is_ok() {
eprintln!(
"kebab: migrated config {}{}",
legacy.display(),
p.display()
);
return Self::from_file(&p)
.map(|c| c.apply_env(&std::env::vars().collect()));
}
}
}
Self::defaults()
}
}
};
let env: HashMap<String, String> = std::env::vars().collect();
Ok(from_disk.apply_env(&env))
}
/// Parse a config from `path`. p9-fb-05: also stamps
/// `source_dir = path.parent()` so relative `workspace.root`
/// values resolve against the config file's directory rather
/// than the user's `cwd`.
pub fn from_file(path: &Path) -> anyhow::Result<Self> {
let text = std::fs::read_to_string(path).map_err(|e| {
anyhow::Error::new(ConfigInvalid {
path: path.to_path_buf(),
cause: format!("read_failed: {e}"),
})
})?;
// p9-fb-25: probe for the legacy `workspace.include` key — if
// present, emit a one-shot deprecation warning. Detection uses
// raw `toml::Value` lookup; the warning fires via a process-
// level OnceLock so a long-running TUI / CLI run doesn't spam
// the log on every Config::load.
if let Ok(value) = toml::from_str::<toml::Value>(&text) {
if value
.get("workspace")
.and_then(|v| v.get("include"))
.is_some()
{
static DEPRECATION_FIRED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
DEPRECATION_FIRED.get_or_init(|| {
tracing::warn!(
target: "kebab-config",
config = %path.display(),
"deprecated config: `workspace.include` 필드는 더 이상 사용되지 않습니다 (p9-fb-25, v0.2.1+). 처리 가능한 형식 (md / png / jpg / pdf) 은 extractor 가 자동 결정. config 에서 이 필드를 제거해도 안전 — 더 이상 enforce 안 됨."
);
});
}
}
// v3: 파일의 schema_version 이 CURRENT 보다 낮으면 메모리에서 변환한다
// (디스크 미변경 — 파일 갱신은 `kebab config migrate`). 미변환 v2 파일도
// 설정 유실 없이 로드(불변식 #3). non-additive relocation(v2→v3) 은
// serde default forward-compat 로는 커버 안 되므로 반드시 거쳐야 한다.
let parse_text = {
let from = toml::from_str::<toml::Value>(&text)
.ok()
.and_then(|v| v.get("schema_version").and_then(toml::Value::as_integer))
.unwrap_or(1) as u32;
if from < crate::migrate::CURRENT_SCHEMA_VERSION {
static MIGRATE_WARNED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
MIGRATE_WARNED.get_or_init(|| {
tracing::warn!(
target: "kebab-config",
config = %path.display(),
from,
to = crate::migrate::CURRENT_SCHEMA_VERSION,
"config 가 옛 스키마입니다 — 이번 실행은 메모리에서 변환됨. 파일 갱신: `kebab config migrate`."
);
});
crate::migrate::migrate_document(&text).new_text
} else {
text.clone()
}
};
let mut cfg: Self = toml::from_str(&parse_text).map_err(|e| {
anyhow::Error::new(ConfigInvalid {
path: path.to_path_buf(),
cause: format!("parse_failed: {e}"),
})
})?;
cfg.source_dir = path.parent().map(Path::to_path_buf);
Ok(cfg)
}
/// Apply `KEBAB_<SECTION>_<KEY>` env overrides. Unknown keys are ignored.
///
/// The mapping is an explicit grep-friendly whitelist — one match arm
/// per leaf key in `Config`. Booleans accept `1` / `true` / `yes`
/// (case-insensitive) for true and anything else for false. Numeric
/// keys silently keep their prior value if the env value fails to
/// parse, so a malformed `KEBAB_*` cannot crash startup.
pub fn apply_env(mut self, env: &HashMap<String, String>) -> Self {
for (k, v) in env {
if !k.starts_with("KEBAB_") {
continue;
}
match k.as_str() {
// workspace
"KEBAB_WORKSPACE_ROOT" => self.workspace.root = v.clone(),
// storage
"KEBAB_STORAGE_DATA_DIR" => self.storage.data_dir = v.clone(),
"KEBAB_STORAGE_SQLITE" => self.storage.sqlite = v.clone(),
"KEBAB_STORAGE_VECTOR_DIR" => self.storage.vector_dir = v.clone(),
"KEBAB_STORAGE_ASSET_DIR" => self.storage.asset_dir = v.clone(),
"KEBAB_STORAGE_ARTIFACT_DIR" => self.storage.artifact_dir = v.clone(),
"KEBAB_STORAGE_MODEL_DIR" => self.storage.model_dir = v.clone(),
"KEBAB_STORAGE_RUNS_DIR" => self.storage.runs_dir = v.clone(),
"KEBAB_STORAGE_COPY_THRESHOLD_MB" => {
if let Ok(n) = v.parse::<u64>() {
self.storage.copy_threshold_mb = n;
}
}
// indexing
"KEBAB_INDEXING_MAX_PARALLEL_EXTRACTORS" => {
if let Ok(n) = v.parse::<u32>() {
self.ingest.max_parallel_extractors = n;
}
}
"KEBAB_INDEXING_MAX_PARALLEL_EMBEDDINGS" => {
if let Ok(n) = v.parse::<u32>() {
self.ingest.max_parallel_embeddings = n;
}
}
"KEBAB_INDEXING_WATCH_FILESYSTEM" => {
self.ingest.watch_filesystem = parse_bool(v);
}
// chunking
"KEBAB_CHUNKING_TARGET_TOKENS" => {
if let Ok(n) = v.parse::<usize>() {
self.ingest.chunking.target_tokens = n;
}
}
"KEBAB_CHUNKING_OVERLAP_TOKENS" => {
if let Ok(n) = v.parse::<usize>() {
self.ingest.chunking.overlap_tokens = n;
}
}
"KEBAB_CHUNKING_RESPECT_MARKDOWN_HEADINGS" => {
self.ingest.chunking.respect_markdown_headings = parse_bool(v);
}
"KEBAB_CHUNKING_CHUNKER_VERSION" => self.ingest.chunking.chunker_version = v.clone(),
// models.embedding
"KEBAB_MODELS_EMBEDDING_PROVIDER" => self.models.embedding.provider = v.clone(),
"KEBAB_MODELS_EMBEDDING_MODEL" => self.models.embedding.model = v.clone(),
"KEBAB_MODELS_EMBEDDING_VERSION" => self.models.embedding.version = v.clone(),
"KEBAB_MODELS_EMBEDDING_DIMENSIONS" => {
if let Ok(n) = v.parse::<usize>() {
self.models.embedding.dimensions = n;
}
}
"KEBAB_MODELS_EMBEDDING_BATCH_SIZE" => {
if let Ok(n) = v.parse::<usize>() {
self.models.embedding.batch_size = n;
}
}
"KEBAB_MODELS_EMBEDDING_NUM_THREADS" => {
if let Ok(n) = v.parse::<u32>() {
self.models.embedding.num_threads = n;
}
}
"KEBAB_MODELS_EMBEDDING_ENDPOINT" => {
// Empty value → None (= fall back to models.llm.endpoint),
// mirroring the OCR endpoint override semantics.
self.models.embedding.endpoint =
if v.is_empty() { None } else { Some(v.clone()) };
}
// models.llm
"KEBAB_MODELS_LLM_PROVIDER" => self.models.llm.provider = v.clone(),
"KEBAB_MODELS_LLM_MODEL" => self.models.llm.model = v.clone(),
"KEBAB_MODELS_LLM_CONTEXT_TOKENS" => {
if let Ok(n) = v.parse::<usize>() {
self.models.llm.context_tokens = n;
}
}
"KEBAB_MODELS_LLM_ENDPOINT" => self.models.llm.endpoint = v.clone(),
"KEBAB_MODELS_LLM_TEMPERATURE" => {
if let Ok(f) = v.parse::<f32>() {
self.models.llm.temperature = f;
}
}
"KEBAB_MODELS_LLM_SEED" => {
if let Ok(n) = v.parse::<u64>() {
self.models.llm.seed = n;
}
}
"KEBAB_MODELS_LLM_REQUEST_TIMEOUT_SECS" => {
if let Ok(n) = v.parse::<u64>() {
self.models.llm.request_timeout_secs = n;
}
}
// models.nli (p9-fb-41 PR-9c-1)
"KEBAB_MODELS_NLI_MODEL" => self.models.nli.model = v.clone(),
"KEBAB_MODELS_NLI_PROVIDER" => self.models.nli.provider = v.clone(),
// search
"KEBAB_SEARCH_DEFAULT_K" => {
if let Ok(n) = v.parse::<usize>() {
self.search.default_k = n;
}
}
"KEBAB_SEARCH_HYBRID_FUSION" => self.search.hybrid_fusion = v.clone(),
"KEBAB_SEARCH_RRF_K" => {
if let Ok(n) = v.parse::<u32>() {
self.search.rrf_k = n;
}
}
"KEBAB_SEARCH_SNIPPET_CHARS" => {
if let Ok(n) = v.parse::<usize>() {
self.search.snippet_chars = n;
}
}
"KEBAB_SEARCH_STALE_THRESHOLD_DAYS" => {
if let Ok(n) = v.parse::<u32>() {
self.search.stale_threshold_days = n;
}
}
// rag
"KEBAB_RAG_PROMPT_TEMPLATE_VERSION" => {
self.rag.prompt_template_version = v.clone();
}
"KEBAB_RAG_SCORE_GATE" => {
if let Ok(f) = v.parse::<f32>() {
self.rag.score_gate = f;
}
}
"KEBAB_RAG_EXPLAIN_DEFAULT" => {
self.rag.explain_default = parse_bool(v);
}
"KEBAB_RAG_MAX_CONTEXT_TOKENS" => {
if let Ok(n) = v.parse::<usize>() {
self.rag.max_context_tokens = n;
}
}
"KEBAB_RAG_MULTI_HOP_MAX_DEPTH" => {
if let Ok(n) = v.parse::<u32>() {
self.rag.multi_hop_max_depth = n;
}
}
"KEBAB_RAG_MULTI_HOP_MAX_SUB_QUERIES_PER_ITER" => {
if let Ok(n) = v.parse::<u32>() {
self.rag.multi_hop_max_sub_queries_per_iter = n;
}
}
"KEBAB_RAG_MULTI_HOP_MAX_POOL_CHUNKS" => {
if let Ok(n) = v.parse::<u32>() {
self.rag.multi_hop_max_pool_chunks = n;
}
}
// p9-fb-41 PR-9c-1: NLI gate threshold. Parse failure
// emits a `tracing::warn!` (not silent like the other
// numeric env overrides) because this knob gates the
// NLI verification entirely — a malformed env value
// would silently disable a security-flavored gate the
// user thought they enabled, which is the failure mode
// most worth surfacing. The default (`0.0`) survives
// on parse failure so behaviour stays well-defined.
"KEBAB_RAG_NLI_THRESHOLD" => match v.parse::<f32>() {
Ok(f) => self.rag.nli_threshold = f,
Err(e) => tracing::warn!(
target: "kebab-config",
env_key = "KEBAB_RAG_NLI_THRESHOLD",
env_value = %v,
error = %e,
"invalid KEBAB_RAG_NLI_THRESHOLD; keeping prior value (0.0 = NLI gate disabled)"
),
},
// image.ocr
"KEBAB_IMAGE_OCR_ENABLED" => {
self.ingest.image.ocr.enabled = parse_bool(v);
}
"KEBAB_IMAGE_OCR_ENGINE" => self.ingest.image.ocr.engine = v.clone(),
"KEBAB_IMAGE_OCR_MODEL" => self.ingest.image.ocr.model = v.clone(),
"KEBAB_IMAGE_OCR_ENDPOINT" => {
// Empty env value is treated the same as "fall back
// to models.llm.endpoint" — i.e. set None.
self.ingest.image.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_LANGUAGES" => {
// Comma-separated list, e.g. "eng,kor".
self.ingest.image.ocr.languages = v
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
}
"KEBAB_IMAGE_OCR_MAX_PIXELS" => {
if let Ok(n) = v.parse::<u32>() {
self.ingest.image.ocr.max_pixels = n;
}
}
"KEBAB_IMAGE_OCR_REQUEST_TIMEOUT_SECS" => {
if let Ok(n) = v.parse::<u64>() {
self.ingest.image.ocr.request_timeout_secs = n;
}
}
// paddle-onnx engine overrides (v0.27.0). Empty string → None
// (fall back to bundled / KEBAB_IMAGE_OCR_MODEL_DIR).
"KEBAB_IMAGE_OCR_DET_MODEL" => {
self.ingest.image.ocr.det_model =
if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_REC_MODEL" => {
self.ingest.image.ocr.rec_model =
if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_DICT" => {
self.ingest.image.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_IMAGE_OCR_SCORE_THRESH" => {
if let Ok(f) = v.parse::<f32>() {
self.ingest.image.ocr.score_thresh = f;
}
}
"KEBAB_IMAGE_OCR_UNCLIP_RATIO" => {
if let Ok(f) = v.parse::<f32>() {
self.ingest.image.ocr.unclip_ratio = f;
}
}
"KEBAB_IMAGE_OCR_MAX_BOXES" => {
if let Ok(n) = v.parse::<usize>() {
self.ingest.image.ocr.max_boxes = n;
}
}
// image.caption (P6-3)
"KEBAB_IMAGE_CAPTION_ENABLED" => {
self.ingest.image.caption.enabled = parse_bool(v);
}
"KEBAB_IMAGE_CAPTION_MAX_PIXELS" => {
if let Ok(n) = v.parse::<u32>() {
self.ingest.image.caption.max_pixels = n;
}
}
"KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION" => {
self.ingest.image.caption.prompt_template_version = v.clone();
}
// pdf.ocr (v0.20.0 sub-item 1)
"KEBAB_PDF_OCR_ENABLED" => self.ingest.pdf.ocr.enabled = parse_bool(v),
"KEBAB_PDF_OCR_ALWAYS_ON" => self.ingest.pdf.ocr.always_on = parse_bool(v),
"KEBAB_PDF_OCR_ENGINE" => self.ingest.pdf.ocr.engine = v.clone(),
"KEBAB_PDF_OCR_MODEL" => self.ingest.pdf.ocr.model = v.clone(),
"KEBAB_PDF_OCR_ENDPOINT" => {
self.ingest.pdf.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_PDF_OCR_LANGUAGES" => {
self.ingest.pdf.ocr.languages = v
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
}
"KEBAB_PDF_OCR_MAX_PIXELS" => {
if let Ok(n) = v.parse::<u32>() {
self.ingest.pdf.ocr.max_pixels = n;
}
}
"KEBAB_PDF_OCR_REQUEST_TIMEOUT_SECS" => {
if let Ok(n) = v.parse::<u64>() {
self.ingest.pdf.ocr.request_timeout_secs = n;
}
}
"KEBAB_PDF_OCR_VALID_RATIO_THRESHOLD" => {
if let Ok(n) = v.parse::<f32>() {
self.ingest.pdf.ocr.valid_ratio_threshold = n.clamp(0.0, 1.0);
}
}
"KEBAB_PDF_OCR_MIN_CHAR_COUNT" => {
if let Ok(n) = v.parse::<u32>() {
self.ingest.pdf.ocr.min_char_count = n;
}
}
"KEBAB_PDF_OCR_LANG_HINT" => {
self.ingest.pdf.ocr.lang_hint = if v.is_empty() { None } else { Some(v.clone()) };
}
// pdf paddle-onnx engine overrides (v3). image.ocr paddle 패턴 복제.
// Empty string → None (fall back to bundled / KEBAB_IMAGE_OCR_MODEL_DIR).
"KEBAB_PDF_OCR_DET_MODEL" => {
self.ingest.pdf.ocr.det_model =
if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_PDF_OCR_REC_MODEL" => {
self.ingest.pdf.ocr.rec_model =
if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_PDF_OCR_DICT" => {
self.ingest.pdf.ocr.dict = if v.is_empty() { None } else { Some(v.clone()) };
}
"KEBAB_PDF_OCR_SCORE_THRESH" => {
if let Ok(f) = v.parse::<f32>() {
self.ingest.pdf.ocr.score_thresh = f;
}
}
"KEBAB_PDF_OCR_UNCLIP_RATIO" => {
if let Ok(f) = v.parse::<f32>() {
self.ingest.pdf.ocr.unclip_ratio = f;
}
}
"KEBAB_PDF_OCR_MAX_BOXES" => {
if let Ok(n) = v.parse::<usize>() {
self.ingest.pdf.ocr.max_boxes = n;
}
}
// Unknown KEBAB_* keys are silently ignored — see
// `env_unknown_key_is_ignored` test.
_ => {}
}
}
self
}
/// `~/.config/kebab/config.toml` (honors `XDG_CONFIG_HOME`).
pub fn xdg_config_path() -> PathBuf {
if let Ok(custom) = std::env::var("XDG_CONFIG_HOME") {
if !custom.is_empty() {
return PathBuf::from(custom).join("kebab").join("config.toml");
}
}
// Always use XDG-standard ~/.config regardless of platform.
// macOS dirs::config_dir() returns ~/Library/Application Support which
// collides with data_dir() — DataOnly reset would delete config too.
match dirs::home_dir() {
Some(h) => h.join(".config").join("kebab").join("config.toml"),
None => PathBuf::from("./kebab/config.toml"),
}
}
/// `~/.local/share/kebab` (honors `XDG_DATA_HOME`).
pub fn xdg_data_dir() -> PathBuf {
if let Ok(custom) = std::env::var("XDG_DATA_HOME") {
if !custom.is_empty() {
return PathBuf::from(custom).join("kebab");
}
}
// Always use XDG-standard ~/.local/share regardless of platform.
match dirs::home_dir() {
Some(h) => h.join(".local").join("share").join("kebab"),
None => PathBuf::from("./kebab-data"),
}
}
/// `~/.cache/kebab` (honors `XDG_CACHE_HOME`).
pub fn xdg_cache_dir() -> PathBuf {
if let Ok(custom) = std::env::var("XDG_CACHE_HOME") {
if !custom.is_empty() {
return PathBuf::from(custom).join("kebab");
}
}
// Always use XDG-standard ~/.cache regardless of platform.
match dirs::home_dir() {
Some(h) => h.join(".cache").join("kebab"),
None => PathBuf::from("./kebab-cache"),
}
}
/// `~/.local/state/kebab` (honors `XDG_STATE_HOME`).
pub fn xdg_state_dir() -> PathBuf {
if let Ok(custom) = std::env::var("XDG_STATE_HOME") {
if !custom.is_empty() {
return PathBuf::from(custom).join("kebab");
}
}
// `dirs` doesn't expose state_dir on all platforms; fall back to
// `$HOME/.local/state/kebab` if XDG_STATE_HOME is unset.
if let Some(home) = dirs::home_dir() {
return home.join(".local").join("state").join("kebab");
}
PathBuf::from("./kebab-state")
}
/// macOS legacy config path: `~/Library/Application Support/kebab/config.toml`.
/// Returns `None` on non-macOS or when home dir is unavailable.
/// Used for one-time migration to the XDG-standard location.
fn macos_legacy_config_path() -> Option<PathBuf> {
#[cfg(target_os = "macos")]
{
dirs::home_dir().map(|h| {
h.join("Library")
.join("Application Support")
.join("kebab")
.join("config.toml")
})
}
#[cfg(not(target_os = "macos"))]
{
None
}
}
}
/// Parse a permissive boolean — `1` / `true` / `yes` (case-insensitive)
/// for true, anything else for false. Used by `apply_env` for boolean
/// leaves of `Config`.
fn parse_bool(s: &str) -> bool {
matches!(s.to_ascii_lowercase().as_str(), "1" | "true" | "yes")
}
#[cfg(test)]
mod tests {
use super::*;
/// Legacy TOML fixture written before the `request_timeout_secs`
/// knobs (LLM in v0.17.1, OCR follow-up) existed. Shared by
/// `legacy_config_without_request_timeout_secs_uses_default`
/// (LLM-side) and `legacy_config_without_ocr_request_timeout_secs_uses_default`
/// (OCR-side) so both invariants pin against the same on-disk
/// shape — schema drift in the legacy form only needs one edit.
const LEGACY_PRE_TIMEOUT_TOML: &str = r#"
schema_version = 1
[workspace]
root = "/tmp/x"
exclude = []
[storage]
data_dir = "/tmp/x"
sqlite = "/tmp/x/kebab.sqlite"
vector_dir = "/tmp/x/lancedb"
asset_dir = "/tmp/x/assets"
artifact_dir = "/tmp/x/artifacts"
model_dir = "/tmp/x/models"
runs_dir = "/tmp/x/runs"
copy_threshold_mb = 100
[indexing]
max_parallel_extractors = 2
max_parallel_embeddings = 1
watch_filesystem = false
[chunking]
target_tokens = 500
overlap_tokens = 80
respect_markdown_headings = true
chunker_version = "md-heading-v1"
[models.embedding]
provider = "fastembed"
model = "multilingual-e5-large"
version = "v1"
dimensions = 1024
batch_size = 64
[models.llm]
provider = "ollama"
model = "gemma3:4b"
context_tokens = 4096
endpoint = "http://127.0.0.1:11434"
temperature = 0.0
seed = 0
[search]
default_k = 10
hybrid_fusion = "rrf"
rrf_k = 60
snippet_chars = 220
[rag]
prompt_template_version = "rag-v3"
score_gate = 0.3
explain_default = false
max_context_tokens = 8000
[image.ocr]
enabled = false
engine = "ollama-vision"
model = "gemma3:4b"
languages = ["eng"]
max_pixels = 1600
[image.caption]
enabled = false
max_pixels = 768
prompt_template_version = "caption-v1"
[ui]
theme = "dark"
"#;
#[test]
fn defaults_are_serde_roundtrip_stable() {
let c = Config::defaults();
let toml_text = toml::to_string(&c).unwrap();
let back: Config = toml::from_str(&toml_text).unwrap();
assert_eq!(c, back);
}
/// 불변식 #3: `from_file` 이 v2 파일을 디스크 미변경으로 메모리에서 v3
/// 변환 — 미변환 v2 파일도 설정 유실 0.
#[test]
fn from_file_auto_migrates_v2_in_memory() {
let dir = tempfile::tempdir().unwrap();
let p = dir.path().join("config.toml");
std::fs::write(
&p,
"\
schema_version = 2
[workspace]
root = \"/my/notes\"
exclude = []
[chunking]
target_tokens = 777
[image.ocr]
enabled = true
engine = \"ollama-vision\"
model = \"gemma4:e4b\"
languages = [\"kor\"]
max_pixels = 1600
",
)
.unwrap();
let c = Config::from_file(&p).expect("v2 auto-migrate load");
// 사용자 v2 값이 새 경로로 살아있어야(기본값 유실 X).
assert_eq!(c.ingest.chunking.target_tokens, 777);
assert!(c.ingest.image.ocr.enabled);
assert_eq!(c.ingest.image.ocr.languages, vec!["kor"]);
// 디스크 파일은 안 바뀜(여전히 schema_version = 2 + [chunking]).
let on_disk = std::fs::read_to_string(&p).unwrap();
assert!(
on_disk.contains("schema_version = 2"),
"파일이 변경됨:\n{on_disk}"
);
assert!(on_disk.contains("[chunking]"), "파일이 변경됨:\n{on_disk}");
}
#[test]
fn v3_layout_nests_media_under_ingest() {
let c = Config::defaults();
// 새 경로가 컴파일·접근 가능해야 한다.
assert_eq!(c.ingest.max_parallel_extractors, 2);
assert_eq!(c.ingest.chunking.target_tokens, 500);
assert_eq!(c.ingest.code.max_file_bytes, 262_144);
assert_eq!(c.ingest.image.ocr.engine, "ollama-vision");
assert_eq!(c.ingest.image.caption.max_pixels, 768);
assert_eq!(c.ingest.pdf.ocr.model, "qwen2.5vl:3b");
// pdf paddle 대칭 키 존재 + 기본값.
assert_eq!(c.ingest.pdf.ocr.score_thresh, 0.3);
assert_eq!(c.ingest.pdf.ocr.max_boxes, 1000);
assert!(c.ingest.pdf.ocr.det_model.is_none());
}
#[test]
fn defaults_match_design_64_score_gate() {
let c = Config::defaults();
assert_eq!(c.rag.score_gate, 0.30);
assert_eq!(c.ingest.chunking.target_tokens, 500);
assert_eq!(c.models.embedding.model, "multilingual-e5-large");
assert_eq!(c.models.embedding.dimensions, 1024);
assert_eq!(c.search.rrf_k, 60);
}
#[test]
fn defaults_rag_prompt_template_version_is_rag_v3() {
let c = Config::defaults();
assert_eq!(c.rag.prompt_template_version, "rag-v3");
}
#[test]
fn env_override_score_gate() {
let mut env = HashMap::new();
env.insert("KEBAB_RAG_SCORE_GATE".to_string(), "0.5".to_string());
let c = Config::defaults().apply_env(&env);
assert!((c.rag.score_gate - 0.5).abs() < 1e-6);
}
#[test]
fn env_override_search_k() {
let mut env = HashMap::new();
env.insert("KEBAB_SEARCH_DEFAULT_K".to_string(), "25".to_string());
let c = Config::defaults().apply_env(&env);
assert_eq!(c.search.default_k, 25);
}
/// 불변식 #2: env override 이름(LHS) 100% 보존 — struct 경로가 바뀌어도
/// 기존 `KEBAB_*` 스크립트가 새 경로로 대입되어 무파손.
#[test]
fn env_names_preserved_target_new_paths() {
let mut env = HashMap::new();
env.insert("KEBAB_CHUNKING_TARGET_TOKENS".into(), "640".into());
env.insert("KEBAB_INDEXING_MAX_PARALLEL_EXTRACTORS".into(), "6".into());
env.insert("KEBAB_IMAGE_OCR_ENABLED".into(), "true".into());
env.insert("KEBAB_PDF_OCR_ENGINE".into(), "paddle-onnx".into());
let c = Config::defaults().apply_env(&env);
assert_eq!(c.ingest.chunking.target_tokens, 640);
assert_eq!(c.ingest.max_parallel_extractors, 6);
assert!(c.ingest.image.ocr.enabled);
assert_eq!(c.ingest.pdf.ocr.engine, "paddle-onnx");
}
#[test]
fn env_pdf_paddle_symmetric_overrides() {
let mut env = HashMap::new();
env.insert("KEBAB_PDF_OCR_DET_MODEL".into(), "/d.onnx".into());
env.insert("KEBAB_PDF_OCR_SCORE_THRESH".into(), "0.4".into());
env.insert("KEBAB_PDF_OCR_MAX_BOXES".into(), "500".into());
let c = Config::defaults().apply_env(&env);
assert_eq!(c.ingest.pdf.ocr.det_model.as_deref(), Some("/d.onnx"));
assert!((c.ingest.pdf.ocr.score_thresh - 0.4).abs() < 1e-6);
assert_eq!(c.ingest.pdf.ocr.max_boxes, 500);
}
#[test]
fn env_unknown_key_is_ignored() {
let baseline = Config::defaults();
let mut env = HashMap::new();
env.insert("KEBAB_NOPE_FOO".to_string(), "garbage".to_string());
let c = Config::defaults().apply_env(&env);
assert_eq!(c, baseline);
}
#[test]
fn env_overrides_chunking_target_tokens() {
let mut env = HashMap::new();
env.insert(
"KEBAB_CHUNKING_TARGET_TOKENS".to_string(),
"777".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert_eq!(c.ingest.chunking.target_tokens, 777);
}
#[test]
fn env_overrides_models_llm_endpoint_and_temperature() {
let mut env = HashMap::new();
env.insert(
"KEBAB_MODELS_LLM_ENDPOINT".to_string(),
"http://10.0.0.1:11434".to_string(),
);
env.insert(
"KEBAB_MODELS_LLM_TEMPERATURE".to_string(),
"0.7".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert_eq!(c.models.llm.endpoint, "http://10.0.0.1:11434");
assert!((c.models.llm.temperature - 0.7).abs() < 1e-6);
}
/// v0.17.0 post-dogfood: matches the legacy hard-coded 300s cap so
/// existing configs that omit the new field are not affected.
#[test]
fn default_llm_request_timeout_secs_is_300() {
assert_eq!(Config::defaults().models.llm.request_timeout_secs, 300);
}
#[test]
fn env_overrides_models_llm_request_timeout_secs() {
let mut env = HashMap::new();
env.insert(
"KEBAB_MODELS_LLM_REQUEST_TIMEOUT_SECS".to_string(),
"1200".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert_eq!(c.models.llm.request_timeout_secs, 1200);
}
/// v0.17.0 post-dogfood: a config file written before the field
/// existed (no `request_timeout_secs` key) must still parse and fall
/// back to the 300s default — backwards-compat invariant. Fixture
/// shared with the OCR-side invariant via [`LEGACY_PRE_TIMEOUT_TOML`].
#[test]
fn legacy_config_without_request_timeout_secs_uses_default() {
let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config");
assert_eq!(c.models.llm.request_timeout_secs, 300);
}
#[test]
fn env_overrides_indexing_watch_filesystem_bool() {
let mut env = HashMap::new();
env.insert(
"KEBAB_INDEXING_WATCH_FILESYSTEM".to_string(),
"true".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert!(c.ingest.watch_filesystem);
}
#[test]
fn image_ocr_defaults_disabled_with_ollama_vision() {
let c = Config::defaults();
assert!(!c.ingest.image.ocr.enabled);
assert_eq!(c.ingest.image.ocr.engine, "ollama-vision");
assert_eq!(c.ingest.image.ocr.model, "gemma4:e4b");
assert_eq!(c.ingest.image.ocr.languages, vec!["eng", "kor"]);
assert_eq!(c.ingest.image.ocr.max_pixels, 1600);
}
/// v0.17.2 post-dogfood: matches the legacy hard-coded 300s cap so
/// existing configs that omit the new field keep behaving identically.
#[test]
fn default_ocr_request_timeout_secs_is_300() {
assert_eq!(Config::defaults().ingest.image.ocr.request_timeout_secs, 300);
}
#[test]
fn env_overrides_image_ocr_request_timeout_secs() {
let mut env = HashMap::new();
env.insert(
"KEBAB_IMAGE_OCR_REQUEST_TIMEOUT_SECS".to_string(),
"900".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert_eq!(c.ingest.image.ocr.request_timeout_secs, 900);
}
/// post-v0.17.1 dogfood: a config file written before the OCR
/// timeout field existed must still parse and fall back to the
/// 300s default — backwards-compat invariant. Fixture shared
/// with the LLM-side invariant via [`LEGACY_PRE_TIMEOUT_TOML`].
#[test]
fn legacy_config_without_ocr_request_timeout_secs_uses_default() {
let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config");
assert_eq!(c.ingest.image.ocr.request_timeout_secs, 300);
}
// ── p9-fb-41: multi-hop RAG knobs ────────────────────────────────────
#[test]
fn default_multi_hop_max_depth_is_3() {
assert_eq!(Config::defaults().rag.multi_hop_max_depth, 3);
}
#[test]
fn default_multi_hop_max_sub_queries_per_iter_is_5() {
assert_eq!(Config::defaults().rag.multi_hop_max_sub_queries_per_iter, 5);
}
#[test]
fn default_multi_hop_max_pool_chunks_is_15() {
// v0.18 dogfood (HOTFIXES 2026-05-25 fb-41 post-PR-7) tuned
// this down from 30 → 15 to keep the synthesize prompt tight
// enough for gemma3:4b to follow the citation rule.
assert_eq!(Config::defaults().rag.multi_hop_max_pool_chunks, 15);
}
#[test]
fn env_overrides_multi_hop_knobs() {
let mut env = HashMap::new();
env.insert("KEBAB_RAG_MULTI_HOP_MAX_DEPTH".to_string(), "5".to_string());
env.insert(
"KEBAB_RAG_MULTI_HOP_MAX_SUB_QUERIES_PER_ITER".to_string(),
"7".to_string(),
);
env.insert(
"KEBAB_RAG_MULTI_HOP_MAX_POOL_CHUNKS".to_string(),
"50".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert_eq!(c.rag.multi_hop_max_depth, 5);
assert_eq!(c.rag.multi_hop_max_sub_queries_per_iter, 7);
assert_eq!(c.rag.multi_hop_max_pool_chunks, 50);
}
/// post-PR-3 fb-41: a config file written before the multi-hop
/// knobs existed must still parse and fall back to the documented
/// defaults — backwards-compat invariant. Fixture shared with the
/// LLM / OCR timeout invariants via [`LEGACY_PRE_TIMEOUT_TOML`]
/// (that fixture also predates the multi_hop_* fields).
#[test]
fn legacy_config_without_multi_hop_knobs_uses_defaults() {
let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config");
assert_eq!(c.rag.multi_hop_max_depth, 3);
assert_eq!(c.rag.multi_hop_max_sub_queries_per_iter, 5);
// v0.18 dogfood (post-PR-7): pool default 30 → 15.
assert_eq!(c.rag.multi_hop_max_pool_chunks, 15);
}
// ── p9-fb-41 PR-9c-1: NLI verification knobs ─────────────────────────
#[test]
fn default_nli_threshold_is_zero() {
// Spec §2.6: NLI gate disabled by default — verification is
// opt-in. `0.0` keeps multi-hop behavior identical to PR-3b.
assert_eq!(Config::defaults().rag.nli_threshold, 0.0);
}
#[test]
fn default_nli_model_is_xenova_mdeberta() {
// Pin the default model id so a refactor that touches NliCfg
// can't silently flip to a different verifier model.
assert_eq!(
Config::defaults().models.nli.model,
"Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
);
assert_eq!(Config::defaults().models.nli.provider, "onnx");
}
/// A config file written before the `[models.nli]` / `nli_threshold`
/// keys existed must still parse and fall back to the documented
/// defaults. Fixture shared via [`LEGACY_PRE_TIMEOUT_TOML`] (predates
/// all PR-9c-1 fields).
#[test]
fn legacy_config_without_nli_uses_defaults() {
let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config");
assert_eq!(c.rag.nli_threshold, 0.0);
assert_eq!(
c.models.nli.model,
"Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
);
assert_eq!(c.models.nli.provider, "onnx");
}
#[test]
fn env_override_nli_threshold() {
let mut env = HashMap::new();
env.insert("KEBAB_RAG_NLI_THRESHOLD".to_string(), "0.5".to_string());
let c = Config::defaults().apply_env(&env);
assert!((c.rag.nli_threshold - 0.5).abs() < 1e-6);
}
#[test]
fn env_override_nli_model_and_provider() {
let mut env = HashMap::new();
env.insert(
"KEBAB_MODELS_NLI_MODEL".to_string(),
"user/custom-nli-model".to_string(),
);
env.insert(
"KEBAB_MODELS_NLI_PROVIDER".to_string(),
"candle".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert_eq!(c.models.nli.model, "user/custom-nli-model");
assert_eq!(c.models.nli.provider, "candle");
}
/// Malformed `KEBAB_RAG_NLI_THRESHOLD` keeps the prior value (does
/// NOT silently disable nor crash). The `tracing::warn!` surface
/// is observable only when the user has tracing wired; the
/// behavior contract is "default survives".
#[test]
fn env_malformed_nli_threshold_keeps_prior_value() {
let mut env = HashMap::new();
env.insert(
"KEBAB_RAG_NLI_THRESHOLD".to_string(),
"not-a-float".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert_eq!(
c.rag.nli_threshold, 0.0,
"malformed env value must keep the default unchanged"
);
}
#[test]
fn image_ocr_env_overrides() {
let mut env = HashMap::new();
env.insert("KEBAB_IMAGE_OCR_ENABLED".to_string(), "true".to_string());
env.insert(
"KEBAB_IMAGE_OCR_MODEL".to_string(),
"gemma4:31b".to_string(),
);
env.insert(
"KEBAB_IMAGE_OCR_ENDPOINT".to_string(),
"http://192.168.0.47:11434".to_string(),
);
// Empty env value should map to None (= fall back to llm.endpoint).
// We exercise that branch in a separate test.
env.insert(
"KEBAB_IMAGE_OCR_LANGUAGES".to_string(),
"eng, kor, jpn".to_string(),
);
env.insert("KEBAB_IMAGE_OCR_MAX_PIXELS".to_string(), "2048".to_string());
let c = Config::defaults().apply_env(&env);
assert!(c.ingest.image.ocr.enabled);
assert_eq!(c.ingest.image.ocr.model, "gemma4:31b");
assert_eq!(
c.ingest.image.ocr.endpoint.as_deref(),
Some("http://192.168.0.47:11434")
);
assert_eq!(c.ingest.image.ocr.languages, vec!["eng", "kor", "jpn"]);
assert_eq!(c.ingest.image.ocr.max_pixels, 2048);
}
/// Pre-P6 config files don't have an `[image]` section. The
/// `#[serde(default)]` attribute on `Config::image` must let those
/// files load with `ImageCfg::defaults()` instead of erroring.
#[test]
fn image_caption_defaults_disabled() {
let c = Config::defaults();
assert!(!c.ingest.image.caption.enabled);
assert_eq!(c.ingest.image.caption.max_pixels, 768);
assert_eq!(c.ingest.image.caption.prompt_template_version, "caption-v1");
}
#[test]
fn image_caption_env_overrides() {
let mut env = HashMap::new();
env.insert(
"KEBAB_IMAGE_CAPTION_ENABLED".to_string(),
"true".to_string(),
);
env.insert(
"KEBAB_IMAGE_CAPTION_MAX_PIXELS".to_string(),
"1024".to_string(),
);
env.insert(
"KEBAB_IMAGE_CAPTION_PROMPT_TEMPLATE_VERSION".to_string(),
"caption-v2".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert!(c.ingest.image.caption.enabled);
assert_eq!(c.ingest.image.caption.max_pixels, 1024);
assert_eq!(c.ingest.image.caption.prompt_template_version, "caption-v2");
}
/// `KEBAB_IMAGE_OCR_ENDPOINT=""` (empty value) should map to `None`
/// rather than to `Some("")` so the fallback to `models.llm.endpoint`
/// kicks in. Covers the env-equivalent of a missing TOML key.
#[test]
fn image_ocr_endpoint_empty_env_value_is_none() {
let mut env = HashMap::new();
env.insert("KEBAB_IMAGE_OCR_ENDPOINT".to_string(), String::new());
let c = Config::defaults().apply_env(&env);
assert_eq!(c.ingest.image.ocr.endpoint, None);
}
#[test]
fn pre_p6_config_without_image_section_loads_with_defaults() {
let toml_text = r#"
schema_version = 1
[workspace]
root = "/tmp/x"
include = ["**/*.md"]
exclude = []
[storage]
data_dir = "/tmp/d"
sqlite = "{data_dir}/x.sqlite"
vector_dir = "{data_dir}/v"
asset_dir = "{data_dir}/a"
artifact_dir = "{data_dir}/r"
model_dir = "{data_dir}/m"
runs_dir = "{data_dir}/u"
copy_threshold_mb = 100
[indexing]
max_parallel_extractors = 2
max_parallel_embeddings = 1
watch_filesystem = false
[chunking]
target_tokens = 500
overlap_tokens = 80
respect_markdown_headings = true
chunker_version = "md-heading-v1"
[models.embedding]
provider = "fastembed"
model = "multilingual-e5-large"
version = "v1"
dimensions = 1024
batch_size = 64
[models.llm]
provider = "ollama"
model = "gemma4:e4b"
context_tokens = 32768
endpoint = "http://127.0.0.1:11434"
temperature = 0.0
seed = 0
[search]
default_k = 10
hybrid_fusion = "rrf"
rrf_k = 60
snippet_chars = 220
stale_threshold_days = 30
[rag]
prompt_template_version = "rag-v2"
score_gate = 0.30
explain_default = false
max_context_tokens = 8000
"#;
let c: Config = toml::from_str(toml_text).expect("pre-P6 TOML must still parse");
assert_eq!(c.ingest.image, ImageCfg::defaults());
}
/// p9-fb-25: legacy config with `workspace.include = [...]` must
/// still deserialize cleanly (silent unknown-field acceptance).
#[test]
fn legacy_include_field_is_ignored_silently() {
let mut cfg = Config::defaults();
cfg.workspace.root = "/tmp/kebab-legacy".to_string();
let mut toml_text = toml::to_string(&cfg).expect("default round-trips");
// Inject a legacy `include = [...]` line into the [workspace] block.
toml_text = toml_text.replace(
"[workspace]",
"[workspace]\ninclude = [\"**/*.md\", \"**/*.txt\"]",
);
let parsed: Result<Config, _> = toml::from_str(&toml_text);
assert!(
parsed.is_ok(),
"legacy include must not break load: {:?}",
parsed.err()
);
let cfg = parsed.unwrap();
assert_eq!(cfg.workspace.root, "/tmp/kebab-legacy");
}
/// p9-fb-25: `WorkspaceCfg` must NOT have an `include` field.
/// Compile-time proof: exhaustive destructure.
#[test]
fn workspace_cfg_has_only_root_and_exclude_fields() {
let ws = Config::defaults().workspace;
let WorkspaceCfg {
root: _,
exclude: _,
} = &ws;
}
#[test]
fn default_stale_threshold_is_30() {
let c = Config::defaults();
assert_eq!(c.search.stale_threshold_days, 30);
}
#[test]
fn env_override_stale_threshold() {
let c = Config::defaults();
let env: HashMap<String, String> = [(
"KEBAB_SEARCH_STALE_THRESHOLD_DAYS".to_string(),
"7".to_string(),
)]
.into_iter()
.collect();
let c = c.apply_env(&env);
assert_eq!(c.search.stale_threshold_days, 7);
}
#[test]
fn env_negative_threshold_silently_ignored() {
// Env path: malformed numeric values (including negatives that
// can't fit `u32`) are silently ignored — same pattern as
// `KEBAB_SEARCH_DEFAULT_K`. The TOML file-load path (covered in
// `fb27_tests::file_negative_stale_threshold_returns_config_invalid`)
// is the spec-required hard error surface.
let c = Config::defaults();
let env: HashMap<String, String> = [(
"KEBAB_SEARCH_STALE_THRESHOLD_DAYS".to_string(),
"-5".to_string(),
)]
.into_iter()
.collect();
let c = c.apply_env(&env);
assert_eq!(
c.search.stale_threshold_days, 30,
"env path: malformed value must leave the default unchanged"
);
}
#[test]
fn xdg_paths_honor_env() {
// Must restore env after the test to avoid polluting other tests.
let prev = std::env::var("XDG_CONFIG_HOME").ok();
// SAFETY: tests in this module run sequentially; we restore below.
unsafe {
std::env::set_var("XDG_CONFIG_HOME", "/tmp/kebabtest-xdg-config");
}
let p = Config::xdg_config_path();
assert_eq!(
p,
PathBuf::from("/tmp/kebabtest-xdg-config/kebab/config.toml")
);
// SAFETY: scope-local restore.
unsafe {
match prev {
Some(v) => std::env::set_var("XDG_CONFIG_HOME", v),
None => std::env::remove_var("XDG_CONFIG_HOME"),
}
}
}
#[test]
fn ingest_code_cfg_defaults() {
let cfg: IngestCodeCfg = toml::from_str("").unwrap();
assert_eq!(cfg.max_file_bytes, 262_144);
assert_eq!(cfg.max_file_lines, 5_000);
assert!(cfg.skip_generated_header);
assert!(cfg.extra_skip_globs.is_empty());
assert_eq!(cfg.ast_chunk_max_lines, 200);
assert_eq!(cfg.fallback_lines_per_chunk, 80);
assert_eq!(cfg.fallback_lines_overlap, 20);
}
#[test]
fn ingest_code_cfg_user_override() {
let toml = r#"
max_file_bytes = 1048576
max_file_lines = 20000
skip_generated_header = false
extra_skip_globs = ["**/fixtures/**", "**/snapshots/**"]
"#;
let cfg: IngestCodeCfg = toml::from_str(toml).unwrap();
assert_eq!(cfg.max_file_bytes, 1_048_576);
assert_eq!(cfg.max_file_lines, 20_000);
assert!(!cfg.skip_generated_header);
assert_eq!(cfg.extra_skip_globs.len(), 2);
}
#[test]
fn config_with_ingest_code_section() {
// Build a full valid Config serialization and patch only the
// [ingest.code] field we care about — avoids having to enumerate
// every required Config field in the test fixture.
let base = Config::defaults();
let mut toml_text = toml::to_string(&base).unwrap();
// Inject max_file_bytes override into the [ingest.code] table.
toml_text = toml_text.replace("max_file_bytes = 262144", "max_file_bytes = 524288");
let cfg: Config = toml::from_str(&toml_text).unwrap();
assert_eq!(cfg.ingest.code.max_file_bytes, 524_288);
}
}
#[cfg(test)]
mod fb27_tests {
use super::*;
use std::path::PathBuf;
#[test]
fn config_invalid_carries_path_and_cause() {
let nonexistent = PathBuf::from("/this/path/should/not/exist/kebab.toml");
let err = Config::from_file(&nonexistent).unwrap_err();
let signal = err
.downcast_ref::<ConfigInvalid>()
.expect("from_file error should downcast to ConfigInvalid");
assert_eq!(signal.path, nonexistent);
assert!(!signal.cause.is_empty(), "cause should be non-empty");
}
#[test]
fn config_invalid_on_malformed_toml() {
let dir = tempfile::tempdir().unwrap();
let p = dir.path().join("bad.toml");
std::fs::write(&p, "this is not [valid toml").unwrap();
let err = Config::from_file(&p).unwrap_err();
let signal = err
.downcast_ref::<ConfigInvalid>()
.expect("malformed TOML should downcast to ConfigInvalid");
assert_eq!(signal.path, p);
assert!(!signal.cause.is_empty(), "cause should be non-empty");
}
/// Spec §Config: a negative `stale_threshold_days` in TOML must be
/// rejected at load time (not silently coerced or ignored). serde's
/// `u32` type-check surfaces the failure as a parse error, which
/// `from_file` wraps into `ConfigInvalid`. CLI's `error_classify`
/// downcasts this and emits `error.v1.code = "config_invalid"`.
#[test]
fn file_negative_stale_threshold_returns_config_invalid() {
let dir = tempfile::tempdir().unwrap();
let p = dir.path().join("neg.toml");
// Build a minimally valid TOML and override only the field
// under test — this isolates the failure to the negative
// value rather than missing required sections.
let cfg = Config::defaults();
let mut toml_text = toml::to_string(&cfg).expect("default round-trips");
assert!(
toml_text.contains("stale_threshold_days = 30"),
"default value drifted; update test fixture"
);
toml_text = toml_text.replace("stale_threshold_days = 30", "stale_threshold_days = -5");
std::fs::write(&p, &toml_text).unwrap();
let err = Config::from_file(&p).unwrap_err();
let signal = err
.downcast_ref::<ConfigInvalid>()
.expect("negative stale_threshold_days should downcast to ConfigInvalid");
assert_eq!(signal.path, p);
assert!(
signal.cause.contains("parse_failed"),
"expected parse_failed cause, got: {}",
signal.cause
);
}
#[test]
fn config_load_explicit_nonexistent_path_returns_config_not_found() {
// Bug #10: --config /tmp/nonexistent.toml → silent fallback 금지.
let p = std::path::Path::new("/tmp/__kebab_bugfix3_nonexistent.toml");
assert!(!p.exists(), "test precondition: path must not exist");
let err = Config::load(Some(p)).expect_err("expected ConfigNotFound");
let signal = err
.downcast_ref::<ConfigNotFound>()
.expect("from_load error should downcast to ConfigNotFound");
assert_eq!(signal.path, p.to_path_buf());
}
#[test]
fn pdf_ocr_request_timeout_default_is_180s() {
// Bug #11 (dogfood 2026-05-27 + follow-up 2026-05-28):
// default 600s → 60s → 180s (sweet spot 점진적 축소 정책).
let cfg = PdfOcrCfg::defaults();
assert_eq!(
cfg.request_timeout_secs, 180,
"pdf.ocr.request_timeout_secs default must be 180s (Bug #11, HOTFIXES 2026-05-28)"
);
}
}