p0-1: kb-config schema + XDG path resolution
Adds the kb-config crate per design §6. Provides the frozen Config schema (§6.4) with serde + toml round-trip, defaults() that exactly match the reference values (e.g. score_gate=0.30, target_tokens=500, embedding.dimensions=384, rrf_k=60), and XDG path resolvers that honor XDG_CONFIG_HOME / XDG_DATA_HOME / XDG_CACHE_HOME / XDG_STATE_HOME. Layer order in load(): defaults → file → env (KB_<SECTION>_<KEY>); CLI overrides apply later in kb-cli. Env mapping covers the keys needed by P0 smoke tests; the rest land as their config sections wire up. 5 unit tests cover serde round-trip, defaults pinned to design, KB_RAG_SCORE_GATE / KB_SEARCH_DEFAULT_K env override, and XDG_CONFIG_HOME handling. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
17
crates/kb-config/Cargo.toml
Normal file
17
crates/kb-config/Cargo.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "kb-config"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Config schema + XDG path resolution"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
anyhow = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
toml = "0.8"
|
||||
dirs = "5"
|
||||
330
crates/kb-config/src/lib.rs
Normal file
330
crates/kb-config/src/lib.rs
Normal file
@@ -0,0 +1,330 @@
|
||||
//! `kb-config` — `Config` schema and XDG path resolution (§6).
|
||||
//!
|
||||
//! Layer order (`Config::load`): defaults → file → env (`KB_<SECTION>_<KEY>`).
|
||||
//! CLI overrides land later, applied by `kb-cli` after `Config::load`.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Config {
|
||||
pub schema_version: u32,
|
||||
pub workspace: WorkspaceCfg,
|
||||
pub storage: StorageCfg,
|
||||
pub indexing: IndexingCfg,
|
||||
pub chunking: ChunkingCfg,
|
||||
pub models: ModelsCfg,
|
||||
pub search: SearchCfg,
|
||||
pub rag: RagCfg,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct WorkspaceCfg {
|
||||
pub root: String,
|
||||
pub include: Vec<String>,
|
||||
pub exclude: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct StorageCfg {
|
||||
pub data_dir: String,
|
||||
pub sqlite: String,
|
||||
pub vector_dir: String,
|
||||
pub asset_dir: String,
|
||||
pub artifact_dir: String,
|
||||
pub model_dir: String,
|
||||
pub runs_dir: String,
|
||||
pub copy_threshold_mb: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IndexingCfg {
|
||||
pub max_parallel_extractors: u32,
|
||||
pub max_parallel_embeddings: u32,
|
||||
pub watch_filesystem: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ChunkingCfg {
|
||||
pub target_tokens: usize,
|
||||
pub overlap_tokens: usize,
|
||||
pub respect_markdown_headings: bool,
|
||||
pub chunker_version: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ModelsCfg {
|
||||
pub embedding: EmbeddingModelCfg,
|
||||
pub llm: LlmCfg,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct EmbeddingModelCfg {
|
||||
pub provider: String,
|
||||
pub model: String,
|
||||
pub version: String,
|
||||
pub dimensions: usize,
|
||||
pub batch_size: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct LlmCfg {
|
||||
pub provider: String,
|
||||
pub model: String,
|
||||
pub context_tokens: usize,
|
||||
pub endpoint: String,
|
||||
pub temperature: f32,
|
||||
pub seed: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SearchCfg {
|
||||
pub default_k: usize,
|
||||
pub hybrid_fusion: String,
|
||||
pub rrf_k: u32,
|
||||
pub snippet_chars: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RagCfg {
|
||||
pub prompt_template_version: String,
|
||||
pub score_gate: f32,
|
||||
pub explain_default: bool,
|
||||
pub max_context_tokens: usize,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Defaults per design §6.4.
|
||||
pub fn defaults() -> Self {
|
||||
Self {
|
||||
schema_version: 1,
|
||||
workspace: WorkspaceCfg {
|
||||
root: "~/KnowledgeBase".to_string(),
|
||||
include: vec!["**/*.md".to_string()],
|
||||
exclude: vec![
|
||||
".git/**".to_string(),
|
||||
"node_modules/**".to_string(),
|
||||
".obsidian/**".to_string(),
|
||||
],
|
||||
},
|
||||
storage: StorageCfg {
|
||||
data_dir: "${XDG_DATA_HOME:-~/.local/share}/kb".to_string(),
|
||||
sqlite: "{data_dir}/kb.sqlite".to_string(),
|
||||
vector_dir: "{data_dir}/lancedb".to_string(),
|
||||
asset_dir: "{data_dir}/assets".to_string(),
|
||||
artifact_dir: "{data_dir}/artifacts".to_string(),
|
||||
model_dir: "{data_dir}/models".to_string(),
|
||||
runs_dir: "{data_dir}/runs".to_string(),
|
||||
copy_threshold_mb: 100,
|
||||
},
|
||||
indexing: IndexingCfg {
|
||||
max_parallel_extractors: 2,
|
||||
max_parallel_embeddings: 1,
|
||||
watch_filesystem: false,
|
||||
},
|
||||
chunking: ChunkingCfg {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: true,
|
||||
chunker_version: "md-heading-v1".to_string(),
|
||||
},
|
||||
models: ModelsCfg {
|
||||
embedding: EmbeddingModelCfg {
|
||||
provider: "fastembed".to_string(),
|
||||
model: "multilingual-e5-small".to_string(),
|
||||
version: "v1".to_string(),
|
||||
dimensions: 384,
|
||||
batch_size: 64,
|
||||
},
|
||||
llm: LlmCfg {
|
||||
provider: "ollama".to_string(),
|
||||
model: "qwen2.5:14b-instruct".to_string(),
|
||||
context_tokens: 32768,
|
||||
endpoint: "http://127.0.0.1:11434".to_string(),
|
||||
temperature: 0.0,
|
||||
seed: 0,
|
||||
},
|
||||
},
|
||||
search: SearchCfg {
|
||||
default_k: 10,
|
||||
hybrid_fusion: "rrf".to_string(),
|
||||
rrf_k: 60,
|
||||
snippet_chars: 220,
|
||||
},
|
||||
rag: RagCfg {
|
||||
prompt_template_version: "rag-v1".to_string(),
|
||||
score_gate: 0.30,
|
||||
explain_default: false,
|
||||
max_context_tokens: 8000,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Read config from disk and merge env overrides on top of it. If the
|
||||
/// file is missing, defaults are used (so `kb doctor` runs with no
|
||||
/// prior `kb init`).
|
||||
pub fn load(path: Option<&Path>) -> anyhow::Result<Self> {
|
||||
let from_disk = match path {
|
||||
Some(p) if p.exists() => Self::from_file(p)?,
|
||||
Some(_) => Self::defaults(),
|
||||
None => {
|
||||
let p = Self::xdg_config_path();
|
||||
if p.exists() {
|
||||
Self::from_file(&p)?
|
||||
} else {
|
||||
Self::defaults()
|
||||
}
|
||||
}
|
||||
};
|
||||
let env: HashMap<String, String> = std::env::vars().collect();
|
||||
Ok(from_disk.apply_env(&env))
|
||||
}
|
||||
|
||||
pub fn from_file(path: &Path) -> anyhow::Result<Self> {
|
||||
let text = std::fs::read_to_string(path)?;
|
||||
let cfg: Self = toml::from_str(&text)?;
|
||||
Ok(cfg)
|
||||
}
|
||||
|
||||
/// Apply `KB_<SECTION>_<KEY>` env overrides. Unknown keys are ignored.
|
||||
pub fn apply_env(mut self, env: &HashMap<String, String>) -> Self {
|
||||
for (k, v) in env {
|
||||
if !k.starts_with("KB_") {
|
||||
continue;
|
||||
}
|
||||
// Match a small, intentional whitelist for P0 — full env→config
|
||||
// mapping lands when the rest of the schema is wired up.
|
||||
match k.as_str() {
|
||||
"KB_WORKSPACE_ROOT" => self.workspace.root = v.clone(),
|
||||
"KB_RAG_SCORE_GATE" => {
|
||||
if let Ok(f) = v.parse::<f32>() {
|
||||
self.rag.score_gate = f;
|
||||
}
|
||||
}
|
||||
"KB_RAG_EXPLAIN_DEFAULT" => {
|
||||
self.rag.explain_default = matches!(v.as_str(), "1" | "true" | "yes");
|
||||
}
|
||||
"KB_SEARCH_DEFAULT_K" => {
|
||||
if let Ok(k) = v.parse::<usize>() {
|
||||
self.search.default_k = k;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// `~/.config/kb/config.toml` (honors `XDG_CONFIG_HOME`).
|
||||
pub fn xdg_config_path() -> PathBuf {
|
||||
if let Ok(custom) = std::env::var("XDG_CONFIG_HOME") {
|
||||
if !custom.is_empty() {
|
||||
return PathBuf::from(custom).join("kb").join("config.toml");
|
||||
}
|
||||
}
|
||||
match dirs::config_dir() {
|
||||
Some(d) => d.join("kb").join("config.toml"),
|
||||
None => PathBuf::from("./kb/config.toml"),
|
||||
}
|
||||
}
|
||||
|
||||
/// `~/.local/share/kb` (honors `XDG_DATA_HOME`).
|
||||
pub fn xdg_data_dir() -> PathBuf {
|
||||
if let Ok(custom) = std::env::var("XDG_DATA_HOME") {
|
||||
if !custom.is_empty() {
|
||||
return PathBuf::from(custom).join("kb");
|
||||
}
|
||||
}
|
||||
match dirs::data_dir() {
|
||||
Some(d) => d.join("kb"),
|
||||
None => PathBuf::from("./kb-data"),
|
||||
}
|
||||
}
|
||||
|
||||
/// `~/.cache/kb` (honors `XDG_CACHE_HOME`).
|
||||
pub fn xdg_cache_dir() -> PathBuf {
|
||||
if let Ok(custom) = std::env::var("XDG_CACHE_HOME") {
|
||||
if !custom.is_empty() {
|
||||
return PathBuf::from(custom).join("kb");
|
||||
}
|
||||
}
|
||||
match dirs::cache_dir() {
|
||||
Some(d) => d.join("kb"),
|
||||
None => PathBuf::from("./kb-cache"),
|
||||
}
|
||||
}
|
||||
|
||||
/// `~/.local/state/kb` (honors `XDG_STATE_HOME`).
|
||||
pub fn xdg_state_dir() -> PathBuf {
|
||||
if let Ok(custom) = std::env::var("XDG_STATE_HOME") {
|
||||
if !custom.is_empty() {
|
||||
return PathBuf::from(custom).join("kb");
|
||||
}
|
||||
}
|
||||
// `dirs` doesn't expose state_dir on all platforms; fall back to
|
||||
// `$HOME/.local/state/kb` if XDG_STATE_HOME is unset.
|
||||
if let Some(home) = dirs::home_dir() {
|
||||
return home.join(".local").join("state").join("kb");
|
||||
}
|
||||
PathBuf::from("./kb-state")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn defaults_are_serde_roundtrip_stable() {
|
||||
let c = Config::defaults();
|
||||
let toml_text = toml::to_string(&c).unwrap();
|
||||
let back: Config = toml::from_str(&toml_text).unwrap();
|
||||
assert_eq!(c, back);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn defaults_match_design_64_score_gate() {
|
||||
let c = Config::defaults();
|
||||
assert_eq!(c.rag.score_gate, 0.30);
|
||||
assert_eq!(c.chunking.target_tokens, 500);
|
||||
assert_eq!(c.models.embedding.dimensions, 384);
|
||||
assert_eq!(c.search.rrf_k, 60);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_override_score_gate() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert("KB_RAG_SCORE_GATE".to_string(), "0.5".to_string());
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert!((c.rag.score_gate - 0.5).abs() < 1e-6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_override_search_k() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert("KB_SEARCH_DEFAULT_K".to_string(), "25".to_string());
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert_eq!(c.search.default_k, 25);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xdg_paths_honor_env() {
|
||||
// Must restore env after the test to avoid polluting other tests.
|
||||
let prev = std::env::var("XDG_CONFIG_HOME").ok();
|
||||
// SAFETY: tests in this module run sequentially; we restore below.
|
||||
unsafe {
|
||||
std::env::set_var("XDG_CONFIG_HOME", "/tmp/kbtest-xdg-config");
|
||||
}
|
||||
let p = Config::xdg_config_path();
|
||||
assert_eq!(p, PathBuf::from("/tmp/kbtest-xdg-config/kb/config.toml"));
|
||||
// SAFETY: scope-local restore.
|
||||
unsafe {
|
||||
match prev {
|
||||
Some(v) => std::env::set_var("XDG_CONFIG_HOME", v),
|
||||
None => std::env::remove_var("XDG_CONFIG_HOME"),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user