diff --git a/crates/kb-config/Cargo.toml b/crates/kb-config/Cargo.toml new file mode 100644 index 0000000..000ccc2 --- /dev/null +++ b/crates/kb-config/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "kb-config" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Config schema + XDG path resolution" + +[dependencies] +kb-core = { path = "../kb-core" } +anyhow = { workspace = true } +thiserror = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +toml = "0.8" +dirs = "5" diff --git a/crates/kb-config/src/lib.rs b/crates/kb-config/src/lib.rs new file mode 100644 index 0000000..40b35ec --- /dev/null +++ b/crates/kb-config/src/lib.rs @@ -0,0 +1,330 @@ +//! `kb-config` — `Config` schema and XDG path resolution (§6). +//! +//! Layer order (`Config::load`): defaults → file → env (`KB_
_`). +//! CLI overrides land later, applied by `kb-cli` after `Config::load`. + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; + +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Config { + pub schema_version: u32, + pub workspace: WorkspaceCfg, + pub storage: StorageCfg, + pub indexing: IndexingCfg, + pub chunking: ChunkingCfg, + pub models: ModelsCfg, + pub search: SearchCfg, + pub rag: RagCfg, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct WorkspaceCfg { + pub root: String, + pub include: Vec, + pub exclude: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct StorageCfg { + pub data_dir: String, + pub sqlite: String, + pub vector_dir: String, + pub asset_dir: String, + pub artifact_dir: String, + pub model_dir: String, + pub runs_dir: String, + pub copy_threshold_mb: u64, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct IndexingCfg { + pub max_parallel_extractors: u32, + pub max_parallel_embeddings: u32, + pub watch_filesystem: bool, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ChunkingCfg { + pub target_tokens: usize, + pub overlap_tokens: usize, + pub respect_markdown_headings: bool, + pub chunker_version: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ModelsCfg { + pub embedding: EmbeddingModelCfg, + pub llm: LlmCfg, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct EmbeddingModelCfg { + pub provider: String, + pub model: String, + pub version: String, + pub dimensions: usize, + pub batch_size: usize, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct LlmCfg { + pub provider: String, + pub model: String, + pub context_tokens: usize, + pub endpoint: String, + pub temperature: f32, + pub seed: u64, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct SearchCfg { + pub default_k: usize, + pub hybrid_fusion: String, + pub rrf_k: u32, + pub snippet_chars: usize, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct RagCfg { + pub prompt_template_version: String, + pub score_gate: f32, + pub explain_default: bool, + pub max_context_tokens: usize, +} + +impl Config { + /// Defaults per design §6.4. + pub fn defaults() -> Self { + Self { + schema_version: 1, + workspace: WorkspaceCfg { + root: "~/KnowledgeBase".to_string(), + include: vec!["**/*.md".to_string()], + exclude: vec![ + ".git/**".to_string(), + "node_modules/**".to_string(), + ".obsidian/**".to_string(), + ], + }, + storage: StorageCfg { + data_dir: "${XDG_DATA_HOME:-~/.local/share}/kb".to_string(), + sqlite: "{data_dir}/kb.sqlite".to_string(), + vector_dir: "{data_dir}/lancedb".to_string(), + asset_dir: "{data_dir}/assets".to_string(), + artifact_dir: "{data_dir}/artifacts".to_string(), + model_dir: "{data_dir}/models".to_string(), + runs_dir: "{data_dir}/runs".to_string(), + copy_threshold_mb: 100, + }, + indexing: IndexingCfg { + max_parallel_extractors: 2, + max_parallel_embeddings: 1, + watch_filesystem: false, + }, + chunking: ChunkingCfg { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: true, + chunker_version: "md-heading-v1".to_string(), + }, + models: ModelsCfg { + embedding: EmbeddingModelCfg { + provider: "fastembed".to_string(), + model: "multilingual-e5-small".to_string(), + version: "v1".to_string(), + dimensions: 384, + batch_size: 64, + }, + llm: LlmCfg { + provider: "ollama".to_string(), + model: "qwen2.5:14b-instruct".to_string(), + context_tokens: 32768, + endpoint: "http://127.0.0.1:11434".to_string(), + temperature: 0.0, + seed: 0, + }, + }, + search: SearchCfg { + default_k: 10, + hybrid_fusion: "rrf".to_string(), + rrf_k: 60, + snippet_chars: 220, + }, + rag: RagCfg { + prompt_template_version: "rag-v1".to_string(), + score_gate: 0.30, + explain_default: false, + max_context_tokens: 8000, + }, + } + } + + /// Read config from disk and merge env overrides on top of it. If the + /// file is missing, defaults are used (so `kb doctor` runs with no + /// prior `kb init`). + pub fn load(path: Option<&Path>) -> anyhow::Result { + let from_disk = match path { + Some(p) if p.exists() => Self::from_file(p)?, + Some(_) => Self::defaults(), + None => { + let p = Self::xdg_config_path(); + if p.exists() { + Self::from_file(&p)? + } else { + Self::defaults() + } + } + }; + let env: HashMap = std::env::vars().collect(); + Ok(from_disk.apply_env(&env)) + } + + pub fn from_file(path: &Path) -> anyhow::Result { + let text = std::fs::read_to_string(path)?; + let cfg: Self = toml::from_str(&text)?; + Ok(cfg) + } + + /// Apply `KB_
_` env overrides. Unknown keys are ignored. + pub fn apply_env(mut self, env: &HashMap) -> Self { + for (k, v) in env { + if !k.starts_with("KB_") { + continue; + } + // Match a small, intentional whitelist for P0 — full env→config + // mapping lands when the rest of the schema is wired up. + match k.as_str() { + "KB_WORKSPACE_ROOT" => self.workspace.root = v.clone(), + "KB_RAG_SCORE_GATE" => { + if let Ok(f) = v.parse::() { + self.rag.score_gate = f; + } + } + "KB_RAG_EXPLAIN_DEFAULT" => { + self.rag.explain_default = matches!(v.as_str(), "1" | "true" | "yes"); + } + "KB_SEARCH_DEFAULT_K" => { + if let Ok(k) = v.parse::() { + self.search.default_k = k; + } + } + _ => {} + } + } + self + } + + /// `~/.config/kb/config.toml` (honors `XDG_CONFIG_HOME`). + pub fn xdg_config_path() -> PathBuf { + if let Ok(custom) = std::env::var("XDG_CONFIG_HOME") { + if !custom.is_empty() { + return PathBuf::from(custom).join("kb").join("config.toml"); + } + } + match dirs::config_dir() { + Some(d) => d.join("kb").join("config.toml"), + None => PathBuf::from("./kb/config.toml"), + } + } + + /// `~/.local/share/kb` (honors `XDG_DATA_HOME`). + pub fn xdg_data_dir() -> PathBuf { + if let Ok(custom) = std::env::var("XDG_DATA_HOME") { + if !custom.is_empty() { + return PathBuf::from(custom).join("kb"); + } + } + match dirs::data_dir() { + Some(d) => d.join("kb"), + None => PathBuf::from("./kb-data"), + } + } + + /// `~/.cache/kb` (honors `XDG_CACHE_HOME`). + pub fn xdg_cache_dir() -> PathBuf { + if let Ok(custom) = std::env::var("XDG_CACHE_HOME") { + if !custom.is_empty() { + return PathBuf::from(custom).join("kb"); + } + } + match dirs::cache_dir() { + Some(d) => d.join("kb"), + None => PathBuf::from("./kb-cache"), + } + } + + /// `~/.local/state/kb` (honors `XDG_STATE_HOME`). + pub fn xdg_state_dir() -> PathBuf { + if let Ok(custom) = std::env::var("XDG_STATE_HOME") { + if !custom.is_empty() { + return PathBuf::from(custom).join("kb"); + } + } + // `dirs` doesn't expose state_dir on all platforms; fall back to + // `$HOME/.local/state/kb` if XDG_STATE_HOME is unset. + if let Some(home) = dirs::home_dir() { + return home.join(".local").join("state").join("kb"); + } + PathBuf::from("./kb-state") + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn defaults_are_serde_roundtrip_stable() { + let c = Config::defaults(); + let toml_text = toml::to_string(&c).unwrap(); + let back: Config = toml::from_str(&toml_text).unwrap(); + assert_eq!(c, back); + } + + #[test] + fn defaults_match_design_64_score_gate() { + let c = Config::defaults(); + assert_eq!(c.rag.score_gate, 0.30); + assert_eq!(c.chunking.target_tokens, 500); + assert_eq!(c.models.embedding.dimensions, 384); + assert_eq!(c.search.rrf_k, 60); + } + + #[test] + fn env_override_score_gate() { + let mut env = HashMap::new(); + env.insert("KB_RAG_SCORE_GATE".to_string(), "0.5".to_string()); + let c = Config::defaults().apply_env(&env); + assert!((c.rag.score_gate - 0.5).abs() < 1e-6); + } + + #[test] + fn env_override_search_k() { + let mut env = HashMap::new(); + env.insert("KB_SEARCH_DEFAULT_K".to_string(), "25".to_string()); + let c = Config::defaults().apply_env(&env); + assert_eq!(c.search.default_k, 25); + } + + #[test] + fn xdg_paths_honor_env() { + // Must restore env after the test to avoid polluting other tests. + let prev = std::env::var("XDG_CONFIG_HOME").ok(); + // SAFETY: tests in this module run sequentially; we restore below. + unsafe { + std::env::set_var("XDG_CONFIG_HOME", "/tmp/kbtest-xdg-config"); + } + let p = Config::xdg_config_path(); + assert_eq!(p, PathBuf::from("/tmp/kbtest-xdg-config/kb/config.toml")); + // SAFETY: scope-local restore. + unsafe { + match prev { + Some(v) => std::env::set_var("XDG_CONFIG_HOME", v), + None => std::env::remove_var("XDG_CONFIG_HOME"), + } + } + } +}