From 58a11cc2b87df0e0604c535ced94b4f88d0696c9 Mon Sep 17 00:00:00 2001 From: altair823 Date: Fri, 1 May 2026 18:01:09 +0000 Subject: [PATCH 1/2] =?UTF-8?q?feat(p5-1):=20kb-eval=20crate=20=E2=80=94?= =?UTF-8?q?=20golden-fixture=20runner=20+=20eval=20persistence?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - new kb-eval crate: load_golden_set (YAML) + run_eval (per-query search/ask + persistence) - new kb-store-sqlite::eval module: record_eval_run_with_results (transactional), document_exists / chunk_exists probes - fixtures/golden_queries.yaml: 5-entry KO+EN template - tests: 13 pass (loader: parse, dup-id, missing chunk_id; runner: elapsed, snapshot, error capture, JSONL, determinism, persistence, config_snapshot) - per_query.jsonl mirror written to runs_dir// - temperature=0 + fixed seed → byte-identical per_query.jsonl (lexical) deviations from spec (documented in code): - run_id uses uuid::Uuid::now_v7().simple() (timestamp-ordered hex) instead of ULID — uuid already in workspace deps - load_golden_set_validated kept #[cfg(test)] pub(crate) — production inlines validate_against_db - snapshot fixture uses normalized projection (id/query/mode/first_hit) — full byte-determinism covered by separate test - index_version in config_snapshot left null (composed per call by kb-app, not config-level) deferred to follow-up: - App reuse across queries (currently rebuilds App per query) - expand_path hoist to kb-config (3 crate clones now) - --max-queries flag (deferred to P5-2 per updated spec) Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 32 ++ Cargo.toml | 4 + crates/kb-eval/Cargo.toml | 30 ++ crates/kb-eval/src/lib.rs | 29 ++ crates/kb-eval/src/loader.rs | 229 ++++++++++ crates/kb-eval/src/runner.rs | 330 ++++++++++++++ crates/kb-eval/src/types.rs | 87 ++++ crates/kb-eval/tests/fixtures/eval/run-1.json | 30 ++ crates/kb-eval/tests/loader.rs | 59 +++ crates/kb-eval/tests/runner.rs | 403 ++++++++++++++++++ crates/kb-store-sqlite/src/eval.rs | 161 +++++++ crates/kb-store-sqlite/src/lib.rs | 2 + fixtures/golden_queries.yaml | 45 ++ tasks/p5/p5-1-golden-fixture-runner.md | 4 +- 14 files changed, 1443 insertions(+), 2 deletions(-) create mode 100644 crates/kb-eval/Cargo.toml create mode 100644 crates/kb-eval/src/lib.rs create mode 100644 crates/kb-eval/src/loader.rs create mode 100644 crates/kb-eval/src/runner.rs create mode 100644 crates/kb-eval/src/types.rs create mode 100644 crates/kb-eval/tests/fixtures/eval/run-1.json create mode 100644 crates/kb-eval/tests/loader.rs create mode 100644 crates/kb-eval/tests/runner.rs create mode 100644 crates/kb-store-sqlite/src/eval.rs create mode 100644 fixtures/golden_queries.yaml diff --git a/Cargo.lock b/Cargo.lock index 6b598dc..87b04ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3478,6 +3478,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "kb-eval" +version = "0.1.0" +dependencies = [ + "anyhow", + "kb-app", + "kb-config", + "kb-core", + "kb-store-sqlite", + "rusqlite", + "serde", + "serde_json", + "serde_yaml", + "tempfile", + "time", + "tracing", + "uuid", +] + [[package]] name = "kb-llm" version = "0.1.0" @@ -6380,6 +6399,19 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap 2.14.0", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "serde_yaml_ng" version = "0.10.0" diff --git a/Cargo.toml b/Cargo.toml index 67e4698..91a6252 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ members = [ "crates/kb-rag", "crates/kb-app", "crates/kb-cli", + "crates/kb-eval", ] [workspace.package] @@ -32,6 +33,9 @@ anyhow = "1" thiserror = "2" serde = { version = "1", features = ["derive"] } serde_json = "1" +# Golden-fixture loader (P5-1, kb-eval) parses YAML; pinned in the +# workspace so future eval-adjacent crates share the same major. +serde_yaml = "0.9" time = { version = "0.3", features = ["serde", "macros", "formatting", "parsing"] } uuid = { version = "1", features = ["v7", "serde"] } blake3 = "1" diff --git a/crates/kb-eval/Cargo.toml b/crates/kb-eval/Cargo.toml new file mode 100644 index 0000000..4281ee4 --- /dev/null +++ b/crates/kb-eval/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "kb-eval" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Golden-fixture eval runner: load YAML, drive kb-app search/ask, persist eval_runs / eval_query_results / per_query.jsonl" + +[dependencies] +# Allowed deps per p5-1 spec — domain types + facade only. +kb-core = { path = "../kb-core" } +kb-config = { path = "../kb-config" } +kb-app = { path = "../kb-app" } +kb-store-sqlite = { path = "../kb-store-sqlite" } +serde = { workspace = true } +serde_json = { workspace = true } +serde_yaml = { workspace = true } +time = { workspace = true } +tracing = { workspace = true } +anyhow = { workspace = true } +# `uuid::Uuid::now_v7()` powers the `run_`-shaped run_id; +# v7 UUIDs are timestamp-ordered (same monotonicity as ULID) and `uuid` +# is already in workspace deps, so we avoid pulling a new ULID crate +# just for the lower-cased timestamp prefix. +uuid = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } +rusqlite = { workspace = true } diff --git a/crates/kb-eval/src/lib.rs b/crates/kb-eval/src/lib.rs new file mode 100644 index 0000000..9825949 --- /dev/null +++ b/crates/kb-eval/src/lib.rs @@ -0,0 +1,29 @@ +//! `kb-eval` — golden-fixture eval runner (P5-1). +//! +//! Loads `fixtures/golden_queries.yaml`, runs each entry through the +//! [`kb_app`] facade (lexical / vector / hybrid + optional RAG), and +//! persists results into `eval_runs` / `eval_query_results` plus +//! `runs_dir//per_query.jsonl` (design §5.7, §6.3). +//! +//! Metric computation lives in P5-2 (`kb-eval::metrics`); this crate is +//! the **data collector** only. +//! +//! ## Allowed deps (per task spec) +//! +//! `kb-core`, `kb-config`, `kb-app`, `kb-store-sqlite`, plus `serde`, +//! `serde_yaml`, `serde_json`, `time`, `tracing`, +//! `anyhow`, `uuid`. Retrieval / embedding / LLM crates are NOT +//! reachable here — every retrieval and `ask` call must go through +//! `kb-app`. +//! +//! ## `run_id` recipe +//! +//! `run_id` uses UUIDv7 simple — timestamp-ordered, lowercase hex. + +mod loader; +mod runner; +mod types; + +pub use loader::load_golden_set; +pub use runner::{run_eval, run_eval_with_config}; +pub use types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult}; diff --git a/crates/kb-eval/src/loader.rs b/crates/kb-eval/src/loader.rs new file mode 100644 index 0000000..5b24fa5 --- /dev/null +++ b/crates/kb-eval/src/loader.rs @@ -0,0 +1,229 @@ +//! Golden-set YAML loader. +//! +//! Two entry points: +//! +//! - [`load_golden_set`] — pure YAML parse + uniqueness check. Used by +//! tests that don't have a SQLite store handy. +//! - [`load_golden_set_validated`] — additionally verifies every +//! `expected_doc_id` / `expected_chunk_id` exists in the SQLite DB +//! the supplied [`kb_config::Config`] points at. Used by +//! [`crate::run_eval`] in production so a stale golden set fails +//! fast at run start. + +use std::collections::{BTreeSet, HashSet}; +use std::path::Path; + +use anyhow::{Context, Result, anyhow}; +use kb_store_sqlite::SqliteStore; + +use crate::types::GoldenQuery; + +/// Parse the YAML at `path` into a `Vec` and check that +/// every `id` is unique. +/// +/// The YAML is expected to be a top-level list of mappings. Required +/// fields per entry: `id`, `query`. All other fields default to empty / +/// `None` per [`GoldenQuery`]'s `serde(default)` annotations. +pub fn load_golden_set(path: &Path) -> Result> { + let bytes = + std::fs::read(path).with_context(|| format!("read golden YAML from {}", path.display()))?; + let queries: Vec = serde_yaml::from_slice(&bytes) + .with_context(|| format!("parse golden YAML at {}", path.display()))?; + check_unique_ids(&queries)?; + Ok(queries) +} + +/// Same as [`load_golden_set`] but additionally validates that every +/// `expected_doc_id` and `expected_chunk_id` referenced by the loaded +/// entries actually exists in the SQLite database `cfg` resolves to. +/// +/// Missing IDs are surfaced as a single sorted error listing every +/// offender, so curators can fix the whole set in one pass. +/// +/// Currently used only by the in-module tests below; production code +/// inlines `load_golden_set` + `validate_against_db` in +/// [`crate::run_eval_with_config`] so the validation can run against +/// an already-opened [`kb_config::Config`] without re-parsing YAML. +#[cfg(test)] +pub(crate) fn load_golden_set_validated( + yaml_path: &Path, + cfg: &kb_config::Config, +) -> Result> { + let queries = load_golden_set(yaml_path)?; + validate_against_db(&queries, cfg)?; + Ok(queries) +} + +fn check_unique_ids(queries: &[GoldenQuery]) -> Result<()> { + let mut seen: HashSet<&str> = HashSet::new(); + let mut dups: BTreeSet = BTreeSet::new(); + for q in queries { + if !seen.insert(q.id.as_str()) { + dups.insert(q.id.clone()); + } + } + if dups.is_empty() { + Ok(()) + } else { + let list: Vec = dups.into_iter().collect(); + Err(anyhow!("duplicate query id(s): {}", list.join(", "))) + } +} + +/// Read every doc_id / chunk_id referenced by `queries` and confirm +/// SQLite has rows for them. Builds a sorted, deduplicated error +/// message listing every missing ID. +pub(crate) fn validate_against_db(queries: &[GoldenQuery], cfg: &kb_config::Config) -> Result<()> { + // Short-circuit when there is nothing to validate — saves opening + // SQLite for golden sets that omit expected_*_ids entirely. + let needs_check = queries + .iter() + .any(|q| !q.expected_doc_ids.is_empty() || !q.expected_chunk_ids.is_empty()); + if !needs_check { + return Ok(()); + } + + let store = SqliteStore::open(cfg).context("open SqliteStore for golden validation")?; + store + .run_migrations() + .context("run migrations for golden validation")?; + + let mut missing_docs: BTreeSet = BTreeSet::new(); + let mut missing_chunks: BTreeSet = BTreeSet::new(); + + for q in queries { + for did in &q.expected_doc_ids { + let exists = store + .document_exists(&did.0) + .with_context(|| format!("probe document {}", did.0))?; + if !exists { + missing_docs.insert(did.0.clone()); + } + } + for cid in &q.expected_chunk_ids { + let exists = store + .chunk_exists(&cid.0) + .with_context(|| format!("probe chunk {}", cid.0))?; + if !exists { + missing_chunks.insert(cid.0.clone()); + } + } + } + + if missing_docs.is_empty() && missing_chunks.is_empty() { + return Ok(()); + } + + let mut parts: Vec = Vec::new(); + if !missing_docs.is_empty() { + parts.push(format!( + "missing doc_ids: {}", + missing_docs.into_iter().collect::>().join(", ") + )); + } + if !missing_chunks.is_empty() { + parts.push(format!( + "missing chunk_ids: {}", + missing_chunks.into_iter().collect::>().join(", ") + )); + } + Err(anyhow!( + "golden set references unknown IDs — {}", + parts.join("; ") + )) +} + +#[cfg(test)] +mod tests { + //! Tests that exercise the crate-private + //! [`load_golden_set_validated`]. The pure-parser cases live in + //! `tests/loader.rs`; only the validated-variant cases need to sit + //! next to the function so they can see the `pub(crate)` symbol. + use super::*; + use kb_config::Config; + use kb_store_sqlite::SqliteStore; + use rusqlite::params; + use std::fs; + use tempfile::tempdir; + + #[test] + fn rejects_unknown_expected_chunk_id() { + let tmp = tempdir().unwrap(); + let mut config = Config::defaults(); + config.storage.data_dir = tmp.path().to_string_lossy().into_owned(); + + let store = SqliteStore::open(&config).unwrap(); + store.run_migrations().unwrap(); + seed_one_chunk(&store, "doc_present", "chunk_present"); + + let yaml_path = tmp.path().join("golden.yaml"); + fs::write( + &yaml_path, + "- id: g1\n query: hello\n expected_chunk_ids: [\"chunk_present\", \"chunk_missing\"]\n", + ) + .unwrap(); + + let err = load_golden_set_validated(&yaml_path, &config).unwrap_err(); + let msg = format!("{err:#}"); + assert!(msg.contains("missing chunk_ids"), "msg: {msg}"); + assert!(msg.contains("chunk_missing"), "msg: {msg}"); + assert!(!msg.contains("chunk_present"), "msg: {msg}"); + } + + #[test] + fn accepts_resolved_ids() { + let tmp = tempdir().unwrap(); + let mut config = Config::defaults(); + config.storage.data_dir = tmp.path().to_string_lossy().into_owned(); + + let store = SqliteStore::open(&config).unwrap(); + store.run_migrations().unwrap(); + seed_one_chunk(&store, "doc_present", "chunk_present"); + + let yaml_path = tmp.path().join("golden.yaml"); + fs::write( + &yaml_path, + "- id: g1\n query: hello\n expected_doc_ids: [\"doc_present\"]\n expected_chunk_ids: [\"chunk_present\"]\n", + ) + .unwrap(); + + let qs = load_golden_set_validated(&yaml_path, &config).unwrap(); + assert_eq!(qs.len(), 1); + } + + fn seed_one_chunk(store: &SqliteStore, doc_id: &str, chunk_id: &str) { + let conn = store.read_conn(); + let asset_id = format!("a_{doc_id}"); + conn.execute( + "INSERT OR IGNORE INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, '\"markdown\"', 0, + 'deadbeefdeadbeefdeadbeefdeadbeef', + 'reference', ?, '1970-01-01T00:00:00Z')", + params![asset_id, "file:///tmp/x.md", "x.md", "x.md"], + ) + .unwrap(); + conn.execute( + "INSERT OR IGNORE INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, parser_version, doc_version, schema_version, + metadata_json, provenance_json, created_at, updated_at + ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1, + '{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')", + params![doc_id, asset_id, "x.md"], + ) + .unwrap(); + conn.execute( + "INSERT OR IGNORE INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, 'hi', '[]', NULL, + '[{\"kind\":\"line\",\"start\":1,\"end\":3}]', + 1, 'v1', 'h', '[]', '1970-01-01T00:00:00Z')", + params![chunk_id, doc_id], + ) + .unwrap(); + } +} diff --git a/crates/kb-eval/src/runner.rs b/crates/kb-eval/src/runner.rs new file mode 100644 index 0000000..5ebd340 --- /dev/null +++ b/crates/kb-eval/src/runner.rs @@ -0,0 +1,330 @@ +//! Per-query eval runner. See [`run_eval`] / [`run_eval_with_config`]. + +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use kb_core::{SearchFilters, SearchQuery}; +use kb_store_sqlite::{EvalRunRow, SqliteStore}; +use time::OffsetDateTime; + +use crate::loader::{load_golden_set, validate_against_db}; +use crate::types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult}; + +/// Env var that overrides the default `fixtures/golden_queries.yaml` +/// path. Resolved relative to the current working directory. +const KB_EVAL_GOLDEN: &str = "KB_EVAL_GOLDEN"; + +/// Default golden YAML path (relative to CWD when set). +const DEFAULT_GOLDEN_PATH: &str = "fixtures/golden_queries.yaml"; + +/// Run the golden suite end-to-end against the active XDG-loaded +/// [`kb_config::Config`]. Wraps [`run_eval_with_config`] with +/// `Config::load(None)`. +pub fn run_eval(opts: &EvalRunOpts) -> Result { + let cfg = kb_config::Config::load(None).context("load Config for run_eval")?; + run_eval_with_config(&cfg, opts) +} + +/// Run the golden suite end-to-end against an explicit +/// [`kb_config::Config`]. Used by integration tests (TempDir-backed +/// data_dir) and any future caller that wants to drive the runner +/// against a non-default config. +pub fn run_eval_with_config(cfg: &kb_config::Config, opts: &EvalRunOpts) -> Result { + let started = std::time::Instant::now(); + + // ── 1. Load golden set ──────────────────────────────────────────────── + let golden_path = resolve_golden_path(); + tracing::debug!( + target: "kb-eval", + path = %golden_path.display(), + "kb-eval: loading golden set" + ); + let queries = load_golden_set(&golden_path).with_context(|| { + format!( + "load golden set from {} (override via KB_EVAL_GOLDEN)", + golden_path.display() + ) + })?; + validate_against_db(&queries, cfg)?; + + // ── 2. Mint identifiers + open store ────────────────────────────────── + let run_id = mint_run_id(); + let created_at = OffsetDateTime::now_utc(); + let commit_hash = std::env::var("KB_COMMIT_HASH") + .ok() + .filter(|s| !s.is_empty()); + + // Open the store once so every per-query write reuses the same + // connection-mutex lifetime. + let store = SqliteStore::open(cfg).context("open SqliteStore for run_eval")?; + store + .run_migrations() + .context("run migrations for run_eval")?; + + // ── 3. Build config_snapshot_json ───────────────────────────────────── + let config_snapshot_json = build_config_snapshot(cfg)?; + let config_snapshot_text = + serde_json::to_string(&config_snapshot_json).context("serialize config_snapshot_json")?; + + // ── 4. Per-query execution ──────────────────────────────────────────── + let mut per_query: Vec = Vec::with_capacity(queries.len()); + for gq in &queries { + let qr = execute_query(cfg, gq, opts); + per_query.push(qr); + } + + // ── 5. Persist eval_runs + eval_query_results ──────────────────────── + // Serialize per-query JSON up front so the SQLite transaction below + // never holds the connection mutex through serde failures. + let mut results: Vec<(String, String)> = Vec::with_capacity(per_query.len()); + for qr in &per_query { + let json = serde_json::to_string(qr) + .with_context(|| format!("serialize QueryResult for {}", qr.query_id))?; + results.push((qr.query_id.clone(), json)); + } + let row = EvalRunRow { + run_id: &run_id, + suite: opts.suite.as_str(), + config_snapshot_json: &config_snapshot_text, + aggregate_json: "{}", + commit_hash: commit_hash.as_deref(), + created_at, + }; + store + .record_eval_run_with_results(&row, &results) + .context("record eval_runs + eval_query_results (transactional)")?; + + // ── 6. Mirror to runs_dir//per_query.jsonl ──────────────────── + write_per_query_jsonl(cfg, &run_id, &per_query)?; + + let duration_ms = started.elapsed().as_millis().min(u128::from(u32::MAX)) as u32; + tracing::info!( + target: "kb-eval", + run_id = %run_id, + suite = %opts.suite, + queries = per_query.len(), + duration_ms, + "kb-eval: run complete" + ); + + Ok(EvalRun { + run_id, + created_at, + commit_hash, + config_snapshot_json, + per_query, + }) +} + +/// Mint a `run_` identifier. UUIDv7 stands in for ULID — same +/// timestamp-ordered monotonicity, already in workspace deps. Lower- +/// case simple form to match the `ulid_lower()` shape the spec asks +/// for. +fn mint_run_id() -> String { + let id = uuid::Uuid::now_v7().simple().to_string(); + format!("run_{id}") +} + +/// Resolve the golden YAML path. Honors the `KB_EVAL_GOLDEN` env +/// override; otherwise relative to CWD. The path is NOT expanded for +/// `~` / `${...}` placeholders — direct file paths only. +fn resolve_golden_path() -> PathBuf { + match std::env::var(KB_EVAL_GOLDEN) { + Ok(s) if !s.is_empty() => PathBuf::from(s), + _ => PathBuf::from(DEFAULT_GOLDEN_PATH), + } +} + +/// Run one [`GoldenQuery`] through the kb-app facade. Errors are +/// captured into `QueryResult.error` so the run continues. +fn execute_query(cfg: &kb_config::Config, gq: &GoldenQuery, opts: &EvalRunOpts) -> QueryResult { + let started = std::time::Instant::now(); + + let search_query = SearchQuery { + text: gq.query.clone(), + mode: opts.mode, + k: opts.k, + filters: SearchFilters::default(), + }; + + let (hits_top_k, mut error) = match kb_app::search_with_config(cfg.clone(), search_query) { + Ok(hits) => (hits, None), + Err(e) => (Vec::new(), Some(format!("{e:#}"))), + }; + + // Optional RAG path: only attempted when `with_rag` and the search + // call did not already error out (we want one error per query, not + // a duplicated one). + let answer = if opts.with_rag && error.is_none() { + let ask_opts = kb_app::AskOpts { + k: opts.k, + explain: true, + mode: opts.mode, + temperature: opts.temperature, + seed: opts.seed, + stream_sink: None, + }; + match kb_app::ask_with_config(cfg.clone(), &gq.query, ask_opts) { + Ok(ans) => Some(ans), + Err(e) => { + error = Some(format!("{e:#}")); + None + } + } + } else { + None + }; + + let elapsed_ms = started.elapsed().as_millis().min(u128::from(u32::MAX)) as u32; + + QueryResult { + query_id: gq.id.clone(), + query: gq.query.clone(), + mode: opts.mode, + hits_top_k, + answer, + elapsed_ms, + error, + } +} + +/// Build the `config_snapshot_json` value: full Config as `config` plus +/// the auxiliary version fields the spec calls out. +/// +/// `index_version` is intentionally `None` here — it is composed +/// dynamically by `kb-app` on a per-call basis from the configured +/// embedder (e.g., `vec:@:`), so it is not a +/// stable run-time property of the config alone. P5-2 may compose it +/// from `embedding.{model,version,dimensions}` if it needs the field +/// for compare reports. +fn build_config_snapshot(cfg: &kb_config::Config) -> Result { + let cfg_value = serde_json::to_value(cfg).context("serialize Config")?; + Ok(serde_json::json!({ + "config": cfg_value, + "chunker_version": cfg.chunking.chunker_version, + "embedding": { + "model": cfg.models.embedding.model, + "version": cfg.models.embedding.version, + "dimensions": cfg.models.embedding.dimensions, + "provider": cfg.models.embedding.provider, + }, + "llm": { + "model_id": cfg.models.llm.model, + "provider": cfg.models.llm.provider, + }, + "prompt_template_version": cfg.rag.prompt_template_version, + "score_gate": cfg.rag.score_gate, + "rrf_k": cfg.search.rrf_k, + "index_version": serde_json::Value::Null, + })) +} + +/// Write the `runs_dir//per_query.jsonl` mirror (design §6.3). +/// Each `QueryResult` is one line, separator `\n`. The directory is +/// created if it doesn't exist; an existing file is overwritten (a +/// `run_id` collision would already have failed the `eval_runs` +/// PRIMARY KEY upstream). +fn write_per_query_jsonl( + cfg: &kb_config::Config, + run_id: &str, + per_query: &[QueryResult], +) -> Result<()> { + let runs_dir = expand_path(&cfg.storage.runs_dir, &cfg.storage.data_dir); + let run_dir = runs_dir.join(run_id); + std::fs::create_dir_all(&run_dir) + .with_context(|| format!("create run dir {}", run_dir.display()))?; + let path = run_dir.join("per_query.jsonl"); + let file = File::create(&path) + .with_context(|| format!("create per_query.jsonl at {}", path.display()))?; + let mut w = BufWriter::new(file); + for qr in per_query { + serde_json::to_writer(&mut w, qr) + .with_context(|| format!("serialize QueryResult for {}", qr.query_id))?; + w.write_all(b"\n") + .context("write newline separator in per_query.jsonl")?; + } + w.flush().context("flush per_query.jsonl")?; + Ok(()) +} + +/// Expand `{data_dir}` / `${XDG_DATA_HOME:-…}` / leading `~`. Mirror +/// of `kb-store-vector::paths::expand_path` and +/// `kb-store-sqlite::expand_data_dir` — kept private here because +/// `kb-config` does not (yet) expose a shared resolver helper. +fn expand_path(raw: &str, data_dir: &str) -> PathBuf { + let mut s = raw.to_string(); + + // First, resolve `data_dir` itself so any `{data_dir}` substitution + // points at an already-expanded base path. `data_dir` may contain + // `${XDG_DATA_HOME:-…}` and `~`; resolve them once and re-use the + // result. + let resolved_data_dir = expand_xdg_and_tilde(data_dir); + s = s.replace("{data_dir}", &resolved_data_dir); + + expand_xdg_and_tilde_path(&s) +} + +fn expand_xdg_and_tilde(raw: &str) -> String { + let s = expand_xdg(raw); + expand_tilde_str(&s) +} + +fn expand_xdg_and_tilde_path(raw: &str) -> PathBuf { + let s = expand_xdg_and_tilde(raw); + PathBuf::from(s) +} + +fn expand_xdg(raw: &str) -> String { + let mut s = raw.to_string(); + if let Some(start) = s.find("${XDG_DATA_HOME") { + if let Some(rel_end) = s[start..].find('}') { + let end = start + rel_end + 1; + let inner = &s[start + 2..end - 1]; + let replacement = match std::env::var("XDG_DATA_HOME") { + Ok(v) if !v.is_empty() => v, + _ => match inner.split_once(":-") { + Some((_, default)) => default.to_string(), + None => String::new(), + }, + }; + s.replace_range(start..end, &replacement); + } + } + s +} + +fn expand_tilde_str(raw: &str) -> String { + if let Some(rest) = raw.strip_prefix("~/") { + if let Some(home) = std::env::var_os("HOME") { + let mut p = PathBuf::from(home); + p.push(rest); + return p.to_string_lossy().into_owned(); + } + } + if raw == "~" { + if let Some(home) = std::env::var_os("HOME") { + return PathBuf::from(home).to_string_lossy().into_owned(); + } + } + raw.to_string() +} + +#[cfg(test)] +mod expand_tests { + use super::*; + use std::path::Path; + + #[test] + fn expand_substitutes_data_dir() { + let p = expand_path("{data_dir}/runs", "/tmp/kbtest"); + assert_eq!(p, Path::new("/tmp/kbtest/runs")); + } + + #[test] + fn expand_passthrough_absolute() { + let p = expand_path("/abs/runs", "/ignored"); + assert_eq!(p, Path::new("/abs/runs")); + } +} diff --git a/crates/kb-eval/src/types.rs b/crates/kb-eval/src/types.rs new file mode 100644 index 0000000..6165770 --- /dev/null +++ b/crates/kb-eval/src/types.rs @@ -0,0 +1,87 @@ +//! Public domain types for the eval runner (signatures pinned by +//! `tasks/p5/p5-1-golden-fixture-runner.md` "Public surface"). + +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; + +use kb_core::{Answer, ChunkId, DocumentId, Lang, SearchHit, SearchMode}; + +/// One golden query loaded from `fixtures/golden_queries.yaml`. +/// +/// Required fields: `id`, `query`. Everything else defaults to +/// empty / `None` per the loader contract. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct GoldenQuery { + pub id: String, + pub query: String, + #[serde(default = "default_lang")] + pub lang: Lang, + #[serde(default)] + pub expected_doc_ids: Vec, + #[serde(default)] + pub expected_chunk_ids: Vec, + #[serde(default)] + pub must_contain: Vec, + #[serde(default)] + pub forbidden: Vec, + #[serde(default)] + pub difficulty: Option, +} + +fn default_lang() -> Lang { + // `Lang` is a BCP-47 string newtype (§3.3); the empty string is + // the safe default for golden entries that omit `lang`. Curators + // may fill it in later; the runner does not branch on this field. + Lang(String::new()) +} + +/// Caller-supplied knobs for one [`crate::run_eval`] invocation. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct EvalRunOpts { + /// Suite label persisted into `eval_runs.suite`. The shipped + /// fixture is `"golden"`; other suites can reuse the same runner. + pub suite: String, + /// Retrieval mode forwarded to every `kb_app::search` / + /// `kb_app::ask` call inside the run. + pub mode: SearchMode, + /// When `true`, also call `kb_app::ask` per query and record the + /// resulting `Answer` on the `QueryResult`. + pub with_rag: bool, + /// Top-k forwarded to retrieval (and `AskOpts.k` when `with_rag`). + pub k: usize, + /// Override `config.models.llm.temperature` when `with_rag`. + /// Determinism contract requires `Some(0.0)` + a fixed `seed`. + pub temperature: Option, + /// Override `config.models.llm.seed` when `with_rag`. + pub seed: Option, +} + +/// One full eval run. Persisted to `eval_runs` + `eval_query_results` +/// (design §5.7) and mirrored to `runs_dir//per_query.jsonl` +/// (design §6.3). +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct EvalRun { + pub run_id: String, + #[serde(with = "time::serde::rfc3339")] + pub created_at: OffsetDateTime, + pub commit_hash: Option, + /// Snapshot of the `Config` plus auxiliary version fields + /// (`chunker_version`, embedding/llm/prompt versions, fusion + /// params, `index_version`). See [`crate::run_eval`] for the + /// exact shape. + pub config_snapshot_json: serde_json::Value, + pub per_query: Vec, +} + +/// One per-query record. Every row in `eval_query_results` has its +/// `result_json` filled with `serde_json::to_string(&QueryResult)`. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct QueryResult { + pub query_id: String, + pub query: String, + pub mode: SearchMode, + pub hits_top_k: Vec, + pub answer: Option, + pub elapsed_ms: u32, + pub error: Option, +} diff --git a/crates/kb-eval/tests/fixtures/eval/run-1.json b/crates/kb-eval/tests/fixtures/eval/run-1.json new file mode 100644 index 0000000..0f6e93d --- /dev/null +++ b/crates/kb-eval/tests/fixtures/eval/run-1.json @@ -0,0 +1,30 @@ +[ + { + "error": null, + "first_hit": { + "chunk_id": "chunk000000000000000000000000000000", + "doc_id": "doc00000000000000000000000000000000", + "heading_path": [], + "score": 0.3429983854293823 + }, + "has_answer": false, + "hits_count": 1, + "mode": "lexical", + "query": "ownership", + "query_id": "q1" + }, + { + "error": null, + "first_hit": { + "chunk_id": "chunk000000000000000000000000000002", + "doc_id": "doc00000000000000000000000000000002", + "heading_path": [], + "score": 0.3585492968559265 + }, + "has_answer": false, + "hits_count": 1, + "mode": "lexical", + "query": "heading", + "query_id": "q2" + } +] \ No newline at end of file diff --git a/crates/kb-eval/tests/loader.rs b/crates/kb-eval/tests/loader.rs new file mode 100644 index 0000000..115dacb --- /dev/null +++ b/crates/kb-eval/tests/loader.rs @@ -0,0 +1,59 @@ +//! Loader tests for the golden-fixture YAML parser (P5-1). +//! +//! These tests exercise pure parsing and duplicate-id detection. The +//! DB-validation tests for the crate-private +//! `load_golden_set_validated` live next to the function in +//! `src/loader.rs` (they need `pub(crate)` visibility, which integration +//! tests can't see). + +use std::fs; + +use kb_eval::load_golden_set; +use tempfile::tempdir; + +// ── 1. parser accepts well-formed YAML with optional fields ────────────────── + +#[test] +fn loads_minimal_well_formed_yaml() { + let tmp = tempdir().unwrap(); + let yaml_path = tmp.path().join("golden.yaml"); + fs::write( + &yaml_path, + "- id: g1\n query: hello\n- id: g2\n query: \"another\"\n lang: en\n must_contain: [\"foo\"]\n forbidden: [\"bar\"]\n difficulty: easy\n", + ) + .unwrap(); + + let qs = load_golden_set(&yaml_path).unwrap(); + assert_eq!(qs.len(), 2); + assert_eq!(qs[0].id, "g1"); + assert_eq!(qs[0].query, "hello"); + assert!(qs[0].must_contain.is_empty()); + assert!(qs[0].forbidden.is_empty()); + assert!(qs[0].difficulty.is_none()); + + assert_eq!(qs[1].id, "g2"); + assert_eq!(qs[1].lang.0, "en"); + assert_eq!(qs[1].must_contain, vec!["foo".to_string()]); + assert_eq!(qs[1].forbidden, vec!["bar".to_string()]); + assert_eq!(qs[1].difficulty.as_deref(), Some("easy")); +} + +// ── 2. duplicate IDs error lists every offender (sorted, deduplicated) ─────── + +#[test] +fn rejects_duplicate_ids() { + let tmp = tempdir().unwrap(); + let yaml_path = tmp.path().join("dup.yaml"); + fs::write( + &yaml_path, + "- id: g1\n query: a\n- id: g2\n query: b\n- id: g1\n query: c\n- id: g2\n query: d\n- id: g1\n query: e\n", + ) + .unwrap(); + + let err = load_golden_set(&yaml_path).unwrap_err(); + let msg = format!("{err:#}"); + assert!(msg.contains("duplicate query id"), "msg: {msg}"); + // Both dup IDs should appear, sorted (BTreeSet) and deduplicated. + assert!(msg.contains("g1"), "msg: {msg}"); + assert!(msg.contains("g2"), "msg: {msg}"); +} diff --git a/crates/kb-eval/tests/runner.rs b/crates/kb-eval/tests/runner.rs new file mode 100644 index 0000000..eba63af --- /dev/null +++ b/crates/kb-eval/tests/runner.rs @@ -0,0 +1,403 @@ +//! Runner integration tests for `kb-eval` (P5-1). +//! +//! Drives [`kb_eval::run_eval_with_config`] end-to-end against a +//! TempDir-backed config: +//! +//! - tiny seeded SQLite corpus (3 docs / 3 chunks) used as the +//! workspace's source-of-truth, +//! - lexical-only retrieval (`SearchMode::Lexical`) so no embedder is +//! required (`models.embedding.provider = "none"`), +//! - golden YAML pointed at via `KB_EVAL_GOLDEN`. +//! +//! Determinism: lexical-only with a fixed seed corpus produces +//! byte-identical `per_query.jsonl` content (modulo `run_id` / +//! `created_at`, which we strip when comparing). + +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::Mutex; + +use kb_config::Config; +use kb_core::SearchMode; +use kb_eval::{EvalRunOpts, QueryResult, run_eval_with_config}; +use kb_store_sqlite::SqliteStore; +use rusqlite::params; +use tempfile::TempDir; + +/// `KB_EVAL_GOLDEN` is process-global state. Tests touching it must +/// serialize so they don't trample each other when `cargo test` +/// runs them in parallel. +static GOLDEN_ENV_LOCK: Mutex<()> = Mutex::new(()); + +// ── shared scaffolding ─────────────────────────────────────────────────────── + +struct RunEnv { + temp: TempDir, + config: Config, +} + +impl RunEnv { + fn new() -> Self { + let temp = tempfile::tempdir().unwrap(); + let mut config = Config::defaults(); + config.storage.data_dir = temp.path().to_string_lossy().into_owned(); + // Force lexical-only behavior so the runner never tries to + // load fastembed during integration tests. + config.models.embedding.provider = "none".to_string(); + config.models.embedding.dimensions = 0; + // Pin search defaults so test asserts are stable. + config.search.default_k = 5; + + let store = SqliteStore::open(&config).unwrap(); + store.run_migrations().unwrap(); + seed_corpus(&store); + Self { temp, config } + } + + fn data_dir(&self) -> PathBuf { + self.temp.path().to_path_buf() + } +} + +/// Seed three (asset, document, chunk) triples with text the test +/// queries can match against the FTS5 lexical index. +fn seed_corpus(store: &SqliteStore) { + let conn = store.read_conn(); + for (i, text) in [ + "Rust ownership and borrow checker basics.", + "Cargo workspace members are listed in workspace.members.", + "Markdown chunking respects heading boundaries.", + ] + .iter() + .enumerate() + { + let doc_id = format!("doc{i:032}"); + let chunk_id = format!("chunk{i:030}"); + let asset_id = format!("asset{i:030}"); + let path = format!("notes/{i}.md"); + conn.execute( + "INSERT INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, '\"markdown\"', 0, + 'deadbeefdeadbeefdeadbeefdeadbeef', + 'reference', ?, '1970-01-01T00:00:00Z')", + params![asset_id, format!("file:///{path}"), path, path], + ) + .unwrap(); + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, parser_version, doc_version, schema_version, + metadata_json, provenance_json, created_at, updated_at + ) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1, + '{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')", + params![doc_id, asset_id, path], + ) + .unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, ?, '[]', NULL, + '[{\"kind\":\"line\",\"start\":1,\"end\":3}]', + 1, 'md-heading-v1', 'h', '[]', '1970-01-01T00:00:00Z')", + params![chunk_id, doc_id, text], + ) + .unwrap(); + } + // Build the FTS index so lexical search returns hits. Reuses the + // same connection guard rather than reopening — the SAVEPOINT + // protocol nests correctly under the existing read_conn lock. + kb_store_sqlite::rebuild_chunks_fts(&conn).unwrap(); + drop(conn); +} + +fn write_golden(dir: &Path, body: &str) -> PathBuf { + let path = dir.join("golden.yaml"); + fs::write(&path, body).unwrap(); + path +} + +fn lexical_opts() -> EvalRunOpts { + EvalRunOpts { + suite: "test".to_string(), + mode: SearchMode::Lexical, + with_rag: false, + k: 5, + temperature: Some(0.0), + seed: Some(0), + } +} + +/// Run the eval after pointing `KB_EVAL_GOLDEN` at `yaml`. The env +/// guard must outlive the call so concurrent tests don't reset the +/// var mid-run. +fn run_with_golden R, R>(yaml: &Path, f: F) -> R { + let _g = GOLDEN_ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner()); + // SAFETY: `KB_EVAL_GOLDEN` is a benign env var; the GOLDEN_ENV_LOCK + // serializes mutations so concurrent tests don't race. + unsafe { + std::env::set_var("KB_EVAL_GOLDEN", yaml); + } + let out = f(); + unsafe { + std::env::remove_var("KB_EVAL_GOLDEN"); + } + out +} + +// ── 1. elapsed_ms recorded for every query ────────────────────────────────── + +#[test] +fn runner_records_elapsed_for_every_query() { + let env = RunEnv::new(); + let yaml = write_golden( + env.data_dir().as_path(), + "- id: q1\n query: ownership\n- id: q2\n query: heading\n- id: q3\n query: workspace\n", + ); + + let run = run_with_golden(&yaml, || { + run_eval_with_config(&env.config, &lexical_opts()).unwrap() + }); + + assert_eq!(run.per_query.len(), 3); + for qr in &run.per_query { + assert_eq!(qr.mode, SearchMode::Lexical); + // `elapsed_ms` is `u32`; the assertion that it's a valid + // unsigned value is implicit. We additionally bound it well + // below the 4G ceiling to detect a stuck/overflow path. + assert!( + qr.elapsed_ms < 60_000, + "elapsed_ms suspicious: {}", + qr.elapsed_ms + ); + } + // The id-list round-trips into the per-query records. + let ids: Vec<&str> = run.per_query.iter().map(|q| q.query_id.as_str()).collect(); + assert_eq!(ids, vec!["q1", "q2", "q3"]); +} + +// ── 2. config snapshot carries the documented version fields ──────────────── + +#[test] +fn runner_records_config_snapshot_with_versions() { + let env = RunEnv::new(); + let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n query: ownership\n"); + + let run = run_with_golden(&yaml, || { + run_eval_with_config(&env.config, &lexical_opts()).unwrap() + }); + + let snap = &run.config_snapshot_json; + assert!(snap.get("config").is_some(), "config field missing"); + assert_eq!( + snap.pointer("/chunker_version"), + Some(&serde_json::Value::String("md-heading-v1".to_string())), + ); + assert!(snap.pointer("/embedding/model").is_some()); + assert!(snap.pointer("/embedding/dimensions").is_some()); + assert!(snap.pointer("/llm/model_id").is_some()); + assert_eq!( + snap.pointer("/prompt_template_version"), + Some(&serde_json::Value::String("rag-v1".to_string())), + ); + assert!(snap.pointer("/score_gate").is_some()); + assert!(snap.pointer("/rrf_k").is_some()); +} + +// ── 3. failing query (ask path with no Ollama) records an error ───────────── + +#[test] +fn runner_captures_per_query_error_when_rag_unreachable() { + let env = RunEnv::new(); + // Point Ollama at a guaranteed-dead port so `ask_with_config` + // surfaces a connection error per query. + let mut config = env.config.clone(); + config.models.llm.endpoint = "http://127.0.0.1:1".to_string(); + + let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n query: ownership\n"); + + let opts = EvalRunOpts { + with_rag: true, + ..lexical_opts() + }; + let run = run_with_golden(&yaml, || run_eval_with_config(&config, &opts).unwrap()); + + let qr = &run.per_query[0]; + // hits_top_k still populated by lexical search before the RAG attempt. + assert!( + !qr.hits_top_k.is_empty(), + "lexical hits should populate before RAG attempt" + ); + assert!(qr.answer.is_none(), "no answer when RAG fails"); + assert!(qr.error.is_some(), "error must be recorded"); +} + +// ── 4. eval_runs + eval_query_results rows persisted ──────────────────────── + +#[test] +fn runner_persists_eval_run_and_query_result_rows() { + let env = RunEnv::new(); + let yaml = write_golden( + env.data_dir().as_path(), + "- id: q1\n query: ownership\n- id: q2\n query: heading\n", + ); + + let run = run_with_golden(&yaml, || { + run_eval_with_config(&env.config, &lexical_opts()).unwrap() + }); + + // Reopen the same SQLite file with a new store handle and read + // the rows back. We use the inherent `read_conn` helper rather + // than rusqlite directly because the latter would require kb-eval + // to add a runtime rusqlite dep (forbidden by the spec). + let store = SqliteStore::open(&env.config).unwrap(); + let conn = store.read_conn(); + + let n_runs: i64 = conn + .query_row( + "SELECT COUNT(*) FROM eval_runs WHERE run_id = ?", + params![run.run_id], + |r| r.get(0), + ) + .unwrap(); + assert_eq!(n_runs, 1); + + let n_results: i64 = conn + .query_row( + "SELECT COUNT(*) FROM eval_query_results WHERE run_id = ?", + params![run.run_id], + |r| r.get(0), + ) + .unwrap(); + assert_eq!(n_results, 2); +} + +// ── 5. per_query.jsonl mirror exists and round-trips ──────────────────────── + +#[test] +fn runner_writes_per_query_jsonl_mirror() { + let env = RunEnv::new(); + let yaml = write_golden( + env.data_dir().as_path(), + "- id: q1\n query: ownership\n- id: q2\n query: heading\n", + ); + + let run = run_with_golden(&yaml, || { + run_eval_with_config(&env.config, &lexical_opts()).unwrap() + }); + + let mirror = env + .data_dir() + .join("runs") + .join(&run.run_id) + .join("per_query.jsonl"); + assert!( + mirror.exists(), + "per_query.jsonl missing at {}", + mirror.display() + ); + let body = fs::read_to_string(&mirror).unwrap(); + let lines: Vec<&str> = body.lines().collect(); + assert_eq!(lines.len(), 2); + let parsed: Vec = lines + .iter() + .map(|l| serde_json::from_str::(l).expect("valid JSONL line")) + .collect(); + assert_eq!(parsed[0].query_id, "q1"); + assert_eq!(parsed[1].query_id, "q2"); +} + +// ── 6. determinism — repeating the run produces byte-identical per_query JSON ─ + +#[test] +fn runner_lexical_is_deterministic_per_query_payload() { + let env = RunEnv::new(); + let yaml = write_golden( + env.data_dir().as_path(), + "- id: q1\n query: ownership\n- id: q2\n query: heading\n", + ); + + let run_a = run_with_golden(&yaml, || { + run_eval_with_config(&env.config, &lexical_opts()).unwrap() + }); + let run_b = run_with_golden(&yaml, || { + run_eval_with_config(&env.config, &lexical_opts()).unwrap() + }); + + // Run-level fields (`run_id`, `created_at`) intentionally diverge; + // the per-query payload (which is what the snapshot fixture pins) + // must be byte-identical. + let a_json = serde_json::to_string(&run_a.per_query).unwrap(); + let b_json = serde_json::to_string(&run_b.per_query).unwrap(); + assert_eq!( + a_json, b_json, + "lexical-only per_query payload must be byte-identical across runs" + ); +} + +// ── 7. snapshot — per_query JSON pinned to fixtures/eval/run-1.json ───────── + +#[test] +fn runner_per_query_snapshot_matches_fixture() { + let env = RunEnv::new(); + let yaml = write_golden( + env.data_dir().as_path(), + "- id: q1\n query: ownership\n- id: q2\n query: heading\n", + ); + + let run = run_with_golden(&yaml, || { + run_eval_with_config(&env.config, &lexical_opts()).unwrap() + }); + + // Fixture pins the *shape* of the per-query payload, including the + // first hit's stable scalar fields (chunk_id, doc_id, heading_path, + // fusion_score). FTS scores depend on the SQLite version, so the + // fusion_score is captured into the fixture from one passing run + // and must remain stable across re-runs against the same seeded + // corpus. Timing-sensitive fields (`elapsed_ms`, raw `Instant` + // byproducts) are excluded. Verifying byte stability is the + // determinism test (#6); this test verifies the field set + + // ordering is stable. + let projection: Vec<_> = run + .per_query + .iter() + .map(|qr| { + let first_hit = qr.hits_top_k.first().map(|h| { + serde_json::json!({ + "chunk_id": h.chunk_id, + "doc_id": h.doc_id, + "heading_path": h.heading_path, + "score": h.retrieval.fusion_score, + }) + }); + serde_json::json!({ + "query_id": qr.query_id, + "query": qr.query, + "mode": qr.mode, + "hits_count": qr.hits_top_k.len(), + "first_hit": first_hit, + "has_answer": qr.answer.is_some(), + "error": qr.error, + }) + }) + .collect(); + let actual = serde_json::to_string_pretty(&projection).unwrap(); + + let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/eval/run-1.json"); + + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + fs::create_dir_all(fixture_path.parent().unwrap()).unwrap(); + fs::write(&fixture_path, &actual).unwrap(); + } + + let expected = fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("read snapshot {}: {e}", fixture_path.display())); + assert_eq!( + actual.trim(), + expected.trim(), + "snapshot drift — re-run with UPDATE_SNAPSHOTS=1 to refresh" + ); +} diff --git a/crates/kb-store-sqlite/src/eval.rs b/crates/kb-store-sqlite/src/eval.rs new file mode 100644 index 0000000..d0bb272 --- /dev/null +++ b/crates/kb-store-sqlite/src/eval.rs @@ -0,0 +1,161 @@ +//! `eval_runs` / `eval_query_results` row writers (P5-1 — design §5.7). +//! +//! `kb-eval` calls these directly via the inherent methods on +//! [`SqliteStore`]. The pattern mirrors [`crate::answers`]: the trait +//! `kb_core::DocumentStore` is the document surface, and run-level +//! audit rows (jobs, ingest_runs, answers, eval_runs) are inherent +//! methods so the trait surface stays small. + +use anyhow::{Context, Result}; +use rusqlite::params; +use time::OffsetDateTime; + +use crate::error::StoreError; +use crate::store::SqliteStore; + +/// One row about to land in `eval_runs` (per V001 schema). +/// +/// `aggregate_json` is filled by P5-1 with the literal `"{}"` — +/// metric computation lives in P5-2 and updates the row in place. +#[derive(Clone, Debug)] +pub struct EvalRunRow<'a> { + pub run_id: &'a str, + pub suite: &'a str, + pub config_snapshot_json: &'a str, + pub aggregate_json: &'a str, + pub commit_hash: Option<&'a str>, + pub created_at: OffsetDateTime, +} + +impl SqliteStore { + /// Return `true` iff a row with `doc_id = ?` exists in + /// `documents`. Lightweight existence probe used by + /// `kb-eval`'s golden-fixture validator — full + /// `DocumentStore::get_document` deserializes blocks + metadata + /// JSON, which is overkill for "does this ID exist?" + pub fn document_exists(&self, doc_id: &str) -> Result { + let conn = self.lock_conn(); + let row: Result = conn.query_row( + "SELECT 1 FROM documents WHERE doc_id = ? LIMIT 1", + params![doc_id], + |r| r.get(0), + ); + match row { + Ok(_) => Ok(true), + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(false), + Err(e) => Err(StoreError::from(e).into()), + } + } + + /// Same shape as [`Self::document_exists`] but probes the + /// `chunks` table by `chunk_id`. + pub fn chunk_exists(&self, chunk_id: &str) -> Result { + let conn = self.lock_conn(); + let row: Result = conn.query_row( + "SELECT 1 FROM chunks WHERE chunk_id = ? LIMIT 1", + params![chunk_id], + |r| r.get(0), + ); + match row { + Ok(_) => Ok(true), + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(false), + Err(e) => Err(StoreError::from(e).into()), + } + } + + /// Insert one row into `eval_runs`. Mirrors the schema in + /// `migrations/V001__init.sql` (§5.7). Called by + /// `kb-eval::run_eval` once per run, after every per-query result + /// row has been written. + pub fn record_eval_run(&self, row: &EvalRunRow<'_>) -> Result<()> { + let created_at = row + .created_at + .format(&time::format_description::well_known::Rfc3339) + .context("format eval_runs.created_at")?; + let conn = self.lock_conn(); + conn.execute( + "INSERT INTO eval_runs ( + run_id, suite, config_snapshot_json, aggregate_json, + commit_hash, created_at + ) VALUES (?, ?, ?, ?, ?, ?)", + params![ + row.run_id, + row.suite, + row.config_snapshot_json, + row.aggregate_json, + row.commit_hash, + created_at, + ], + ) + .map_err(StoreError::from)?; + Ok(()) + } + + /// Insert one row into `eval_query_results`. PRIMARY KEY is + /// `(run_id, query_id)` so writing the same `(run, query)` twice + /// surfaces a `UNIQUE` violation through `StoreError`. + pub fn record_eval_query_result( + &self, + run_id: &str, + query_id: &str, + result_json: &str, + ) -> Result<()> { + let conn = self.lock_conn(); + conn.execute( + "INSERT INTO eval_query_results (run_id, query_id, result_json) + VALUES (?, ?, ?)", + params![run_id, query_id, result_json], + ) + .map_err(StoreError::from)?; + Ok(()) + } + + /// Insert the `eval_runs` row plus every `eval_query_results` row + /// for the same run inside a single SQLite transaction. This is the + /// preferred path for `kb-eval::run_eval` — a panic between the run + /// row and the per-query rows can't leave orphan run rows. + /// + /// `results` is a slice of `(query_id, result_json)` tuples mirroring + /// the per-call `record_eval_query_result` arguments. + pub fn record_eval_run_with_results( + &self, + row: &EvalRunRow<'_>, + results: &[(String, String)], + ) -> Result<()> { + let created_at = row + .created_at + .format(&time::format_description::well_known::Rfc3339) + .context("format eval_runs.created_at")?; + let mut conn = self.lock_conn(); + let tx = conn.transaction().map_err(StoreError::from)?; + tx.execute( + "INSERT INTO eval_runs ( + run_id, suite, config_snapshot_json, aggregate_json, + commit_hash, created_at + ) VALUES (?, ?, ?, ?, ?, ?)", + params![ + row.run_id, + row.suite, + row.config_snapshot_json, + row.aggregate_json, + row.commit_hash, + created_at, + ], + ) + .map_err(StoreError::from)?; + { + let mut stmt = tx + .prepare( + "INSERT INTO eval_query_results (run_id, query_id, result_json) + VALUES (?, ?, ?)", + ) + .map_err(StoreError::from)?; + for (query_id, result_json) in results { + stmt.execute(params![row.run_id, query_id, result_json]) + .map_err(StoreError::from)?; + } + } + tx.commit().map_err(StoreError::from)?; + Ok(()) + } +} diff --git a/crates/kb-store-sqlite/src/lib.rs b/crates/kb-store-sqlite/src/lib.rs index 241bedc..09e2b20 100644 --- a/crates/kb-store-sqlite/src/lib.rs +++ b/crates/kb-store-sqlite/src/lib.rs @@ -21,6 +21,7 @@ mod answers; mod documents; mod embeddings; mod error; +mod eval; mod filters; mod fts; mod jobs; @@ -29,6 +30,7 @@ mod store; pub use embeddings::EmbeddingRecordRow; pub use error::StoreError; +pub use eval::EvalRunRow; pub use fts::rebuild_chunks_fts; pub use jobs::IngestRunRow; pub use store::SqliteStore; diff --git a/fixtures/golden_queries.yaml b/fixtures/golden_queries.yaml new file mode 100644 index 0000000..eecf629 --- /dev/null +++ b/fixtures/golden_queries.yaml @@ -0,0 +1,45 @@ +# Golden query suite for `kb eval run` (P5-1 / P5-2). +# +# Top-level: list of queries. Required fields: `id`, `query`. All +# others are optional and default to empty / null. +# +# Curators: `expected_doc_ids` and `expected_chunk_ids` MUST refer to +# real rows in the active workspace's SQLite store at run time. Stale +# references make the runner bail at start. The shipped template +# leaves them empty so the file is loadable on any fresh workspace — +# fill them in after a `kb ingest` to enable hit@k / MRR metrics +# (P5-2). +# +# `must_contain` / `forbidden` drive the rule-based groundedness +# metric (P5-2). + +- id: g001 + query: "Cargo workspace 멤버 추가하는 법" + lang: ko + must_contain: ["[workspace]", "members"] + difficulty: easy + +- id: g002 + query: "What is Rust ownership?" + lang: en + must_contain: ["borrow", "lifetime"] + difficulty: easy + +- id: g003 + query: "Markdown chunking 규칙은?" + lang: ko + must_contain: ["heading"] + forbidden: ["embedding"] + difficulty: medium + +- id: g004 + query: "How does FTS5 tokenization work for Korean text?" + lang: en + must_contain: ["unicode61", "tokenizer"] + difficulty: medium + +- id: g005 + query: "RAG citation 검증은 어떻게 동작?" + lang: ko + must_contain: ["citation", "marker"] + difficulty: hard diff --git a/tasks/p5/p5-1-golden-fixture-runner.md b/tasks/p5/p5-1-golden-fixture-runner.md index ec44acb..5cc7e47 100644 --- a/tasks/p5/p5-1-golden-fixture-runner.md +++ b/tasks/p5/p5-1-golden-fixture-runner.md @@ -3,7 +3,7 @@ phase: P5 component: kb-eval (runner) task_id: p5-1 title: "Golden query fixture loader + per-query runner" -status: planned +status: completed depends_on: [p4-3] unblocks: [p5-2] contract_source: ../../docs/superpowers/specs/2026-04-27-kb-final-form-design.md @@ -149,6 +149,6 @@ All tests under `cargo test -p kb-eval runner`. ## Risks / notes -- Large RAG suites can be slow. Consider `--max-queries` for incremental runs (kept here as a flag spec; implementation is the responsibility of this task). +- Large RAG suites can be slow. `--max-queries` flag is deferred to P5-2 / a follow-up. Rationale: (a) the runner currently runs all queries unconditionally; (b) without metrics aggregation it adds little incremental value; (c) trivial to add as a `Vec::truncate` once the eval CLI subcommand exists. - `expected_chunk_id` references depend on `chunker_version`. If chunker bumps, golden set must be re-curated. Fail fast in the loader. - Use `time::OffsetDateTime::now_utc()` for `created_at`; never local TZ. From e6ff9c412c5ce351c7257ef4460a0633168cdc11 Mon Sep 17 00:00:00 2001 From: altair823 Date: Fri, 1 May 2026 18:55:23 +0000 Subject: [PATCH 2/2] =?UTF-8?q?fix(p5-1):=20apply=20deferred=20review=20it?= =?UTF-8?q?ems=20=E2=80=94=20App=20reuse=20+=20expand=5Fpath=20hoist=20+?= =?UTF-8?q?=20nits?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - kb-app: promote App to pub, add open_with_config / search / ask methods so kb-eval (and future TUI) can amortize embedder + vector store + LLM cold-start across many queries on one App instance. Memoization is per-instance via OnceLock; *_with_config free functions delegate. - kb-config: add canonical expand_path helper + 8 unit tests; drop the 4 duplicate copies in kb-store-sqlite, kb-store-vector, kb-embed-local, kb-eval (net: -6 duplicate tests, +8 canonical tests). - kb-eval: extract elapsed_ms_u32 helper, drop redundant tracing debug log (with_context already names path on error), replace dead-port :1 test with bind-then-release ephemeral port. Verified: cargo clippy --workspace --all-targets -D warnings clean, all crate tests green (kb-app 12+3 ignored, kb-eval 11, kb-config 17, kb-store-sqlite 33, kb-store-vector 7+8 AVX-gated, kb-embed-local 7+7). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-app/src/app.rs | 224 +++++++++++++++++++++++++--- crates/kb-app/src/lib.rs | 167 ++------------------- crates/kb-config/src/lib.rs | 3 + crates/kb-config/src/paths.rs | 186 +++++++++++++++++++++++ crates/kb-embed-local/src/lib.rs | 137 +---------------- crates/kb-eval/src/runner.rs | 132 +++++----------- crates/kb-eval/tests/runner.rs | 22 ++- crates/kb-store-sqlite/src/store.rs | 52 +------ crates/kb-store-vector/src/paths.rs | 63 +------- crates/kb-store-vector/src/store.rs | 4 +- 10 files changed, 473 insertions(+), 517 deletions(-) create mode 100644 crates/kb-config/src/paths.rs diff --git a/crates/kb-app/src/app.rs b/crates/kb-app/src/app.rs index 01266d8..8156b3e 100644 --- a/crates/kb-app/src/app.rs +++ b/crates/kb-app/src/app.rs @@ -1,24 +1,22 @@ -//! `App` — internal lifecycle struct (§7). +//! `App` — facade lifecycle struct (§7). //! -//! A single `App` represents one CLI invocation's worth of state: a -//! resolved `Config`, an open `SqliteStore`, and (when embeddings are -//! enabled) an `Embedder` + `LanceVectorStore`. Each public free -//! function on `kb-app` wraps `App::open(config)` once, runs the -//! requested op, and drops everything on return. -//! -//! The struct is `pub(crate)` because it is an internal seam: `kb-cli` -//! calls only the free functions on the crate root. `kb-tui` (P9) is -//! expected to hold one `App` for the session, at which point the -//! struct may need to be promoted to `pub`. Until then, keep it -//! private to insulate the wiring shape from downstream callers. +//! A single `App` represents one CLI invocation's (or one TUI +//! session's / one eval-runner suite's) worth of state: a resolved +//! `Config`, an open `SqliteStore`, and (when embeddings are enabled) +//! an `Embedder` + `LanceVectorStore`. Each public free function on +//! `kb-app` builds an `App` once, runs the requested op, and drops +//! everything on return; long-lived callers (kb-eval, the future P9 +//! TUI session) hold onto an `App` across many calls so the per-query +//! cost is just a method dispatch. //! //! ## Embedder + Vector store lifetime //! -//! `App::open` builds the SQLite store unconditionally. The embedder -//! and vector store are *lazy + memoized* — built on first call to -//! [`App::embedder`] / [`App::vector`] and cached in `OnceLock`s — so -//! a long-lived `App` (e.g., the P9 TUI session) pays the ~470 MB -//! ONNX init plus Lance reopen cost exactly once. +//! `App::open_with_config` builds the SQLite store unconditionally. +//! The embedder and vector store are *lazy + memoized* — built on +//! first call to [`App::embedder`] / [`App::vector`] and cached in +//! `OnceLock`s — so a long-lived `App` (kb-eval driving 50 queries, +//! the P9 TUI session) pays the ~470 MB ONNX init plus Lance reopen +//! cost exactly once. //! //! - `kb list` / `kb inspect` never need them. //! - `kb search --mode lexical` never needs them. @@ -27,7 +25,8 @@ //! Building eagerly would force every CLI invocation to load ~470 MB of //! ONNX weights, which is the dominant cold-start cost. The lazy //! pattern keeps the lexical-only paths instant; the memoization makes -//! the TUI's repeated searches cheap after the first. +//! the TUI's repeated searches and the eval runner's per-query loop +//! cheap after the first invocation. //! //! Embeddings can also be **disabled** workspace-wide via //! `config.models.embedding.provider = "none"` (or `dimensions = 0`); @@ -36,15 +35,26 @@ use std::sync::{Arc, OnceLock}; -use anyhow::{Context, Result}; +use anyhow::{Context, Result, anyhow}; -use kb_core::Embedder; +use kb_core::{ + Answer, Embedder, IndexVersion, LanguageModel, Retriever, SearchHit, SearchMode, + SearchQuery, VectorStore, +}; use kb_embed_local::FastembedEmbedder; +use kb_llm_local::OllamaLanguageModel; +use kb_rag::{AskOpts, RagPipeline}; +use kb_search::{HybridRetriever, LexicalRetriever, VectorRetriever}; use kb_store_sqlite::SqliteStore; use kb_store_vector::LanceVectorStore; -/// Internal facade state. See module docs for lifetime rules. -pub(crate) struct App { +/// Facade state — see module docs for lifetime rules. +/// +/// The struct is public so long-lived callers (kb-eval, the future P9 +/// TUI session) can construct one and reuse it across many search / +/// ask calls. The OnceLock-backed `embedder` / `vector` fields ensure +/// the cold-start cost is paid exactly once per instance. +pub struct App { pub(crate) config: kb_config::Config, pub(crate) sqlite: Arc, /// Memoized embedder — built lazily on first `embedder()` call when @@ -54,6 +64,11 @@ pub(crate) struct App { /// Memoized vector store — built lazily on first `vector()` call /// when embeddings are enabled. Same rationale as `embedder`. vector: OnceLock>, + /// Memoized LLM — built lazily on first `ask()` call. Sharing one + /// across the eval runner avoids re-handshaking the Ollama HTTP + /// client per query (cheap, but still measurable on a 50-query + /// suite). + llm: OnceLock>, } impl App { @@ -65,7 +80,7 @@ impl App { /// Downstream `LanceVectorStore::new` (called by [`Self::vector`]) /// internally drives a `tokio::Runtime::block_on`, which panics if /// invoked from inside another tokio runtime. - pub(crate) fn open(config: kb_config::Config) -> Result { + pub fn open_with_config(config: kb_config::Config) -> Result { let sqlite = SqliteStore::open(&config).context("kb-app: open SqliteStore")?; sqlite .run_migrations() @@ -75,9 +90,112 @@ impl App { sqlite: Arc::new(sqlite), embedder: OnceLock::new(), vector: OnceLock::new(), + llm: OnceLock::new(), }) } + /// Run a [`SearchQuery`] through the configured retriever stack and + /// return the top-k hits. + /// + /// Reuses any previously-built embedder / vector store on this `App` + /// — long-lived callers (kb-eval, future TUI) get amortized cost + /// across calls. + pub fn search(&self, query: SearchQuery) -> Result> { + match query.mode { + SearchMode::Lexical => { + let lex = LexicalRetriever::with_settings( + self.sqlite.clone(), + lexical_index_version(&self.config), + self.config.search.snippet_chars, + ); + lex.search(&query) + } + SearchMode::Vector => { + let (emb, vec_store) = self.require_embeddings()?; + let vec_iv = vector_index_version(emb.as_ref()); + let vec_dyn: Arc = vec_store; + let emb_dyn: Arc = emb; + let retr = VectorRetriever::with_settings( + vec_dyn, + emb_dyn, + self.sqlite.clone(), + vec_iv, + self.config.search.snippet_chars, + ); + retr.search(&query) + } + SearchMode::Hybrid => { + let lex = Arc::new(LexicalRetriever::with_settings( + self.sqlite.clone(), + lexical_index_version(&self.config), + self.config.search.snippet_chars, + )) as Arc; + let (emb, vec_store) = self.require_embeddings()?; + let vec_iv = vector_index_version(emb.as_ref()); + let vec_dyn: Arc = vec_store; + let emb_dyn: Arc = emb; + let vec_retr = Arc::new(VectorRetriever::with_settings( + vec_dyn, + emb_dyn, + self.sqlite.clone(), + vec_iv, + self.config.search.snippet_chars, + )) as Arc; + let hybrid = HybridRetriever::new(&self.config, lex, vec_retr); + hybrid.search(&query) + } + } + } + + /// Run a RAG `ask` against the configured retriever + LLM. Reuses + /// the memoized embedder / vector / LLM where applicable. + pub fn ask(&self, query: &str, opts: AskOpts) -> Result { + let retriever: Arc = match opts.mode { + SearchMode::Lexical => Arc::new(LexicalRetriever::with_settings( + self.sqlite.clone(), + lexical_index_version(&self.config), + self.config.search.snippet_chars, + )), + SearchMode::Vector => { + let (emb, vec_store) = self.require_embeddings()?; + let vec_iv = vector_index_version(emb.as_ref()); + let vec_dyn: Arc = vec_store; + let emb_dyn: Arc = emb; + Arc::new(VectorRetriever::with_settings( + vec_dyn, + emb_dyn, + self.sqlite.clone(), + vec_iv, + self.config.search.snippet_chars, + )) + } + SearchMode::Hybrid => { + let lex = Arc::new(LexicalRetriever::with_settings( + self.sqlite.clone(), + lexical_index_version(&self.config), + self.config.search.snippet_chars, + )) as Arc; + let (emb, vec_store) = self.require_embeddings()?; + let vec_iv = vector_index_version(emb.as_ref()); + let vec_dyn: Arc = vec_store; + let emb_dyn: Arc = emb; + let vec_retr = Arc::new(VectorRetriever::with_settings( + vec_dyn, + emb_dyn, + self.sqlite.clone(), + vec_iv, + self.config.search.snippet_chars, + )) as Arc; + Arc::new(HybridRetriever::new(&self.config, lex, vec_retr)) + } + }; + + let llm = self.llm()?; + let pipeline = + RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone()); + pipeline.ask(query, opts) + } + /// Returns `true` when the workspace has embeddings turned off /// (`provider = "none"` or `dimensions = 0`). Lexical-only mode. pub(crate) fn embeddings_disabled(&self) -> bool { @@ -123,4 +241,64 @@ impl App { let _ = self.vector.set(store.clone()); Ok(Some(self.vector.get().cloned().unwrap_or(store))) } + + /// Build (or reuse) the configured LLM. Currently always Ollama; + /// when a second provider lands this is the place to switch on + /// `config.models.llm.provider`. + fn llm(&self) -> Result> { + if let Some(l) = self.llm.get() { + return Ok(l.clone()); + } + let llm: Arc = Arc::new( + OllamaLanguageModel::new(&self.config) + .context("kb-app::ask: build OllamaLanguageModel")?, + ); + let _ = self.llm.set(llm.clone()); + Ok(self.llm.get().cloned().unwrap_or(llm)) + } + + /// Resolve the embedder + vector store, surfacing the user-friendly + /// "switch to --mode lexical" error when embeddings are disabled. + fn require_embeddings( + &self, + ) -> Result<( + Arc, + Arc, + )> { + let emb = self.embedder()?.ok_or_else(|| { + anyhow!( + "embeddings disabled (config.models.embedding.provider == \"none\" \ + or dimensions == 0); vector / hybrid search require embeddings — \ + switch to --mode lexical or enable an embedding provider in config.toml" + ) + })?; + let vec_store = self.vector()?.ok_or_else(|| { + anyhow!( + "vector store unavailable while embedder is configured — this should \ + not happen; check `kb doctor` and the data_dir permissions" + ) + })?; + Ok((emb, vec_store)) + } +} + +/// Compose a stable `IndexVersion` for the lexical retriever from +/// the active config. This token surfaces in `SearchHit.index_version` +/// and on snapshot tests; including the chunker version pins it to +/// the chunking policy in effect. +fn lexical_index_version(config: &kb_config::Config) -> IndexVersion { + IndexVersion(format!("lex:{}", config.chunking.chunker_version)) +} + +/// Compose a stable `IndexVersion` for the vector retriever. Tracks +/// `(embedding_model, embedding_version, dimensions)` so a model swap +/// flags drift via the existing index_version mismatch warning in +/// `HybridRetriever::new`. +fn vector_index_version(embedder: &dyn Embedder) -> IndexVersion { + IndexVersion(format!( + "vec:{}@{}:{}", + embedder.model_id().0, + embedder.model_version().0, + embedder.dimensions(), + )) } diff --git a/crates/kb-app/src/lib.rs b/crates/kb-app/src/lib.rs index 8903e6a..aefc030 100644 --- a/crates/kb-app/src/lib.rs +++ b/crates/kb-app/src/lib.rs @@ -43,22 +43,18 @@ use kb_chunk::MdHeadingV1Chunker; use kb_core::{ Answer, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, - EmbeddingKind, IndexVersion, IngestReport, LanguageModel, ParserVersion, RawAsset, - Retriever, SearchHit, SearchMode, SearchQuery, SourceConnector, SourceScope, - SourceUri, VectorRecord, VectorStore, + EmbeddingKind, IngestReport, ParserVersion, RawAsset, SearchHit, SearchQuery, + SourceConnector, SourceScope, SourceUri, VectorRecord, VectorStore, }; -use kb_llm_local::OllamaLanguageModel; use kb_normalize::build_canonical_document; use kb_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; -use kb_rag::RagPipeline; -use kb_search::{HybridRetriever, LexicalRetriever, VectorRetriever}; use kb_source_fs::FsSourceConnector; mod app; pub mod doctor_signal; pub mod logging; -use app::App; +pub use app::App; /// Parser-version label persisted in `documents.parser_version` for /// every Markdown file ingested through the `kb-parse-md` pipeline. @@ -168,7 +164,7 @@ pub fn ingest_with_config( ) -> anyhow::Result { let started_instant = std::time::Instant::now(); - let app = App::open(config)?; + let app = App::open_with_config(config)?; // Walk the workspace. let connector = FsSourceConnector::new(&app.config) @@ -667,7 +663,7 @@ pub fn list_docs_with_config( config: kb_config::Config, filter: DocFilter, ) -> anyhow::Result> { - let app = App::open(config)?; + let app = App::open_with_config(config)?; app.sqlite.list_documents(&filter) } @@ -683,7 +679,7 @@ pub fn inspect_doc_with_config( config: kb_config::Config, id: &DocumentId, ) -> anyhow::Result { - let app = App::open(config)?; + let app = App::open_with_config(config)?; app.sqlite .get_document(id)? .ok_or_else(|| anyhow!("document not found: {} (try `kb list docs`)", id.0)) @@ -701,7 +697,7 @@ pub fn inspect_chunk_with_config( config: kb_config::Config, id: &ChunkId, ) -> anyhow::Result { - let app = App::open(config)?; + let app = App::open_with_config(config)?; app.sqlite .get_chunk(id)? .ok_or_else(|| anyhow!("chunk not found: {} (try `kb inspect doc `)", id.0)) @@ -715,101 +711,15 @@ pub fn search(query: SearchQuery) -> anyhow::Result> { } /// Test-only seam — kb-cli must call the public free function -/// ([`search`]), not this. +/// ([`search`]), not this. Builds a one-shot `App` and delegates to +/// [`App::search`]; long-lived callers should hold an `App` instance +/// directly to amortize the embedder / vector-store cold start. #[doc(hidden)] pub fn search_with_config( config: kb_config::Config, query: SearchQuery, ) -> anyhow::Result> { - let app = App::open(config)?; - - match query.mode { - SearchMode::Lexical => { - let lex = LexicalRetriever::with_settings( - app.sqlite.clone(), - lexical_index_version(&app.config), - app.config.search.snippet_chars, - ); - lex.search(&query) - } - SearchMode::Vector => { - let (emb, vec_store) = require_embeddings(&app)?; - let vec_iv = vector_index_version(emb.as_ref()); - let vec_dyn: Arc = vec_store; - let emb_dyn: Arc = emb; - let retr = VectorRetriever::with_settings( - vec_dyn, - emb_dyn, - app.sqlite.clone(), - vec_iv, - app.config.search.snippet_chars, - ); - retr.search(&query) - } - SearchMode::Hybrid => { - let lex = Arc::new(LexicalRetriever::with_settings( - app.sqlite.clone(), - lexical_index_version(&app.config), - app.config.search.snippet_chars, - )) as Arc; - let (emb, vec_store) = require_embeddings(&app)?; - let vec_iv = vector_index_version(emb.as_ref()); - let vec_dyn: Arc = vec_store; - let emb_dyn: Arc = emb; - let vec_retr = Arc::new(VectorRetriever::with_settings( - vec_dyn, - emb_dyn, - app.sqlite.clone(), - vec_iv, - app.config.search.snippet_chars, - )) as Arc; - let hybrid = HybridRetriever::new(&app.config, lex, vec_retr); - hybrid.search(&query) - } - } -} - -fn require_embeddings( - app: &App, -) -> anyhow::Result<( - Arc, - Arc, -)> { - let emb = app.embedder()?.ok_or_else(|| { - anyhow!( - "embeddings disabled (config.models.embedding.provider == \"none\" \ - or dimensions == 0); vector / hybrid search require embeddings — \ - switch to --mode lexical or enable an embedding provider in config.toml" - ) - })?; - let vec_store = app.vector()?.ok_or_else(|| { - anyhow!( - "vector store unavailable while embedder is configured — this should \ - not happen; check `kb doctor` and the data_dir permissions" - ) - })?; - Ok((emb, vec_store)) -} - -/// Compose a stable `IndexVersion` for the lexical retriever from -/// the active config. This token surfaces in `SearchHit.index_version` -/// and on snapshot tests; including the chunker version pins it to -/// the chunking policy in effect. -fn lexical_index_version(config: &kb_config::Config) -> IndexVersion { - IndexVersion(format!("lex:{}", config.chunking.chunker_version)) -} - -/// Compose a stable `IndexVersion` for the vector retriever. Tracks -/// `(embedding_model, embedding_version, dimensions)` so a model swap -/// flags drift via the existing index_version mismatch warning in -/// `HybridRetriever::new`. -fn vector_index_version(embedder: &dyn Embedder) -> IndexVersion { - IndexVersion(format!( - "vec:{}@{}:{}", - embedder.model_id().0, - embedder.model_version().0, - embedder.dimensions(), - )) + App::open_with_config(config)?.search(query) } // ── ask ────────────────────────────────────────────────────────────────── @@ -826,64 +736,15 @@ pub fn ask(query: &str, opts: AskOpts) -> anyhow::Result { } /// Test-only seam — kb-cli must call the public free function -/// ([`ask`]), not this. Mirrors the `*_with_config` pattern documented -/// at the top of this module. +/// ([`ask`]), not this. Builds a one-shot `App` and delegates to +/// [`App::ask`]. #[doc(hidden)] pub fn ask_with_config( config: kb_config::Config, query: &str, opts: AskOpts, ) -> anyhow::Result { - let app = App::open(config)?; - - let retriever: Arc = match opts.mode { - SearchMode::Lexical => Arc::new(LexicalRetriever::with_settings( - app.sqlite.clone(), - lexical_index_version(&app.config), - app.config.search.snippet_chars, - )), - SearchMode::Vector => { - let (emb, vec_store) = require_embeddings(&app)?; - let vec_iv = vector_index_version(emb.as_ref()); - let vec_dyn: Arc = vec_store; - let emb_dyn: Arc = emb; - Arc::new(VectorRetriever::with_settings( - vec_dyn, - emb_dyn, - app.sqlite.clone(), - vec_iv, - app.config.search.snippet_chars, - )) - } - SearchMode::Hybrid => { - let lex = Arc::new(LexicalRetriever::with_settings( - app.sqlite.clone(), - lexical_index_version(&app.config), - app.config.search.snippet_chars, - )) as Arc; - let (emb, vec_store) = require_embeddings(&app)?; - let vec_iv = vector_index_version(emb.as_ref()); - let vec_dyn: Arc = vec_store; - let emb_dyn: Arc = emb; - let vec_retr = Arc::new(VectorRetriever::with_settings( - vec_dyn, - emb_dyn, - app.sqlite.clone(), - vec_iv, - app.config.search.snippet_chars, - )) as Arc; - Arc::new(HybridRetriever::new(&app.config, lex, vec_retr)) - } - }; - - let llm: Arc = Arc::new( - OllamaLanguageModel::new(&app.config) - .context("kb-app::ask: build OllamaLanguageModel")?, - ); - - let pipeline = - RagPipeline::new(app.config.clone(), retriever, llm, app.sqlite.clone()); - pipeline.ask(query, opts) + App::open_with_config(config)?.ask(query, opts) } /// Run the doctor checks against the explicit config path the user diff --git a/crates/kb-config/src/lib.rs b/crates/kb-config/src/lib.rs index b2b1352..4620ff1 100644 --- a/crates/kb-config/src/lib.rs +++ b/crates/kb-config/src/lib.rs @@ -8,6 +8,9 @@ use std::path::{Path, PathBuf}; use serde::{Deserialize, Serialize}; +mod paths; +pub use paths::expand_path; + #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct Config { pub schema_version: u32, diff --git a/crates/kb-config/src/paths.rs b/crates/kb-config/src/paths.rs new file mode 100644 index 0000000..f36c6dc --- /dev/null +++ b/crates/kb-config/src/paths.rs @@ -0,0 +1,186 @@ +//! Shared path expansion helper. +//! +//! `Config::storage.*` fields are stored as raw template strings (e.g. +//! `${XDG_DATA_HOME:-~/.local/share}/kb`, `{data_dir}/runs`). Every +//! crate that turns one of those strings into a real filesystem path +//! needs to apply the same set of substitutions; this module is the +//! single source of truth so the behavior cannot drift. +//! +//! Substitutions, applied in order: +//! +//! 1. `{data_dir}` → caller-supplied `data_dir`. +//! - When the caller passes an empty `data_dir` (because they ARE +//! resolving `data_dir` itself), the substitution is a no-op so +//! a literal `{data_dir}` is left in place rather than producing +//! a `/{data_dir}/...` artifact. +//! 2. `${XDG_DATA_HOME:-}` (or the bare `${XDG_DATA_HOME}`) → +//! the env var if set + non-empty, else the default after `:-`. +//! Mimics POSIX shell's `${VAR:-default}` semantics. Mid-string +//! occurrences are supported; only the first match is replaced. +//! 3. Leading `~` / `~/...` → `$HOME`. Any non-leading `~` is left +//! literal (matches shell behavior — only the first segment expands). +//! +//! The result is a `PathBuf` regardless of whether all substitutions +//! were applicable; relative paths are kept relative to the caller's +//! CWD (not resolved here). + +use std::path::PathBuf; + +/// Expand storage-path templates. See module docs for the substitution +/// rules. +/// +/// Pass an empty `data_dir` when resolving `data_dir` itself; the +/// `{data_dir}` substitution becomes a no-op in that case so the +/// recursive shape (`data_dir = "${XDG_DATA_HOME:-…}/kb"`) resolves +/// without producing a literal `{data_dir}` token in the output. +pub fn expand_path(raw: &str, data_dir: &str) -> PathBuf { + let mut s = raw.to_string(); + + // 1. {data_dir} substitution (skipped when resolving data_dir + // itself; see module docs). + if !data_dir.is_empty() { + s = s.replace("{data_dir}", data_dir); + } + + // 2. ${XDG_DATA_HOME:-}: env override else default. + if let Some(start) = s.find("${XDG_DATA_HOME") { + if let Some(rel_end) = s[start..].find('}') { + let end = start + rel_end + 1; // include trailing '}' + let inner = &s[start + 2..end - 1]; // strip ${ and } + let replacement = match std::env::var("XDG_DATA_HOME") { + Ok(v) if !v.is_empty() => v, + _ => match inner.split_once(":-") { + Some((_, default)) => default.to_string(), + None => String::new(), + }, + }; + s.replace_range(start..end, &replacement); + } + } + + // 3. Leading `~` → $HOME. + if let Some(rest) = s.strip_prefix('~') { + if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) { + return home.join(rest.trim_start_matches('/')); + } + } + + PathBuf::from(s) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Mutex as StdMutex; + + /// `XDG_DATA_HOME` / `HOME` env mutations must be serialized so + /// concurrent test runs (cargo's default parallel runner) don't + /// observe each other's transient values. + static ENV_LOCK: StdMutex<()> = StdMutex::new(()); + + /// RAII guard: snapshots `XDG_DATA_HOME` on construction, restores + /// it on drop. + struct XdgGuard { + prior: Option, + } + + impl XdgGuard { + fn capture() -> Self { + Self { + prior: std::env::var("XDG_DATA_HOME").ok(), + } + } + } + + impl Drop for XdgGuard { + fn drop(&mut self) { + // SAFETY: edition 2024 marks set_var/remove_var unsafe + // because env mutation is not thread-safe. The ENV_LOCK + // guard at the call site prevents concurrent observation. + unsafe { + match &self.prior { + Some(v) => std::env::set_var("XDG_DATA_HOME", v), + None => std::env::remove_var("XDG_DATA_HOME"), + } + } + } + } + + #[test] + fn substitutes_data_dir_template() { + let p = expand_path("{data_dir}/runs", "/tmp/kbtest"); + assert_eq!(p, PathBuf::from("/tmp/kbtest/runs")); + } + + #[test] + fn data_dir_substitution_skipped_when_empty() { + // Empty `data_dir` is the "resolving data_dir itself" signal; + // the literal `{data_dir}` token must survive. + let p = expand_path("{data_dir}/runs", ""); + assert_eq!(p, PathBuf::from("{data_dir}/runs")); + } + + #[test] + fn passthrough_absolute_path() { + let p = expand_path("/abs/runs", "/ignored"); + assert_eq!(p, PathBuf::from("/abs/runs")); + } + + #[test] + fn xdg_data_home_set_replaces_var() { + let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner()); + let _guard = XdgGuard::capture(); + // SAFETY: lock held for the duration of this test. + unsafe { std::env::set_var("XDG_DATA_HOME", "/custom/path") }; + + let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", ""); + assert_eq!(p, PathBuf::from("/custom/path/kb")); + } + + #[test] + fn xdg_data_home_unset_uses_default() { + let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner()); + let _guard = XdgGuard::capture(); + // SAFETY: lock held for the duration of this test. + unsafe { std::env::remove_var("XDG_DATA_HOME") }; + + let home = std::env::var("HOME").expect("HOME must be set in tests"); + let expected = PathBuf::from(home).join(".local/share/kb"); + let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", ""); + assert_eq!(p, expected); + } + + #[test] + fn xdg_with_no_default_resolves_to_empty_when_unset() { + let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner()); + let _guard = XdgGuard::capture(); + // SAFETY: lock held for the duration of this test. + unsafe { std::env::remove_var("XDG_DATA_HOME") }; + + // No `:-default` clause, no env var → empty string substitution. + let p = expand_path("${XDG_DATA_HOME}/kb", ""); + assert_eq!(p, PathBuf::from("/kb")); + } + + #[test] + fn leading_tilde_expands_to_home() { + let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner()); + let home = std::env::var("HOME").expect("HOME must be set in tests"); + let p = expand_path("~/runs", ""); + assert_eq!(p, PathBuf::from(home).join("runs")); + } + + #[test] + fn data_dir_then_xdg_then_tilde_compose() { + // Order matters: substitute `{data_dir}` (which itself contains + // an unexpanded `${XDG_DATA_HOME}` and `~`), then the other two + // resolve the result. + let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner()); + let _guard = XdgGuard::capture(); + // SAFETY: lock held for the duration of this test. + unsafe { std::env::set_var("XDG_DATA_HOME", "/xdg/data") }; + + let p = expand_path("{data_dir}/runs", "/xdg/data/kb"); + assert_eq!(p, PathBuf::from("/xdg/data/kb/runs")); + } +} diff --git a/crates/kb-embed-local/src/lib.rs b/crates/kb-embed-local/src/lib.rs index 1d3e2d0..2d0796c 100644 --- a/crates/kb-embed-local/src/lib.rs +++ b/crates/kb-embed-local/src/lib.rs @@ -22,11 +22,11 @@ //! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md` //! §7.2 (Embedder), §6.4 ([models.embedding]), §9 (versioning). -use std::path::PathBuf; use std::sync::Mutex; use anyhow::{Context, Result}; use fastembed::{EmbeddingModel, InitOptions, TextEmbedding}; +use kb_config::expand_path; use kb_embed::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion}; /// Subdirectory under `config.storage.model_dir` where the fastembed @@ -60,9 +60,8 @@ impl FastembedEmbedder { /// first `embed`). pub fn new(config: &kb_config::Config) -> Result { // 1. Resolve `{data_dir}/models/fastembed/` from the config - // templates. `kb-config` does not expose a public path - // resolver yet, so we hand-roll a tiny one mirroring - // kb-store-sqlite's `expand_data_dir`. + // templates. Goes through the shared `kb_config::expand_path` + // so every crate resolves storage paths identically. let data_dir = expand_path(&config.storage.data_dir, ""); let model_dir = expand_path(&config.storage.model_dir, &data_dir.to_string_lossy()); let cache_dir = model_dir.join(FASTEMBED_CACHE_SUBDIR); @@ -222,58 +221,6 @@ pub(crate) fn check_dim(model_dim: usize, cfg_dim: usize) -> Result<()> { Ok(()) } -/// Expand the limited template language `kb-config` uses for storage -/// paths. -/// -/// Supported substitutions, applied in order: -/// 1. `{data_dir}` → `data_dir` (caller-supplied resolved string). This -/// is a no-op when `data_dir` is empty (used by the recursive call -/// that resolves `data_dir` itself). -/// 2. `${XDG_DATA_HOME:-~/.local/share}` (and the bare -/// `${XDG_DATA_HOME}`) → env var if set, else the default after -/// `:-`. -/// 3. Leading `~` → `$HOME`. -/// -/// Mirrors `kb-store-sqlite::store::expand_data_dir`. Kept private to -/// this crate; promoting it to a public `kb-config` API is a separate -/// task (see task p3-2 risks: "don't expand kb-config's public API"). -fn expand_path(raw: &str, data_dir: &str) -> PathBuf { - let mut s = raw.to_string(); - - if !data_dir.is_empty() { - s = s.replace("{data_dir}", data_dir); - } - - // ${XDG_DATA_HOME:-~/.local/share}: respect env override, else fall - // back to the suffix after `:-`. - if let Some(start) = s.find("${XDG_DATA_HOME") { - if let Some(rel_end) = s[start..].find('}') { - let end = start + rel_end + 1; // include trailing '}' - let inner = &s[start + 2..end - 1]; // strip ${ and } - let replacement = match std::env::var("XDG_DATA_HOME") { - Ok(v) if !v.is_empty() => v, - _ => { - if let Some((_, default)) = inner.split_once(":-") { - default.to_string() - } else { - String::new() - } - } - }; - s.replace_range(start..end, &replacement); - } - } - - // Leading `~` → $HOME. - if let Some(rest) = s.strip_prefix('~') { - if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) { - return home.join(rest.trim_start_matches('/')); - } - } - - PathBuf::from(s) -} - #[cfg(test)] mod tests { use super::*; @@ -354,80 +301,6 @@ mod tests { assert!(msg.contains("unsupported embedding model"), "msg={msg}"); } - // ── expand_path ────────────────────────────────────────────────── - - #[test] - fn expand_path_substitutes_data_dir_template() { - let p = expand_path("{data_dir}/models", "/tmp/kbtest"); - assert_eq!(p, PathBuf::from("/tmp/kbtest/models")); - } - - #[test] - fn expand_path_no_op_without_template() { - let p = expand_path("/abs/path", "/tmp/kbtest"); - assert_eq!(p, PathBuf::from("/abs/path")); - } - - // ── expand_path: XDG_DATA_HOME fallback ────────────────────────── - // - // These two tests mutate the process-wide `XDG_DATA_HOME` env var, - // which is unsafe under edition 2024 and racy under cargo's default - // parallel test runner. The shared `ENV_LOCK` serializes them; each - // test snapshots the prior value and restores it on exit. - - use std::sync::Mutex as StdMutex; - static ENV_LOCK: StdMutex<()> = StdMutex::new(()); - - /// RAII guard: snapshots `XDG_DATA_HOME` on construction, restores - /// it on drop. Pair with the `ENV_LOCK` guard for serial access. - struct XdgGuard { - prior: Option, - } - - impl XdgGuard { - fn capture() -> Self { - Self { - prior: std::env::var("XDG_DATA_HOME").ok(), - } - } - } - - impl Drop for XdgGuard { - fn drop(&mut self) { - // SAFETY: edition 2024 marks `set_var`/`remove_var` unsafe - // because env mutation is not thread-safe. Callers hold - // `ENV_LOCK` for the duration of the test, so no other - // thread observes the mutation. - unsafe { - match &self.prior { - Some(v) => std::env::set_var("XDG_DATA_HOME", v), - None => std::env::remove_var("XDG_DATA_HOME"), - } - } - } - } - - #[test] - fn expand_path_xdg_data_home_set() { - let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner()); - let _guard = XdgGuard::capture(); - // SAFETY: lock held for the duration of this test. - unsafe { std::env::set_var("XDG_DATA_HOME", "/custom/path") }; - - let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", ""); - assert_eq!(p, PathBuf::from("/custom/path/kb")); - } - - #[test] - fn expand_path_xdg_data_home_unset_falls_back_to_home() { - let _lock = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner()); - let _guard = XdgGuard::capture(); - // SAFETY: lock held for the duration of this test. - unsafe { std::env::remove_var("XDG_DATA_HOME") }; - - let home = std::env::var("HOME").expect("HOME must be set in tests"); - let expected = PathBuf::from(home).join(".local/share/kb"); - let p = expand_path("${XDG_DATA_HOME:-~/.local/share}/kb", ""); - assert_eq!(p, expected); - } + // expand_path tests live in `kb-config::paths`. The adapter imports + // it and trusts the upstream coverage rather than duplicating it. } diff --git a/crates/kb-eval/src/runner.rs b/crates/kb-eval/src/runner.rs index 5ebd340..7916654 100644 --- a/crates/kb-eval/src/runner.rs +++ b/crates/kb-eval/src/runner.rs @@ -3,8 +3,11 @@ use std::fs::File; use std::io::{BufWriter, Write}; use std::path::PathBuf; +use std::time::Instant; use anyhow::{Context, Result}; +use kb_app::App; +use kb_config::expand_path; use kb_core::{SearchFilters, SearchQuery}; use kb_store_sqlite::{EvalRunRow, SqliteStore}; use time::OffsetDateTime; @@ -12,6 +15,14 @@ use time::OffsetDateTime; use crate::loader::{load_golden_set, validate_against_db}; use crate::types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult}; +/// Convert a wall-clock duration since `start` into milliseconds clamped +/// to `u32::MAX`. The `QueryResult.elapsed_ms` and `eval_runs.duration_ms` +/// fields are `u32`; saturate (rather than wrap) so a stuck run never +/// reports a misleading sub-second duration. +fn elapsed_ms_u32(start: Instant) -> u32 { + start.elapsed().as_millis().min(u128::from(u32::MAX)) as u32 +} + /// Env var that overrides the default `fixtures/golden_queries.yaml` /// path. Resolved relative to the current working directory. const KB_EVAL_GOLDEN: &str = "KB_EVAL_GOLDEN"; @@ -32,15 +43,13 @@ pub fn run_eval(opts: &EvalRunOpts) -> Result { /// data_dir) and any future caller that wants to drive the runner /// against a non-default config. pub fn run_eval_with_config(cfg: &kb_config::Config, opts: &EvalRunOpts) -> Result { - let started = std::time::Instant::now(); + let started = Instant::now(); // ── 1. Load golden set ──────────────────────────────────────────────── + // + // `with_context` already names the path on error, so a separate + // `tracing::debug!` here would just be noise. let golden_path = resolve_golden_path(); - tracing::debug!( - target: "kb-eval", - path = %golden_path.display(), - "kb-eval: loading golden set" - ); let queries = load_golden_set(&golden_path).with_context(|| { format!( "load golden set from {} (override via KB_EVAL_GOLDEN)", @@ -69,9 +78,15 @@ pub fn run_eval_with_config(cfg: &kb_config::Config, opts: &EvalRunOpts) -> Resu serde_json::to_string(&config_snapshot_json).context("serialize config_snapshot_json")?; // ── 4. Per-query execution ──────────────────────────────────────────── + // + // Open one `App` for the whole suite. The embedder / vector store / + // LLM are memoized on the App, so a 50-query run pays the ~470 MB + // ONNX init + Lance reopen + Ollama handshake exactly once. + let app = App::open_with_config(cfg.clone()).context("open App for run_eval")?; + let mut per_query: Vec = Vec::with_capacity(queries.len()); for gq in &queries { - let qr = execute_query(cfg, gq, opts); + let qr = execute_query(&app, gq, opts); per_query.push(qr); } @@ -99,7 +114,7 @@ pub fn run_eval_with_config(cfg: &kb_config::Config, opts: &EvalRunOpts) -> Resu // ── 6. Mirror to runs_dir//per_query.jsonl ──────────────────── write_per_query_jsonl(cfg, &run_id, &per_query)?; - let duration_ms = started.elapsed().as_millis().min(u128::from(u32::MAX)) as u32; + let duration_ms = elapsed_ms_u32(started); tracing::info!( target: "kb-eval", run_id = %run_id, @@ -139,8 +154,8 @@ fn resolve_golden_path() -> PathBuf { /// Run one [`GoldenQuery`] through the kb-app facade. Errors are /// captured into `QueryResult.error` so the run continues. -fn execute_query(cfg: &kb_config::Config, gq: &GoldenQuery, opts: &EvalRunOpts) -> QueryResult { - let started = std::time::Instant::now(); +fn execute_query(app: &App, gq: &GoldenQuery, opts: &EvalRunOpts) -> QueryResult { + let started = Instant::now(); let search_query = SearchQuery { text: gq.query.clone(), @@ -149,7 +164,7 @@ fn execute_query(cfg: &kb_config::Config, gq: &GoldenQuery, opts: &EvalRunOpts) filters: SearchFilters::default(), }; - let (hits_top_k, mut error) = match kb_app::search_with_config(cfg.clone(), search_query) { + let (hits_top_k, mut error) = match app.search(search_query) { Ok(hits) => (hits, None), Err(e) => (Vec::new(), Some(format!("{e:#}"))), }; @@ -166,7 +181,7 @@ fn execute_query(cfg: &kb_config::Config, gq: &GoldenQuery, opts: &EvalRunOpts) seed: opts.seed, stream_sink: None, }; - match kb_app::ask_with_config(cfg.clone(), &gq.query, ask_opts) { + match app.ask(&gq.query, ask_opts) { Ok(ans) => Some(ans), Err(e) => { error = Some(format!("{e:#}")); @@ -177,15 +192,13 @@ fn execute_query(cfg: &kb_config::Config, gq: &GoldenQuery, opts: &EvalRunOpts) None }; - let elapsed_ms = started.elapsed().as_millis().min(u128::from(u32::MAX)) as u32; - QueryResult { query_id: gq.id.clone(), query: gq.query.clone(), mode: opts.mode, hits_top_k, answer, - elapsed_ms, + elapsed_ms: elapsed_ms_u32(started), error, } } @@ -231,7 +244,14 @@ fn write_per_query_jsonl( run_id: &str, per_query: &[QueryResult], ) -> Result<()> { - let runs_dir = expand_path(&cfg.storage.runs_dir, &cfg.storage.data_dir); + // `data_dir` may itself contain `${XDG_DATA_HOME:-…}` / `~` (the + // workspace-default does); resolve it before threading it into the + // `{data_dir}` substitution of `runs_dir`. + let resolved_data_dir = expand_path(&cfg.storage.data_dir, ""); + let runs_dir = expand_path( + &cfg.storage.runs_dir, + &resolved_data_dir.to_string_lossy(), + ); let run_dir = runs_dir.join(run_id); std::fs::create_dir_all(&run_dir) .with_context(|| format!("create run dir {}", run_dir.display()))?; @@ -248,83 +268,3 @@ fn write_per_query_jsonl( w.flush().context("flush per_query.jsonl")?; Ok(()) } - -/// Expand `{data_dir}` / `${XDG_DATA_HOME:-…}` / leading `~`. Mirror -/// of `kb-store-vector::paths::expand_path` and -/// `kb-store-sqlite::expand_data_dir` — kept private here because -/// `kb-config` does not (yet) expose a shared resolver helper. -fn expand_path(raw: &str, data_dir: &str) -> PathBuf { - let mut s = raw.to_string(); - - // First, resolve `data_dir` itself so any `{data_dir}` substitution - // points at an already-expanded base path. `data_dir` may contain - // `${XDG_DATA_HOME:-…}` and `~`; resolve them once and re-use the - // result. - let resolved_data_dir = expand_xdg_and_tilde(data_dir); - s = s.replace("{data_dir}", &resolved_data_dir); - - expand_xdg_and_tilde_path(&s) -} - -fn expand_xdg_and_tilde(raw: &str) -> String { - let s = expand_xdg(raw); - expand_tilde_str(&s) -} - -fn expand_xdg_and_tilde_path(raw: &str) -> PathBuf { - let s = expand_xdg_and_tilde(raw); - PathBuf::from(s) -} - -fn expand_xdg(raw: &str) -> String { - let mut s = raw.to_string(); - if let Some(start) = s.find("${XDG_DATA_HOME") { - if let Some(rel_end) = s[start..].find('}') { - let end = start + rel_end + 1; - let inner = &s[start + 2..end - 1]; - let replacement = match std::env::var("XDG_DATA_HOME") { - Ok(v) if !v.is_empty() => v, - _ => match inner.split_once(":-") { - Some((_, default)) => default.to_string(), - None => String::new(), - }, - }; - s.replace_range(start..end, &replacement); - } - } - s -} - -fn expand_tilde_str(raw: &str) -> String { - if let Some(rest) = raw.strip_prefix("~/") { - if let Some(home) = std::env::var_os("HOME") { - let mut p = PathBuf::from(home); - p.push(rest); - return p.to_string_lossy().into_owned(); - } - } - if raw == "~" { - if let Some(home) = std::env::var_os("HOME") { - return PathBuf::from(home).to_string_lossy().into_owned(); - } - } - raw.to_string() -} - -#[cfg(test)] -mod expand_tests { - use super::*; - use std::path::Path; - - #[test] - fn expand_substitutes_data_dir() { - let p = expand_path("{data_dir}/runs", "/tmp/kbtest"); - assert_eq!(p, Path::new("/tmp/kbtest/runs")); - } - - #[test] - fn expand_passthrough_absolute() { - let p = expand_path("/abs/runs", "/ignored"); - assert_eq!(p, Path::new("/abs/runs")); - } -} diff --git a/crates/kb-eval/tests/runner.rs b/crates/kb-eval/tests/runner.rs index eba63af..248c1fd 100644 --- a/crates/kb-eval/tests/runner.rs +++ b/crates/kb-eval/tests/runner.rs @@ -120,6 +120,18 @@ fn write_golden(dir: &Path, body: &str) -> PathBuf { path } +/// Bind a fresh ephemeral port, then release it. The returned URL +/// points at a port that was just freed; very likely still unbound +/// when the test issues its outbound connection a moment later, in +/// which case `connect()` fails fast with `ECONNREFUSED`. Beats +/// hard-coding port 1 which can timeout slowly on hardened hosts. +fn unreachable_endpoint() -> String { + let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); + let port = listener.local_addr().unwrap().port(); + drop(listener); + format!("http://127.0.0.1:{port}") +} + fn lexical_opts() -> EvalRunOpts { EvalRunOpts { suite: "test".to_string(), @@ -212,10 +224,14 @@ fn runner_records_config_snapshot_with_versions() { #[test] fn runner_captures_per_query_error_when_rag_unreachable() { let env = RunEnv::new(); - // Point Ollama at a guaranteed-dead port so `ask_with_config` - // surfaces a connection error per query. + // Point Ollama at an unbound port so `ask_with_config` surfaces a + // connection error per query. We use bind-then-release rather than + // a hard-coded `:1` because port 1 is reserved-but-not-guaranteed- + // unbound (some hardened systems answer with ICMP unreachable + // instantly, others timeout slowly). TOCTOU race is theoretically + // possible but rare in practice and faster-failing than `:1`. let mut config = env.config.clone(); - config.models.llm.endpoint = "http://127.0.0.1:1".to_string(); + config.models.llm.endpoint = unreachable_endpoint(); let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n query: ownership\n"); diff --git a/crates/kb-store-sqlite/src/store.rs b/crates/kb-store-sqlite/src/store.rs index b7f57d6..f0c1560 100644 --- a/crates/kb-store-sqlite/src/store.rs +++ b/crates/kb-store-sqlite/src/store.rs @@ -64,7 +64,7 @@ impl SqliteStore { /// temp_store=MEMORY), and create parent directories as needed. /// **Does not run migrations** — call [`Self::run_migrations`] next. pub fn open(config: &kb_config::Config) -> Result { - let data_dir = expand_data_dir(&config.storage.data_dir); + let data_dir = kb_config::expand_path(&config.storage.data_dir, ""); std::fs::create_dir_all(&data_dir) .with_context(|| format!("create data_dir {}", data_dir.display()))?; let db_path = data_dir.join(SQLITE_FILE); @@ -363,53 +363,3 @@ fn apply_pragmas(conn: &Connection) -> Result<()> { Ok(()) } -/// Expand the placeholders / `~` / env-vars used by `Config::storage.data_dir`. -/// -/// Supported substitutions, in order: -/// - `${XDG_DATA_HOME:-~/.local/share}` (and the bare `${XDG_DATA_HOME}`) -/// - leading `~` → `$HOME` -/// -/// If neither produces an absolute path, the input is returned as-is -/// (relative paths are kept relative to the caller's CWD). -fn expand_data_dir(raw: &str) -> PathBuf { - let mut s = raw.to_string(); - - // ${XDG_DATA_HOME:-~/.local/share}: respect the env override, else - // fall back to the suffix after `:-`. - if let Some(start) = s.find("${XDG_DATA_HOME") { - if let Some(rel_end) = s[start..].find('}') { - let end = start + rel_end + 1; // include trailing '}' - let inner = &s[start + 2..end - 1]; // strip ${ and } - let replacement = match std::env::var("XDG_DATA_HOME") { - Ok(v) if !v.is_empty() => v, - _ => { - // inner is e.g. `XDG_DATA_HOME:-~/.local/share`. - if let Some((_, default)) = inner.split_once(":-") { - default.to_string() - } else { - // No default supplied; mimic Bash and yield "". - String::new() - } - } - }; - s.replace_range(start..end, &replacement); - } - } - - // ~ at the front → $HOME (or `dirs::home_dir`). - if let Some(rest) = s.strip_prefix('~') { - if let Some(home) = std::env::var_os("HOME").map(PathBuf::from).or_else(dirs_home_fallback) - { - return home.join(rest.trim_start_matches('/')); - } - } - - PathBuf::from(s) -} - -/// Tiny shim to avoid pulling in the `dirs` crate as a direct dep — we -/// only fall back when `$HOME` is unset, which is exotic on the platforms -/// we target. Returns `None` so the caller keeps the literal `~`. -fn dirs_home_fallback() -> Option { - None -} diff --git a/crates/kb-store-vector/src/paths.rs b/crates/kb-store-vector/src/paths.rs index 87d2979..40508f2 100644 --- a/crates/kb-store-vector/src/paths.rs +++ b/crates/kb-store-vector/src/paths.rs @@ -1,51 +1,10 @@ //! Path expansion + table-name sanitization. //! -//! Mirrors `kb-store-sqlite::store::expand_data_dir` and -//! `kb-embed-local::expand_path` so the three crates resolve -//! `${XDG_DATA_HOME:-…}` / leading `~` / `{data_dir}` identically. A -//! shared helper would live in `kb-config`, but the task spec forbids -//! adding new types to `kb-config`, so we keep a private clone. - -use std::path::PathBuf; - -/// Expand `{data_dir}` → `data_dir`, `${XDG_DATA_HOME:-…}` → env or -/// default, leading `~` → `$HOME`. Pass an empty `data_dir` when -/// resolving `data_dir` itself (the `{data_dir}` substitution is a -/// no-op in that case). -pub(crate) fn expand_path(raw: &str, data_dir: &str) -> PathBuf { - let mut s = raw.to_string(); - - if !data_dir.is_empty() { - s = s.replace("{data_dir}", data_dir); - } - - // ${XDG_DATA_HOME:-~/.local/share}: env override, else default after `:-`. - if let Some(start) = s.find("${XDG_DATA_HOME") { - if let Some(rel_end) = s[start..].find('}') { - let end = start + rel_end + 1; - let inner = &s[start + 2..end - 1]; - let replacement = match std::env::var("XDG_DATA_HOME") { - Ok(v) if !v.is_empty() => v, - _ => { - if let Some((_, default)) = inner.split_once(":-") { - default.to_string() - } else { - String::new() - } - } - }; - s.replace_range(start..end, &replacement); - } - } - - if let Some(rest) = s.strip_prefix('~') { - if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) { - return home.join(rest.trim_start_matches('/')); - } - } - - PathBuf::from(s) -} +//! `expand_path` lives in `kb-config` so `kb-store-vector`, +//! `kb-store-sqlite`, `kb-embed-local`, and `kb-eval` all resolve +//! `${XDG_DATA_HOME:-…}` / leading `~` / `{data_dir}` identically. This +//! module re-exports nothing; consumers within the crate `use +//! kb_config::expand_path` directly. /// Build the per-model Lance table name. Per design §6.3: /// `chunk_embeddings__.lance`. Model IDs may contain @@ -104,16 +63,4 @@ mod tests { "chunk_embeddings_BAAI_bge-small-en_384" ); } - - #[test] - fn expand_path_substitutes_data_dir() { - let p = expand_path("{data_dir}/lancedb", "/tmp/kbtest"); - assert_eq!(p, PathBuf::from("/tmp/kbtest/lancedb")); - } - - #[test] - fn expand_path_passthrough_absolute() { - let p = expand_path("/abs/dir", "/ignored"); - assert_eq!(p, PathBuf::from("/abs/dir")); - } } diff --git a/crates/kb-store-vector/src/store.rs b/crates/kb-store-vector/src/store.rs index 124c11c..99e3c6f 100644 --- a/crates/kb-store-vector/src/store.rs +++ b/crates/kb-store-vector/src/store.rs @@ -22,8 +22,10 @@ use serde_json::json; use time::OffsetDateTime; use tokio::runtime::{Builder as RuntimeBuilder, Runtime}; +use kb_config::expand_path; + use crate::arrow_batch::{build_batch, schema_for, schema_params_hash}; -use crate::paths::{expand_path, lance_table_name}; +use crate::paths::lance_table_name; /// Overfetch multiplier: when post-filtering Lance results against /// SQLite-side filters we ask for `2 * k` candidates so a moderately