부모 경로에 .ingest 삽입(leaf 구조체 불변). src + 테스트 call-site 전부. kebab-cli 테스트의 v2 TOML fixture 는 from_file 자동변환(T6) 경로 검증용으로 유지. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
272 lines
11 KiB
Rust
272 lines
11 KiB
Rust
//! Per-query eval runner. See [`run_eval`] / [`run_eval_with_config`].
|
|
|
|
use std::fs::File;
|
|
use std::io::{BufWriter, Write};
|
|
use std::path::PathBuf;
|
|
use std::time::Instant;
|
|
|
|
use anyhow::{Context, Result};
|
|
use kebab_app::App;
|
|
use kebab_config::expand_path;
|
|
use kebab_core::{SearchFilters, SearchQuery};
|
|
use kebab_store_sqlite::{EvalRunRow, SqliteStore};
|
|
use time::OffsetDateTime;
|
|
|
|
use crate::loader::{load_golden_set, validate_against_db};
|
|
use crate::metrics::{DEFAULT_GOLDEN_PATH, KEBAB_EVAL_GOLDEN};
|
|
use crate::types::{EvalRun, EvalRunOpts, GoldenQuery, QueryResult};
|
|
|
|
/// Convert a wall-clock duration since `start` into milliseconds clamped
|
|
/// to `u32::MAX`. The `QueryResult.elapsed_ms` and `eval_runs.duration_ms`
|
|
/// fields are `u32`; saturate (rather than wrap) so a stuck run never
|
|
/// reports a misleading sub-second duration.
|
|
fn elapsed_ms_u32(start: Instant) -> u32 {
|
|
start.elapsed().as_millis().min(u128::from(u32::MAX)) as u32
|
|
}
|
|
|
|
/// Run the golden suite end-to-end against the active XDG-loaded
|
|
/// [`kebab_config::Config`]. Wraps [`run_eval_with_config`] with
|
|
/// `Config::load(None)`.
|
|
pub fn run_eval(opts: &EvalRunOpts) -> Result<EvalRun> {
|
|
let cfg = kebab_config::Config::load(None).context("load Config for run_eval")?;
|
|
run_eval_with_config(&cfg, opts)
|
|
}
|
|
|
|
/// Run the golden suite end-to-end against an explicit
|
|
/// [`kebab_config::Config`]. Used by integration tests (TempDir-backed
|
|
/// data_dir) and any future caller that wants to drive the runner
|
|
/// against a non-default config.
|
|
pub fn run_eval_with_config(cfg: &kebab_config::Config, opts: &EvalRunOpts) -> Result<EvalRun> {
|
|
let started = Instant::now();
|
|
|
|
// ── 1. Load golden set ────────────────────────────────────────────────
|
|
//
|
|
// `with_context` already names the path on error, so a separate
|
|
// `tracing::debug!` here would just be noise.
|
|
let golden_path = resolve_golden_path();
|
|
let queries = load_golden_set(&golden_path).with_context(|| {
|
|
format!(
|
|
"load golden set from {} (override via KEBAB_EVAL_GOLDEN)",
|
|
golden_path.display()
|
|
)
|
|
})?;
|
|
validate_against_db(&queries, cfg)?;
|
|
|
|
// ── 2. Mint identifiers + open store ──────────────────────────────────
|
|
let run_id = mint_run_id();
|
|
let created_at = OffsetDateTime::now_utc();
|
|
let commit_hash = std::env::var("KEBAB_COMMIT_HASH")
|
|
.ok()
|
|
.filter(|s| !s.is_empty());
|
|
|
|
// Open the store once so every per-query write reuses the same
|
|
// connection-mutex lifetime.
|
|
let store = SqliteStore::open(cfg).context("open SqliteStore for run_eval")?;
|
|
store
|
|
.run_migrations()
|
|
.context("run migrations for run_eval")?;
|
|
|
|
// ── 3. Build config_snapshot_json ─────────────────────────────────────
|
|
let config_snapshot_json = build_config_snapshot(cfg, opts.k)?;
|
|
let config_snapshot_text =
|
|
serde_json::to_string(&config_snapshot_json).context("serialize config_snapshot_json")?;
|
|
|
|
// ── 4. Per-query execution ────────────────────────────────────────────
|
|
//
|
|
// Open one `App` for the whole suite. The embedder / vector store /
|
|
// LLM are memoized on the App, so a 50-query run pays the ~470 MB
|
|
// ONNX init + Lance reopen + Ollama handshake exactly once.
|
|
let app = App::open_with_config(cfg.clone()).context("open App for run_eval")?;
|
|
|
|
let mut per_query: Vec<QueryResult> = Vec::with_capacity(queries.len());
|
|
for gq in &queries {
|
|
let qr = execute_query(&app, gq, opts);
|
|
per_query.push(qr);
|
|
}
|
|
|
|
// ── 5. Persist eval_runs + eval_query_results ────────────────────────
|
|
// Serialize per-query JSON up front so the SQLite transaction below
|
|
// never holds the connection mutex through serde failures.
|
|
let mut results: Vec<(String, String)> = Vec::with_capacity(per_query.len());
|
|
for qr in &per_query {
|
|
let json = serde_json::to_string(qr)
|
|
.with_context(|| format!("serialize QueryResult for {}", qr.query_id))?;
|
|
results.push((qr.query_id.clone(), json));
|
|
}
|
|
let row = EvalRunRow {
|
|
run_id: &run_id,
|
|
suite: opts.suite.as_str(),
|
|
config_snapshot_json: &config_snapshot_text,
|
|
aggregate_json: "{}",
|
|
commit_hash: commit_hash.as_deref(),
|
|
created_at,
|
|
};
|
|
store
|
|
.record_eval_run_with_results(&row, &results)
|
|
.context("record eval_runs + eval_query_results (transactional)")?;
|
|
|
|
// ── 6. Mirror to runs_dir/<run_id>/per_query.jsonl ────────────────────
|
|
write_per_query_jsonl(cfg, &run_id, &per_query)?;
|
|
|
|
let duration_ms = elapsed_ms_u32(started);
|
|
tracing::info!(
|
|
target: "kebab-eval",
|
|
run_id = %run_id,
|
|
suite = %opts.suite,
|
|
queries = per_query.len(),
|
|
duration_ms,
|
|
"kb-eval: run complete"
|
|
);
|
|
|
|
Ok(EvalRun {
|
|
run_id,
|
|
created_at,
|
|
commit_hash,
|
|
config_snapshot_json,
|
|
per_query,
|
|
})
|
|
}
|
|
|
|
/// Mint a `run_<lower>` identifier. UUIDv7 stands in for ULID — same
|
|
/// timestamp-ordered monotonicity, already in workspace deps. Lower-
|
|
/// case simple form to match the `ulid_lower()` shape the spec asks
|
|
/// for.
|
|
fn mint_run_id() -> String {
|
|
let id = uuid::Uuid::now_v7().simple().to_string();
|
|
format!("run_{id}")
|
|
}
|
|
|
|
/// Resolve the golden YAML path. Honors the `KEBAB_EVAL_GOLDEN` env
|
|
/// override; otherwise relative to CWD. The path is NOT expanded for
|
|
/// `~` / `${...}` placeholders — direct file paths only.
|
|
fn resolve_golden_path() -> PathBuf {
|
|
match std::env::var(KEBAB_EVAL_GOLDEN) {
|
|
Ok(s) if !s.is_empty() => PathBuf::from(s),
|
|
_ => PathBuf::from(DEFAULT_GOLDEN_PATH),
|
|
}
|
|
}
|
|
|
|
/// Run one [`GoldenQuery`] through the kb-app facade. Errors are
|
|
/// captured into `QueryResult.error` so the run continues.
|
|
fn execute_query(app: &App, gq: &GoldenQuery, opts: &EvalRunOpts) -> QueryResult {
|
|
let started = Instant::now();
|
|
|
|
let search_query = SearchQuery {
|
|
text: gq.query.clone(),
|
|
mode: opts.mode,
|
|
k: opts.k,
|
|
filters: SearchFilters::default(),
|
|
};
|
|
|
|
let (hits_top_k, mut error) = match app.search(search_query) {
|
|
Ok(hits) => (hits, None),
|
|
Err(e) => (Vec::new(), Some(format!("{e:#}"))),
|
|
};
|
|
|
|
// Optional RAG path: only attempted when `with_rag` and the search
|
|
// call did not already error out (we want one error per query, not
|
|
// a duplicated one).
|
|
let answer = if opts.with_rag && error.is_none() {
|
|
let ask_opts = kebab_app::AskOpts {
|
|
k: opts.k,
|
|
explain: true,
|
|
mode: opts.mode,
|
|
temperature: opts.temperature,
|
|
seed: opts.seed,
|
|
stream_sink: None,
|
|
// p9-fb-15: golden eval is single-shot per query; no
|
|
// conversational history.
|
|
history: Vec::new(),
|
|
conversation_id: None,
|
|
turn_index: None,
|
|
// p9-fb-41: golden eval baseline runs are single-pass; the
|
|
// multi-hop path is opted into per query via a future
|
|
// fixture flag (PR-4+) once the runner learns to dispatch.
|
|
multi_hop: false,
|
|
};
|
|
match app.ask(&gq.query, ask_opts) {
|
|
Ok(ans) => Some(ans),
|
|
Err(e) => {
|
|
error = Some(format!("{e:#}"));
|
|
None
|
|
}
|
|
}
|
|
} else {
|
|
None
|
|
};
|
|
|
|
QueryResult {
|
|
query_id: gq.id.clone(),
|
|
query: gq.query.clone(),
|
|
mode: opts.mode,
|
|
hits_top_k,
|
|
answer,
|
|
elapsed_ms: elapsed_ms_u32(started),
|
|
error,
|
|
}
|
|
}
|
|
|
|
/// Build the `config_snapshot_json` value: full Config as `config` plus
|
|
/// the auxiliary version fields the spec calls out.
|
|
///
|
|
/// `index_version` is intentionally `None` here — it is composed
|
|
/// dynamically by `kb-app` on a per-call basis from the configured
|
|
/// embedder (e.g., `vec:<model>@<version>:<dim>`), so it is not a
|
|
/// stable run-time property of the config alone. P5-2 may compose it
|
|
/// from `embedding.{model,version,dimensions}` if it needs the field
|
|
/// for compare reports.
|
|
fn build_config_snapshot(cfg: &kebab_config::Config, eval_k: usize) -> Result<serde_json::Value> {
|
|
let cfg_value = serde_json::to_value(cfg).context("serialize Config")?;
|
|
Ok(serde_json::json!({
|
|
"config": cfg_value,
|
|
"eval_k": eval_k,
|
|
"chunker_version": cfg.ingest.chunking.chunker_version,
|
|
"embedding": {
|
|
"model": cfg.models.embedding.model,
|
|
"version": cfg.models.embedding.version,
|
|
"dimensions": cfg.models.embedding.dimensions,
|
|
"provider": cfg.models.embedding.provider,
|
|
},
|
|
"llm": {
|
|
"model_id": cfg.models.llm.model,
|
|
"provider": cfg.models.llm.provider,
|
|
},
|
|
"prompt_template_version": cfg.rag.prompt_template_version,
|
|
"score_gate": cfg.rag.score_gate,
|
|
"rrf_k": cfg.search.rrf_k,
|
|
"index_version": serde_json::Value::Null,
|
|
}))
|
|
}
|
|
|
|
/// Write the `runs_dir/<run_id>/per_query.jsonl` mirror (design §6.3).
|
|
/// Each `QueryResult` is one line, separator `\n`. The directory is
|
|
/// created if it doesn't exist; an existing file is overwritten (a
|
|
/// `run_id` collision would already have failed the `eval_runs`
|
|
/// PRIMARY KEY upstream).
|
|
fn write_per_query_jsonl(
|
|
cfg: &kebab_config::Config,
|
|
run_id: &str,
|
|
per_query: &[QueryResult],
|
|
) -> Result<()> {
|
|
// `data_dir` may itself contain `${XDG_DATA_HOME:-…}` / `~` (the
|
|
// workspace-default does); resolve it before threading it into the
|
|
// `{data_dir}` substitution of `runs_dir`.
|
|
let resolved_data_dir = expand_path(&cfg.storage.data_dir, "");
|
|
let runs_dir = expand_path(&cfg.storage.runs_dir, &resolved_data_dir.to_string_lossy());
|
|
let run_dir = runs_dir.join(run_id);
|
|
std::fs::create_dir_all(&run_dir)
|
|
.with_context(|| format!("create run dir {}", run_dir.display()))?;
|
|
let path = run_dir.join("per_query.jsonl");
|
|
let file = File::create(&path)
|
|
.with_context(|| format!("create per_query.jsonl at {}", path.display()))?;
|
|
let mut w = BufWriter::new(file);
|
|
for qr in per_query {
|
|
serde_json::to_writer(&mut w, qr)
|
|
.with_context(|| format!("serialize QueryResult for {}", qr.query_id))?;
|
|
w.write_all(b"\n")
|
|
.context("write newline separator in per_query.jsonl")?;
|
|
}
|
|
w.flush().context("flush per_query.jsonl")?;
|
|
Ok(())
|
|
}
|