From d93b757cf1429d8477c279beede7f24a5cca7337 Mon Sep 17 00:00:00 2001 From: altair823 Date: Fri, 29 May 2026 03:42:40 +0000 Subject: [PATCH] fix(cli): thread --config through kebab eval run/aggregate/compare (facade-rule) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cmd::Eval now loads Config via cli.config (same pattern as all other subcommands) before dispatching to the inner match. Each arm now calls the *_with_config variant: run_eval(&opts) → run_eval_with_config(&cfg, &opts) compute_aggregate(run_id) → compute_aggregate_with_config(&cfg, run_id) store_aggregate(run_id, ..) → store_aggregate_with_config(&cfg, run_id, ..) Compare already called compare_runs_with_config but sourced cfg from Config::load(None) — that redundant load is removed; cfg comes from the shared binding above. Fixes the same facade-rule regression pattern as P3-5 / P4-3: previously `kebab --config /build/dogfood/config.toml eval run` silently evaluated the XDG-default (empty) KB instead of the dogfood KB. Also fixes runner.rs test that hardcoded rag-v2 after commit 5719969 bumped the default prompt_template_version to rag-v3. Co-Authored-By: Claude Sonnet 4.6 --- crates/kebab-cli/src/main.rs | 171 +++++++++++++++--------------- crates/kebab-eval/tests/runner.rs | 2 +- 2 files changed, 88 insertions(+), 85 deletions(-) diff --git a/crates/kebab-cli/src/main.rs b/crates/kebab-cli/src/main.rs index 7e9d5a0..1d802d9 100644 --- a/crates/kebab-cli/src/main.rs +++ b/crates/kebab-cli/src/main.rs @@ -1337,93 +1337,96 @@ fn run(cli: &Cli) -> anyhow::Result<()> { app.run() } - Cmd::Eval { what } => match what { - EvalWhat::Run { - suite, - mode, - k, - with_rag, - temperature, - seed, - } => { - let opts = kebab_eval::EvalRunOpts { - suite: suite.clone(), - mode: (*mode).into(), - with_rag: *with_rag, - k: *k, - temperature: *temperature, - seed: *seed, - }; - let run = kebab_eval::run_eval(&opts)?; - if cli.json { - println!("{}", serde_json::to_string_pretty(&run)?); - } else { - println!("run_id: {}", run.run_id); - println!("queries: {}", run.per_query.len()); - let failed = run.per_query.iter().filter(|q| q.error.is_some()).count(); - println!("failed: {failed}"); - } - Ok(()) - } - - EvalWhat::Aggregate { run_id } => { - let agg = kebab_eval::compute_aggregate(run_id)?; - kebab_eval::store_aggregate(run_id, &agg)?; - if cli.json { - println!("{}", serde_json::to_string_pretty(&agg)?); - } else { - println!("run_id: {run_id}"); - println!( - "queries: {} ({} failed)", - agg.total_queries, agg.failed_queries - ); - println!( - "hit@1: {:.4}", - agg.hit_at_k.get(&1).copied().unwrap_or(0.0) - ); - println!( - "hit@5: {:.4}", - agg.hit_at_k.get(&5).copied().unwrap_or(0.0) - ); - println!("MRR: {:.4}", agg.mrr); - } - Ok(()) - } - - EvalWhat::Compare { - run_a, - run_b, - strict_chunker_version, - write_report, - } => { - let cfg = kebab_config::Config::load(None)?; - let opts = kebab_eval::CompareOpts { - strict_chunker_version: *strict_chunker_version, - }; - let report = kebab_eval::compare_runs_with_config(&cfg, run_a, run_b, &opts)?; - let md = kebab_eval::render_report_md(&report); - if cli.json { - println!("{}", serde_json::to_string_pretty(&report)?); - } else { - print!("{md}"); - } - if *write_report { - let resolved_data_dir = kebab_config::expand_path(&cfg.storage.data_dir, ""); - let runs_dir = kebab_config::expand_path( - &cfg.storage.runs_dir, - &resolved_data_dir.to_string_lossy(), - ); - let dir = runs_dir.join(run_b); - std::fs::create_dir_all(&dir)?; - let path = dir.join("report.md"); - std::fs::write(&path, &md)?; - if !cli.json { - eprintln!("wrote {}", path.display()); + Cmd::Eval { what } => { + let cfg = kebab_config::Config::load(cli.config.as_deref())?; + match what { + EvalWhat::Run { + suite, + mode, + k, + with_rag, + temperature, + seed, + } => { + let opts = kebab_eval::EvalRunOpts { + suite: suite.clone(), + mode: (*mode).into(), + with_rag: *with_rag, + k: *k, + temperature: *temperature, + seed: *seed, + }; + let run = kebab_eval::run_eval_with_config(&cfg, &opts)?; + if cli.json { + println!("{}", serde_json::to_string_pretty(&run)?); + } else { + println!("run_id: {}", run.run_id); + println!("queries: {}", run.per_query.len()); + let failed = run.per_query.iter().filter(|q| q.error.is_some()).count(); + println!("failed: {failed}"); } + Ok(()) + } + + EvalWhat::Aggregate { run_id } => { + let agg = kebab_eval::compute_aggregate_with_config(&cfg, run_id)?; + kebab_eval::store_aggregate_with_config(&cfg, run_id, &agg)?; + if cli.json { + println!("{}", serde_json::to_string_pretty(&agg)?); + } else { + println!("run_id: {run_id}"); + println!( + "queries: {} ({} failed)", + agg.total_queries, agg.failed_queries + ); + println!( + "hit@1: {:.4}", + agg.hit_at_k.get(&1).copied().unwrap_or(0.0) + ); + println!( + "hit@5: {:.4}", + agg.hit_at_k.get(&5).copied().unwrap_or(0.0) + ); + println!("MRR: {:.4}", agg.mrr); + } + Ok(()) + } + + EvalWhat::Compare { + run_a, + run_b, + strict_chunker_version, + write_report, + } => { + let opts = kebab_eval::CompareOpts { + strict_chunker_version: *strict_chunker_version, + }; + let report = kebab_eval::compare_runs_with_config(&cfg, run_a, run_b, &opts)?; + let md = kebab_eval::render_report_md(&report); + if cli.json { + println!("{}", serde_json::to_string_pretty(&report)?); + } else { + print!("{md}"); + } + if *write_report { + let resolved_data_dir = + kebab_config::expand_path(&cfg.storage.data_dir, ""); + let runs_dir = kebab_config::expand_path( + &cfg.storage.runs_dir, + &resolved_data_dir.to_string_lossy(), + ); + let dir = runs_dir.join(run_b); + std::fs::create_dir_all(&dir)?; + let path = dir.join("report.md"); + std::fs::write(&path, &md)?; + if !cli.json { + eprintln!("wrote {}", path.display()); + } + } + Ok(()) } - Ok(()) } - }, + } Cmd::IngestFile { path } => { let cfg = kebab_config::Config::load(cli.config.as_deref())?; diff --git a/crates/kebab-eval/tests/runner.rs b/crates/kebab-eval/tests/runner.rs index 637b8ce6..406e71f 100644 --- a/crates/kebab-eval/tests/runner.rs +++ b/crates/kebab-eval/tests/runner.rs @@ -215,7 +215,7 @@ fn runner_records_config_snapshot_with_versions() { assert!(snap.pointer("/llm/model_id").is_some()); assert_eq!( snap.pointer("/prompt_template_version"), - Some(&serde_json::Value::String("rag-v2".to_string())), + Some(&serde_json::Value::String("rag-v3".to_string())), ); assert!(snap.pointer("/score_gate").is_some()); assert!(snap.pointer("/rrf_k").is_some());