Files
kebab/crates/kebab-eval/tests/runner.rs
2026-05-10 19:15:15 +09:00

420 lines
15 KiB
Rust

//! Runner integration tests for `kb-eval` (P5-1).
//!
//! Drives [`kebab_eval::run_eval_with_config`] end-to-end against a
//! TempDir-backed config:
//!
//! - tiny seeded SQLite corpus (3 docs / 3 chunks) used as the
//! workspace's source-of-truth,
//! - lexical-only retrieval (`SearchMode::Lexical`) so no embedder is
//! required (`models.embedding.provider = "none"`),
//! - golden YAML pointed at via `KEBAB_EVAL_GOLDEN`.
//!
//! Determinism: lexical-only with a fixed seed corpus produces
//! byte-identical `per_query.jsonl` content (modulo `run_id` /
//! `created_at`, which we strip when comparing).
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use kebab_config::Config;
use kebab_core::SearchMode;
use kebab_eval::{EvalRunOpts, QueryResult, run_eval_with_config};
use kebab_store_sqlite::SqliteStore;
use rusqlite::params;
use tempfile::TempDir;
/// `KEBAB_EVAL_GOLDEN` is process-global state. Tests touching it must
/// serialize so they don't trample each other when `cargo test`
/// runs them in parallel.
static GOLDEN_ENV_LOCK: Mutex<()> = Mutex::new(());
// ── shared scaffolding ───────────────────────────────────────────────────────
struct RunEnv {
temp: TempDir,
config: Config,
}
impl RunEnv {
fn new() -> Self {
let temp = tempfile::tempdir().unwrap();
let mut config = Config::defaults();
config.storage.data_dir = temp.path().to_string_lossy().into_owned();
// Force lexical-only behavior so the runner never tries to
// load fastembed during integration tests.
config.models.embedding.provider = "none".to_string();
config.models.embedding.dimensions = 0;
// Pin search defaults so test asserts are stable.
config.search.default_k = 5;
let store = SqliteStore::open(&config).unwrap();
store.run_migrations().unwrap();
seed_corpus(&store);
Self { temp, config }
}
fn data_dir(&self) -> PathBuf {
self.temp.path().to_path_buf()
}
}
/// Seed three (asset, document, chunk) triples with text the test
/// queries can match against the FTS5 lexical index.
fn seed_corpus(store: &SqliteStore) {
let conn = store.read_conn();
for (i, text) in [
"Rust ownership and borrow checker basics.",
"Cargo workspace members are listed in workspace.members.",
"Markdown chunking respects heading boundaries.",
]
.iter()
.enumerate()
{
let doc_id = format!("doc{i:032}");
let chunk_id = format!("chunk{i:030}");
let asset_id = format!("asset{i:030}");
let path = format!("notes/{i}.md");
conn.execute(
"INSERT INTO assets (
asset_id, source_uri, workspace_path, media_type, byte_len,
checksum, storage_kind, storage_path, discovered_at
) VALUES (?, ?, ?, '\"markdown\"', 0,
'deadbeefdeadbeefdeadbeefdeadbeef',
'reference', ?, '1970-01-01T00:00:00Z')",
params![asset_id, format!("file:///{path}"), path, path],
)
.unwrap();
conn.execute(
"INSERT INTO documents (
doc_id, asset_id, workspace_path, title, lang, source_type,
trust_level, parser_version, doc_version, schema_version,
metadata_json, provenance_json, created_at, updated_at
) VALUES (?, ?, ?, NULL, 'en', 'markdown', 'primary', 'v1', 1, 1,
'{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')",
params![doc_id, asset_id, path],
)
.unwrap();
conn.execute(
"INSERT INTO chunks (
chunk_id, doc_id, text, heading_path_json, section_label,
source_spans_json, token_estimate, chunker_version,
policy_hash, block_ids_json, created_at
) VALUES (?, ?, ?, '[]', NULL,
'[{\"kind\":\"line\",\"start\":1,\"end\":3}]',
1, 'md-heading-v1', 'h', '[]', '1970-01-01T00:00:00Z')",
params![chunk_id, doc_id, text],
)
.unwrap();
}
// Build the FTS index so lexical search returns hits. Reuses the
// same connection guard rather than reopening — the SAVEPOINT
// protocol nests correctly under the existing read_conn lock.
kebab_store_sqlite::rebuild_chunks_fts(&conn).unwrap();
drop(conn);
}
fn write_golden(dir: &Path, body: &str) -> PathBuf {
let path = dir.join("golden.yaml");
fs::write(&path, body).unwrap();
path
}
/// Bind a fresh ephemeral port, then release it. The returned URL
/// points at a port that was just freed; very likely still unbound
/// when the test issues its outbound connection a moment later, in
/// which case `connect()` fails fast with `ECONNREFUSED`. Beats
/// hard-coding port 1 which can timeout slowly on hardened hosts.
fn unreachable_endpoint() -> String {
let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
let port = listener.local_addr().unwrap().port();
drop(listener);
format!("http://127.0.0.1:{port}")
}
fn lexical_opts() -> EvalRunOpts {
EvalRunOpts {
suite: "test".to_string(),
mode: SearchMode::Lexical,
with_rag: false,
k: 5,
temperature: Some(0.0),
seed: Some(0),
}
}
/// Run the eval after pointing `KEBAB_EVAL_GOLDEN` at `yaml`. The env
/// guard must outlive the call so concurrent tests don't reset the
/// var mid-run.
fn run_with_golden<F: FnOnce() -> R, R>(yaml: &Path, f: F) -> R {
let _g = GOLDEN_ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner());
// SAFETY: `KEBAB_EVAL_GOLDEN` is a benign env var; the GOLDEN_ENV_LOCK
// serializes mutations so concurrent tests don't race.
unsafe {
std::env::set_var("KEBAB_EVAL_GOLDEN", yaml);
}
let out = f();
unsafe {
std::env::remove_var("KEBAB_EVAL_GOLDEN");
}
out
}
// ── 1. elapsed_ms recorded for every query ──────────────────────────────────
#[test]
fn runner_records_elapsed_for_every_query() {
let env = RunEnv::new();
let yaml = write_golden(
env.data_dir().as_path(),
"- id: q1\n query: ownership\n- id: q2\n query: heading\n- id: q3\n query: workspace\n",
);
let run = run_with_golden(&yaml, || {
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
});
assert_eq!(run.per_query.len(), 3);
for qr in &run.per_query {
assert_eq!(qr.mode, SearchMode::Lexical);
// `elapsed_ms` is `u32`; the assertion that it's a valid
// unsigned value is implicit. We additionally bound it well
// below the 4G ceiling to detect a stuck/overflow path.
assert!(
qr.elapsed_ms < 60_000,
"elapsed_ms suspicious: {}",
qr.elapsed_ms
);
}
// The id-list round-trips into the per-query records.
let ids: Vec<&str> = run.per_query.iter().map(|q| q.query_id.as_str()).collect();
assert_eq!(ids, vec!["q1", "q2", "q3"]);
}
// ── 2. config snapshot carries the documented version fields ────────────────
#[test]
fn runner_records_config_snapshot_with_versions() {
let env = RunEnv::new();
let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n query: ownership\n");
let run = run_with_golden(&yaml, || {
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
});
let snap = &run.config_snapshot_json;
assert!(snap.get("config").is_some(), "config field missing");
assert_eq!(
snap.pointer("/chunker_version"),
Some(&serde_json::Value::String("md-heading-v1".to_string())),
);
assert!(snap.pointer("/embedding/model").is_some());
assert!(snap.pointer("/embedding/dimensions").is_some());
assert!(snap.pointer("/llm/model_id").is_some());
assert_eq!(
snap.pointer("/prompt_template_version"),
Some(&serde_json::Value::String("rag-v2".to_string())),
);
assert!(snap.pointer("/score_gate").is_some());
assert!(snap.pointer("/rrf_k").is_some());
}
// ── 3. failing query (ask path with no Ollama) records an error ─────────────
#[test]
fn runner_captures_per_query_error_when_rag_unreachable() {
let env = RunEnv::new();
// Point Ollama at an unbound port so `ask_with_config` surfaces a
// connection error per query. We use bind-then-release rather than
// a hard-coded `:1` because port 1 is reserved-but-not-guaranteed-
// unbound (some hardened systems answer with ICMP unreachable
// instantly, others timeout slowly). TOCTOU race is theoretically
// possible but rare in practice and faster-failing than `:1`.
let mut config = env.config.clone();
config.models.llm.endpoint = unreachable_endpoint();
let yaml = write_golden(env.data_dir().as_path(), "- id: q1\n query: ownership\n");
let opts = EvalRunOpts {
with_rag: true,
..lexical_opts()
};
let run = run_with_golden(&yaml, || run_eval_with_config(&config, &opts).unwrap());
let qr = &run.per_query[0];
// hits_top_k still populated by lexical search before the RAG attempt.
assert!(
!qr.hits_top_k.is_empty(),
"lexical hits should populate before RAG attempt"
);
assert!(qr.answer.is_none(), "no answer when RAG fails");
assert!(qr.error.is_some(), "error must be recorded");
}
// ── 4. eval_runs + eval_query_results rows persisted ────────────────────────
#[test]
fn runner_persists_eval_run_and_query_result_rows() {
let env = RunEnv::new();
let yaml = write_golden(
env.data_dir().as_path(),
"- id: q1\n query: ownership\n- id: q2\n query: heading\n",
);
let run = run_with_golden(&yaml, || {
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
});
// Reopen the same SQLite file with a new store handle and read
// the rows back. We use the inherent `read_conn` helper rather
// than rusqlite directly because the latter would require kb-eval
// to add a runtime rusqlite dep (forbidden by the spec).
let store = SqliteStore::open(&env.config).unwrap();
let conn = store.read_conn();
let n_runs: i64 = conn
.query_row(
"SELECT COUNT(*) FROM eval_runs WHERE run_id = ?",
params![run.run_id],
|r| r.get(0),
)
.unwrap();
assert_eq!(n_runs, 1);
let n_results: i64 = conn
.query_row(
"SELECT COUNT(*) FROM eval_query_results WHERE run_id = ?",
params![run.run_id],
|r| r.get(0),
)
.unwrap();
assert_eq!(n_results, 2);
}
// ── 5. per_query.jsonl mirror exists and round-trips ────────────────────────
#[test]
fn runner_writes_per_query_jsonl_mirror() {
let env = RunEnv::new();
let yaml = write_golden(
env.data_dir().as_path(),
"- id: q1\n query: ownership\n- id: q2\n query: heading\n",
);
let run = run_with_golden(&yaml, || {
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
});
let mirror = env
.data_dir()
.join("runs")
.join(&run.run_id)
.join("per_query.jsonl");
assert!(
mirror.exists(),
"per_query.jsonl missing at {}",
mirror.display()
);
let body = fs::read_to_string(&mirror).unwrap();
let lines: Vec<&str> = body.lines().collect();
assert_eq!(lines.len(), 2);
let parsed: Vec<QueryResult> = lines
.iter()
.map(|l| serde_json::from_str::<QueryResult>(l).expect("valid JSONL line"))
.collect();
assert_eq!(parsed[0].query_id, "q1");
assert_eq!(parsed[1].query_id, "q2");
}
// ── 6. determinism — repeating the run produces byte-identical per_query JSON ─
#[test]
fn runner_lexical_is_deterministic_per_query_payload() {
let env = RunEnv::new();
let yaml = write_golden(
env.data_dir().as_path(),
"- id: q1\n query: ownership\n- id: q2\n query: heading\n",
);
let run_a = run_with_golden(&yaml, || {
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
});
let run_b = run_with_golden(&yaml, || {
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
});
// Run-level fields (`run_id`, `created_at`) intentionally diverge;
// the per-query payload (which is what the snapshot fixture pins)
// must be byte-identical.
let a_json = serde_json::to_string(&run_a.per_query).unwrap();
let b_json = serde_json::to_string(&run_b.per_query).unwrap();
assert_eq!(
a_json, b_json,
"lexical-only per_query payload must be byte-identical across runs"
);
}
// ── 7. snapshot — per_query JSON pinned to fixtures/eval/run-1.json ─────────
#[test]
fn runner_per_query_snapshot_matches_fixture() {
let env = RunEnv::new();
let yaml = write_golden(
env.data_dir().as_path(),
"- id: q1\n query: ownership\n- id: q2\n query: heading\n",
);
let run = run_with_golden(&yaml, || {
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
});
// Fixture pins the *shape* of the per-query payload, including the
// first hit's stable scalar fields (chunk_id, doc_id, heading_path,
// fusion_score). FTS scores depend on the SQLite version, so the
// fusion_score is captured into the fixture from one passing run
// and must remain stable across re-runs against the same seeded
// corpus. Timing-sensitive fields (`elapsed_ms`, raw `Instant`
// byproducts) are excluded. Verifying byte stability is the
// determinism test (#6); this test verifies the field set +
// ordering is stable.
let projection: Vec<_> = run
.per_query
.iter()
.map(|qr| {
let first_hit = qr.hits_top_k.first().map(|h| {
serde_json::json!({
"chunk_id": h.chunk_id,
"doc_id": h.doc_id,
"heading_path": h.heading_path,
"score": h.retrieval.fusion_score,
})
});
serde_json::json!({
"query_id": qr.query_id,
"query": qr.query,
"mode": qr.mode,
"hits_count": qr.hits_top_k.len(),
"first_hit": first_hit,
"has_answer": qr.answer.is_some(),
"error": qr.error,
})
})
.collect();
let actual = serde_json::to_string_pretty(&projection).unwrap();
let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/eval/run-1.json");
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
fs::create_dir_all(fixture_path.parent().unwrap()).unwrap();
fs::write(&fixture_path, &actual).unwrap();
}
let expected = fs::read_to_string(&fixture_path)
.unwrap_or_else(|e| panic!("read snapshot {}: {e}", fixture_path.display()));
assert_eq!(
actual.trim(),
expected.trim(),
"snapshot drift — re-run with UPDATE_SNAPSHOTS=1 to refresh"
);
}