test(eval): normalize elapsed_ms before determinism comparison (flake fix) #141

Merged
altair823 merged 1 commits from fix/eval-runner-timing-flake into main 2026-05-20 00:08:42 +00:00

View File

@@ -336,21 +336,29 @@ fn runner_lexical_is_deterministic_per_query_payload() {
"- id: q1\n query: ownership\n- id: q2\n query: heading\n",
);
let run_a = run_with_golden(&yaml, || {
let mut run_a = run_with_golden(&yaml, || {
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
});
let run_b = run_with_golden(&yaml, || {
let mut run_b = run_with_golden(&yaml, || {
run_eval_with_config(&env.config, &lexical_opts()).unwrap()
});
// Run-level fields (`run_id`, `created_at`) intentionally diverge;
// the per-query payload (which is what the snapshot fixture pins)
// must be byte-identical.
// must be byte-identical EXCEPT for `elapsed_ms`. Timing-sensitive
// fields aren't determinism signals — they're µs-scale wall-clock
// jitter and would otherwise make this assertion a flaky one (a 0
// vs 1 ms divergence was observed under contended-CI load). Normalize
// before comparing; see test #7 for the same exclusion done via a
// projection.
for qr in run_a.per_query.iter_mut().chain(run_b.per_query.iter_mut()) {
qr.elapsed_ms = 0;
}
let a_json = serde_json::to_string(&run_a.per_query).unwrap();
let b_json = serde_json::to_string(&run_b.per_query).unwrap();
assert_eq!(
a_json, b_json,
"lexical-only per_query payload must be byte-identical across runs"
"lexical-only per_query payload must be byte-identical across runs (timing normalized)"
);
}