diff --git a/crates/kebab-eval/tests/runner.rs b/crates/kebab-eval/tests/runner.rs index 8b8b21e..29b3830 100644 --- a/crates/kebab-eval/tests/runner.rs +++ b/crates/kebab-eval/tests/runner.rs @@ -336,21 +336,29 @@ fn runner_lexical_is_deterministic_per_query_payload() { "- id: q1\n query: ownership\n- id: q2\n query: heading\n", ); - let run_a = run_with_golden(&yaml, || { + let mut run_a = run_with_golden(&yaml, || { run_eval_with_config(&env.config, &lexical_opts()).unwrap() }); - let run_b = run_with_golden(&yaml, || { + let mut run_b = run_with_golden(&yaml, || { run_eval_with_config(&env.config, &lexical_opts()).unwrap() }); // Run-level fields (`run_id`, `created_at`) intentionally diverge; // the per-query payload (which is what the snapshot fixture pins) - // must be byte-identical. + // must be byte-identical EXCEPT for `elapsed_ms`. Timing-sensitive + // fields aren't determinism signals — they're µs-scale wall-clock + // jitter and would otherwise make this assertion a flaky one (a 0 + // vs 1 ms divergence was observed under contended-CI load). Normalize + // before comparing; see test #7 for the same exclusion done via a + // projection. + for qr in run_a.per_query.iter_mut().chain(run_b.per_query.iter_mut()) { + qr.elapsed_ms = 0; + } let a_json = serde_json::to_string(&run_a.per_query).unwrap(); let b_json = serde_json::to_string(&run_b.per_query).unwrap(); assert_eq!( a_json, b_json, - "lexical-only per_query payload must be byte-identical across runs" + "lexical-only per_query payload must be byte-identical across runs (timing normalized)" ); }