From d71563192895c36e51183448a9eb0c8850d6e048 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 20 May 2026 00:01:41 +0000 Subject: [PATCH] test(eval): normalize elapsed_ms before determinism comparison (flake fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `runner_lexical_is_deterministic_per_query_payload` 가 full-suite 첫 실행에서 간헐적으로 `elapsed_ms: 0` vs `elapsed_ms: 1` 차이로 깨지는 timing flake 가 있었음 (PR #140 회차 0 의 full-suite 실행에서 관찰). 원인: per_query 전체 JSON 을 byte-identical 비교하는데 QueryResult.elapsed_ms 가 timing 기반이라 µs-scale wall-clock jitter 가 그대로 비교에 들어감. 의도는 "timing 외에 byte-identical" — 인접 snapshot test #7 은 projection 으로 timing 을 명시적으로 제외하지만 #6 은 누락. Fix: 비교 직전 양쪽 run 의 elapsed_ms 를 0 으로 normalize. 의도 그대로 표현하고 다른 field 의 결정성 검증은 보존. 50회 반복 stress 통과 (이전: 간헐 실패). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-eval/tests/runner.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/crates/kebab-eval/tests/runner.rs b/crates/kebab-eval/tests/runner.rs index 8b8b21e..29b3830 100644 --- a/crates/kebab-eval/tests/runner.rs +++ b/crates/kebab-eval/tests/runner.rs @@ -336,21 +336,29 @@ fn runner_lexical_is_deterministic_per_query_payload() { "- id: q1\n query: ownership\n- id: q2\n query: heading\n", ); - let run_a = run_with_golden(&yaml, || { + let mut run_a = run_with_golden(&yaml, || { run_eval_with_config(&env.config, &lexical_opts()).unwrap() }); - let run_b = run_with_golden(&yaml, || { + let mut run_b = run_with_golden(&yaml, || { run_eval_with_config(&env.config, &lexical_opts()).unwrap() }); // Run-level fields (`run_id`, `created_at`) intentionally diverge; // the per-query payload (which is what the snapshot fixture pins) - // must be byte-identical. + // must be byte-identical EXCEPT for `elapsed_ms`. Timing-sensitive + // fields aren't determinism signals — they're µs-scale wall-clock + // jitter and would otherwise make this assertion a flaky one (a 0 + // vs 1 ms divergence was observed under contended-CI load). Normalize + // before comparing; see test #7 for the same exclusion done via a + // projection. + for qr in run_a.per_query.iter_mut().chain(run_b.per_query.iter_mut()) { + qr.elapsed_ms = 0; + } let a_json = serde_json::to_string(&run_a.per_query).unwrap(); let b_json = serde_json::to_string(&run_b.per_query).unwrap(); assert_eq!( a_json, b_json, - "lexical-only per_query payload must be byte-identical across runs" + "lexical-only per_query payload must be byte-identical across runs (timing normalized)" ); } -- 2.49.1