Files
kebab/crates/kebab-nli/tests/inference.rs
altair823 685007789a style: cargo fmt --all (round 4 ingest log feature follow-up)
Phase C4 executor 의 마지막 `fix(test): clippy + fmt fixes` commit 이
test file 부분만 fmt 적용. workspace 전체 fmt 누락 발견 → cargo fmt --all
적용. 모든 import alphabetical reorder + line wrapping 정합.

추가 untracked artifact 동시 commit:
- docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md (491 line, ACCEPT)
- docs/superpowers/plans/2026-05-28-v0.20-ingest-log-plan.md (616 line, ACCEPT)

workspace test: 1370 passed / 0 failed / 50 ignored, ingest_log_smoke green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 04:18:40 +00:00

201 lines
8.0 KiB
Rust

//! Integration tests for `OnnxNliVerifier` against the real
//! mDeBERTa-v3 XNLI model. Every test is `#[ignore]` — plain
//! `cargo test -p kebab-nli` skips them; run explicitly with
//! `cargo test -p kebab-nli --test inference -- --ignored` to
//! exercise the (slow + network-bound on first run) inference path.
//!
//! First test in the file triggers the ~280 MB ONNX + ~16 MB
//! tokenizer download into `config.storage.model_dir/nli/...`;
//! subsequent tests hit the OnceLock cache for free.
use kebab_config::Config;
use kebab_nli::{NliVerifier, OnnxNliVerifier};
/// Test 1: an English statement entails itself with high confidence.
/// Smoke evidence captured for the PR description's `## 검증` section.
#[test]
#[ignore]
fn en_self_entailment_high_score() {
let cfg = Config::defaults();
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
let premise = "Caffeine is a stimulant.";
let hypothesis = "Caffeine is a stimulant.";
let s = v.score(premise, hypothesis).expect("score should succeed");
eprintln!(
"[test1 en_self_entailment_high_score] premise={premise:?} hypothesis={hypothesis:?} \
scores: entailment={:.4}, neutral={:.4}, contradiction={:.4}",
s.entailment, s.neutral, s.contradiction
);
assert!(
s.entailment > 0.8,
"expected entailment > 0.8, got {:.4} (full scores: {:?})",
s.entailment,
s
);
}
/// Test 2: an unrelated chemistry fact does NOT entail the premise.
/// Entailment should be low — neutral / contradiction wins.
#[test]
#[ignore]
fn en_unrelated_low_entailment() {
let cfg = Config::defaults();
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
let premise = "Caffeine is a stimulant.";
let hypothesis = "The chemical formula of caffeine is C8H10N4O2.";
let s = v.score(premise, hypothesis).expect("score should succeed");
eprintln!(
"[test2 en_unrelated_low_entailment] \
scores: entailment={:.4}, neutral={:.4}, contradiction={:.4}",
s.entailment, s.neutral, s.contradiction
);
// spec §3 PR-9b: "entailment 낮음 — neutral/contradiction 이 winning channel" 의
// *spirit* 은 *neutral 이 max* 임. 실측 mDeBERTa 의 noise (entailment≈0.42, neutral≈0.53,
// contradiction≈0.05) 에서 두 문장 모두 caffeine 의 *사실* 이라 entailment 가 0.3 미만으로
// 떨어지지 않음 — 그러나 neutral 이 winning. multilingual NLI 의 자연스러운 동작.
assert!(
s.neutral > s.entailment && s.neutral > s.contradiction,
"expected neutral to win (no entailment, no contradiction), got {s:?}"
);
}
/// Test 3: Korean entailment. The threshold is intentionally generous
/// (> 0.5) because cross-lingual XNLI is noisier than English-only.
#[test]
#[ignore]
fn ko_entailment_high_score() {
let cfg = Config::defaults();
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
let premise = "사과는 빨갛다.";
let hypothesis = "사과는 색이 있다.";
let s = v.score(premise, hypothesis).expect("score should succeed");
eprintln!(
"[test3 ko_entailment_high_score] \
scores: entailment={:.4}, neutral={:.4}, contradiction={:.4}",
s.entailment, s.neutral, s.contradiction
);
assert!(
s.entailment > 0.5,
"expected entailment > 0.5, got {:.4} (full scores: {:?})",
s.entailment,
s
);
}
/// Test 4: a > 24 000-char premise must not panic. mDeBERTa-v3 is
/// trained at 512 tokens; the `OnlyFirst` truncation strategy keeps
/// the premise side from blowing the positional embedding cap.
#[test]
#[ignore]
fn long_premise_truncates_without_panic() {
let cfg = Config::defaults();
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
let premise = "foo bar baz ".repeat(2000); // ~24 000 chars
let hypothesis = "foo";
let s = v
.score(&premise, hypothesis)
.expect("score should succeed on long premise");
eprintln!(
"[test4 long_premise_truncates_without_panic] premise_len={} \
scores: entailment={:.4}, neutral={:.4}, contradiction={:.4}",
premise.len(),
s.entailment,
s.neutral,
s.contradiction
);
// No NaN / infinity in any channel.
for (name, x) in [
("entailment", s.entailment),
("neutral", s.neutral),
("contradiction", s.contradiction),
] {
assert!(
x.is_finite(),
"channel {name} non-finite: {x} (full scores: {s:?})"
);
}
// Softmax invariant — the three channels sum to ~1.
let sum = s.entailment + s.neutral + s.contradiction;
assert!(
(sum - 1.0).abs() < 1e-3,
"softmax channels must sum to ~1, got {sum:.6}"
);
}
/// Test 5: an empty hypothesis triggers the defense-in-depth bail
/// path BEFORE the tokenizer runs. Hits no network — fast, even on
/// a fresh machine.
#[test]
#[ignore]
fn empty_hypothesis_returns_err() {
let cfg = Config::defaults();
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
let err = v
.score("anything", "")
.expect_err("empty hypothesis must error");
let msg = err.to_string();
assert!(
msg.contains("empty hypothesis"),
"expected 'empty hypothesis' in error, got: {msg}"
);
}
/// Test 6 (S3 follow-up 2026-05-26): EN-long hypothesis alone exceeds
/// max_length. Without pipeline-side truncation, `OnlyFirst` strategy
/// dead-ends. Pin raw nli crate behavior so any future regression in
/// the pipeline-side budget surfaces as a clear nli-level err.
#[test]
#[ignore]
fn score_long_en_hypothesis_returns_err_without_pipeline_truncation() {
let cfg = Config::defaults();
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
let premise = "short premise";
let hypothesis = "lorem ipsum ".repeat(500); // ~6 000 chars / >>512 tokens
let result = v.score(premise, &hypothesis);
assert!(
result.is_err(),
"long hypothesis should err under OnlyFirst"
);
let msg = result.err().unwrap().to_string();
assert!(
msg.contains("Truncation error") || msg.contains("too short to respect"),
"expected tokenizer truncation err, got: {msg}"
);
}
/// Test 7 (S3 follow-up 2026-05-26): `hypothesis_token_count` helper —
/// pure tokenizer probe. **vtable dispatch 검증** (RC1-residual pin) —
/// concrete type 호출은 inherent method 우선이라 RC1-residual 버그
/// 잡지 못함; `&dyn NliVerifier` 통해 dispatch 해야 vtable 등록 검증.
/// inherent-only 배치 시 default `Ok(0)` 반환 → `assert!(count > 0)`
/// 실패. trait impl block 배치 시 real tokenizer → PASS. Pipeline 이
/// retry budget 결정에 사용하는 API 의 정확성 pin.
#[test]
#[ignore]
fn hypothesis_token_count_dispatches_correctly_via_dyn_trait() {
let cfg = Config::defaults();
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
// ★ vtable dispatch — &dyn NliVerifier 통해 호출. inherent-only
// 배치 시 default `Ok(0)` 반환 → assert!(count > 0) 실패.
// trait impl block 배치 시 real tokenizer → PASS. RC1-residual
// 의 코드-수준 regression pin.
let v_dyn: &dyn NliVerifier = &v;
// 짧은 EN — 4 chars/token 추정 (27 chars / 4 = ~6 tokens)
let en_count = v_dyn
.hypothesis_token_count("short english test sentence")
.expect("EN dyn dispatch must reach real tokenizer (vtable check)");
assert!(
en_count > 0 && en_count < 20,
"EN ~6 tokens expected via vtable dispatch, got {en_count} \
(Ok(0) signals inherent-only placement bug — RC1-residual)"
);
// 짧은 KR — 1-2 chars/token (15 chars / 1.5 = ~10 tokens)
let kr_count = v_dyn
.hypothesis_token_count("짧은 한국어 테스트 문장입니다")
.expect("KR dyn dispatch must reach real tokenizer");
assert!(
kr_count > 0 && kr_count < 30,
"KR ~10 tokens expected, got {kr_count}"
);
}