Phase C4 executor 의 마지막 `fix(test): clippy + fmt fixes` commit 이 test file 부분만 fmt 적용. workspace 전체 fmt 누락 발견 → cargo fmt --all 적용. 모든 import alphabetical reorder + line wrapping 정합. 추가 untracked artifact 동시 commit: - docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md (491 line, ACCEPT) - docs/superpowers/plans/2026-05-28-v0.20-ingest-log-plan.md (616 line, ACCEPT) workspace test: 1370 passed / 0 failed / 50 ignored, ingest_log_smoke green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
201 lines
8.0 KiB
Rust
201 lines
8.0 KiB
Rust
//! Integration tests for `OnnxNliVerifier` against the real
|
|
//! mDeBERTa-v3 XNLI model. Every test is `#[ignore]` — plain
|
|
//! `cargo test -p kebab-nli` skips them; run explicitly with
|
|
//! `cargo test -p kebab-nli --test inference -- --ignored` to
|
|
//! exercise the (slow + network-bound on first run) inference path.
|
|
//!
|
|
//! First test in the file triggers the ~280 MB ONNX + ~16 MB
|
|
//! tokenizer download into `config.storage.model_dir/nli/...`;
|
|
//! subsequent tests hit the OnceLock cache for free.
|
|
|
|
use kebab_config::Config;
|
|
use kebab_nli::{NliVerifier, OnnxNliVerifier};
|
|
|
|
/// Test 1: an English statement entails itself with high confidence.
|
|
/// Smoke evidence captured for the PR description's `## 검증` section.
|
|
#[test]
|
|
#[ignore]
|
|
fn en_self_entailment_high_score() {
|
|
let cfg = Config::defaults();
|
|
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
|
|
let premise = "Caffeine is a stimulant.";
|
|
let hypothesis = "Caffeine is a stimulant.";
|
|
let s = v.score(premise, hypothesis).expect("score should succeed");
|
|
eprintln!(
|
|
"[test1 en_self_entailment_high_score] premise={premise:?} hypothesis={hypothesis:?} \
|
|
scores: entailment={:.4}, neutral={:.4}, contradiction={:.4}",
|
|
s.entailment, s.neutral, s.contradiction
|
|
);
|
|
assert!(
|
|
s.entailment > 0.8,
|
|
"expected entailment > 0.8, got {:.4} (full scores: {:?})",
|
|
s.entailment,
|
|
s
|
|
);
|
|
}
|
|
|
|
/// Test 2: an unrelated chemistry fact does NOT entail the premise.
|
|
/// Entailment should be low — neutral / contradiction wins.
|
|
#[test]
|
|
#[ignore]
|
|
fn en_unrelated_low_entailment() {
|
|
let cfg = Config::defaults();
|
|
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
|
|
let premise = "Caffeine is a stimulant.";
|
|
let hypothesis = "The chemical formula of caffeine is C8H10N4O2.";
|
|
let s = v.score(premise, hypothesis).expect("score should succeed");
|
|
eprintln!(
|
|
"[test2 en_unrelated_low_entailment] \
|
|
scores: entailment={:.4}, neutral={:.4}, contradiction={:.4}",
|
|
s.entailment, s.neutral, s.contradiction
|
|
);
|
|
// spec §3 PR-9b: "entailment 낮음 — neutral/contradiction 이 winning channel" 의
|
|
// *spirit* 은 *neutral 이 max* 임. 실측 mDeBERTa 의 noise (entailment≈0.42, neutral≈0.53,
|
|
// contradiction≈0.05) 에서 두 문장 모두 caffeine 의 *사실* 이라 entailment 가 0.3 미만으로
|
|
// 떨어지지 않음 — 그러나 neutral 이 winning. multilingual NLI 의 자연스러운 동작.
|
|
assert!(
|
|
s.neutral > s.entailment && s.neutral > s.contradiction,
|
|
"expected neutral to win (no entailment, no contradiction), got {s:?}"
|
|
);
|
|
}
|
|
|
|
/// Test 3: Korean entailment. The threshold is intentionally generous
|
|
/// (> 0.5) because cross-lingual XNLI is noisier than English-only.
|
|
#[test]
|
|
#[ignore]
|
|
fn ko_entailment_high_score() {
|
|
let cfg = Config::defaults();
|
|
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
|
|
let premise = "사과는 빨갛다.";
|
|
let hypothesis = "사과는 색이 있다.";
|
|
let s = v.score(premise, hypothesis).expect("score should succeed");
|
|
eprintln!(
|
|
"[test3 ko_entailment_high_score] \
|
|
scores: entailment={:.4}, neutral={:.4}, contradiction={:.4}",
|
|
s.entailment, s.neutral, s.contradiction
|
|
);
|
|
assert!(
|
|
s.entailment > 0.5,
|
|
"expected entailment > 0.5, got {:.4} (full scores: {:?})",
|
|
s.entailment,
|
|
s
|
|
);
|
|
}
|
|
|
|
/// Test 4: a > 24 000-char premise must not panic. mDeBERTa-v3 is
|
|
/// trained at 512 tokens; the `OnlyFirst` truncation strategy keeps
|
|
/// the premise side from blowing the positional embedding cap.
|
|
#[test]
|
|
#[ignore]
|
|
fn long_premise_truncates_without_panic() {
|
|
let cfg = Config::defaults();
|
|
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
|
|
let premise = "foo bar baz ".repeat(2000); // ~24 000 chars
|
|
let hypothesis = "foo";
|
|
let s = v
|
|
.score(&premise, hypothesis)
|
|
.expect("score should succeed on long premise");
|
|
eprintln!(
|
|
"[test4 long_premise_truncates_without_panic] premise_len={} \
|
|
scores: entailment={:.4}, neutral={:.4}, contradiction={:.4}",
|
|
premise.len(),
|
|
s.entailment,
|
|
s.neutral,
|
|
s.contradiction
|
|
);
|
|
// No NaN / infinity in any channel.
|
|
for (name, x) in [
|
|
("entailment", s.entailment),
|
|
("neutral", s.neutral),
|
|
("contradiction", s.contradiction),
|
|
] {
|
|
assert!(
|
|
x.is_finite(),
|
|
"channel {name} non-finite: {x} (full scores: {s:?})"
|
|
);
|
|
}
|
|
// Softmax invariant — the three channels sum to ~1.
|
|
let sum = s.entailment + s.neutral + s.contradiction;
|
|
assert!(
|
|
(sum - 1.0).abs() < 1e-3,
|
|
"softmax channels must sum to ~1, got {sum:.6}"
|
|
);
|
|
}
|
|
|
|
/// Test 5: an empty hypothesis triggers the defense-in-depth bail
|
|
/// path BEFORE the tokenizer runs. Hits no network — fast, even on
|
|
/// a fresh machine.
|
|
#[test]
|
|
#[ignore]
|
|
fn empty_hypothesis_returns_err() {
|
|
let cfg = Config::defaults();
|
|
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
|
|
let err = v
|
|
.score("anything", "")
|
|
.expect_err("empty hypothesis must error");
|
|
let msg = err.to_string();
|
|
assert!(
|
|
msg.contains("empty hypothesis"),
|
|
"expected 'empty hypothesis' in error, got: {msg}"
|
|
);
|
|
}
|
|
|
|
/// Test 6 (S3 follow-up 2026-05-26): EN-long hypothesis alone exceeds
|
|
/// max_length. Without pipeline-side truncation, `OnlyFirst` strategy
|
|
/// dead-ends. Pin raw nli crate behavior so any future regression in
|
|
/// the pipeline-side budget surfaces as a clear nli-level err.
|
|
#[test]
|
|
#[ignore]
|
|
fn score_long_en_hypothesis_returns_err_without_pipeline_truncation() {
|
|
let cfg = Config::defaults();
|
|
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
|
|
let premise = "short premise";
|
|
let hypothesis = "lorem ipsum ".repeat(500); // ~6 000 chars / >>512 tokens
|
|
let result = v.score(premise, &hypothesis);
|
|
assert!(
|
|
result.is_err(),
|
|
"long hypothesis should err under OnlyFirst"
|
|
);
|
|
let msg = result.err().unwrap().to_string();
|
|
assert!(
|
|
msg.contains("Truncation error") || msg.contains("too short to respect"),
|
|
"expected tokenizer truncation err, got: {msg}"
|
|
);
|
|
}
|
|
|
|
/// Test 7 (S3 follow-up 2026-05-26): `hypothesis_token_count` helper —
|
|
/// pure tokenizer probe. **vtable dispatch 검증** (RC1-residual pin) —
|
|
/// concrete type 호출은 inherent method 우선이라 RC1-residual 버그
|
|
/// 잡지 못함; `&dyn NliVerifier` 통해 dispatch 해야 vtable 등록 검증.
|
|
/// inherent-only 배치 시 default `Ok(0)` 반환 → `assert!(count > 0)`
|
|
/// 실패. trait impl block 배치 시 real tokenizer → PASS. Pipeline 이
|
|
/// retry budget 결정에 사용하는 API 의 정확성 pin.
|
|
#[test]
|
|
#[ignore]
|
|
fn hypothesis_token_count_dispatches_correctly_via_dyn_trait() {
|
|
let cfg = Config::defaults();
|
|
let v = OnnxNliVerifier::new(&cfg).expect("verifier construction");
|
|
// ★ vtable dispatch — &dyn NliVerifier 통해 호출. inherent-only
|
|
// 배치 시 default `Ok(0)` 반환 → assert!(count > 0) 실패.
|
|
// trait impl block 배치 시 real tokenizer → PASS. RC1-residual
|
|
// 의 코드-수준 regression pin.
|
|
let v_dyn: &dyn NliVerifier = &v;
|
|
// 짧은 EN — 4 chars/token 추정 (27 chars / 4 = ~6 tokens)
|
|
let en_count = v_dyn
|
|
.hypothesis_token_count("short english test sentence")
|
|
.expect("EN dyn dispatch must reach real tokenizer (vtable check)");
|
|
assert!(
|
|
en_count > 0 && en_count < 20,
|
|
"EN ~6 tokens expected via vtable dispatch, got {en_count} \
|
|
(Ok(0) signals inherent-only placement bug — RC1-residual)"
|
|
);
|
|
// 짧은 KR — 1-2 chars/token (15 chars / 1.5 = ~10 tokens)
|
|
let kr_count = v_dyn
|
|
.hypothesis_token_count("짧은 한국어 테스트 문장입니다")
|
|
.expect("KR dyn dispatch must reach real tokenizer");
|
|
assert!(
|
|
kr_count > 0 && kr_count < 30,
|
|
"KR ~10 tokens expected, got {kr_count}"
|
|
);
|
|
}
|