feat(nli): fb-41 PR-9a — kebab-nli crate skeleton + workspace deps

- 신규 crate kebab-nli (trait + impl 동일 crate, v0.18 scope = ONNX adapter 1개).
- NliVerifier trait + NliScores struct (XNLI 3-channel: entailment/neutral/contradiction).
- private softmax3 (log-sum-exp 안전).
- OnnxNliVerifier placeholder (PR-9b 가 ONNX inference + model download 추가).
- workspace.dependencies 추가: ort 2.0-rc.9, tokenizers 0.21 (default-features=false, onig), hf-hub 0.4, ndarray 0.16.

Pre-flight (PR-9 design contract 의 gate):
- HF Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7 model.onnx + tokenizer.json → HTTP/2 302 (HF S3 routing, file 존재).
- tokenizers --no-default-features -F onig 의 standalone repro: SentencePiece mDeBERTa tokenizer.json 로드 OK (KR 9 tokens / EN 11 tokens 정상 encode).
- Cargo features 결정 trace: tokenizers = { default-features = false, features = ["onig"] } lock.

Tests: 6 unit (softmax3 정규화 + 불변성 + XNLI logits 변환 + faithfulness + new + score stub) — 통과.
Verification: cargo test -p kebab-nli -j 1 (6/6) + cargo clippy -p kebab-nli --all-targets -j 1 -- -D warnings clean.
Workspace: cargo test --workspace -j 1 — pre-existing kebab-mcp::tools_call_ask_multi_hop 1 fail (main baseline 동일 fail, PR-9a 무관 — ingest fixture/Ollama 의존 flaky).

Wire 영향: 없음 (crate 도입만).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-25 21:22:38 +00:00
parent 44fbffff26
commit 1eb0bbecb3
5 changed files with 216 additions and 0 deletions

10
Cargo.lock generated
View File

@@ -4327,6 +4327,16 @@ dependencies = [
"tracing",
]
[[package]]
name = "kebab-nli"
version = "0.17.2"
dependencies = [
"anyhow",
"kebab-config",
"serde",
"tempfile",
]
[[package]]
name = "kebab-normalize"
version = "0.17.2"

View File

@@ -24,6 +24,7 @@ members = [
"crates/kebab-tui",
"crates/kebab-mcp",
"crates/kebab-parse-code",
"crates/kebab-nli",
]
[workspace.package]
@@ -102,6 +103,19 @@ tree-sitter-kotlin-ng = "1.1.0" # bare tree-sitter-kotlin requires ts <0.23;
# C/C++ family grammars for code ingest (kebab-parse-code, p10-1D).
tree-sitter-c = "0.24.2"
tree-sitter-cpp = "0.23.4"
# fb-41 PR-9 (kebab-nli): mDeBERTa-v3 XNLI verifier deps. Versions match
# the fastembed 4.9 transitive set so the ONNX Runtime + tokenizer stack
# stays single-versioned across the workspace. ort `default-features=false`
# drops the bundled binary downloader (fastembed already provides one);
# tokenizers `default-features=false, onig` swaps the default `esaxx` regex
# backend for `onig` so the build doesn't need libstdc++ headers (verified
# via PR-9a pre-flight: SentencePiece tokenizer.json loads + KR/EN encode).
# hf-hub uses `ureq + rustls-tls` to stay aligned with kebab-embed-local's
# pure-Rust TLS stack.
ort = { version = "=2.0.0-rc.9", default-features = false, features = ["ndarray"] }
tokenizers = { version = "0.21", default-features = false, features = ["onig"] }
hf-hub = { version = "0.4", default-features = false, features = ["ureq", "rustls-tls"] }
ndarray = "0.16"
# Disk-footprint trim for dev / test builds. Codegen, opt-level, and
# behavior are unchanged — only DWARF debug info is reduced (line

View File

@@ -0,0 +1,20 @@
[package]
name = "kebab-nli"
version = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "fb-41: NLI-based post-synthesis verification (XNLI mDeBERTa-v3). PR-9a = trait + scaffolding; ONNX inference lands in PR-9b."
[dependencies]
# PR-9a scope: kebab-config for the OnnxNliVerifier::new(&Config) signature
# the rag crate will call once PR-9d wires verification into ask_multi_hop.
# ort / tokenizers / hf-hub / ndarray are intentionally NOT depended on here
# — they sit in workspace.dependencies until PR-9b adds the real adapter.
kebab-config = { path = "../kebab-config" }
anyhow = { workspace = true }
serde = { workspace = true }
[dev-dependencies]
tempfile = { workspace = true }

114
crates/kebab-nli/src/lib.rs Normal file
View File

@@ -0,0 +1,114 @@
//! `kebab-nli` — NLI-based post-synthesis verification for multi-hop RAG.
//!
//! fb-41 introduces a mDeBERTa-v3 XNLI verifier that runs on
//! `(packed_chunks, generated_answer)` after synthesize. If
//! `NliScores::faithfulness()` < threshold the rag crate refuses the answer
//! with `NliVerificationFailed`. PR-9a (this file) is the trait surface +
//! scaffolding only — `OnnxNliVerifier::score` returns a stub error until
//! PR-9b adds the real ONNX inference path.
use serde::{Deserialize, Serialize};
pub mod onnx;
pub use onnx::OnnxNliVerifier;
/// Three-channel XNLI output. Channel order matches the standard XNLI
/// `id2label` mapping `[entailment, neutral, contradiction]` shipped with
/// the Xenova mDeBERTa-v3 model.
#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct NliScores {
pub entailment: f32,
pub neutral: f32,
pub contradiction: f32,
}
impl NliScores {
/// Faithfulness score = entailment channel. The rag crate compares this
/// against `rag.nli_faithfulness_min` to decide whether to refuse.
pub fn faithfulness(&self) -> f32 {
self.entailment
}
/// Wrap raw XNLI logits (`[entailment, neutral, contradiction]`) into
/// a normalised `NliScores`. Applies a numerically-stable softmax3.
pub fn from_xnli_logits(logits: [f32; 3]) -> Self {
let probs = softmax3(logits);
Self {
entailment: probs[0],
neutral: probs[1],
contradiction: probs[2],
}
}
}
/// Abstract NLI verifier. `score` is called with `(premise = packed chunks,
/// hypothesis = generated answer)` — the standard NLI direction (premise
/// entails hypothesis ⇒ answer is grounded in retrieved evidence).
pub trait NliVerifier: Send + Sync {
fn score(&self, premise: &str, hypothesis: &str) -> anyhow::Result<NliScores>;
}
/// Numerically stable 3-way softmax (subtract max for log-sum-exp safety).
/// Private — call sites should go through `NliScores::from_xnli_logits`.
fn softmax3(logits: [f32; 3]) -> [f32; 3] {
let max = logits[0].max(logits[1]).max(logits[2]);
let e0 = (logits[0] - max).exp();
let e1 = (logits[1] - max).exp();
let e2 = (logits[2] - max).exp();
let sum = e0 + e1 + e2;
[e0 / sum, e1 / sum, e2 / sum]
}
#[cfg(test)]
mod tests {
use super::*;
fn approx_eq(a: f32, b: f32, eps: f32) -> bool {
(a - b).abs() <= eps
}
#[test]
fn softmax3_normalises_to_unit() {
let p = softmax3([1.0, 2.0, 3.0]);
assert!(p.iter().all(|x| *x > 0.0));
assert!(approx_eq(p[0] + p[1] + p[2], 1.0, 1e-6));
// Monotonic: larger logit ⇒ larger probability.
assert!(p[0] < p[1] && p[1] < p[2]);
}
#[test]
fn softmax3_is_invariant_to_constant_shift() {
let a = softmax3([1.0, 2.0, 3.0]);
let b = softmax3([101.0, 102.0, 103.0]);
for i in 0..3 {
assert!(
approx_eq(a[i], b[i], 1e-6),
"channel {i} drifted: a={a:?} b={b:?}"
);
}
}
#[test]
fn nli_scores_from_xnli_logits_orders_correctly() {
// entailment dominates ⇒ entailment is the max probability channel.
let s = NliScores::from_xnli_logits([5.0, 1.0, 0.5]);
assert!(s.entailment > s.neutral);
assert!(s.entailment > s.contradiction);
assert!(approx_eq(
s.entailment + s.neutral + s.contradiction,
1.0,
1e-6
));
}
#[test]
fn faithfulness_returns_entailment_channel() {
let s = NliScores {
entailment: 0.7,
neutral: 0.2,
contradiction: 0.1,
};
assert!(approx_eq(s.faithfulness(), 0.7, f32::EPSILON));
}
}

View File

@@ -0,0 +1,58 @@
//! ONNX-backed `NliVerifier` adapter (mDeBERTa-v3 XNLI).
//!
//! PR-9a: scaffolding only. `new` succeeds against the default `Config`
//! and `score` returns an explicit `"PR-9a stub"` error so any caller that
//! wires this up before PR-9b lands gets a loud failure instead of silent
//! all-zero scores. PR-9b will add ort `Session` + `Tokenizer` lazy init
//! and real inference.
use crate::{NliScores, NliVerifier};
/// ONNX-runtime mDeBERTa-v3 XNLI verifier.
///
/// PR-9a scaffolding holds no state — fields land in PR-9b
/// (`model_id`, `cache_dir`, `session: OnceLock<ort::Session>`,
/// `tokenizer: OnceLock<tokenizers::Tokenizer>`).
pub struct OnnxNliVerifier {
_private: (),
}
impl OnnxNliVerifier {
/// Construct a verifier from the user's `Config`. PR-9a always returns
/// `Ok` because the real model + tokenizer download is deferred to
/// PR-9b's first `score` call.
pub fn new(_config: &kebab_config::Config) -> anyhow::Result<Self> {
Ok(Self { _private: () })
}
}
impl NliVerifier for OnnxNliVerifier {
fn score(&self, _premise: &str, _hypothesis: &str) -> anyhow::Result<NliScores> {
anyhow::bail!("PR-9a stub — ONNX inference lands in PR-9b")
}
}
#[cfg(test)]
mod tests {
use super::*;
use kebab_config::Config;
#[test]
fn new_succeeds_on_default_config() {
let cfg = Config::defaults();
let v = OnnxNliVerifier::new(&cfg).expect("new should succeed on default config");
// Silence unused-binding lint without weakening the assertion.
let _ = &v;
}
#[test]
fn score_returns_err_in_skeleton() {
let cfg = Config::defaults();
let v = OnnxNliVerifier::new(&cfg).unwrap();
let err = v.score("a", "b").expect_err("PR-9a stub must error");
assert!(
err.to_string().contains("PR-9a stub"),
"unexpected error message: {err}"
);
}
}