feat(nli): fb-41 PR-9a — kebab-nli crate skeleton + workspace deps

- 신규 crate kebab-nli (trait + impl 동일 crate, v0.18 scope = ONNX adapter 1개). - NliVerifier trait + NliScores struct (XNLI 3-channel: entailment/neutral/contradiction). - private softmax3 (log-sum-exp 안전). - OnnxNliVerifier placeholder (PR-9b 가 ONNX inference + model download 추가). - workspace.dependencies 추가: ort 2.0-rc.9, tokenizers 0.21 (default-features=false, onig), hf-hub 0.4, ndarray 0.16. Pre-flight (PR-9 design contract 의 gate): - HF Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7 model.onnx + tokenizer.json → HTTP/2 302 (HF S3 routing, file 존재). - tokenizers --no-default-features -F onig 의 standalone repro: SentencePiece mDeBERTa tokenizer.json 로드 OK (KR 9 tokens / EN 11 tokens 정상 encode). - Cargo features 결정 trace: tokenizers = { default-features = false, features = ["onig"] } lock. Tests: 6 unit (softmax3 정규화 + 불변성 + XNLI logits 변환 + faithfulness + new + score stub) — 통과. Verification: cargo test -p kebab-nli -j 1 (6/6) + cargo clippy -p kebab-nli --all-targets -j 1 -- -D warnings clean. Workspace: cargo test --workspace -j 1 — pre-existing kebab-mcp::tools_call_ask_multi_hop 1 fail (main baseline 동일 fail, PR-9a 무관 — ingest fixture/Ollama 의존 flaky). Wire 영향: 없음 (crate 도입만). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 21:22:38 +00:00
parent 44fbffff26
commit 1eb0bbecb3
5 changed files with 216 additions and 0 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4327,6 +4327,16 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "kebab-nli"
+version = "0.17.2"
+dependencies = [
+ "anyhow",
+ "kebab-config",
+ "serde",
+ "tempfile",
+]
+
 [[package]]
 name = "kebab-normalize"
 version = "0.17.2"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,6 +24,7 @@ members = [
    "crates/kebab-tui",
    "crates/kebab-mcp",
    "crates/kebab-parse-code",
+    "crates/kebab-nli",
 ]

 [workspace.package]
@@ -102,6 +103,19 @@ tree-sitter-kotlin-ng  = "1.1.0"   # bare tree-sitter-kotlin requires ts <0.23;
 # C/C++ family grammars for code ingest (kebab-parse-code, p10-1D).
 tree-sitter-c         = "0.24.2"
 tree-sitter-cpp       = "0.23.4"
+# fb-41 PR-9 (kebab-nli): mDeBERTa-v3 XNLI verifier deps. Versions match
+# the fastembed 4.9 transitive set so the ONNX Runtime + tokenizer stack
+# stays single-versioned across the workspace. ort `default-features=false`
+# drops the bundled binary downloader (fastembed already provides one);
+# tokenizers `default-features=false, onig` swaps the default `esaxx` regex
+# backend for `onig` so the build doesn't need libstdc++ headers (verified
+# via PR-9a pre-flight: SentencePiece tokenizer.json loads + KR/EN encode).
+# hf-hub uses `ureq + rustls-tls` to stay aligned with kebab-embed-local's
+# pure-Rust TLS stack.
+ort          = { version = "=2.0.0-rc.9", default-features = false, features = ["ndarray"] }
+tokenizers   = { version = "0.21", default-features = false, features = ["onig"] }
+hf-hub       = { version = "0.4", default-features = false, features = ["ureq", "rustls-tls"] }
+ndarray      = "0.16"

 # Disk-footprint trim for dev / test builds. Codegen, opt-level, and
 # behavior are unchanged — only DWARF debug info is reduced (line
--- a/crates/kebab-nli/Cargo.toml
+++ b/crates/kebab-nli/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "kebab-nli"
+version       = { workspace = true }
+edition       = { workspace = true }
+rust-version  = { workspace = true }
+license       = { workspace = true }
+repository    = { workspace = true }
+description   = "fb-41: NLI-based post-synthesis verification (XNLI mDeBERTa-v3). PR-9a = trait + scaffolding; ONNX inference lands in PR-9b."
+
+[dependencies]
+# PR-9a scope: kebab-config for the OnnxNliVerifier::new(&Config) signature
+# the rag crate will call once PR-9d wires verification into ask_multi_hop.
+# ort / tokenizers / hf-hub / ndarray are intentionally NOT depended on here
+# — they sit in workspace.dependencies until PR-9b adds the real adapter.
+kebab-config = { path = "../kebab-config" }
+anyhow       = { workspace = true }
+serde        = { workspace = true }
+
+[dev-dependencies]
+tempfile     = { workspace = true }
--- a/crates/kebab-nli/src/lib.rs
+++ b/crates/kebab-nli/src/lib.rs
@@ -0,0 +1,114 @@
+//! `kebab-nli` — NLI-based post-synthesis verification for multi-hop RAG.
+//!
+//! fb-41 introduces a mDeBERTa-v3 XNLI verifier that runs on
+//! `(packed_chunks, generated_answer)` after synthesize. If
+//! `NliScores::faithfulness()` < threshold the rag crate refuses the answer
+//! with `NliVerificationFailed`. PR-9a (this file) is the trait surface +
+//! scaffolding only — `OnnxNliVerifier::score` returns a stub error until
+//! PR-9b adds the real ONNX inference path.
+
+use serde::{Deserialize, Serialize};
+
+pub mod onnx;
+
+pub use onnx::OnnxNliVerifier;
+
+/// Three-channel XNLI output. Channel order matches the standard XNLI
+/// `id2label` mapping `[entailment, neutral, contradiction]` shipped with
+/// the Xenova mDeBERTa-v3 model.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, Deserialize)]
+pub struct NliScores {
+    pub entailment: f32,
+    pub neutral: f32,
+    pub contradiction: f32,
+}
+
+impl NliScores {
+    /// Faithfulness score = entailment channel. The rag crate compares this
+    /// against `rag.nli_faithfulness_min` to decide whether to refuse.
+    pub fn faithfulness(&self) -> f32 {
+        self.entailment
+    }
+
+    /// Wrap raw XNLI logits (`[entailment, neutral, contradiction]`) into
+    /// a normalised `NliScores`. Applies a numerically-stable softmax3.
+    pub fn from_xnli_logits(logits: [f32; 3]) -> Self {
+        let probs = softmax3(logits);
+        Self {
+            entailment: probs[0],
+            neutral: probs[1],
+            contradiction: probs[2],
+        }
+    }
+}
+
+/// Abstract NLI verifier. `score` is called with `(premise = packed chunks,
+/// hypothesis = generated answer)` — the standard NLI direction (premise
+/// entails hypothesis ⇒ answer is grounded in retrieved evidence).
+pub trait NliVerifier: Send + Sync {
+    fn score(&self, premise: &str, hypothesis: &str) -> anyhow::Result<NliScores>;
+}
+
+/// Numerically stable 3-way softmax (subtract max for log-sum-exp safety).
+/// Private — call sites should go through `NliScores::from_xnli_logits`.
+fn softmax3(logits: [f32; 3]) -> [f32; 3] {
+    let max = logits[0].max(logits[1]).max(logits[2]);
+    let e0 = (logits[0] - max).exp();
+    let e1 = (logits[1] - max).exp();
+    let e2 = (logits[2] - max).exp();
+    let sum = e0 + e1 + e2;
+    [e0 / sum, e1 / sum, e2 / sum]
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn approx_eq(a: f32, b: f32, eps: f32) -> bool {
+        (a - b).abs() <= eps
+    }
+
+    #[test]
+    fn softmax3_normalises_to_unit() {
+        let p = softmax3([1.0, 2.0, 3.0]);
+        assert!(p.iter().all(|x| *x > 0.0));
+        assert!(approx_eq(p[0] + p[1] + p[2], 1.0, 1e-6));
+        // Monotonic: larger logit ⇒ larger probability.
+        assert!(p[0] < p[1] && p[1] < p[2]);
+    }
+
+    #[test]
+    fn softmax3_is_invariant_to_constant_shift() {
+        let a = softmax3([1.0, 2.0, 3.0]);
+        let b = softmax3([101.0, 102.0, 103.0]);
+        for i in 0..3 {
+            assert!(
+                approx_eq(a[i], b[i], 1e-6),
+                "channel {i} drifted: a={a:?} b={b:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn nli_scores_from_xnli_logits_orders_correctly() {
+        // entailment dominates ⇒ entailment is the max probability channel.
+        let s = NliScores::from_xnli_logits([5.0, 1.0, 0.5]);
+        assert!(s.entailment > s.neutral);
+        assert!(s.entailment > s.contradiction);
+        assert!(approx_eq(
+            s.entailment + s.neutral + s.contradiction,
+            1.0,
+            1e-6
+        ));
+    }
+
+    #[test]
+    fn faithfulness_returns_entailment_channel() {
+        let s = NliScores {
+            entailment: 0.7,
+            neutral: 0.2,
+            contradiction: 0.1,
+        };
+        assert!(approx_eq(s.faithfulness(), 0.7, f32::EPSILON));
+    }
+}
--- a/crates/kebab-nli/src/onnx.rs
+++ b/crates/kebab-nli/src/onnx.rs
@@ -0,0 +1,58 @@
+//! ONNX-backed `NliVerifier` adapter (mDeBERTa-v3 XNLI).
+//!
+//! PR-9a: scaffolding only. `new` succeeds against the default `Config`
+//! and `score` returns an explicit `"PR-9a stub"` error so any caller that
+//! wires this up before PR-9b lands gets a loud failure instead of silent
+//! all-zero scores. PR-9b will add ort `Session` + `Tokenizer` lazy init
+//! and real inference.
+
+use crate::{NliScores, NliVerifier};
+
+/// ONNX-runtime mDeBERTa-v3 XNLI verifier.
+///
+/// PR-9a scaffolding holds no state — fields land in PR-9b
+/// (`model_id`, `cache_dir`, `session: OnceLock<ort::Session>`,
+/// `tokenizer: OnceLock<tokenizers::Tokenizer>`).
+pub struct OnnxNliVerifier {
+    _private: (),
+}
+
+impl OnnxNliVerifier {
+    /// Construct a verifier from the user's `Config`. PR-9a always returns
+    /// `Ok` because the real model + tokenizer download is deferred to
+    /// PR-9b's first `score` call.
+    pub fn new(_config: &kebab_config::Config) -> anyhow::Result<Self> {
+        Ok(Self { _private: () })
+    }
+}
+
+impl NliVerifier for OnnxNliVerifier {
+    fn score(&self, _premise: &str, _hypothesis: &str) -> anyhow::Result<NliScores> {
+        anyhow::bail!("PR-9a stub — ONNX inference lands in PR-9b")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use kebab_config::Config;
+
+    #[test]
+    fn new_succeeds_on_default_config() {
+        let cfg = Config::defaults();
+        let v = OnnxNliVerifier::new(&cfg).expect("new should succeed on default config");
+        // Silence unused-binding lint without weakening the assertion.
+        let _ = &v;
+    }
+
+    #[test]
+    fn score_returns_err_in_skeleton() {
+        let cfg = Config::defaults();
+        let v = OnnxNliVerifier::new(&cfg).unwrap();
+        let err = v.score("a", "b").expect_err("PR-9a stub must error");
+        assert!(
+            err.to_string().contains("PR-9a stub"),
+            "unexpected error message: {err}"
+        );
+    }
+}