diff --git a/Cargo.lock b/Cargo.lock index 997f3cc..af56201 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4451,6 +4451,7 @@ dependencies = [ "kebab-config", "kebab-core", "kebab-llm", + "kebab-nli", "kebab-search", "kebab-store-sqlite", "regex", diff --git a/crates/kebab-cli/src/main.rs b/crates/kebab-cli/src/main.rs index 935ff81..5bc2139 100644 --- a/crates/kebab-cli/src/main.rs +++ b/crates/kebab-cli/src/main.rs @@ -1653,6 +1653,7 @@ mod tests { conversation_id: None, turn_index: None, hops: None, + verification: None, } } diff --git a/crates/kebab-cli/tests/wire_ask_multi_hop.rs b/crates/kebab-cli/tests/wire_ask_multi_hop.rs index ab65443..49fce55 100644 --- a/crates/kebab-cli/tests/wire_ask_multi_hop.rs +++ b/crates/kebab-cli/tests/wire_ask_multi_hop.rs @@ -144,3 +144,111 @@ fn error_schema_code_enum_includes_multi_hop_decompose_failed() { ); } } + +// ── p9-fb-41 PR-9c-1: NLI verification surface pins ───────────────────── + +/// answer.v1 must declare a `verification` property AND a +/// `$defs.VerificationSummary` entry with all three required fields. +/// Guards against accidental schema deletion / typo in future edits. +#[test] +fn answer_schema_declares_verification_field_and_defs() { + let schema = parse_schema("answer.schema.json"); + assert!( + schema["properties"]["verification"].is_object(), + "`verification` property must be declared on answer.v1" + ); + // `verification` allows object-or-null (multi-hop with threshold>0 + // emits an object; everything else omits the field). + let v_any_of = schema["properties"]["verification"]["anyOf"] + .as_array() + .expect("verification must declare anyOf (object | null)"); + assert!( + v_any_of.iter().any(|v| v["type"] == "null"), + "verification anyOf must include null (single-pass / disabled gate omits the field)" + ); + assert!( + v_any_of + .iter() + .any(|v| v["$ref"].as_str() == Some("#/$defs/VerificationSummary")), + "verification anyOf must $ref VerificationSummary" + ); + + // VerificationSummary $defs entry + required fields. + let vs = &schema["$defs"]["VerificationSummary"]; + assert!( + vs.is_object(), + "$defs.VerificationSummary must be declared so verification.anyOf can $ref it" + ); + let required: Vec<&str> = vs["required"] + .as_array() + .expect("VerificationSummary.required must be an array") + .iter() + .filter_map(|v| v.as_str()) + .collect(); + for needed in ["nli_score", "nli_threshold", "nli_passed"] { + assert!( + required.contains(&needed), + "VerificationSummary.required must include {needed:?}, got {required:?}" + ); + } +} + +#[test] +fn answer_schema_refusal_reason_enum_includes_nli_verification_failed() { + let schema = parse_schema("answer.schema.json"); + let refusal_any_of = schema["properties"]["refusal_reason"]["anyOf"] + .as_array() + .expect("refusal_reason must declare anyOf"); + let enum_arr = refusal_any_of + .iter() + .find_map(|v| v["enum"].as_array()) + .expect("one of refusal_reason.anyOf entries must declare an enum"); + let values: Vec<&str> = enum_arr.iter().filter_map(|v| v.as_str()).collect(); + assert!( + values.contains(&"nli_verification_failed"), + "refusal_reason enum must include `nli_verification_failed`, got {values:?}" + ); +} + +#[test] +fn answer_schema_refusal_reason_enum_includes_nli_model_unavailable() { + let schema = parse_schema("answer.schema.json"); + let refusal_any_of = schema["properties"]["refusal_reason"]["anyOf"] + .as_array() + .expect("refusal_reason must declare anyOf"); + let enum_arr = refusal_any_of + .iter() + .find_map(|v| v["enum"].as_array()) + .expect("one of refusal_reason.anyOf entries must declare an enum"); + let values: Vec<&str> = enum_arr.iter().filter_map(|v| v.as_str()).collect(); + assert!( + values.contains(&"nli_model_unavailable"), + "refusal_reason enum must include `nli_model_unavailable`, got {values:?}" + ); +} + +#[test] +fn error_schema_code_enum_includes_nli_verification_failed() { + let schema = parse_schema("error.schema.json"); + let code_enum = schema["properties"]["code"]["enum"] + .as_array() + .expect("error.v1 must declare code.enum"); + let values: Vec<&str> = code_enum.iter().filter_map(|v| v.as_str()).collect(); + assert!( + values.contains(&"nli_verification_failed"), + "error.v1 code enum must include forward-looking `nli_verification_failed`, got {values:?}" + ); +} + +#[test] +fn error_schema_code_enum_includes_nli_model_unavailable() { + let schema = parse_schema("error.schema.json"); + let code_enum = schema["properties"]["code"]["enum"] + .as_array() + .expect("error.v1 must declare code.enum"); + let values: Vec<&str> = code_enum.iter().filter_map(|v| v.as_str()).collect(); + assert!( + values.contains(&"nli_model_unavailable"), + "error.v1 code enum must include forward-looking `nli_model_unavailable`, got {values:?}" + ); +} diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index 1cd5f81..5a4215e 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -103,6 +103,34 @@ pub struct ChunkingCfg { pub struct ModelsCfg { pub embedding: EmbeddingModelCfg, pub llm: LlmCfg, + /// p9-fb-41 PR-9c-1: NLI verifier model + provider knob. + /// `#[serde(default)]` so pre-v0.18 config files that predate the + /// `[models.nli]` section still load with built-in defaults + /// (`Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7` / `onnx`). + /// The verifier itself is gated by `[rag].nli_threshold` — even + /// with a model configured here, threshold `0.0` (the default) + /// skips the verification step entirely. + #[serde(default = "NliCfg::defaults")] + pub nli: NliCfg, +} + +/// p9-fb-41 PR-9c-1: NLI verifier configuration. The model id flows to +/// `OnnxNliVerifier::new` via `kebab-nli` (PR-9c-2 wiring); the provider +/// is reserved for future verifier swap-in (currently only `"onnx"` is +/// recognized — anything else falls back to the same path). +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct NliCfg { + pub model: String, + pub provider: String, +} + +impl NliCfg { + pub fn defaults() -> Self { + Self { + model: "Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7".to_string(), + provider: "onnx".to_string(), + } + } } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] @@ -213,6 +241,22 @@ pub struct RagCfg { /// cross-doc reasoning over ~5 chunks per iter. #[serde(default = "default_multi_hop_max_pool_chunks")] pub multi_hop_max_pool_chunks: u32, + /// p9-fb-41 PR-9c-1: minimum NLI entailment score required for the + /// multi-hop synthesize answer to be returned as `grounded=true` + /// (spec §2.6 single gate). When the post-synthesize NLI verifier + /// returns `NliScores::faithfulness() < nli_threshold` the + /// pipeline refuses with `RefusalReason::NliVerificationFailed`. + /// + /// Default `0.0` = verification disabled — no NLI call, multi-hop + /// matches its PR-3b behavior exactly. Set to e.g. `0.5` to + /// activate the gate. Knob lives on `[rag]` (the gate is a RAG + /// policy, not a model property); the model itself comes from + /// `[models.nli].model`. + /// + /// Single-pass `ask` ignores this knob entirely — only multi-hop + /// runs through the verification step (PR-9c-2 wires it). + #[serde(default = "default_nli_threshold")] + pub nli_threshold: f32, } fn default_multi_hop_max_depth() -> u32 { @@ -227,6 +271,13 @@ fn default_multi_hop_max_pool_chunks() -> u32 { 15 } +/// p9-fb-41 PR-9c-1: NLI gate disabled by default per spec §2.6 +/// (verification opt-in — users explicitly raise the threshold once +/// they're ready to trade refusal-rate for groundedness). +fn default_nli_threshold() -> f32 { + 0.0 +} + /// Settings for the image ingest pipeline (P6). `ocr` controls OCR /// behaviour (P6-2); `caption` controls vision-LM captioning (P6-3). #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] @@ -464,6 +515,7 @@ impl Config { seed: 0, request_timeout_secs: default_llm_request_timeout_secs(), }, + nli: NliCfg::defaults(), }, search: SearchCfg { default_k: 10, @@ -482,6 +534,7 @@ impl Config { multi_hop_max_sub_queries_per_iter: default_multi_hop_max_sub_queries_per_iter(), multi_hop_max_pool_chunks: default_multi_hop_max_pool_chunks(), + nli_threshold: default_nli_threshold(), }, image: ImageCfg::defaults(), ui: UiCfg::defaults(), @@ -725,6 +778,10 @@ impl Config { } } + // models.nli (p9-fb-41 PR-9c-1) + "KEBAB_MODELS_NLI_MODEL" => self.models.nli.model = v.clone(), + "KEBAB_MODELS_NLI_PROVIDER" => self.models.nli.provider = v.clone(), + // search "KEBAB_SEARCH_DEFAULT_K" => { if let Ok(n) = v.parse::() { @@ -780,6 +837,24 @@ impl Config { self.rag.multi_hop_max_pool_chunks = n; } } + // p9-fb-41 PR-9c-1: NLI gate threshold. Parse failure + // emits a `tracing::warn!` (not silent like the other + // numeric env overrides) because this knob gates the + // NLI verification entirely — a malformed env value + // would silently disable a security-flavored gate the + // user thought they enabled, which is the failure mode + // most worth surfacing. The default (`0.0`) survives + // on parse failure so behaviour stays well-defined. + "KEBAB_RAG_NLI_THRESHOLD" => match v.parse::() { + Ok(f) => self.rag.nli_threshold = f, + Err(e) => tracing::warn!( + target: "kebab-config", + env_key = "KEBAB_RAG_NLI_THRESHOLD", + env_value = %v, + error = %e, + "invalid KEBAB_RAG_NLI_THRESHOLD; keeping prior value (0.0 = NLI gate disabled)" + ), + }, // image.ocr "KEBAB_IMAGE_OCR_ENABLED" => { @@ -1214,6 +1289,84 @@ theme = "dark" assert_eq!(c.rag.multi_hop_max_pool_chunks, 15); } + // ── p9-fb-41 PR-9c-1: NLI verification knobs ───────────────────────── + + #[test] + fn default_nli_threshold_is_zero() { + // Spec §2.6: NLI gate disabled by default — verification is + // opt-in. `0.0` keeps multi-hop behavior identical to PR-3b. + assert_eq!(Config::defaults().rag.nli_threshold, 0.0); + } + + #[test] + fn default_nli_model_is_xenova_mdeberta() { + // Pin the default model id so a refactor that touches NliCfg + // can't silently flip to a different verifier model. + assert_eq!( + Config::defaults().models.nli.model, + "Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" + ); + assert_eq!(Config::defaults().models.nli.provider, "onnx"); + } + + /// A config file written before the `[models.nli]` / `nli_threshold` + /// keys existed must still parse and fall back to the documented + /// defaults. Fixture shared via [`LEGACY_PRE_TIMEOUT_TOML`] (predates + /// all PR-9c-1 fields). + #[test] + fn legacy_config_without_nli_uses_defaults() { + let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML) + .expect("parse legacy config"); + assert_eq!(c.rag.nli_threshold, 0.0); + assert_eq!( + c.models.nli.model, + "Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7" + ); + assert_eq!(c.models.nli.provider, "onnx"); + } + + #[test] + fn env_override_nli_threshold() { + let mut env = HashMap::new(); + env.insert("KEBAB_RAG_NLI_THRESHOLD".to_string(), "0.5".to_string()); + let c = Config::defaults().apply_env(&env); + assert!((c.rag.nli_threshold - 0.5).abs() < 1e-6); + } + + #[test] + fn env_override_nli_model_and_provider() { + let mut env = HashMap::new(); + env.insert( + "KEBAB_MODELS_NLI_MODEL".to_string(), + "user/custom-nli-model".to_string(), + ); + env.insert( + "KEBAB_MODELS_NLI_PROVIDER".to_string(), + "candle".to_string(), + ); + let c = Config::defaults().apply_env(&env); + assert_eq!(c.models.nli.model, "user/custom-nli-model"); + assert_eq!(c.models.nli.provider, "candle"); + } + + /// Malformed `KEBAB_RAG_NLI_THRESHOLD` keeps the prior value (does + /// NOT silently disable nor crash). The `tracing::warn!` surface + /// is observable only when the user has tracing wired; the + /// behavior contract is "default survives". + #[test] + fn env_malformed_nli_threshold_keeps_prior_value() { + let mut env = HashMap::new(); + env.insert( + "KEBAB_RAG_NLI_THRESHOLD".to_string(), + "not-a-float".to_string(), + ); + let c = Config::defaults().apply_env(&env); + assert_eq!( + c.rag.nli_threshold, 0.0, + "malformed env value must keep the default unchanged" + ); + } + #[test] fn image_ocr_env_overrides() { let mut env = HashMap::new(); diff --git a/crates/kebab-core/src/answer.rs b/crates/kebab-core/src/answer.rs index 78afa91..2629102 100644 --- a/crates/kebab-core/src/answer.rs +++ b/crates/kebab-core/src/answer.rs @@ -37,6 +37,29 @@ pub struct Answer { /// reading older single-pass answers see `hops: None` (or absent). #[serde(default, skip_serializing_if = "Option::is_none")] pub hops: Option>, + /// p9-fb-41 PR-9c-1: NLI-based post-synthesis verification summary. + /// `None` for single-pass asks and for multi-hop runs with + /// `[rag].nli_threshold == 0` (verification disabled — the default). + /// Present only when the multi-hop pipeline reached the post- + /// synthesize verification step (PR-9c-2 wires step 8.5). Wire- + /// additive: `answer.v1` schema_version unchanged; consumers + /// reading pre-v0.18 answers see `verification: None` (or absent). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub verification: Option, +} + +/// p9-fb-41 PR-9c-1: post-synthesize NLI verification summary stamped +/// onto [`Answer::verification`] when multi-hop runs reach step 8.5 +/// (NLI gate). Three required fields ride together on every wire emit: +/// `nli_score` is the entailment channel of the XNLI verifier, +/// `nli_threshold` mirrors `[rag].nli_threshold` for audit, and +/// `nli_passed` is `nli_score >= nli_threshold`. The whole struct is +/// omitted (serde skip) when no verification ran. +#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct VerificationSummary { + pub nli_score: f32, + pub nli_threshold: f32, + pub nli_passed: bool, } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] @@ -153,6 +176,18 @@ pub enum RefusalReason { /// 단계 진입 못 함. CLI / MCP / TUI 가 받는 wire error code /// = `"multi_hop_decompose_failed"` (PR-4 의 error_wire 매핑). MultiHopDecomposeFailed, + /// p9-fb-41 PR-9c-1: post-synthesize NLI verification gate fired — + /// `NliScores::faithfulness()` (entailment channel) fell below + /// `[rag].nli_threshold`. Wire string = `"nli_verification_failed"` + /// (single source of truth: also the matching `error.v1.code`). + /// Multi-hop only; behavior wiring lands in PR-9c-2. + NliVerificationFailed, + /// p9-fb-41 PR-9c-1: NLI verifier was configured (threshold > 0) + /// but the model / runtime is unavailable (download failure, + /// missing tokenizer, ONNX session init error). Treated as a soft + /// refusal — the user sees an unverified-answer outcome rather + /// than crashing the ask. Wire string = `"nli_model_unavailable"`. + NliModelUnavailable, } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] @@ -190,6 +225,81 @@ mod tests { use crate::citation::Citation; use time::macros::datetime; + /// p9-fb-41 PR-9c-1: pin the wire-side spelling of the new + /// `RefusalReason` variants. The strings here must match + /// `answer.schema.json::refusal_reason.enum` AND + /// `error.schema.json::code.enum` byte-for-byte (single source of + /// truth per spec §2.4). + #[test] + fn refusal_reason_nli_variants_serialize_to_snake_case() { + assert_eq!( + serde_json::to_string(&RefusalReason::NliVerificationFailed).unwrap(), + "\"nli_verification_failed\"" + ); + assert_eq!( + serde_json::to_string(&RefusalReason::NliModelUnavailable).unwrap(), + "\"nli_model_unavailable\"" + ); + } + + /// p9-fb-41 PR-9c-1: `Answer.verification` is `Option<...>` with + /// `skip_serializing_if = None`. A `verification: None` answer + /// must NOT emit a `"verification"` key on the wire — the field + /// is additive and pre-v0.18 readers see no new key. + #[test] + fn answer_omits_verification_field_when_none() { + let ans = Answer { + answer: "x".into(), + citations: vec![], + grounded: true, + refusal_reason: None, + model: ModelRef { + id: "m".into(), + provider: "p".into(), + dimensions: None, + }, + embedding: None, + prompt_template_version: PromptTemplateVersion("rag-v2".into()), + retrieval: AnswerRetrievalSummary { + trace_id: TraceId("t".into()), + mode: crate::SearchMode::Lexical, + k: 1, + score_gate: 0.0, + top_score: 0.0, + chunks_returned: 0, + chunks_used: 0, + }, + usage: TokenUsage { + prompt_tokens: 0, + completion_tokens: 0, + latency_ms: 0, + }, + created_at: datetime!(2026-05-09 12:00:00 UTC), + conversation_id: None, + turn_index: None, + hops: None, + verification: None, + }; + let v = serde_json::to_value(&ans).unwrap(); + assert!( + v.get("verification").is_none(), + "verification: None must be omitted from wire output, got: {v}" + ); + } + + #[test] + fn verification_summary_serializes_all_three_required_fields() { + let vs = VerificationSummary { + nli_score: 0.87, + nli_threshold: 0.5, + nli_passed: true, + }; + let v = serde_json::to_value(vs).unwrap(); + assert!((v["nli_score"].as_f64().unwrap() - 0.87).abs() < 1e-5); + assert!((v["nli_threshold"].as_f64().unwrap() - 0.5).abs() < 1e-5); + assert_eq!(v["nli_passed"], true); + } + #[test] fn answer_citation_serializes_indexed_at_and_stale() { let ac = AnswerCitation { diff --git a/crates/kebab-core/src/lib.rs b/crates/kebab-core/src/lib.rs index b534bf0..d8838c0 100644 --- a/crates/kebab-core/src/lib.rs +++ b/crates/kebab-core/src/lib.rs @@ -57,7 +57,7 @@ pub use search::{ }; pub use answer::{ Answer, AnswerCitation, AnswerRetrievalSummary, HopKind, HopRecord, ModelRef, - RefusalReason, TokenUsage, TraceId, Turn, + RefusalReason, TokenUsage, TraceId, Turn, VerificationSummary, }; pub use ingest::{IngestItem, IngestItemKind, IngestReport, SkipExamples}; pub use jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus}; diff --git a/crates/kebab-eval/src/metrics.rs b/crates/kebab-eval/src/metrics.rs index 15d5cd6..d9565a8 100644 --- a/crates/kebab-eval/src/metrics.rs +++ b/crates/kebab-eval/src/metrics.rs @@ -533,6 +533,7 @@ mod tests { conversation_id: None, turn_index: None, hops: None, + verification: None, } } diff --git a/crates/kebab-rag/Cargo.toml b/crates/kebab-rag/Cargo.toml index 7afcfad..c0c3baf 100644 --- a/crates/kebab-rag/Cargo.toml +++ b/crates/kebab-rag/Cargo.toml @@ -12,6 +12,7 @@ kebab-core = { path = "../kebab-core" } kebab-config = { path = "../kebab-config" } kebab-search = { path = "../kebab-search" } kebab-llm = { path = "../kebab-llm" } +kebab-nli = { path = "../kebab-nli" } kebab-store-sqlite = { path = "../kebab-store-sqlite" } serde = { workspace = true } serde_json = { workspace = true } diff --git a/crates/kebab-rag/src/pipeline.rs b/crates/kebab-rag/src/pipeline.rs index 50230d0..8d77eec 100644 --- a/crates/kebab-rag/src/pipeline.rs +++ b/crates/kebab-rag/src/pipeline.rs @@ -197,6 +197,14 @@ pub struct RagPipeline { retriever: Arc, llm: Arc, docs: Arc, + /// p9-fb-41 PR-9c-1: optional NLI verifier injected via + /// [`Self::with_verifier`]. Not yet read — PR-9c-2 wires the + /// `ask_multi_hop` step 8.5 (post-synthesize gate) that consumes + /// it. Until then the field is `#[allow(dead_code)]`; the + /// attribute is removed in the PR-9c-2 commit that adds the + /// read site so leftover dead code can never sneak in. + #[allow(dead_code)] + verifier: Option>, } impl RagPipeline { @@ -204,6 +212,10 @@ impl RagPipeline { /// validated here — callers are expected to pass already-built /// `Arc`'d trait objects (kb-app builds them from config; tests /// inject mocks). + /// + /// The NLI verifier is NOT a constructor arg — it threads in via + /// the [`Self::with_verifier`] builder so the historical 4-arg + /// signature stays stable across the PR-9c-1 surface bump. pub fn new( config: kebab_config::Config, retriever: Arc, @@ -215,9 +227,26 @@ impl RagPipeline { retriever, llm, docs, + verifier: None, } } + /// p9-fb-41 PR-9c-1: inject the post-synthesize NLI verifier. + /// Caller (kebab-app facade, PR-9c-2) builds an + /// `Arc` from `cfg.models.nli` when + /// `cfg.rag.nli_threshold > 0`, then chains + /// `RagPipeline::new(...).with_verifier(v)`. + /// + /// Currently unused — PR-9c-2 wires the read site (step 8.5 of + /// `ask_multi_hop`). `#[allow(dead_code)]` survives only until + /// that PR's commit, which removes it together with adding the + /// hook that reads `self.verifier`. + #[allow(dead_code)] + pub fn with_verifier(mut self, v: Arc) -> Self { + self.verifier = Some(v); + self + } + /// p9-fb-15: convenience for multi-turn ask. Stuffs `history`, /// `conversation_id`, `turn_index` into a fresh `AskOpts` (built /// from `opts.mode` + carried-through knobs) and forwards to @@ -537,6 +566,10 @@ impl RagPipeline { // only the multi-hop happy path will set `Some(...)` in // Step 5 once the decide loop populates a hop trace. hops: None, + // p9-fb-41 PR-9c-1: surface-only field — single-pass + // never verifies (multi-hop step 8.5 is the only path + // that stamps `Some(...)`, wired in PR-9c-2). + verification: None, }; // Drop the moved `finish_reason` early into a tracing breadcrumb; the @@ -1068,6 +1101,11 @@ impl RagPipeline { // currently lose the trace (cleanup deferred — would // require widening helper signatures, PR-3b-ii / follow-up). hops: Some(hops), + // p9-fb-41 PR-9c-1: surface-only field — PR-9c-2 wires + // step 8.5 between citation-validate and Answer-build to + // stamp this with the actual NLI score when + // `cfg.rag.nli_threshold > 0`. Until then, stays None. + verification: None, }; tracing::debug!( @@ -1276,6 +1314,9 @@ impl RagPipeline { // only the multi-hop happy path will set `Some(...)` in // Step 5 once the decide loop populates a hop trace. hops: None, + // p9-fb-41 PR-9c-1: surface-only field — decompose-failure + // refusal never reaches the NLI gate. + verification: None, }; if let Some(sink) = &opts.stream_sink { let _ = sink.send(StreamEvent::Final { @@ -1411,6 +1452,9 @@ impl RagPipeline { // stays `skip_serializing_if = None`, so single-pass // wire output is unchanged. hops, + // p9-fb-41 PR-9c-1: NoChunks refusal never reaches the + // synthesize / NLI gate. + verification: None, }; if let Err(e) = self.docs.put_answer(&answer, query, None) { tracing::warn!(target: "kebab-rag", error = %e, "kb-rag: put_answer (NoChunks) failed"); @@ -1501,6 +1545,9 @@ impl RagPipeline { turn_index: opts.turn_index, // p9-fb-41 PR-3b-ii: see refuse_no_chunks' identical comment. hops, + // p9-fb-41 PR-9c-1: ScoreGate refusal never reaches the + // synthesize / NLI gate. + verification: None, }; if let Err(e) = self.docs.put_answer(&answer, query, None) { tracing::warn!(target: "kebab-rag", error = %e, "kb-rag: put_answer (ScoreGate) failed"); @@ -2134,6 +2181,7 @@ mod stream_event_serde_tests { conversation_id: None, turn_index: None, hops: None, + verification: None, }; let ev = StreamEvent::Final { answer }; let v = serde_json::to_value(&ev).unwrap(); diff --git a/crates/kebab-store-sqlite/src/answers.rs b/crates/kebab-store-sqlite/src/answers.rs index 6efe244..9d3eec8 100644 --- a/crates/kebab-store-sqlite/src/answers.rs +++ b/crates/kebab-store-sqlite/src/answers.rs @@ -100,6 +100,12 @@ fn refusal_reason_label(r: &RefusalReason) -> &'static str { RefusalReason::NoChunks => "no_chunks", RefusalReason::LlmStreamAborted => "llm_stream_aborted", RefusalReason::MultiHopDecomposeFailed => "multi_hop_decompose_failed", + // p9-fb-41 PR-9c-1: mirror the serde(rename_all="snake_case") + // wire form. PR-9c-2 surfaces these on actual answers when + // `[rag].nli_threshold > 0`; the labels exist now so the + // match stays exhaustive without `_ => unreachable!()`. + RefusalReason::NliVerificationFailed => "nli_verification_failed", + RefusalReason::NliModelUnavailable => "nli_model_unavailable", } } diff --git a/crates/kebab-tui/src/ask.rs b/crates/kebab-tui/src/ask.rs index f9c4707..6a6dccf 100644 --- a/crates/kebab-tui/src/ask.rs +++ b/crates/kebab-tui/src/ask.rs @@ -261,6 +261,16 @@ fn render_status(f: &mut Frame, area: Rect, s: &AskState, theme: &crate::theme:: Some(RefusalReason::MultiHopDecomposeFailed) => { " refusal=multi_hop_decompose_failed" } + // p9-fb-41 PR-9c-1: NLI refusals don't yet appear on + // live answers (PR-9c-2 wires the gate), but the + // match must stay exhaustive so the new variants + // compile without `_ => unreachable!()`. + Some(RefusalReason::NliVerificationFailed) => { + " refusal=nli_verification_failed" + } + Some(RefusalReason::NliModelUnavailable) => { + " refusal=nli_model_unavailable" + } None => "", }; let mut lines = vec![ diff --git a/crates/kebab-tui/tests/ask.rs b/crates/kebab-tui/tests/ask.rs index 47e7ba4..066ff7e 100644 --- a/crates/kebab-tui/tests/ask.rs +++ b/crates/kebab-tui/tests/ask.rs @@ -78,6 +78,7 @@ fn make_answer(grounded: bool, refusal: Option, body: &str) -> An conversation_id: None, turn_index: None, hops: None, + verification: None, } } diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index dfa38db..79a68d2 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -68,6 +68,7 @@ flowchart TB llmlocal["kebab-llm-local
(Ollama)"] search["kebab-search"] rag["kebab-rag"] + nli["kebab-nli
(NLI verifier, fb-41)"] end eval["kebab-eval"] config["kebab-config"] @@ -106,6 +107,9 @@ flowchart TB rag --> search rag --> llm rag --> sqlite + rag --> nli + app --> nli + nli --> config search --> sqlite search --> vector search --> embed @@ -181,6 +185,7 @@ kebab/ │ ├── kebab-store-vector/ # LanceDB VectorStore (P3-3, P7-3 follow-up) │ ├── kebab-llm/ kebab-llm-local/ # LanguageModel trait + Ollama adapter (P4-1, P4-2) │ ├── kebab-rag/ # RAG pipeline (P4-3) +│ ├── kebab-nli/ # NLI verifier (mDeBERTa-v3 XNLI, fb-41 PR-9a/9b/9c-1) │ ├── kebab-eval/ # golden query runner + metrics (P5-1, P5-2) │ ├── kebab-parse-image/ # ImageExtractor + Ollama OCR + caption (P6) │ ├── kebab-parse-pdf/ # lopdf per-page text extractor (P7-1) diff --git a/docs/wire-schema/v1/answer.schema.json b/docs/wire-schema/v1/answer.schema.json index da8ac27..8d9c0c3 100644 --- a/docs/wire-schema/v1/answer.schema.json +++ b/docs/wire-schema/v1/answer.schema.json @@ -30,12 +30,14 @@ "no_index", "no_chunks", "llm_stream_aborted", - "multi_hop_decompose_failed" + "multi_hop_decompose_failed", + "nli_verification_failed", + "nli_model_unavailable" ] }, { "type": "null" } ], - "description": "p9-fb-41: `multi_hop_decompose_failed` added in PR-2 alongside the multi-hop pipeline skeleton (only emitted when AskOpts.multi_hop = true and the decompose LLM call fails to parse). Other variants are unchanged from earlier phases." + "description": "p9-fb-41: `multi_hop_decompose_failed` added in PR-2 alongside the multi-hop pipeline skeleton (only emitted when AskOpts.multi_hop = true and the decompose LLM call fails to parse). `nli_verification_failed` + `nli_model_unavailable` added in PR-9c-1 — both emitted only on the multi-hop path when `[rag].nli_threshold > 0`; surface declared in PR-9c-1, behavior wired in PR-9c-2." }, "model": { "type": "object" }, "embedding": { "type": ["object", "null"] }, @@ -61,6 +63,13 @@ { "type": "null" } ], "description": "p9-fb-41 multi-hop trace. Present (non-null array) only when the ask routed through the multi-hop pipeline (`AskOpts.multi_hop = true`); single-pass answers omit the field entirely (serde `skip_serializing_if = None`). Each entry records one LLM hop — decompose / decide / synthesize — with sub-queries, retrieval count, and per-hop latency. Wire-additive: pre-fb-41 readers tolerate the missing field; new readers branch on its presence to render the per-hop trace." + }, + "verification": { + "anyOf": [ + { "$ref": "#/$defs/VerificationSummary" }, + { "type": "null" } + ], + "description": "p9-fb-41 PR-9c-1: NLI-based groundedness verification summary. Present only when `[rag].nli_threshold > 0` and multi-hop ask reached step 8.5 (single-pass ask never verifies). Surface declared in PR-9c-1; the actual stamp lands in PR-9c-2. Wire-additive: pre-v0.18 readers tolerate the missing field." } }, "$defs": { @@ -97,6 +106,24 @@ "description": "Wall-clock latency of the LLM call for this hop. `0` is overloaded — means 'no LLM call happened' when (a) the Decide hop was skipped due to forced_stop or (b) the pool was empty before any decide could run. Treat 0 as absent or instantaneous." } } + }, + "VerificationSummary": { + "type": "object", + "required": ["nli_score", "nli_threshold", "nli_passed"], + "properties": { + "nli_score": { + "type": "number", + "description": "p9-fb-41 PR-9c-1: NLI entailment channel score (faithfulness) — `NliScores::faithfulness()` of `(premise = packed chunks, hypothesis = answer)`." + }, + "nli_threshold": { + "type": "number", + "description": "p9-fb-41 PR-9c-1: mirror of `[rag].nli_threshold` at the time the verification ran (audit field — same value the pipeline gates on)." + }, + "nli_passed": { + "type": "boolean", + "description": "p9-fb-41 PR-9c-1: `nli_score >= nli_threshold`. When false, the matching wire emit also carries `refusal_reason = \"nli_verification_failed\"`." + } + } } } } diff --git a/docs/wire-schema/v1/error.schema.json b/docs/wire-schema/v1/error.schema.json index d113b65..04bfdf6 100644 --- a/docs/wire-schema/v1/error.schema.json +++ b/docs/wire-schema/v1/error.schema.json @@ -17,14 +17,16 @@ "timeout", "io_error", "generic", - "multi_hop_decompose_failed" + "multi_hop_decompose_failed", + "nli_verification_failed", + "nli_model_unavailable" ] }, "message": { "type": "string" }, "details": { "type": "object", "additionalProperties": true, - "description": "Per-code free-form context. config_invalid: { path, cause }. not_indexed: { expected, found }. model_unreachable: { endpoint, source }. model_not_pulled: { model }. timeout: { source }. io_error: { kind }. generic: { chain (when --verbose) }. multi_hop_decompose_failed: {} (reserved — currently emitted as Answer.refusal_reason on stdout, NOT as error.v1 on stderr; the enum member is forward-looking for a future RefusalReason → error_wire promotion)." + "description": "Per-code free-form context. config_invalid: { path, cause }. not_indexed: { expected, found }. model_unreachable: { endpoint, source }. model_not_pulled: { model }. timeout: { source }. io_error: { kind }. generic: { chain (when --verbose) }. multi_hop_decompose_failed: {} (reserved — currently emitted as Answer.refusal_reason on stdout, NOT as error.v1 on stderr; the enum member is forward-looking for a future RefusalReason → error_wire promotion). nli_verification_failed: {} (p9-fb-41 PR-9c-1 reserved — currently emitted only as Answer.refusal_reason on stdout; forward-looking for future RefusalReason → error_wire promotion). nli_model_unavailable: {} (p9-fb-41 PR-9c-1 reserved — same pattern as nli_verification_failed)." }, "hint": { "anyOf": [