feat(rag): fb-41 PR-9c-1 — core types + wire scaffolding (NLI verification)
Surface-only PR (no behavior wiring — that's PR-9c-2):
- kebab-core: RefusalReason::NliVerificationFailed + NliModelUnavailable (serde rename_all="snake_case", wire = identical strings).
- kebab-core: Answer.verification: Option<VerificationSummary> field (additive minor wire — pre-v0.18 reader 무영향).
- kebab-core: VerificationSummary { nli_score: f32, nli_threshold: f32, nli_passed: bool } struct + lib.rs 재-export.
- kebab-config: NliCfg { model, provider } + ModelsCfg.nli (default Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7).
- kebab-config: RagCfg.nli_threshold: f32 (default 0.0 = disabled, spec §2.6 single gate).
- kebab-config: env override KEBAB_MODELS_NLI_MODEL/PROVIDER + KEBAB_RAG_NLI_THRESHOLD (parse 실패 시 tracing::warn + default 유지).
- kebab-rag: RagPipeline.verifier: Option<Arc<dyn NliVerifier>> field + with_verifier builder (모두 #[allow(dead_code)] — PR-9c-2 의 step 8.5 hook 가 활성화 시 제거). RagPipeline::new signature 유지 (round-2 NEW-M1 Option B).
- kebab-rag: Cargo.toml 에 kebab-nli path 의존 추가.
- kebab-store-sqlite + kebab-tui: 두 신규 RefusalReason variant 에 대한 exhaustive match arm 추가 (snake_case label / 표시 문구).
- 모든 Answer 구축 site (rag 6 + cli/tui/eval 3 fixture) 에 verification: None 추가.
- wire schemas: answer.schema.json verification field + \$defs.VerificationSummary + refusal_reason.enum 2 추가. error.schema.json code.enum + details.description 2 추가 (forward-looking reserved).
- docs/ARCHITECTURE.md: Mermaid Adapters subgraph 의 nli 노드 + rag→nli + app→nli (forward-looking) + nli→config edges. nli→core edge 는 skip (kebab-nli/Cargo.toml direct dep 가 config 만, ARCHITECTURE 컨벤션 = direct deps only). 디렉토리 트리에 crates/kebab-nli/ 추가.
Tests: kebab-core 3 (serde rename + verification skip + struct shape) + kebab-config 6 (defaults + legacy + env + malformed env) + kebab-cli wire 5 (schema verification + enum 검증).
검증: cargo test --workspace -j 1 회귀 0 (pre-existing kebab-mcp::tools_call_ask_multi_hop flaky 1개 동일 — spec 에 명시된 known-flaky). cargo clippy --workspace --all-targets -D warnings clean.
Wire 영향: additive minor — answer.v1 의 verification optional + refusal_reason.enum 확장 + error.v1.code 확장.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -4451,6 +4451,7 @@ dependencies = [
|
||||
"kebab-config",
|
||||
"kebab-core",
|
||||
"kebab-llm",
|
||||
"kebab-nli",
|
||||
"kebab-search",
|
||||
"kebab-store-sqlite",
|
||||
"regex",
|
||||
|
||||
@@ -1653,6 +1653,7 @@ mod tests {
|
||||
conversation_id: None,
|
||||
turn_index: None,
|
||||
hops: None,
|
||||
verification: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -144,3 +144,111 @@ fn error_schema_code_enum_includes_multi_hop_decompose_failed() {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── p9-fb-41 PR-9c-1: NLI verification surface pins ─────────────────────
|
||||
|
||||
/// answer.v1 must declare a `verification` property AND a
|
||||
/// `$defs.VerificationSummary` entry with all three required fields.
|
||||
/// Guards against accidental schema deletion / typo in future edits.
|
||||
#[test]
|
||||
fn answer_schema_declares_verification_field_and_defs() {
|
||||
let schema = parse_schema("answer.schema.json");
|
||||
assert!(
|
||||
schema["properties"]["verification"].is_object(),
|
||||
"`verification` property must be declared on answer.v1"
|
||||
);
|
||||
// `verification` allows object-or-null (multi-hop with threshold>0
|
||||
// emits an object; everything else omits the field).
|
||||
let v_any_of = schema["properties"]["verification"]["anyOf"]
|
||||
.as_array()
|
||||
.expect("verification must declare anyOf (object | null)");
|
||||
assert!(
|
||||
v_any_of.iter().any(|v| v["type"] == "null"),
|
||||
"verification anyOf must include null (single-pass / disabled gate omits the field)"
|
||||
);
|
||||
assert!(
|
||||
v_any_of
|
||||
.iter()
|
||||
.any(|v| v["$ref"].as_str() == Some("#/$defs/VerificationSummary")),
|
||||
"verification anyOf must $ref VerificationSummary"
|
||||
);
|
||||
|
||||
// VerificationSummary $defs entry + required fields.
|
||||
let vs = &schema["$defs"]["VerificationSummary"];
|
||||
assert!(
|
||||
vs.is_object(),
|
||||
"$defs.VerificationSummary must be declared so verification.anyOf can $ref it"
|
||||
);
|
||||
let required: Vec<&str> = vs["required"]
|
||||
.as_array()
|
||||
.expect("VerificationSummary.required must be an array")
|
||||
.iter()
|
||||
.filter_map(|v| v.as_str())
|
||||
.collect();
|
||||
for needed in ["nli_score", "nli_threshold", "nli_passed"] {
|
||||
assert!(
|
||||
required.contains(&needed),
|
||||
"VerificationSummary.required must include {needed:?}, got {required:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn answer_schema_refusal_reason_enum_includes_nli_verification_failed() {
|
||||
let schema = parse_schema("answer.schema.json");
|
||||
let refusal_any_of = schema["properties"]["refusal_reason"]["anyOf"]
|
||||
.as_array()
|
||||
.expect("refusal_reason must declare anyOf");
|
||||
let enum_arr = refusal_any_of
|
||||
.iter()
|
||||
.find_map(|v| v["enum"].as_array())
|
||||
.expect("one of refusal_reason.anyOf entries must declare an enum");
|
||||
let values: Vec<&str> = enum_arr.iter().filter_map(|v| v.as_str()).collect();
|
||||
assert!(
|
||||
values.contains(&"nli_verification_failed"),
|
||||
"refusal_reason enum must include `nli_verification_failed`, got {values:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn answer_schema_refusal_reason_enum_includes_nli_model_unavailable() {
|
||||
let schema = parse_schema("answer.schema.json");
|
||||
let refusal_any_of = schema["properties"]["refusal_reason"]["anyOf"]
|
||||
.as_array()
|
||||
.expect("refusal_reason must declare anyOf");
|
||||
let enum_arr = refusal_any_of
|
||||
.iter()
|
||||
.find_map(|v| v["enum"].as_array())
|
||||
.expect("one of refusal_reason.anyOf entries must declare an enum");
|
||||
let values: Vec<&str> = enum_arr.iter().filter_map(|v| v.as_str()).collect();
|
||||
assert!(
|
||||
values.contains(&"nli_model_unavailable"),
|
||||
"refusal_reason enum must include `nli_model_unavailable`, got {values:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_schema_code_enum_includes_nli_verification_failed() {
|
||||
let schema = parse_schema("error.schema.json");
|
||||
let code_enum = schema["properties"]["code"]["enum"]
|
||||
.as_array()
|
||||
.expect("error.v1 must declare code.enum");
|
||||
let values: Vec<&str> = code_enum.iter().filter_map(|v| v.as_str()).collect();
|
||||
assert!(
|
||||
values.contains(&"nli_verification_failed"),
|
||||
"error.v1 code enum must include forward-looking `nli_verification_failed`, got {values:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_schema_code_enum_includes_nli_model_unavailable() {
|
||||
let schema = parse_schema("error.schema.json");
|
||||
let code_enum = schema["properties"]["code"]["enum"]
|
||||
.as_array()
|
||||
.expect("error.v1 must declare code.enum");
|
||||
let values: Vec<&str> = code_enum.iter().filter_map(|v| v.as_str()).collect();
|
||||
assert!(
|
||||
values.contains(&"nli_model_unavailable"),
|
||||
"error.v1 code enum must include forward-looking `nli_model_unavailable`, got {values:?}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -103,6 +103,34 @@ pub struct ChunkingCfg {
|
||||
pub struct ModelsCfg {
|
||||
pub embedding: EmbeddingModelCfg,
|
||||
pub llm: LlmCfg,
|
||||
/// p9-fb-41 PR-9c-1: NLI verifier model + provider knob.
|
||||
/// `#[serde(default)]` so pre-v0.18 config files that predate the
|
||||
/// `[models.nli]` section still load with built-in defaults
|
||||
/// (`Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7` / `onnx`).
|
||||
/// The verifier itself is gated by `[rag].nli_threshold` — even
|
||||
/// with a model configured here, threshold `0.0` (the default)
|
||||
/// skips the verification step entirely.
|
||||
#[serde(default = "NliCfg::defaults")]
|
||||
pub nli: NliCfg,
|
||||
}
|
||||
|
||||
/// p9-fb-41 PR-9c-1: NLI verifier configuration. The model id flows to
|
||||
/// `OnnxNliVerifier::new` via `kebab-nli` (PR-9c-2 wiring); the provider
|
||||
/// is reserved for future verifier swap-in (currently only `"onnx"` is
|
||||
/// recognized — anything else falls back to the same path).
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct NliCfg {
|
||||
pub model: String,
|
||||
pub provider: String,
|
||||
}
|
||||
|
||||
impl NliCfg {
|
||||
pub fn defaults() -> Self {
|
||||
Self {
|
||||
model: "Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7".to_string(),
|
||||
provider: "onnx".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -213,6 +241,22 @@ pub struct RagCfg {
|
||||
/// cross-doc reasoning over ~5 chunks per iter.
|
||||
#[serde(default = "default_multi_hop_max_pool_chunks")]
|
||||
pub multi_hop_max_pool_chunks: u32,
|
||||
/// p9-fb-41 PR-9c-1: minimum NLI entailment score required for the
|
||||
/// multi-hop synthesize answer to be returned as `grounded=true`
|
||||
/// (spec §2.6 single gate). When the post-synthesize NLI verifier
|
||||
/// returns `NliScores::faithfulness() < nli_threshold` the
|
||||
/// pipeline refuses with `RefusalReason::NliVerificationFailed`.
|
||||
///
|
||||
/// Default `0.0` = verification disabled — no NLI call, multi-hop
|
||||
/// matches its PR-3b behavior exactly. Set to e.g. `0.5` to
|
||||
/// activate the gate. Knob lives on `[rag]` (the gate is a RAG
|
||||
/// policy, not a model property); the model itself comes from
|
||||
/// `[models.nli].model`.
|
||||
///
|
||||
/// Single-pass `ask` ignores this knob entirely — only multi-hop
|
||||
/// runs through the verification step (PR-9c-2 wires it).
|
||||
#[serde(default = "default_nli_threshold")]
|
||||
pub nli_threshold: f32,
|
||||
}
|
||||
|
||||
fn default_multi_hop_max_depth() -> u32 {
|
||||
@@ -227,6 +271,13 @@ fn default_multi_hop_max_pool_chunks() -> u32 {
|
||||
15
|
||||
}
|
||||
|
||||
/// p9-fb-41 PR-9c-1: NLI gate disabled by default per spec §2.6
|
||||
/// (verification opt-in — users explicitly raise the threshold once
|
||||
/// they're ready to trade refusal-rate for groundedness).
|
||||
fn default_nli_threshold() -> f32 {
|
||||
0.0
|
||||
}
|
||||
|
||||
/// Settings for the image ingest pipeline (P6). `ocr` controls OCR
|
||||
/// behaviour (P6-2); `caption` controls vision-LM captioning (P6-3).
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -464,6 +515,7 @@ impl Config {
|
||||
seed: 0,
|
||||
request_timeout_secs: default_llm_request_timeout_secs(),
|
||||
},
|
||||
nli: NliCfg::defaults(),
|
||||
},
|
||||
search: SearchCfg {
|
||||
default_k: 10,
|
||||
@@ -482,6 +534,7 @@ impl Config {
|
||||
multi_hop_max_sub_queries_per_iter:
|
||||
default_multi_hop_max_sub_queries_per_iter(),
|
||||
multi_hop_max_pool_chunks: default_multi_hop_max_pool_chunks(),
|
||||
nli_threshold: default_nli_threshold(),
|
||||
},
|
||||
image: ImageCfg::defaults(),
|
||||
ui: UiCfg::defaults(),
|
||||
@@ -725,6 +778,10 @@ impl Config {
|
||||
}
|
||||
}
|
||||
|
||||
// models.nli (p9-fb-41 PR-9c-1)
|
||||
"KEBAB_MODELS_NLI_MODEL" => self.models.nli.model = v.clone(),
|
||||
"KEBAB_MODELS_NLI_PROVIDER" => self.models.nli.provider = v.clone(),
|
||||
|
||||
// search
|
||||
"KEBAB_SEARCH_DEFAULT_K" => {
|
||||
if let Ok(n) = v.parse::<usize>() {
|
||||
@@ -780,6 +837,24 @@ impl Config {
|
||||
self.rag.multi_hop_max_pool_chunks = n;
|
||||
}
|
||||
}
|
||||
// p9-fb-41 PR-9c-1: NLI gate threshold. Parse failure
|
||||
// emits a `tracing::warn!` (not silent like the other
|
||||
// numeric env overrides) because this knob gates the
|
||||
// NLI verification entirely — a malformed env value
|
||||
// would silently disable a security-flavored gate the
|
||||
// user thought they enabled, which is the failure mode
|
||||
// most worth surfacing. The default (`0.0`) survives
|
||||
// on parse failure so behaviour stays well-defined.
|
||||
"KEBAB_RAG_NLI_THRESHOLD" => match v.parse::<f32>() {
|
||||
Ok(f) => self.rag.nli_threshold = f,
|
||||
Err(e) => tracing::warn!(
|
||||
target: "kebab-config",
|
||||
env_key = "KEBAB_RAG_NLI_THRESHOLD",
|
||||
env_value = %v,
|
||||
error = %e,
|
||||
"invalid KEBAB_RAG_NLI_THRESHOLD; keeping prior value (0.0 = NLI gate disabled)"
|
||||
),
|
||||
},
|
||||
|
||||
// image.ocr
|
||||
"KEBAB_IMAGE_OCR_ENABLED" => {
|
||||
@@ -1214,6 +1289,84 @@ theme = "dark"
|
||||
assert_eq!(c.rag.multi_hop_max_pool_chunks, 15);
|
||||
}
|
||||
|
||||
// ── p9-fb-41 PR-9c-1: NLI verification knobs ─────────────────────────
|
||||
|
||||
#[test]
|
||||
fn default_nli_threshold_is_zero() {
|
||||
// Spec §2.6: NLI gate disabled by default — verification is
|
||||
// opt-in. `0.0` keeps multi-hop behavior identical to PR-3b.
|
||||
assert_eq!(Config::defaults().rag.nli_threshold, 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_nli_model_is_xenova_mdeberta() {
|
||||
// Pin the default model id so a refactor that touches NliCfg
|
||||
// can't silently flip to a different verifier model.
|
||||
assert_eq!(
|
||||
Config::defaults().models.nli.model,
|
||||
"Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
|
||||
);
|
||||
assert_eq!(Config::defaults().models.nli.provider, "onnx");
|
||||
}
|
||||
|
||||
/// A config file written before the `[models.nli]` / `nli_threshold`
|
||||
/// keys existed must still parse and fall back to the documented
|
||||
/// defaults. Fixture shared via [`LEGACY_PRE_TIMEOUT_TOML`] (predates
|
||||
/// all PR-9c-1 fields).
|
||||
#[test]
|
||||
fn legacy_config_without_nli_uses_defaults() {
|
||||
let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML)
|
||||
.expect("parse legacy config");
|
||||
assert_eq!(c.rag.nli_threshold, 0.0);
|
||||
assert_eq!(
|
||||
c.models.nli.model,
|
||||
"Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
|
||||
);
|
||||
assert_eq!(c.models.nli.provider, "onnx");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_override_nli_threshold() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert("KEBAB_RAG_NLI_THRESHOLD".to_string(), "0.5".to_string());
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert!((c.rag.nli_threshold - 0.5).abs() < 1e-6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_override_nli_model_and_provider() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert(
|
||||
"KEBAB_MODELS_NLI_MODEL".to_string(),
|
||||
"user/custom-nli-model".to_string(),
|
||||
);
|
||||
env.insert(
|
||||
"KEBAB_MODELS_NLI_PROVIDER".to_string(),
|
||||
"candle".to_string(),
|
||||
);
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert_eq!(c.models.nli.model, "user/custom-nli-model");
|
||||
assert_eq!(c.models.nli.provider, "candle");
|
||||
}
|
||||
|
||||
/// Malformed `KEBAB_RAG_NLI_THRESHOLD` keeps the prior value (does
|
||||
/// NOT silently disable nor crash). The `tracing::warn!` surface
|
||||
/// is observable only when the user has tracing wired; the
|
||||
/// behavior contract is "default survives".
|
||||
#[test]
|
||||
fn env_malformed_nli_threshold_keeps_prior_value() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert(
|
||||
"KEBAB_RAG_NLI_THRESHOLD".to_string(),
|
||||
"not-a-float".to_string(),
|
||||
);
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert_eq!(
|
||||
c.rag.nli_threshold, 0.0,
|
||||
"malformed env value must keep the default unchanged"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn image_ocr_env_overrides() {
|
||||
let mut env = HashMap::new();
|
||||
|
||||
@@ -37,6 +37,29 @@ pub struct Answer {
|
||||
/// reading older single-pass answers see `hops: None` (or absent).
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub hops: Option<Vec<HopRecord>>,
|
||||
/// p9-fb-41 PR-9c-1: NLI-based post-synthesis verification summary.
|
||||
/// `None` for single-pass asks and for multi-hop runs with
|
||||
/// `[rag].nli_threshold == 0` (verification disabled — the default).
|
||||
/// Present only when the multi-hop pipeline reached the post-
|
||||
/// synthesize verification step (PR-9c-2 wires step 8.5). Wire-
|
||||
/// additive: `answer.v1` schema_version unchanged; consumers
|
||||
/// reading pre-v0.18 answers see `verification: None` (or absent).
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub verification: Option<VerificationSummary>,
|
||||
}
|
||||
|
||||
/// p9-fb-41 PR-9c-1: post-synthesize NLI verification summary stamped
|
||||
/// onto [`Answer::verification`] when multi-hop runs reach step 8.5
|
||||
/// (NLI gate). Three required fields ride together on every wire emit:
|
||||
/// `nli_score` is the entailment channel of the XNLI verifier,
|
||||
/// `nli_threshold` mirrors `[rag].nli_threshold` for audit, and
|
||||
/// `nli_passed` is `nli_score >= nli_threshold`. The whole struct is
|
||||
/// omitted (serde skip) when no verification ran.
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct VerificationSummary {
|
||||
pub nli_score: f32,
|
||||
pub nli_threshold: f32,
|
||||
pub nli_passed: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -153,6 +176,18 @@ pub enum RefusalReason {
|
||||
/// 단계 진입 못 함. CLI / MCP / TUI 가 받는 wire error code
|
||||
/// = `"multi_hop_decompose_failed"` (PR-4 의 error_wire 매핑).
|
||||
MultiHopDecomposeFailed,
|
||||
/// p9-fb-41 PR-9c-1: post-synthesize NLI verification gate fired —
|
||||
/// `NliScores::faithfulness()` (entailment channel) fell below
|
||||
/// `[rag].nli_threshold`. Wire string = `"nli_verification_failed"`
|
||||
/// (single source of truth: also the matching `error.v1.code`).
|
||||
/// Multi-hop only; behavior wiring lands in PR-9c-2.
|
||||
NliVerificationFailed,
|
||||
/// p9-fb-41 PR-9c-1: NLI verifier was configured (threshold > 0)
|
||||
/// but the model / runtime is unavailable (download failure,
|
||||
/// missing tokenizer, ONNX session init error). Treated as a soft
|
||||
/// refusal — the user sees an unverified-answer outcome rather
|
||||
/// than crashing the ask. Wire string = `"nli_model_unavailable"`.
|
||||
NliModelUnavailable,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -190,6 +225,81 @@ mod tests {
|
||||
use crate::citation::Citation;
|
||||
use time::macros::datetime;
|
||||
|
||||
/// p9-fb-41 PR-9c-1: pin the wire-side spelling of the new
|
||||
/// `RefusalReason` variants. The strings here must match
|
||||
/// `answer.schema.json::refusal_reason.enum` AND
|
||||
/// `error.schema.json::code.enum` byte-for-byte (single source of
|
||||
/// truth per spec §2.4).
|
||||
#[test]
|
||||
fn refusal_reason_nli_variants_serialize_to_snake_case() {
|
||||
assert_eq!(
|
||||
serde_json::to_string(&RefusalReason::NliVerificationFailed).unwrap(),
|
||||
"\"nli_verification_failed\""
|
||||
);
|
||||
assert_eq!(
|
||||
serde_json::to_string(&RefusalReason::NliModelUnavailable).unwrap(),
|
||||
"\"nli_model_unavailable\""
|
||||
);
|
||||
}
|
||||
|
||||
/// p9-fb-41 PR-9c-1: `Answer.verification` is `Option<...>` with
|
||||
/// `skip_serializing_if = None`. A `verification: None` answer
|
||||
/// must NOT emit a `"verification"` key on the wire — the field
|
||||
/// is additive and pre-v0.18 readers see no new key.
|
||||
#[test]
|
||||
fn answer_omits_verification_field_when_none() {
|
||||
let ans = Answer {
|
||||
answer: "x".into(),
|
||||
citations: vec![],
|
||||
grounded: true,
|
||||
refusal_reason: None,
|
||||
model: ModelRef {
|
||||
id: "m".into(),
|
||||
provider: "p".into(),
|
||||
dimensions: None,
|
||||
},
|
||||
embedding: None,
|
||||
prompt_template_version: PromptTemplateVersion("rag-v2".into()),
|
||||
retrieval: AnswerRetrievalSummary {
|
||||
trace_id: TraceId("t".into()),
|
||||
mode: crate::SearchMode::Lexical,
|
||||
k: 1,
|
||||
score_gate: 0.0,
|
||||
top_score: 0.0,
|
||||
chunks_returned: 0,
|
||||
chunks_used: 0,
|
||||
},
|
||||
usage: TokenUsage {
|
||||
prompt_tokens: 0,
|
||||
completion_tokens: 0,
|
||||
latency_ms: 0,
|
||||
},
|
||||
created_at: datetime!(2026-05-09 12:00:00 UTC),
|
||||
conversation_id: None,
|
||||
turn_index: None,
|
||||
hops: None,
|
||||
verification: None,
|
||||
};
|
||||
let v = serde_json::to_value(&ans).unwrap();
|
||||
assert!(
|
||||
v.get("verification").is_none(),
|
||||
"verification: None must be omitted from wire output, got: {v}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verification_summary_serializes_all_three_required_fields() {
|
||||
let vs = VerificationSummary {
|
||||
nli_score: 0.87,
|
||||
nli_threshold: 0.5,
|
||||
nli_passed: true,
|
||||
};
|
||||
let v = serde_json::to_value(vs).unwrap();
|
||||
assert!((v["nli_score"].as_f64().unwrap() - 0.87).abs() < 1e-5);
|
||||
assert!((v["nli_threshold"].as_f64().unwrap() - 0.5).abs() < 1e-5);
|
||||
assert_eq!(v["nli_passed"], true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn answer_citation_serializes_indexed_at_and_stale() {
|
||||
let ac = AnswerCitation {
|
||||
|
||||
@@ -57,7 +57,7 @@ pub use search::{
|
||||
};
|
||||
pub use answer::{
|
||||
Answer, AnswerCitation, AnswerRetrievalSummary, HopKind, HopRecord, ModelRef,
|
||||
RefusalReason, TokenUsage, TraceId, Turn,
|
||||
RefusalReason, TokenUsage, TraceId, Turn, VerificationSummary,
|
||||
};
|
||||
pub use ingest::{IngestItem, IngestItemKind, IngestReport, SkipExamples};
|
||||
pub use jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
|
||||
|
||||
@@ -533,6 +533,7 @@ mod tests {
|
||||
conversation_id: None,
|
||||
turn_index: None,
|
||||
hops: None,
|
||||
verification: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ kebab-core = { path = "../kebab-core" }
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
kebab-search = { path = "../kebab-search" }
|
||||
kebab-llm = { path = "../kebab-llm" }
|
||||
kebab-nli = { path = "../kebab-nli" }
|
||||
kebab-store-sqlite = { path = "../kebab-store-sqlite" }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
|
||||
@@ -197,6 +197,14 @@ pub struct RagPipeline {
|
||||
retriever: Arc<dyn Retriever>,
|
||||
llm: Arc<dyn LanguageModel>,
|
||||
docs: Arc<SqliteStore>,
|
||||
/// p9-fb-41 PR-9c-1: optional NLI verifier injected via
|
||||
/// [`Self::with_verifier`]. Not yet read — PR-9c-2 wires the
|
||||
/// `ask_multi_hop` step 8.5 (post-synthesize gate) that consumes
|
||||
/// it. Until then the field is `#[allow(dead_code)]`; the
|
||||
/// attribute is removed in the PR-9c-2 commit that adds the
|
||||
/// read site so leftover dead code can never sneak in.
|
||||
#[allow(dead_code)]
|
||||
verifier: Option<Arc<dyn kebab_nli::NliVerifier>>,
|
||||
}
|
||||
|
||||
impl RagPipeline {
|
||||
@@ -204,6 +212,10 @@ impl RagPipeline {
|
||||
/// validated here — callers are expected to pass already-built
|
||||
/// `Arc`'d trait objects (kb-app builds them from config; tests
|
||||
/// inject mocks).
|
||||
///
|
||||
/// The NLI verifier is NOT a constructor arg — it threads in via
|
||||
/// the [`Self::with_verifier`] builder so the historical 4-arg
|
||||
/// signature stays stable across the PR-9c-1 surface bump.
|
||||
pub fn new(
|
||||
config: kebab_config::Config,
|
||||
retriever: Arc<dyn Retriever>,
|
||||
@@ -215,9 +227,26 @@ impl RagPipeline {
|
||||
retriever,
|
||||
llm,
|
||||
docs,
|
||||
verifier: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// p9-fb-41 PR-9c-1: inject the post-synthesize NLI verifier.
|
||||
/// Caller (kebab-app facade, PR-9c-2) builds an
|
||||
/// `Arc<OnnxNliVerifier>` from `cfg.models.nli` when
|
||||
/// `cfg.rag.nli_threshold > 0`, then chains
|
||||
/// `RagPipeline::new(...).with_verifier(v)`.
|
||||
///
|
||||
/// Currently unused — PR-9c-2 wires the read site (step 8.5 of
|
||||
/// `ask_multi_hop`). `#[allow(dead_code)]` survives only until
|
||||
/// that PR's commit, which removes it together with adding the
|
||||
/// hook that reads `self.verifier`.
|
||||
#[allow(dead_code)]
|
||||
pub fn with_verifier(mut self, v: Arc<dyn kebab_nli::NliVerifier>) -> Self {
|
||||
self.verifier = Some(v);
|
||||
self
|
||||
}
|
||||
|
||||
/// p9-fb-15: convenience for multi-turn ask. Stuffs `history`,
|
||||
/// `conversation_id`, `turn_index` into a fresh `AskOpts` (built
|
||||
/// from `opts.mode` + carried-through knobs) and forwards to
|
||||
@@ -537,6 +566,10 @@ impl RagPipeline {
|
||||
// only the multi-hop happy path will set `Some(...)` in
|
||||
// Step 5 once the decide loop populates a hop trace.
|
||||
hops: None,
|
||||
// p9-fb-41 PR-9c-1: surface-only field — single-pass
|
||||
// never verifies (multi-hop step 8.5 is the only path
|
||||
// that stamps `Some(...)`, wired in PR-9c-2).
|
||||
verification: None,
|
||||
};
|
||||
|
||||
// Drop the moved `finish_reason` early into a tracing breadcrumb; the
|
||||
@@ -1068,6 +1101,11 @@ impl RagPipeline {
|
||||
// currently lose the trace (cleanup deferred — would
|
||||
// require widening helper signatures, PR-3b-ii / follow-up).
|
||||
hops: Some(hops),
|
||||
// p9-fb-41 PR-9c-1: surface-only field — PR-9c-2 wires
|
||||
// step 8.5 between citation-validate and Answer-build to
|
||||
// stamp this with the actual NLI score when
|
||||
// `cfg.rag.nli_threshold > 0`. Until then, stays None.
|
||||
verification: None,
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
@@ -1276,6 +1314,9 @@ impl RagPipeline {
|
||||
// only the multi-hop happy path will set `Some(...)` in
|
||||
// Step 5 once the decide loop populates a hop trace.
|
||||
hops: None,
|
||||
// p9-fb-41 PR-9c-1: surface-only field — decompose-failure
|
||||
// refusal never reaches the NLI gate.
|
||||
verification: None,
|
||||
};
|
||||
if let Some(sink) = &opts.stream_sink {
|
||||
let _ = sink.send(StreamEvent::Final {
|
||||
@@ -1411,6 +1452,9 @@ impl RagPipeline {
|
||||
// stays `skip_serializing_if = None`, so single-pass
|
||||
// wire output is unchanged.
|
||||
hops,
|
||||
// p9-fb-41 PR-9c-1: NoChunks refusal never reaches the
|
||||
// synthesize / NLI gate.
|
||||
verification: None,
|
||||
};
|
||||
if let Err(e) = self.docs.put_answer(&answer, query, None) {
|
||||
tracing::warn!(target: "kebab-rag", error = %e, "kb-rag: put_answer (NoChunks) failed");
|
||||
@@ -1501,6 +1545,9 @@ impl RagPipeline {
|
||||
turn_index: opts.turn_index,
|
||||
// p9-fb-41 PR-3b-ii: see refuse_no_chunks' identical comment.
|
||||
hops,
|
||||
// p9-fb-41 PR-9c-1: ScoreGate refusal never reaches the
|
||||
// synthesize / NLI gate.
|
||||
verification: None,
|
||||
};
|
||||
if let Err(e) = self.docs.put_answer(&answer, query, None) {
|
||||
tracing::warn!(target: "kebab-rag", error = %e, "kb-rag: put_answer (ScoreGate) failed");
|
||||
@@ -2134,6 +2181,7 @@ mod stream_event_serde_tests {
|
||||
conversation_id: None,
|
||||
turn_index: None,
|
||||
hops: None,
|
||||
verification: None,
|
||||
};
|
||||
let ev = StreamEvent::Final { answer };
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
|
||||
@@ -100,6 +100,12 @@ fn refusal_reason_label(r: &RefusalReason) -> &'static str {
|
||||
RefusalReason::NoChunks => "no_chunks",
|
||||
RefusalReason::LlmStreamAborted => "llm_stream_aborted",
|
||||
RefusalReason::MultiHopDecomposeFailed => "multi_hop_decompose_failed",
|
||||
// p9-fb-41 PR-9c-1: mirror the serde(rename_all="snake_case")
|
||||
// wire form. PR-9c-2 surfaces these on actual answers when
|
||||
// `[rag].nli_threshold > 0`; the labels exist now so the
|
||||
// match stays exhaustive without `_ => unreachable!()`.
|
||||
RefusalReason::NliVerificationFailed => "nli_verification_failed",
|
||||
RefusalReason::NliModelUnavailable => "nli_model_unavailable",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -261,6 +261,16 @@ fn render_status(f: &mut Frame, area: Rect, s: &AskState, theme: &crate::theme::
|
||||
Some(RefusalReason::MultiHopDecomposeFailed) => {
|
||||
" refusal=multi_hop_decompose_failed"
|
||||
}
|
||||
// p9-fb-41 PR-9c-1: NLI refusals don't yet appear on
|
||||
// live answers (PR-9c-2 wires the gate), but the
|
||||
// match must stay exhaustive so the new variants
|
||||
// compile without `_ => unreachable!()`.
|
||||
Some(RefusalReason::NliVerificationFailed) => {
|
||||
" refusal=nli_verification_failed"
|
||||
}
|
||||
Some(RefusalReason::NliModelUnavailable) => {
|
||||
" refusal=nli_model_unavailable"
|
||||
}
|
||||
None => "",
|
||||
};
|
||||
let mut lines = vec![
|
||||
|
||||
@@ -78,6 +78,7 @@ fn make_answer(grounded: bool, refusal: Option<RefusalReason>, body: &str) -> An
|
||||
conversation_id: None,
|
||||
turn_index: None,
|
||||
hops: None,
|
||||
verification: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -68,6 +68,7 @@ flowchart TB
|
||||
llmlocal["kebab-llm-local<br/>(Ollama)"]
|
||||
search["kebab-search"]
|
||||
rag["kebab-rag"]
|
||||
nli["kebab-nli<br/>(NLI verifier, fb-41)"]
|
||||
end
|
||||
eval["kebab-eval"]
|
||||
config["kebab-config"]
|
||||
@@ -106,6 +107,9 @@ flowchart TB
|
||||
rag --> search
|
||||
rag --> llm
|
||||
rag --> sqlite
|
||||
rag --> nli
|
||||
app --> nli
|
||||
nli --> config
|
||||
search --> sqlite
|
||||
search --> vector
|
||||
search --> embed
|
||||
@@ -181,6 +185,7 @@ kebab/
|
||||
│ ├── kebab-store-vector/ # LanceDB VectorStore (P3-3, P7-3 follow-up)
|
||||
│ ├── kebab-llm/ kebab-llm-local/ # LanguageModel trait + Ollama adapter (P4-1, P4-2)
|
||||
│ ├── kebab-rag/ # RAG pipeline (P4-3)
|
||||
│ ├── kebab-nli/ # NLI verifier (mDeBERTa-v3 XNLI, fb-41 PR-9a/9b/9c-1)
|
||||
│ ├── kebab-eval/ # golden query runner + metrics (P5-1, P5-2)
|
||||
│ ├── kebab-parse-image/ # ImageExtractor + Ollama OCR + caption (P6)
|
||||
│ ├── kebab-parse-pdf/ # lopdf per-page text extractor (P7-1)
|
||||
|
||||
@@ -30,12 +30,14 @@
|
||||
"no_index",
|
||||
"no_chunks",
|
||||
"llm_stream_aborted",
|
||||
"multi_hop_decompose_failed"
|
||||
"multi_hop_decompose_failed",
|
||||
"nli_verification_failed",
|
||||
"nli_model_unavailable"
|
||||
]
|
||||
},
|
||||
{ "type": "null" }
|
||||
],
|
||||
"description": "p9-fb-41: `multi_hop_decompose_failed` added in PR-2 alongside the multi-hop pipeline skeleton (only emitted when AskOpts.multi_hop = true and the decompose LLM call fails to parse). Other variants are unchanged from earlier phases."
|
||||
"description": "p9-fb-41: `multi_hop_decompose_failed` added in PR-2 alongside the multi-hop pipeline skeleton (only emitted when AskOpts.multi_hop = true and the decompose LLM call fails to parse). `nli_verification_failed` + `nli_model_unavailable` added in PR-9c-1 — both emitted only on the multi-hop path when `[rag].nli_threshold > 0`; surface declared in PR-9c-1, behavior wired in PR-9c-2."
|
||||
},
|
||||
"model": { "type": "object" },
|
||||
"embedding": { "type": ["object", "null"] },
|
||||
@@ -61,6 +63,13 @@
|
||||
{ "type": "null" }
|
||||
],
|
||||
"description": "p9-fb-41 multi-hop trace. Present (non-null array) only when the ask routed through the multi-hop pipeline (`AskOpts.multi_hop = true`); single-pass answers omit the field entirely (serde `skip_serializing_if = None`). Each entry records one LLM hop — decompose / decide / synthesize — with sub-queries, retrieval count, and per-hop latency. Wire-additive: pre-fb-41 readers tolerate the missing field; new readers branch on its presence to render the per-hop trace."
|
||||
},
|
||||
"verification": {
|
||||
"anyOf": [
|
||||
{ "$ref": "#/$defs/VerificationSummary" },
|
||||
{ "type": "null" }
|
||||
],
|
||||
"description": "p9-fb-41 PR-9c-1: NLI-based groundedness verification summary. Present only when `[rag].nli_threshold > 0` and multi-hop ask reached step 8.5 (single-pass ask never verifies). Surface declared in PR-9c-1; the actual stamp lands in PR-9c-2. Wire-additive: pre-v0.18 readers tolerate the missing field."
|
||||
}
|
||||
},
|
||||
"$defs": {
|
||||
@@ -97,6 +106,24 @@
|
||||
"description": "Wall-clock latency of the LLM call for this hop. `0` is overloaded — means 'no LLM call happened' when (a) the Decide hop was skipped due to forced_stop or (b) the pool was empty before any decide could run. Treat 0 as absent or instantaneous."
|
||||
}
|
||||
}
|
||||
},
|
||||
"VerificationSummary": {
|
||||
"type": "object",
|
||||
"required": ["nli_score", "nli_threshold", "nli_passed"],
|
||||
"properties": {
|
||||
"nli_score": {
|
||||
"type": "number",
|
||||
"description": "p9-fb-41 PR-9c-1: NLI entailment channel score (faithfulness) — `NliScores::faithfulness()` of `(premise = packed chunks, hypothesis = answer)`."
|
||||
},
|
||||
"nli_threshold": {
|
||||
"type": "number",
|
||||
"description": "p9-fb-41 PR-9c-1: mirror of `[rag].nli_threshold` at the time the verification ran (audit field — same value the pipeline gates on)."
|
||||
},
|
||||
"nli_passed": {
|
||||
"type": "boolean",
|
||||
"description": "p9-fb-41 PR-9c-1: `nli_score >= nli_threshold`. When false, the matching wire emit also carries `refusal_reason = \"nli_verification_failed\"`."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,14 +17,16 @@
|
||||
"timeout",
|
||||
"io_error",
|
||||
"generic",
|
||||
"multi_hop_decompose_failed"
|
||||
"multi_hop_decompose_failed",
|
||||
"nli_verification_failed",
|
||||
"nli_model_unavailable"
|
||||
]
|
||||
},
|
||||
"message": { "type": "string" },
|
||||
"details": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "Per-code free-form context. config_invalid: { path, cause }. not_indexed: { expected, found }. model_unreachable: { endpoint, source }. model_not_pulled: { model }. timeout: { source }. io_error: { kind }. generic: { chain (when --verbose) }. multi_hop_decompose_failed: {} (reserved — currently emitted as Answer.refusal_reason on stdout, NOT as error.v1 on stderr; the enum member is forward-looking for a future RefusalReason → error_wire promotion)."
|
||||
"description": "Per-code free-form context. config_invalid: { path, cause }. not_indexed: { expected, found }. model_unreachable: { endpoint, source }. model_not_pulled: { model }. timeout: { source }. io_error: { kind }. generic: { chain (when --verbose) }. multi_hop_decompose_failed: {} (reserved — currently emitted as Answer.refusal_reason on stdout, NOT as error.v1 on stderr; the enum member is forward-looking for a future RefusalReason → error_wire promotion). nli_verification_failed: {} (p9-fb-41 PR-9c-1 reserved — currently emitted only as Answer.refusal_reason on stdout; forward-looking for future RefusalReason → error_wire promotion). nli_model_unavailable: {} (p9-fb-41 PR-9c-1 reserved — same pattern as nli_verification_failed)."
|
||||
},
|
||||
"hint": {
|
||||
"anyOf": [
|
||||
|
||||
Reference in New Issue
Block a user