Surface-only PR (no behavior wiring — that's PR-9c-2):
- kebab-core: RefusalReason::NliVerificationFailed + NliModelUnavailable (serde rename_all="snake_case", wire = identical strings).
- kebab-core: Answer.verification: Option<VerificationSummary> field (additive minor wire — pre-v0.18 reader 무영향).
- kebab-core: VerificationSummary { nli_score: f32, nli_threshold: f32, nli_passed: bool } struct + lib.rs 재-export.
- kebab-config: NliCfg { model, provider } + ModelsCfg.nli (default Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7).
- kebab-config: RagCfg.nli_threshold: f32 (default 0.0 = disabled, spec §2.6 single gate).
- kebab-config: env override KEBAB_MODELS_NLI_MODEL/PROVIDER + KEBAB_RAG_NLI_THRESHOLD (parse 실패 시 tracing::warn + default 유지).
- kebab-rag: RagPipeline.verifier: Option<Arc<dyn NliVerifier>> field + with_verifier builder (모두 #[allow(dead_code)] — PR-9c-2 의 step 8.5 hook 가 활성화 시 제거). RagPipeline::new signature 유지 (round-2 NEW-M1 Option B).
- kebab-rag: Cargo.toml 에 kebab-nli path 의존 추가.
- kebab-store-sqlite + kebab-tui: 두 신규 RefusalReason variant 에 대한 exhaustive match arm 추가 (snake_case label / 표시 문구).
- 모든 Answer 구축 site (rag 6 + cli/tui/eval 3 fixture) 에 verification: None 추가.
- wire schemas: answer.schema.json verification field + \$defs.VerificationSummary + refusal_reason.enum 2 추가. error.schema.json code.enum + details.description 2 추가 (forward-looking reserved).
- docs/ARCHITECTURE.md: Mermaid Adapters subgraph 의 nli 노드 + rag→nli + app→nli (forward-looking) + nli→config edges. nli→core edge 는 skip (kebab-nli/Cargo.toml direct dep 가 config 만, ARCHITECTURE 컨벤션 = direct deps only). 디렉토리 트리에 crates/kebab-nli/ 추가.
Tests: kebab-core 3 (serde rename + verification skip + struct shape) + kebab-config 6 (defaults + legacy + env + malformed env) + kebab-cli wire 5 (schema verification + enum 검증).
검증: cargo test --workspace -j 1 회귀 0 (pre-existing kebab-mcp::tools_call_ask_multi_hop flaky 1개 동일 — spec 에 명시된 known-flaky). cargo clippy --workspace --all-targets -D warnings clean.
Wire 영향: additive minor — answer.v1 의 verification optional + refusal_reason.enum 확장 + error.v1.code 확장.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
321 lines
13 KiB
Rust
321 lines
13 KiB
Rust
//! Answer + RAG types (§3.8).
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
use time::OffsetDateTime;
|
|
|
|
use crate::citation::Citation;
|
|
use crate::search::SearchMode;
|
|
use crate::versions::PromptTemplateVersion;
|
|
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
pub struct Answer {
|
|
pub answer: String,
|
|
pub citations: Vec<AnswerCitation>,
|
|
pub grounded: bool,
|
|
pub refusal_reason: Option<RefusalReason>,
|
|
pub model: ModelRef,
|
|
pub embedding: Option<ModelRef>,
|
|
pub prompt_template_version: PromptTemplateVersion,
|
|
pub retrieval: AnswerRetrievalSummary,
|
|
pub usage: TokenUsage,
|
|
#[serde(with = "time::serde::rfc3339")]
|
|
pub created_at: OffsetDateTime,
|
|
/// p9-fb-15: same conversation 의 turn 들이 공유. CLI single-shot
|
|
/// (history 없음) / TUI 첫 turn 은 None. blake3 해시 또는 사용자
|
|
/// 명시 (`kebab ask --session <id>`, p9-fb-18).
|
|
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
pub conversation_id: Option<String>,
|
|
/// p9-fb-15: 같은 conversation 안 0-based 순서. 첫 turn = 0. None
|
|
/// 이면 single-shot.
|
|
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
pub turn_index: Option<u32>,
|
|
/// p9-fb-41: multi-hop hop trace. `None` for single-pass asks.
|
|
/// Each entry records one hop (`decompose` / `decide` / `synthesize`)
|
|
/// — the LLM call category, the sub-queries emitted, retrieval
|
|
/// counts, and a `forced_stop` flag for cap-driven termination.
|
|
/// Wire-additive: `answer.v1` schema_version unchanged; consumers
|
|
/// reading older single-pass answers see `hops: None` (or absent).
|
|
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
pub hops: Option<Vec<HopRecord>>,
|
|
/// p9-fb-41 PR-9c-1: NLI-based post-synthesis verification summary.
|
|
/// `None` for single-pass asks and for multi-hop runs with
|
|
/// `[rag].nli_threshold == 0` (verification disabled — the default).
|
|
/// Present only when the multi-hop pipeline reached the post-
|
|
/// synthesize verification step (PR-9c-2 wires step 8.5). Wire-
|
|
/// additive: `answer.v1` schema_version unchanged; consumers
|
|
/// reading pre-v0.18 answers see `verification: None` (or absent).
|
|
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
pub verification: Option<VerificationSummary>,
|
|
}
|
|
|
|
/// p9-fb-41 PR-9c-1: post-synthesize NLI verification summary stamped
|
|
/// onto [`Answer::verification`] when multi-hop runs reach step 8.5
|
|
/// (NLI gate). Three required fields ride together on every wire emit:
|
|
/// `nli_score` is the entailment channel of the XNLI verifier,
|
|
/// `nli_threshold` mirrors `[rag].nli_threshold` for audit, and
|
|
/// `nli_passed` is `nli_score >= nli_threshold`. The whole struct is
|
|
/// omitted (serde skip) when no verification ran.
|
|
#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, Deserialize)]
|
|
pub struct VerificationSummary {
|
|
pub nli_score: f32,
|
|
pub nli_threshold: f32,
|
|
pub nli_passed: bool,
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
pub struct AnswerCitation {
|
|
pub marker: Option<String>,
|
|
pub citation: Citation,
|
|
/// p9-fb-32: cited doc's `documents.updated_at`.
|
|
#[serde(with = "time::serde::rfc3339")]
|
|
pub indexed_at: OffsetDateTime,
|
|
/// p9-fb-32: server-computed staleness flag per config threshold.
|
|
pub stale: bool,
|
|
}
|
|
|
|
/// p9-fb-15: history 가 prompt 에 들어갈 때의 한 turn. RAG facade 가
|
|
/// `Vec<Turn>` 받아 system + history + retrieval + new question 으로
|
|
/// prompt 빌드. token budget 안에 fit 안 되면 oldest turn 부터 drop
|
|
/// (newest 우선 보존).
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
pub struct Turn {
|
|
pub question: String,
|
|
pub answer: String,
|
|
pub citations: Vec<AnswerCitation>,
|
|
#[serde(with = "time::serde::rfc3339")]
|
|
pub created_at: OffsetDateTime,
|
|
}
|
|
|
|
/// p9-fb-41: one entry in [`Answer::hops`] — the per-iteration trace
|
|
/// of a multi-hop ask. The pipeline appends a `HopRecord` per LLM
|
|
/// call (decompose / decide / synthesize) so a `--multi-hop` user
|
|
/// can see what sub-queries the LLM emitted, how many chunks each
|
|
/// hop contributed, whether the iter stopped on the model's own
|
|
/// signal or hit a cap, and the per-hop LLM latency.
|
|
///
|
|
/// Wire-additive — every field uses `#[serde(default)]` where it
|
|
/// could plausibly be omitted by a future schema reader.
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
pub struct HopRecord {
|
|
/// 0-based hop index within this ask. `iter=0` is always the
|
|
/// initial decompose call; subsequent iters are decide calls;
|
|
/// the final iter is the synthesize call.
|
|
pub iter: u32,
|
|
pub kind: HopKind,
|
|
/// Sub-queries associated with this hop. The meaning depends on
|
|
/// `kind`:
|
|
///
|
|
/// - [`HopKind::Decompose`]: the initial sub-queries the LLM
|
|
/// broke the original user query into. These drive the
|
|
/// `iter=1` retrieval round.
|
|
/// - [`HopKind::Decide`]: the *new* sub-queries the LLM
|
|
/// emitted to drive the next retrieval round. Empty when the
|
|
/// LLM signalled stop OR when `forced_stop = true` (cap hit
|
|
/// or parse-degraded).
|
|
/// - [`HopKind::Synthesize`]: always empty — the final hop
|
|
/// produces the user-visible answer, not more sub-queries.
|
|
#[serde(default)]
|
|
pub sub_queries: Vec<String>,
|
|
/// Number of *new* chunks the retrieval round contributed to the
|
|
/// pool (dedup'd by `chunk_id` — repeated hits from a previous
|
|
/// iter do not count). `0` for the decompose hop (no retrieval
|
|
/// yet) and the synthesize hop.
|
|
pub context_chunks_added: u32,
|
|
/// `true` when the pipeline cut the iter loop short because a
|
|
/// safety cap fired (`max_depth` / `max_total_sub_queries` /
|
|
/// `max_pool_chunks`) rather than because the LLM signalled
|
|
/// stop. The user-visible answer still reflects all chunks
|
|
/// accumulated up to that point — `forced_stop` is a tracing
|
|
/// signal, not a refusal.
|
|
pub forced_stop: bool,
|
|
/// Wall-clock latency of the LLM call for this hop, in
|
|
/// milliseconds. Useful for cost / latency analysis when a
|
|
/// `kebab eval` run records `Answer.hops`.
|
|
///
|
|
/// `0` is overloaded: it means "no LLM call happened at this
|
|
/// hop" when (a) the hop was a Decide skipped due to
|
|
/// `forced_stop` (depth-cap or pool-cap fired before the LLM
|
|
/// was asked) or (b) the pool was empty before any decide
|
|
/// could run. Treat `0` as "absent or instantaneous" rather
|
|
/// than as a genuine measurement.
|
|
pub llm_call_ms: u32,
|
|
}
|
|
|
|
/// p9-fb-41: which stage of the multi-hop pipeline a [`HopRecord`]
|
|
/// describes. The serde tag matches the wire shape so agents /
|
|
/// CLIs can branch on the snake_case string without referencing
|
|
/// the Rust enum.
|
|
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub enum HopKind {
|
|
/// First hop — LLM decomposed the user query into sub-queries.
|
|
Decompose,
|
|
/// Subsequent hop — LLM was asked whether more retrieval is
|
|
/// needed and either emitted new sub-queries (`continue`) or
|
|
/// returned an empty array (`stop`).
|
|
Decide,
|
|
/// Terminal hop — LLM produced the final user-visible answer
|
|
/// over the accumulated chunk pool.
|
|
Synthesize,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub enum RefusalReason {
|
|
ScoreGate,
|
|
LlmSelfJudge,
|
|
NoIndex,
|
|
NoChunks,
|
|
/// p9-fb-15: ask 가 LLM 토큰 stream 도중 cancel 됨. partial answer
|
|
/// 가 채워져 있을 수 있음 (사용자가 본 부분까지). RAG retrieval
|
|
/// 자체는 정상 — 모델 generation 단계에서만 중단.
|
|
LlmStreamAborted,
|
|
/// p9-fb-41: multi-hop pipeline 의 decompose LLM call 이 JSON
|
|
/// parse 가능한 sub-question array 를 반환하지 못함 (parse
|
|
/// error, 빈 응답, 또는 잘못된 형식). retrieval / synthesize
|
|
/// 단계 진입 못 함. CLI / MCP / TUI 가 받는 wire error code
|
|
/// = `"multi_hop_decompose_failed"` (PR-4 의 error_wire 매핑).
|
|
MultiHopDecomposeFailed,
|
|
/// p9-fb-41 PR-9c-1: post-synthesize NLI verification gate fired —
|
|
/// `NliScores::faithfulness()` (entailment channel) fell below
|
|
/// `[rag].nli_threshold`. Wire string = `"nli_verification_failed"`
|
|
/// (single source of truth: also the matching `error.v1.code`).
|
|
/// Multi-hop only; behavior wiring lands in PR-9c-2.
|
|
NliVerificationFailed,
|
|
/// p9-fb-41 PR-9c-1: NLI verifier was configured (threshold > 0)
|
|
/// but the model / runtime is unavailable (download failure,
|
|
/// missing tokenizer, ONNX session init error). Treated as a soft
|
|
/// refusal — the user sees an unverified-answer outcome rather
|
|
/// than crashing the ask. Wire string = `"nli_model_unavailable"`.
|
|
NliModelUnavailable,
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
pub struct ModelRef {
|
|
pub id: String,
|
|
pub provider: String,
|
|
pub dimensions: Option<usize>,
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
pub struct AnswerRetrievalSummary {
|
|
pub trace_id: TraceId,
|
|
pub mode: SearchMode,
|
|
pub k: usize,
|
|
pub score_gate: f32,
|
|
pub top_score: f32,
|
|
pub chunks_returned: u32,
|
|
pub chunks_used: u32,
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
pub struct TokenUsage {
|
|
pub prompt_tokens: u32,
|
|
pub completion_tokens: u32,
|
|
pub latency_ms: u32,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
|
pub struct TraceId(pub String);
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::asset::WorkspacePath;
|
|
use crate::citation::Citation;
|
|
use time::macros::datetime;
|
|
|
|
/// p9-fb-41 PR-9c-1: pin the wire-side spelling of the new
|
|
/// `RefusalReason` variants. The strings here must match
|
|
/// `answer.schema.json::refusal_reason.enum` AND
|
|
/// `error.schema.json::code.enum` byte-for-byte (single source of
|
|
/// truth per spec §2.4).
|
|
#[test]
|
|
fn refusal_reason_nli_variants_serialize_to_snake_case() {
|
|
assert_eq!(
|
|
serde_json::to_string(&RefusalReason::NliVerificationFailed).unwrap(),
|
|
"\"nli_verification_failed\""
|
|
);
|
|
assert_eq!(
|
|
serde_json::to_string(&RefusalReason::NliModelUnavailable).unwrap(),
|
|
"\"nli_model_unavailable\""
|
|
);
|
|
}
|
|
|
|
/// p9-fb-41 PR-9c-1: `Answer.verification` is `Option<...>` with
|
|
/// `skip_serializing_if = None`. A `verification: None` answer
|
|
/// must NOT emit a `"verification"` key on the wire — the field
|
|
/// is additive and pre-v0.18 readers see no new key.
|
|
#[test]
|
|
fn answer_omits_verification_field_when_none() {
|
|
let ans = Answer {
|
|
answer: "x".into(),
|
|
citations: vec![],
|
|
grounded: true,
|
|
refusal_reason: None,
|
|
model: ModelRef {
|
|
id: "m".into(),
|
|
provider: "p".into(),
|
|
dimensions: None,
|
|
},
|
|
embedding: None,
|
|
prompt_template_version: PromptTemplateVersion("rag-v2".into()),
|
|
retrieval: AnswerRetrievalSummary {
|
|
trace_id: TraceId("t".into()),
|
|
mode: crate::SearchMode::Lexical,
|
|
k: 1,
|
|
score_gate: 0.0,
|
|
top_score: 0.0,
|
|
chunks_returned: 0,
|
|
chunks_used: 0,
|
|
},
|
|
usage: TokenUsage {
|
|
prompt_tokens: 0,
|
|
completion_tokens: 0,
|
|
latency_ms: 0,
|
|
},
|
|
created_at: datetime!(2026-05-09 12:00:00 UTC),
|
|
conversation_id: None,
|
|
turn_index: None,
|
|
hops: None,
|
|
verification: None,
|
|
};
|
|
let v = serde_json::to_value(&ans).unwrap();
|
|
assert!(
|
|
v.get("verification").is_none(),
|
|
"verification: None must be omitted from wire output, got: {v}"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn verification_summary_serializes_all_three_required_fields() {
|
|
let vs = VerificationSummary {
|
|
nli_score: 0.87,
|
|
nli_threshold: 0.5,
|
|
nli_passed: true,
|
|
};
|
|
let v = serde_json::to_value(vs).unwrap();
|
|
assert!((v["nli_score"].as_f64().unwrap() - 0.87).abs() < 1e-5);
|
|
assert!((v["nli_threshold"].as_f64().unwrap() - 0.5).abs() < 1e-5);
|
|
assert_eq!(v["nli_passed"], true);
|
|
}
|
|
|
|
#[test]
|
|
fn answer_citation_serializes_indexed_at_and_stale() {
|
|
let ac = AnswerCitation {
|
|
marker: Some("[1]".to_string()),
|
|
citation: Citation::Line {
|
|
path: WorkspacePath::new("a.md".to_string()).unwrap(),
|
|
start: 1,
|
|
end: 1,
|
|
section: None,
|
|
},
|
|
indexed_at: datetime!(2026-05-09 12:00:00 UTC),
|
|
stale: false,
|
|
};
|
|
let v = serde_json::to_value(&ac).unwrap();
|
|
assert_eq!(v["indexed_at"], "2026-05-09T12:00:00Z");
|
|
assert_eq!(v["stale"], false);
|
|
}
|
|
}
|