From 44fbffff26c9adf710eb5d59c2240ba658ca51e1 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Mon, 25 May 2026 21:22:20 +0000
Subject: [PATCH 1/3] =?UTF-8?q?docs(rag):=20fb-41=20PR-9=20spec=20+=20plan?=
 =?UTF-8?q?=20=E2=80=94=20NLI=20verification=20+=20v0.18.0=20cut?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fb-41 multi-hop RAG 의 dogfood S7 hallucination root cause = LLM-self-judge ceiling.
대응 = NLI-based post-synthesis verification (mDeBERTa-v3 XNLI, 280 MB ONNX).

산출물:
- docs/superpowers/specs/2026-05-25-p9-fb-41-finalize-spec.md (review_round=5,
  4 OMC reviewer APPROVE: 1 CRITICAL + 9 MAJOR + 3 MINOR → 1 NIT carry-forward).
- docs/superpowers/plans/2026-05-25-p9-fb-41-finalize-plan.md (plan_review_round=3,
  4 OMC reviewer APPROVE: 15 issues → 0 actionable).

5 sub-PR (PR-9a~9d) + cut PR. 작업 21-31h / wall time 28-44h.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../2026-05-25-p9-fb-41-finalize-plan.md      | 513 +++++++++++
 .../2026-05-25-p9-fb-41-finalize-spec.md      | 831 ++++++++++++++++++
 2 files changed, 1344 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-05-25-p9-fb-41-finalize-plan.md
 create mode 100644 docs/superpowers/specs/2026-05-25-p9-fb-41-finalize-spec.md

diff --git a/docs/superpowers/plans/2026-05-25-p9-fb-41-finalize-plan.md b/docs/superpowers/plans/2026-05-25-p9-fb-41-finalize-plan.md
new file mode 100644
index 0000000..79be574
--- /dev/null
+++ b/docs/superpowers/plans/2026-05-25-p9-fb-41-finalize-plan.md
@@ -0,0 +1,513 @@
+---
+title: "p9-fb-41 finalize implementation plan v4 — NLI verification + v0.18.0 cut"
+date: 2026-05-25
+task_id: p9-fb-41-finalize
+phase: P9
+status: approved-by-team
+target_version: 0.18.0
+design: ../specs/2026-05-25-p9-fb-41-finalize-spec.md
+spec_review_round: 5
+spec_status: approved-by-team
+plan_review_round: 3
+plan_review_outcome: |
+  All 4 OMC team reviewers APPROVE (plan v4 round 3, FINAL convergence).
+  - architect: APPROVE (round 1 plan v2)
+  - planner: APPROVE (round 2 spec + plan v2 re-confirmed)
+  - document-specialist: APPROVE (round 2 plan v3 — NIT-1 minor)
+  - critic: APPROVE (round 3 plan v4 FINAL — 5 axes 95.4% production excellence baseline)
+---
+
+# p9-fb-41 finalize plan v4
+
+spec: `docs/superpowers/specs/2026-05-25-p9-fb-41-finalize-spec.md` (review_round=5, APPROVE by all 4 OMC team reviewers).
+
+## 0. 작업 개요
+
+PR-1 ~ PR-8 머지 후 v0.18 pre-cut 도그푸딩 (`/build/cache/dogfood-v018/results/SUMMARY.md`) 에서 발견된 S7 hallucination 의 진짜 fix (NLI post-synthesis verification) + v0.18.0 cut.
+
+총 5 sub-PR (9a / 9b / 9c-1 / 9c-2 / 9d) + 1 cut PR. **총 추정 시간**: 작업 **21-31h** / wall time **28-44h** (§8 cumulative trace + plan v4 round-2 critic M2 분리 참조).
+
+PR sequence 는 *순차* (각 PR 머지 후 다음 시작) — sub-PR 별 surface 가 다음 sub-PR 의 기반:
+
+```
+PR-9a (skeleton)
+  ↓ 머지 후
+PR-9b (ONNX inference)
+  ↓ 머지 후
+PR-9c-1 (core types + wire scaffolding)
+  ↓ 머지 후
+PR-9c-2 (pipeline integration + mock test)
+  ↓ 머지 후
+PR-9d (dogfood retest + HOTFIXES)
+  ↓ 머지 후
+cut PR (chore: bump version 0.17.2 → 0.18.0)
+  ↓ 머지 + tag v0.18.0
+```
+
+본 plan 은 subagent-driven-development 의 task list — 각 sub-PR 의 *self-contained* description.
+
+## 1. 머지된 PR-1 ~ PR-8 의 carry-over
+
+각 PR 의 회차 리뷰 carry-over 항목은 본인 PR 안 또는 후속 PR 에서 해소됨. 본 plan 의 PR-9 sub-PRs 에는 추가 carry-over 없음 — clean baseline.
+
+## 2. PR-9a — kebab-nli crate skeleton
+
+**Goal**: trait surface + scaffolding + workspace dep chain 도입. implementation 없이도 build 가능.
+
+**Pre-flight (PR-9a 시작 전, manual)** — spec §2.1 + §3 PR-9a:
+
+1. **Model + tokenizer file 존재 검증**:
+   ```sh
+   curl -I https://huggingface.co/Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7/resolve/main/onnx/model.onnx
+   curl -I https://huggingface.co/Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7/resolve/main/tokenizer.json
+   ```
+   둘 다 `200 OK` 확인. 실패 시 PR-9 design re-evaluation.
+
+2. **`tokenizers` features 검증** (standalone repro):
+   ```sh
+   cargo new --bin /tmp/nli-tok-probe
+   cd /tmp/nli-tok-probe
+   cargo add tokenizers --no-default-features -F onig
+   wget https://huggingface.co/Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7/resolve/main/tokenizer.json
+   # main.rs: tokenizers::Tokenizer::from_file("tokenizer.json").expect("load");
+   cargo run --release
+   ```
+   성공 시 PR-9a features lock. 실패 시 `default-features = true` fallback. 결과 + 최종 features set 을 PR-9a PR description 의 `## Cargo features 결정 trace` 절에 첨부.
+
+**Files**:
+
+- `Cargo.toml` (workspace):
+  - `members` 에 `"crates/kebab-nli"` 추가.
+  - `workspace.dependencies` 에 추가 (fastembed transitive 와 정확히 일치):
+    - `ort = { version = "=2.0.0-rc.9", default-features = false, features = ["ndarray"] }`
+    - `tokenizers = { version = "0.21", default-features = false, features = ["onig"] }` (pre-flight 결과에 따라 features 갱신 가능)
+    - `hf-hub = { version = "0.4", default-features = false, features = ["ureq", "rustls-tls"] }`
+    - `ndarray = "0.16"`
+- `crates/kebab-nli/Cargo.toml` (skeleton 의존만):
+  - `dependencies`: `kebab-config`, `anyhow`, `serde`.
+  - `dev-dependencies`: `tempfile`.
+- `crates/kebab-nli/src/lib.rs`:
+  - `NliScores` struct + `faithfulness()` + `from_xnli_logits()`.
+  - `NliVerifier` trait.
+  - private `softmax3` helper.
+- `crates/kebab-nli/src/onnx.rs`:
+  - `OnnxNliVerifier` placeholder struct.
+  - `OnnxNliVerifier::new(&Config) -> Result<Self>` placeholder.
+  - `impl NliVerifier::score → bail!("PR-9a stub")`.
+
+**Tests** (6 unit):
+- `softmax3_normalises_to_unit`, `softmax3_is_invariant_to_constant_shift`.
+- `nli_scores_from_xnli_logits_orders_correctly`, `faithfulness_returns_entailment_channel`.
+- `new_succeeds_on_default_config`, `score_returns_err_in_skeleton`.
+
+**검증**:
+- `cargo test -p kebab-nli -j 1` — 6 통과.
+- `cargo clippy -p kebab-nli --all-targets -j 1 -- -D warnings` clean.
+
+**시간**: 2-3h.
+
+## 3. PR-9b — OnnxNliVerifier 의 ONNX inference + model download
+
+**Goal**: `OnnxNliVerifier::score` 의 진짜 implementation.
+
+**Dependency**: PR-9a 머지 완료.
+
+**Files**:
+- `crates/kebab-nli/Cargo.toml`:
+  - `ort`, `tokenizers`, `hf-hub`, `ndarray`, `tracing` 추가 (workspace.dependencies).
+- `crates/kebab-nli/src/onnx.rs`:
+  - `OnnxNliVerifier` fields: `model_id`, `cache_dir` (= `config.storage.model_dir.join("nli").join(sanitize(model_id))`), `session: OnceLock<ort::Session>`, `tokenizer: OnceLock<tokenizers::Tokenizer>`.
+  - `OnnxNliVerifier::new(&Config) -> Result<Self>` — model_id / cache_dir stamp + lazy load deferred.
+  - `ensure_loaded(&self) -> Result<(&Session, &Tokenizer)>` — hf-hub download + `Tokenizer::from_file` + `Session::commit_from_file` + truncation params 설정.
+  - `score(premise, hypothesis)` — encode pair (with OnlyFirst truncation) → ort run → softmax → NliScores.
+  - `sanitize_model_id(s: &str) -> String` helper.
+- `crates/kebab-nli/tests/inference.rs` 신규:
+  - `#[ignore]` integration tests (5 cases):
+    1. EN entailment (`"Caffeine is a stimulant."` → `"Caffeine is a stimulant."`) — entailment > 0.8.
+    2. EN no-entailment (caffeine → C8H10N4O2) — entailment < 0.3.
+    3. KR entailment (`"사과는 빨갛다."` → `"사과는 색이 있다."`) — entailment 높음.
+    4. Long premise (10000 char) → truncation 적용 + 정상 score (panic 없음).
+    5. Empty hypothesis → graceful error.
+
+**Manual smoke protocol** (PR description 강제 첨부):
+```sh
+cargo test -p kebab-nli -j 1 --test inference -- --ignored 2>&1 | tail -20
+```
+- 5 test 모두 PASS 확인.
+- case 1 의 `NliScores` dump (예: `entailment=0.92, neutral=0.05, contradiction=0.03`) 를 PR body 의 `## 검증` 절에 inline.
+
+**검증**:
+- unit test 통과 + clippy clean.
+- `--ignored` integration test 의 manual run (PR 작업자 책임).
+
+**시간**: 8-12h (round-2 planner P2 갱신).
+
+**Risks**:
+- ort 2.0-rc.9 API stability — workspace pin `"=2.0.0-rc.9"` (fastembed transitive 일치).
+- mDeBERTa ONNX 존재 — PR-9a pre-flight 가 검증.
+- tokenizers SentencePiece 호환성 — PR-9a pre-flight 가 검증.
+- hf-hub `ureq + rustls-tls` vs fastembed `native-tls` features union — PR-9a 의 첫 build 가 검증.
+
+## 4. PR-9c-1 — Core types + wire scaffolding
+
+**Goal**: `RefusalReason` + `VerificationSummary` + `RagPipeline.verifier` field + Config + wire schema.
+
+**Dependency**: PR-9b 머지 완료.
+
+**Files**:
+- `crates/kebab-core/src/answer.rs`:
+  - `RefusalReason::NliVerificationFailed` + `RefusalReason::NliModelUnavailable` 신규.
+  - `Answer.verification: Option<VerificationSummary>` field.
+  - `VerificationSummary { nli_score: f32, nli_threshold: f32, nli_passed: bool }` 신규 struct.
+- `crates/kebab-config/src/lib.rs`:
+  - `NliCfg` 신규 struct + `[models.nli]`:
+    - `model: String` (default `"Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"`).
+    - `provider: String` (default `"onnx"`).
+  - `RagCfg.nli_threshold: f32` (default `0.0` — disabled).
+  - env override: `KEBAB_MODELS_NLI_MODEL`, `KEBAB_RAG_NLI_THRESHOLD`.
+- `crates/kebab-rag/src/pipeline.rs`:
+  - `RagPipeline` 의 새 field: `verifier: Option<Arc<dyn NliVerifier>>` (None = verify off).
+  - **시그니처 widening = Option B (builder)**: 기존 `RagPipeline::new(config, retriever, llm, sqlite)` 시그니처 유지 + 신규 `pub fn with_verifier(self, v: Arc<dyn NliVerifier>) -> Self` builder.
+  - `kebab-rag` 의 Cargo.toml 에 `kebab-nli` 의존 추가.
+  - **`#[allow(dead_code)]` 처리** (round-2 critic M1 closure): PR-9c-1 의 `verifier` field 와 `with_verifier` builder 는 PR-9c-2 의 `ask_multi_hop` step 8.5 hook 가 활성화될 때까지 *unused — `cargo clippy -- -D warnings` 의 `dead_code` lint fail risk*. PR-9c-1 의 `verifier` field 에 `#[allow(dead_code)]` 임시 attribute (Cargo.toml 의 `kebab-nli` 의존 자체는 active path). 또는 placeholder smoke test (`pipeline.with_verifier(MockVerifier::default()).verifier.is_some()` 한 줄). PR-9c-2 가 hook 추가 시 attribute 제거.
+- `docs/wire-schema/v1/answer.schema.json`:
+  - `verification` field 추가 (`anyOf [object, null]`) + `$defs.VerificationSummary` 인라인:
+    ```json
+    "VerificationSummary": {
+      "type": "object",
+      "required": ["nli_score", "nli_threshold", "nli_passed"],
+      "properties": {
+        "nli_score":     { "type": "number" },
+        "nli_threshold": { "type": "number" },
+        "nli_passed":    { "type": "boolean" }
+      }
+    }
+    ```
+  - `refusal_reason.enum` 에 `"nli_verification_failed"`, `"nli_model_unavailable"` 추가.
+- `docs/wire-schema/v1/error.schema.json`:
+  - `code` enum 에 `nli_verification_failed`, `nli_model_unavailable` 추가.
+  - `details.description` 에 두 항목 추가 (`multi_hop_decompose_failed` 패턴):
+    - `nli_verification_failed: { score, threshold }` (reserved — currently emitted as Answer.refusal_reason on stdout, NOT as error.v1; forward-looking for future RefusalReason → error_wire promotion).
+    - `nli_model_unavailable: { source }` (reserved — same pattern as nli_verification_failed).
+- `docs/ARCHITECTURE.md` (round-1 document-specialist ISSUE-1 — CLAUDE.md "A new crate is added — extend the graph + directory tree" rule):
+  - Mermaid Adapters subgraph 에 `nli["kebab-nli<br/>(NLI verifier)"]` 노드 추가.
+  - **Edges** (round-2 critic R2-NIT-3 — *forward-looking final state* 명시):
+    - PR-9c-1 시점 *직접 의존 추가* = `rag --> nli` (kebab-rag/Cargo.toml `kebab-nli` 추가) + `nli --> config` (kebab-nli/Cargo.toml `kebab-config` 추가).
+    - `app --> nli` edge 는 *forward-looking* (PR-9c-2 에서 kebab-app/Cargo.toml 의 `kebab-nli` 의존 추가됨) — PR-9c-1 의 ARCHITECTURE.md 가 *최종 graph 상태* 반영 결정 (single update, 9c-2 에서 재변경 회피). 결정 trade-off: *forward-looking* 가 reader 의 `final state` 가시 ↑, *current state* 가 PR-9c-1 시점 정확도 ↑. **권장 forward-looking** (graph 가 surface 명세, 한 번 갱신).
+    - `nli --> core` edge 는 PR-9a 머지 후 `crates/kebab-nli/Cargo.toml` 의 final `[dependencies]` 확인 결정 (round-2 document-specialist NIT-1) — `kebab-core` 직접 의존 시 edge 포함, `config` 경유 transitive 만이면 edge 생략. ARCHITECTURE.md graph 관례 = *직접 Cargo.toml 의존* 기준.
+  - 디렉토리 트리에 `crates/kebab-nli/` 항목 추가.
+
+**Tests**:
+- `crates/kebab-config/src/lib.rs::tests`:
+  - `default_nli_threshold_is_zero`.
+  - `default_nli_model_is_xenova_mdeberta`.
+  - `legacy_config_without_nli_uses_defaults`.
+  - `env_override_nli_threshold`.
+- `crates/kebab-cli/tests/wire_ask_multi_hop.rs`:
+  - `answer_schema_declares_verification_field_and_defs`.
+  - `answer_schema_refusal_reason_enum_includes_nli_verification_failed`.
+  - `answer_schema_refusal_reason_enum_includes_nli_model_unavailable`.
+  - `error_schema_code_enum_includes_nli_verification_failed`.
+  - `error_schema_code_enum_includes_nli_model_unavailable`.
+
+**검증**:
+- `cargo test --workspace -j 1` — 회귀 0 (기존 multi-hop tests pass, RagPipeline::new 시그니처 unchanged).
+- `cargo clippy --workspace --all-targets -j 1 -- -D warnings` clean.
+
+**시간**: 2-3h.
+
+## 5. PR-9c-2 — Pipeline integration + mock test
+
+**Goal**: `ask_multi_hop` 의 NLI verify wiring + mock test + SKILL.md 갱신.
+
+**Dependency**: PR-9c-1 머지 완료.
+
+**Files**:
+- `crates/kebab-rag/src/pipeline.rs`:
+  - `ask_multi_hop` 의 step 8.5 NLI hook (spec §2.3 코드):
+    - empty answer guard: `if !acc.trim().is_empty() { /* step 8.5 */ }`.
+    - `if self.config.rag.nli_threshold > 0.0 { /* verify */ }` outer guard.
+    - inner verify: `truncate_for_nli` → `verifier.score` → score 검사 → refuse 또는 진행.
+  - `refuse_nli_verification` helper (`refuse_*` 패턴) — `verification: Some(...)` 채움.
+  - `refuse_nli_model_unavailable` helper — `verification: None`.
+  - `pub fn truncate_for_nli(premise: &str, hypothesis: &str) -> (String, bool)` helper:
+    - max premise char count = `MAX_NLI_PREMISE_CHARS = 4 * 400` ≈ 1600 chars.
+    - hypothesis 길이 + special tokens 32 char budget 적용 후 자연 보존.
+    - 둘째 return = was_truncated boolean.
+    - **token ratio 가정**: 4 char ≈ 1 token (영어 BPE). 한국어 SentencePiece 는 1-2 char/token — tokenizer OnlyFirst backup. v0.18.1 의 token-count 기반 budget 갱신 candidate.
+- `crates/kebab-app`:
+  - **실제 constructor 이름 = `App::open_with_config`** (round-2 critic R2-NIT-4 verification — `crates/kebab-app/src/app.rs:187`. spec §3 PR-9c-2 의 `App::new` 는 *논리적 이름* — 실제 code 의 함수명으로 mapping). 시그니처 *이미 `Result<Self, anyhow::Error>`* (현재 line 187 `pub fn open_with_config(config: kebab_config::Config) -> Result<Self>`) — **caller cascading 없음** (kebab-cli/tui/mcp 의 `App::open_with_config(...)` 호출 site 의 `?` 또는 `.context(...)` 그대로). round-2 NEW-M2 의 *시그니처 widening* = body 추가만 (`OnnxNliVerifier::new(config)?` integration).
+  - `config.rag.nli_threshold > 0.0` → `OnnxNliVerifier::new(config)?` 호출 + `Arc::new` wrap + `pipeline.with_verifier(v)`.
+  - `config.rag.nli_threshold == 0.0` → verifier = None, 기존 path.
+  - `OnnxNliVerifier::new` 실패 시 `bail!()` — user-facing crash 회피.
+- `crates/kebab-rag/tests/multi_hop.rs`:
+  - `common/mod.rs` 에 `MockNliVerifier { scores: NliScores }` helper.
+  - `multi_hop_nli_pass_keeps_grounded` — entailment 0.9 → grounded=true, verification.nli_passed=true.
+  - `multi_hop_nli_fail_refuses` — entailment 0.1 → refusal=NliVerificationFailed.
+  - `multi_hop_nli_disabled_skip_verify` — threshold = 0.0 → verify skip, verification=None.
+  - `multi_hop_nli_model_unavailable_refuses` — verifier Err → refusal=NliModelUnavailable.
+  - `multi_hop_truncate_for_nli_preserves_hypothesis` — long premise + 짧은 hypothesis → hypothesis 그대로.
+- `integrations/claude-code/kebab/SKILL.md`:
+  - `mcp__kebab__ask` 절에 NLI 안내 한 줄:
+    > `answer.v1.verification.nli_passed` 의미 (true = NLI 통과, false = `refusal_reason = nli_verification_failed`). threshold tuning 권장 (0.5 production, 0.9 strict). `nli_model_unavailable` refusal 시 user 의 `[rag] nli_threshold = 0.0` 임시 disable + network/disk 복구 후 재시도.
+
+**Tests**: 5 신규 multi-hop tests + 기존 tests 회귀 0.
+
+**검증**:
+- `cargo test --workspace -j 1` — 모든 test 통과 + 신규 5 multi-hop pass.
+- `cargo clippy --workspace --all-targets -j 1 -- -D warnings` clean.
+
+**시간**: 3-4h.
+
+## 6. PR-9d — Dogfood retest + HOTFIXES closure
+
+**Goal**: PR-9c 머지 후 dogfood corpus 에서 S7 + S1 + S3 + S10 retest.
+
+**Dependency**: PR-9c-2 머지 완료.
+
+**Pre-run prereq (manual + subagent 양쪽 적용)** — spec §3 PR-9d:
+- Ollama service running (`curl -s 127.0.0.1:11434/api/tags`).
+- dogfood corpus 디렉토리 존재 (`/build/cache/dogfood-v018/queries/*.txt`).
+- network reachable (hf-hub 280 MB NLI model first-run download 가능).
+- free RAM ≥ 6 GB.
+- release binary path: `/build/out/cargo-target/release/kebab` (CARGO_TARGET_DIR) 또는 `./target/release/kebab` (in-tree). 권장: `/build/out/cargo-target/release/kebab` (HOTFIXES 2026-05-25 fb-41 dogfood entry).
+
+prereq 실패 시 *조기 abort* + 사용자 보고. partial dogfood 결과 commit 회피.
+
+**Tests** (자동화 없음, manual run):
+- `[rag] nli_threshold = 0.5` config (production 권장).
+- S7 / S1 / S3 / S10 multi-hop ask → 각각 NLI score 측정 + grounded/refuse 확인.
+- single-pass S7 (verification 없음) baseline 도 같이 측정.
+
+**RAM peak protocol**:
+```sh
+# 시작 전 baseline
+ps -o rss=,vsz=,comm= -p $(pgrep -f 'ollama|kebab')
+
+# multi-hop ask 진행 중 1초 sampling (5분 cap)
+while sleep 1; do ps -o rss=,comm= -p $(pgrep -f 'ollama|kebab') ; done > /tmp/ram-S<N>.log &
+RAM_PID=$!
+
+# kebab ask 실행
+/build/out/cargo-target/release/kebab ask --multi-hop "<query>" --json
+
+# sampling 종료 + peak 추출
+kill $RAM_PID
+awk '{sum+=$1} END {print sum/NR " avg KB"}' /tmp/ram-S<N>.log
+awk '{ if ($1>max) max=$1 } END { print max " peak KB" }' /tmp/ram-S<N>.log
+```
+peak < 10 GB (16 GB 환경 OOM 없음) 확인.
+
+**Files**:
+- `tasks/HOTFIXES.md`:
+  - "PR-9 closure (post-v0.18 dogfood retest)" sub-section 추가 — pre/post 결과 비교 표.
+- `docs/dogfood/v0.18.0/` 신규 디렉토리 (round-2 P5 의 보존 path):
+  - `SUMMARY.md` — sanitized dogfood 보고서 (원본 `/build/cache/dogfood-v018/results/SUMMARY.md` 의 repo 포함 가능 부분).
+  - `s7-multihop-post-pr9.json` — S7 multi-hop NLI 결과 sample (refuse + nli_score).
+  - `s1-multihop-post-pr9.json` — S1 multi-hop NLI 결과 sample (grounded + nli_score).
+- `/build/cache/dogfood-v018/results/post-pr9/` (작업 디렉토리, repo 외):
+  - 시나리오별 JSON dump + findings.md + RAM log.
+
+**검증** — spec §7 PASS criteria 표 따름:
+- S7: grounded=false, refusal=`nli_verification_failed`, nli_score < 0.3.
+- S1: grounded=true, refusal=None, nli_score ≥ 0.6.
+- S3 (EN): primary grounded=true 또는 acceptable degraded LlmSelfJudge.
+- S10 (KR): primary refusal=`nli_verification_failed` 또는 acceptable degraded LlmSelfJudge.
+- range 밖 시 threshold / model 재검토 (spec §6 iteration trigger).
+
+**시간**: 4-6h (RAM 측정 + corpus 보존 + HOTFIXES + manual retest).
+
+**Scope**: PR default. 작업자 선택 가능 (별 commit 가능, round-1 P3).
+
+## 7. v0.18.0 cut PR (PR-9d 머지 후)
+
+**Goal**: version bump + cascading docs + frozen design contract 갱신 + release tag.
+
+**Dependency**: PR-9d 머지 완료.
+
+**Same-commit / Same-PR** (CLAUDE.md "Release / binary version bump" rule):
+- Cargo.toml version bump + tag = 같은 commit.
+- frozen design §3.8 갱신 = 본 cut PR 안.
+- gitea-release tag v0.18.0 = 본 PR 머지 commit 위 즉시.
+
+**Merge strategy** (round-1 critic P5-NEW-M2): kebab 의 default merge commit 패턴은 `Merge pull request '...' (#N)` 형태 — bump commit 이 PR branch 안에 있고 main 의 HEAD = merge commit (별 SHA). CLAUDE.md "bump commit = release commit" rule strict 해석:
+- **Option A 권장 — gitea-pr 의 squash merge** 사용 (`gitea-pr --merge-method squash` 또는 머지 UI 의 squash 옵션). 결과: main HEAD = bump 의 squash commit (single SHA). `gitea-release v0.18.0` tag 가 그 commit 위.
+- Option B (대안): bump 의 *PR branch commit* 에 직접 tag (main 의 merge commit 과 다른 SHA, 그러나 release tag 는 PR branch SHA reference — gitea 에서 가능). audit trail 약간 약함.
+- Option C: merge commit 자체에 *bump 내용 포함* (PR description = bump + cascading docs). gitea-pr 의 default merge commit message 가 bump 의 commit message 와 다른 자체 message — *bump 의 의도* 가 merge commit 에 inline 되지 않음. 권장 안 함.
+
+본 cut PR 작업자가 **Option A (squash merge)** 채택. main HEAD = bump commit, tag = same SHA. CLAUDE.md rule strict 정합.
+
+**R5-NEW-NIT-1 carry-over** (round-1 critic P5-NEW-M1): release notes draft (spec §5 line 681) 의 `9B+ 모델` 표현이 spec §5 step 8 line 651 의 `8B+ Q4 모델 (gemma4:e4b 8B / gemma2:9b 등)` 와 inconsistency 잔존 (cut PR 시점 final 작성 시 정정). cut PR 작업자가 spec §5 step 8 wording 일관 적용. spec round-5 NIT 자체는 spec 안에서 closure (R5-NEW-NIT-1 row of §9), 본 plan §7 가 *implementation reminder*.
+
+**Files** (모두 한 PR, commit msg `chore(release): bump version 0.17.2 → 0.18.0 + cut fb-41 multi-hop` — round-2 critic R2-NIT-2 scope label):
+1. `Cargo.toml` (workspace): `version = "0.17.2"` → `"0.18.0"`. `Cargo.lock` 자동 cascade.
+2. `HANDOFF.md`:
+   - 한 줄 요약 (P0~P9 + P10 + v0.18.0 fb-41 multi-hop ship).
+   - 머지 후 결정 절에 fb-41 entry 단락 (PR-1~PR-9 + dogfood + NLI 한 문단).
+3. `tasks/HOTFIXES.md`: 기존 fb-41 entry 들 `post-v0.18` anchor.
+4. `tasks/INDEX.md`: fb-41 status `open` → `completed`. v0.18.0 release subheader.
+5. `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md`:
+   - §3.8 RAG 의 multi-hop sub-section 추가 (본 finalize spec §1-§3 요약 verbatim).
+   - §9 versioning cascade 표에 (선택) `nli_model_version` row.
+6. `docs/superpowers/specs/2026-05-25-p9-fb-41-finalize-spec.md`:
+   - `status: approved-by-team` → `completed`.
+7. `docs/superpowers/plans/2026-05-25-p9-fb-41-finalize-plan.md`:
+   - `status: open` → `completed`.
+8. `integrations/claude-code/kebab/SKILL.md`:
+   - v0.18.0 release notes link 한 줄.
+9. `README.md`:
+   - `kebab ask --multi-hop` + NLI 옵션 안내 한 단락 (model first-run download cost, RAM 권장).
+   - binary path confusion 한 줄 (`/build/out/cargo-target/release/kebab` 명시).
+10. `docs/SMOKE.md`:
+    - NLI 옵션 활성화 절차 (`[rag] nli_threshold = 0.5`).
+    - first-run model download 안내 (~280 MB).
+    - RAM 권장 (gemma3:4b 기준 ~5-6 GB; 8B+ Q4 모델 추정 ~10 GB / 16 GB 경계).
+
+**gitea-release** (cut PR 작업자 결정):
+```sh
+# Option A: --auto-notes 만 (gitea-ops skill 가 PR 시리즈 자동 list 생성)
+gitea-release v0.18.0 --auto-notes
+
+# Option B: --notes 만 (spec §5 release notes draft inline)
+gitea-release v0.18.0 --notes-file release-notes-v0.18.0.md
+
+# Option C: 둘 다 (gitea-ops 가 동시 명시 시 동작 — 사전 확인 필요)
+gitea-release v0.18.0 --auto-notes --notes "fb-41 multi-hop RAG + NLI ship..."
+```
+
+cut PR 작업자가 `gitea-release --help` 또는 `~/.claude/skills/gitea-ops/SKILL.md` 확인 후 Option 선택 (round-1 critic P5-NIT-1). spec §5 의 release notes draft 가 *content* — 어느 path 로 input 할지가 결정.
+
+권장: **Option A (--auto-notes)** + PR description 에 spec §5 release notes draft inline — gitea-release 가 PR description 의 release notes 절을 carry. 단순 + audit trail.
+
+**검증**:
+- `cargo build --release` 통과.
+- `gitea-pr-status` gate passed.
+- 머지 후 binary smoke test (cargo run --release).
+
+**시간**: 2-3h (round-1 architect N4 + critic P5-NIT-4 반영 — frozen design §3.8 verbatim + 10 files cascading + release notes final + review iteration 가능성 반영).
+
+## 8. 시간 추정 합산
+
+| Sub-PR | 작업 (h) | + review iteration (h) | wall time (h) |
+|---|---|---|---|
+| PR-9a | 2-3 | 1-2 | 3-5 |
+| PR-9b | 8-12 | 2-3 | 10-15 |
+| PR-9c-1 | 2-3 | 1-2 | 3-5 |
+| PR-9c-2 | 3-4 | 1-2 | 4-6 |
+| PR-9d | 4-6 | 1-2 | 5-8 |
+| cut PR | 2-3 | 1-2 | 3-5 |
+| **Total** | **21-31h** | **+7-13h** | **28-44h** |
+
+**round-2 plan critic M2 closure** — 작업 시간 vs wall time 명시적 분리:
+
+- **작업 (h)** = 순수 implementation / dogfood / docs cascade 시간 (review feedback 반영 작업 별도).
+- **review iteration (h)** = `gitea-pr-review` 회차당 1-2h × 평균 1-1.5 회차 추정 (HOTFIXES 평균 의거). 회차 0 (즉시 APPROVE) 도 가능 — 작업자 quality 의존.
+- **wall time (h)** = 작업 + review iteration. 사용자 / stakeholder 의 *ship expectation* baseline.
+
+cumulative 정정 trace: round-1 14-20h → round-2 14-22h → plan v2 20-30h → plan v3 21-31h (작업) / 28-44h (wall time, plan v4 round-2 critic M2 신설).
+
+## 9. /subagent-driven-development 의 task list
+
+`plan` 통과 + OMC team APPROVE 후 다음 task list 로 subagent dispatch:
+
+1. **Task PR-9a**: kebab-nli crate skeleton — **plan §2 + spec §3 PR-9a + spec §2.1~§2.2.4 참조**. branch `feat/fb-41-pr-9a-kebab-nli-crate`. pre-flight (`curl -I` + tokenizers probe) 결과 PR description 첨부. 검증 + PR + 회차 리뷰 루프 + 머지.
+2. **Task PR-9b**: OnnxNliVerifier inference — **plan §3 + spec §3 PR-9b + spec §2.2.2~§2.2.4 참조**. branch `feat/fb-41-pr-9b-onnx-nli-inference`. manual `--ignored` smoke 결과 PR description 첨부.
+3. **Task PR-9c-1**: core types + wire — **plan §4 + spec §3 PR-9c-1 + spec §2.4~§2.6 참조**. branch `feat/fb-41-pr-9c-1-core-types-wire`. `docs/ARCHITECTURE.md` 갱신 포함.
+4. **Task PR-9c-2**: pipeline integration + mock test + SKILL.md — **plan §5 + spec §3 PR-9c-2 + spec §2.3 참조**. branch `feat/fb-41-pr-9c-2-pipeline-integration`.
+5. **Task PR-9d**: dogfood retest + HOTFIXES + dogfood corpus 보존 — **plan §6 + spec §3 PR-9d + spec §7 PASS criteria 표 참조**. branch `feat/fb-41-pr-9d-dogfood-retest`. pre-run prereq 검증 후 시작. **Environment (round-2 critic M3)**: *user machine 에서만 dispatch 가능* — Ollama service running + dogfood corpus 디렉토리 존재 + network reachable + free RAM ≥ 6 GB + release binary path 의존. isolated docker / ephemeral CI container 환경은 모두 부재 → dispatch 시 즉시 abort. autonomous subagent provisioning (sudo Ollama install + corpus mirror) 은 v0.19+ candidate.
+6. **Task cut PR**: version bump + cascading docs — **plan §7 + spec §5 + spec R5-NEW-NIT-1 carry-over 참조**. branch `chore/v0.18.0-cut`. *gitea-pr squash merge* + `gitea-release v0.18.0` tag 머지 commit 위.
+
+각 task 는 *self-contained* — 별 subagent dispatch 가능. dependency 는 *이전 task 의 main 머지* — subagent 가 다음 task 시작 전 `git pull` 로 sync. **순차 only** — speculative pre-work 권장 안 함 (review 부담 + rebase 위험). 특히 **PR-9c-2 는 PR-9c-1 의 review iteration 완료 + 머지 후 시작** (round-2 critic N4) — 중간 schema change 시 9c-2 의 mock test 의 schema validation expectation 변경 위험. `TaskUpdate` 의 `addBlockedBy` chain 으로 race 회피 (round-1 planner informational). **active subagent ≤ 1 임의 timestamp** — RAM 압박 회피 + user memory `feedback_serial_build_only` policy 정합.
+
+각 subagent 는 다음을 책임:
+- branch 생성 + 구현 + tests + cargo test/clippy 검증 (16 GB RAM 직렬 only, user memory `feedback_serial_build_only` 적용).
+- gitea-pr 생성 + 리뷰 루프 (gitea-pr-review 회차) + APPROVE 후 머지.
+- 머지 후 main checkout + pull + branch cleanup (`git branch -d` + worktree 사용 시 `git worktree remove`).
+- `cargo clean` 권장 (CLAUDE.md routinely after merged PR rule, 92GB→0GB 복구).
+- `TaskUpdate(status='completed')` 호출 + team-lead 에게 `SendMessage` 으로 다음 task 시작 신호 (또는 사용자 manual dispatch).
+
+## 10. Self-review notes
+
+- **PR-9 의 ONNX integration** 가 *새 dep chain* (ort + tokenizers + hf-hub) 도입 — 첫 사용 안정화 필요. PR-9a 의 pre-flight 가 *모든 위험 검증*. PR-9b 의 `#[ignore]` test manual smoke 가 *production binary 실제 동작* 검증.
+- **multi-hop NLI 의 latency 추가** — current multi-hop synthesize 158s + NLI ~50ms ≈ 158s. negligible.
+- **Model first-run download (~280 MB)** — 사용자 도그푸딩 환경 (CPU only) 의 disk + download bandwidth 1회 비용. README + SMOKE 안내. fail-closed download failure 정책.
+- **`RagPipeline::new` 시그니처 widening — Option B (builder) 결정** — 18+ existing call sites 무영향.
+- **frozen design contract §3.8 갱신 timing — v0.18.0 cut PR 안** — PR-9c 가 contract 변경 안 함.
+- **kebab-nli 의 trait + impl 동일 crate** — v0.18 scope = adapter 1개. v0.19+ 에 multi-adapter 등장 시 `kebab-nli-onnx` 분리.
+- **dogfood corpus 보존** — `docs/dogfood/v0.18.0/` 신규 dir + sanitized SUMMARY + sample JSON.
+- **RAM cold-start 측정** — PR-9d 의 PASS criteria 에 포함, release notes 의 권장 RAM 한 줄.
+- **ort version pin** — `workspace.dependencies.ort = "=2.0.0-rc.9"`.
+
+### Plan-specific self-review (round-1 critic P5-NIT-3 반영)
+
+execution / coordination 측면의 추가 self-review notes:
+
+- **Subagent 간 race 회피**: `TaskUpdate.addBlockedBy` chain 필수 적용 — PR-9b 의 task 가 PR-9a task 의 머지 완료에 blockedBy. PR-9c-1 → 9c-2 → 9d → cut PR 동일 chain.
+- **PR-9c-1 wire schema baseline for 9c-2**: PR-9c-1 의 `answer.schema.json` / `error.schema.json` 변경이 9c-2 의 mock test 의 schema validation baseline. PR-9c-1 의 review iteration 결과 schema 변경 시 9c-2 시작 전 *main pull* + spec/plan re-check 필수.
+- **`#[allow(dead_code)]` for verifier field in PR-9c-1** (round-1 architect N1): PR-9c-1 의 `RagPipeline.verifier` field 가 *declared 되었지만 read by nothing* 인 interim 시기 (9c-2 머지 전) — `cargo clippy -- -D warnings` fail 위험. PR-9c-1 의 field 에 임시 `#[allow(dead_code)]` 또는 `Debug` derive 의 trivial field access. PR-9c-2 가 attribute 제거 + builder 의 `with_verifier` 의 사용 path 활성화.
+- **OnnxNliVerifier::new 의 lazy stamp semantics** (round-1 architect N2): spec §2.2.2 의 OnceLock pattern — `new()` 자체는 cache_dir create 같은 *early error* 만 잡음. download / inference 실패는 *runtime path* 의 `refuse_nli_model_unavailable` 가 처리. 작업자가 *eager download 시도* (lazy 위반) 회피.
+- **`truncate_for_nli` placement** (round-1 architect N3): module-level `pub fn` in `kebab_rag::pipeline`. 회귀 핀 test = `crates/kebab-rag/tests/multi_hop.rs` 의 `multi_hop_truncate_for_nli_preserves_hypothesis` (§5).
+- **First-run download progress indicator 검증** (round-1 architect N5): PR-9d 의 first-run NLI model download 시 stderr 에 `kebab-nli: downloading model.onnx (280 MB)...` progress emit 확인. non-`--json` mode 만 progress emit. PR-9d 의 검증 절에 명시 안 됐지만 작업자가 stderr 출력 확인 + HOTFIXES PR-9 closure 절에 *progress 확인 결과* 한 줄 명시 권장.
+- **Parallel execution opportunity (round-1 critic Open Question)**: PR-9b (8-12h, kebab-nli crate-internal) 동안 PR-9c-1 의 *kebab-nli 의존 없는 부분* (RefusalReason variant + wire schema) preparation 가능 — 시간 단축 4-6h. **권장 안 함** (review iteration 비용 ↑ + branch rebase risk). plan v3 는 *sequential only* 명시. 단, 작업자가 *speculative pre-work* 결정 시 `kebab-rag` 의 `kebab-nli` 의존 추가는 9b 머지 후 lock.
+- **PR-9d binary path 일관성** (round-1 critic Open Question + dogfood SUMMARY §부수 발견 closure): subagent task 가 `cargo build --release` 후 `/build/out/cargo-target/release/kebab` 사용 (CARGO_TARGET_DIR env 설정 환경). cut PR 의 README 갱신 (§7 step 9) 가 *user-facing* path confusion closure.
+- **Rollback path** (round-1 critic What's missing): PR-9d dogfood retest PASS criteria *catastrophic fail* (NLI library bug 등) 시 PR-9 revert path — `git revert` PR-9c-2 → PR-9c-1 → PR-9b → PR-9a 의 *reverse sequential*. `[rag] nli_threshold = 0.0` config knob 으로 graceful disable 가 더 가벼운 first-response. spec §6 의 threshold iteration trigger 와 분리 (혼동 회피).
+
+## 11. Spec-driven 변경 trace
+
+본 plan v4 는 spec v5 (review_round=5, approved-by-team) 의 모든 결정 반영 + plan-review round-1/round-2 의 issues closure. spec 의 §9 closure matrix (round 1-4) + plan v1/v2/v3 의 점진적 갱신 baseline. plan v2 → v4 갱신 사항 (round-2 critic R2-NIT-1 wording 정정):
+
+- PR-9c 분할 = 9c-1 + 9c-2 별 PR (round-2 P1).
+- 시간 추정 14-20h → 20-30h cumulative 정정 (round-2 P2 + round-3 R3-NEW-N1).
+- PR-9d 의 RAM 측정 protocol + pre-run prereq + dogfood corpus 보존 (round-2 P5/P6 + round-3 R3-NEW-N3).
+- cut PR step 명시 + same-commit rule (round-2 M7 + round-3 R3-NEW-N2).
+- 시그니처 widening = Option B (round-2 NEW-M2).
+- truncate_for_nli signature `(String, bool)` (round-2 NEW-N1).
+- `RefusalReason::NliVerificationFailed` + `NliModelUnavailable` wire 통일 (round-2 ISSUE-1 + R3-NEW-N3).
+- model ID Xenova/... config default 확정 (round-1 A1 / D5).
+- `nli_threshold = 0.0` single gate (round-1 A3 / D3).
+- pre-flight tokenizers features 검증 (round-2 NEW-M1).
+- §7 cross-ref single source of truth (round-4 R4-NEW-M1 + R4-NEW-N1).
+
+### Plan-review round-1 closure (post-spec-v5 plan-level review)
+
+| reviewer | round-1 plan issue | plan v3 resolution |
+|---|---|---|
+| document-specialist | ISSUE-1 ARCHITECTURE.md missing | §4 PR-9c-1 Files 에 `docs/ARCHITECTURE.md` 추가 — Mermaid `nli` 노드 + 4 edges + 디렉토리 트리. |
+| critic | P5-NEW-M1 R5-NEW-NIT-1 carry-over | §7 cut PR 에 "R5-NEW-NIT-1 carry-over" 절 — release notes draft 의 `9B+ 모델` → `8B+ Q4 모델` cut PR 작업자 reminder. |
+| critic | P5-NEW-M2 merge strategy | §7 — Option A (gitea-pr squash merge) 권장 명시. bump commit = main HEAD = release tag SHA. CLAUDE.md same-commit rule strict 정합. |
+| critic | P5-NIT-1 gitea-release flag combo | §7 — `--auto-notes` / `--notes-file` / 둘 다 의 Option A/B/C 명시 + 사전 `gitea-release --help` 확인 + Option A 권장. |
+| critic | P5-NIT-2 §9 spec § reference | §9 — 각 task description 에 `plan §X + spec §Y` cross-ref inline. |
+| critic | P5-NIT-3 plan-specific self-review | §10 — Plan-specific self-review notes 절 추가 (race avoidance + dead_code attr + lazy semantics + parallel opportunity + rollback path 등 8 항목). |
+| critic | P5-NIT-4 cut PR 시간 | §7 + §8 — 시간 1-2h → 2-3h. 합산 20-30h → 21-31h. |
+| architect | N1 dead_code attr | §10 plan-specific self-review notes 의 `#[allow(dead_code)]` 한 줄 명시. |
+| architect | N2 OnnxNliVerifier::new lazy semantics | §10 — early error vs runtime error 명시. |
+| architect | N3 truncate_for_nli placement | §10 — module-level `pub fn` + test 위치 cross-ref. |
+| architect | N4 cut PR 시간 (frozen design §3.8 cost) | §7 시간 1-2h → 2-3h + §8 합산 정정 (critic P5-NIT-4 와 동일 fix). |
+| architect | N5 download progress 검증 | §10 self-review — PR-9d 작업자가 stderr progress 확인 + HOTFIXES 한 줄 명시 권장. |
+| planner | round-2 nit (4개) | spec v5 와 plan v2 가 closure (round-2 closure matrix). round-1 plan-level 신규 nit 0개 (re-confirm APPROVE). |
+
+### Plan-review round-2 closure (post-plan-v3 deep ADVERSARIAL review)
+
+| reviewer | round-2 plan issue | severity | plan v4 resolution |
+|---|---|---|---|
+| critic | M1 PR-9c-1 dead code clippy fail risk | MINOR | §4 PR-9c-1 의 `RagPipeline.verifier` field 절에 `#[allow(dead_code)]` 명시 + PR-9c-2 hook 추가 시 제거 trace. |
+| critic | M2 시간 추정 review iteration 미포함 | MINOR | §8 시간 표 *작업 vs review iteration vs wall time* 분리 — 합산 21-31h (작업) / 28-44h (wall time). |
+| critic | M3 PR-9d subagent dispatch environment | MINOR | §9 PR-9d 항목 Environment 명시 — user machine only (Ollama / corpus / network / RAM / binary path 의존), isolated container 부적합. |
+| critic | N4 9c-1/9c-2 sequential strictness | NIT | §9 — "PR-9c-2 는 PR-9c-1 의 review iteration 완료 + 머지 후 시작" 명시. |
+| document-specialist | NIT-1 nli→core edge 확인 권장 | NIT | §4 ARCHITECTURE.md 절 — `nli --> core` edge 는 PR-9a 머지 후 final `kebab-nli/Cargo.toml` deps 확인 결정. |
+| critic | N1 spec §2.1.1 alternative models cross-ref | NIT (옵션) | 미반영 — PR-9d dogfood retest iteration trigger 발동 시 작업자가 spec 직접 cross-ref 권장. |
+| critic | N2 PR-9b fallback Optimum self-export | NIT (옵션) | 미반영 — spec §3 PR-9b 의 fallback 명시 충분. |
+| critic | N3 PR description template | NIT (옵션) | 미반영 — 각 plan 절이 *de facto* template 역할. |
+
+### Plan-review round-2 (light pass on plan v3) closure
+
+plan v3 SendMessage 후 critic round-2 light pass 가 1 actionable MINOR (R2-NEW-M1 §0 vs §8 시간 mismatch) + 1 verification MINOR (R2-NEW-M4 spec status — *invalid finding*: spec frontmatter 실제 `status: approved-by-team`) + 4 NIT (R2-NIT-1~4) 발견.
+
+| reviewer | round-2 light issue | severity | plan v4 resolution |
+|---|---|---|---|
+| critic | R2-NEW-M1 §0 vs §8 시간 cumulative mismatch | MINOR | §0 line 22 "총 추정 시간 19-28h" → "작업 21-31h / wall time 28-44h (§8 cumulative trace + plan v4 round-2 critic M2 분리 참조)". |
+| critic | R2-NEW-M4 spec status transition | MINOR (invalid) | spec frontmatter 확인 — `status: approved-by-team` (line 6). plan §7 step 6 `approved-by-team → completed` transition 정확. *no edit*. |
+| critic | R2-NIT-1 "본 plan v2" wording | NIT | §11 의 plan-version 헤더 — "본 plan v2 는" → "본 plan v4 는 ... + plan-review round-1/round-2 의 issues closure". |
+| critic | R2-NIT-2 cut PR commit msg scope | NIT | §7 Files header — `chore:` → `chore(release):` scope label. |
+| critic | R2-NIT-3 ARCHITECTURE.md `app --> nli` edge timing | NIT | §4 ARCHITECTURE.md 절 — *forward-looking final state* 명시 + PR-9c-1 시점 직접 의존 (`rag --> nli` + `nli --> config`) 분리 + 권장 forward-looking. |
+| critic | R2-NIT-4 App::new caller cascading | NIT | §5 PR-9c-2 의 kebab-app 항목 — **실제 constructor `App::open_with_config` (kebab-app/src/app.rs:187, 이미 Result return)**. PR-9c-2 = *body 추가만*, caller cascading 0. |
diff --git a/docs/superpowers/specs/2026-05-25-p9-fb-41-finalize-spec.md b/docs/superpowers/specs/2026-05-25-p9-fb-41-finalize-spec.md
new file mode 100644
index 0000000..a8dda60
--- /dev/null
+++ b/docs/superpowers/specs/2026-05-25-p9-fb-41-finalize-spec.md
@@ -0,0 +1,831 @@
+---
+title: "p9-fb-41 finalize — multi-hop RAG post-dogfood safety hardening + v0.18.0 cut"
+date: 2026-05-25
+task_id: p9-fb-41-finalize
+phase: P9
+status: approved-by-team
+target_version: 0.18.0
+contract_source: ./2026-04-27-kebab-final-form-design.md
+contract_sections: [§3.8 RAG, §7 RAG pipeline]
+predecessor: ./2026-05-25-p9-fb-41-multi-hop-rag-design.md
+review_round: 5
+review_outcome: |
+  All 4 OMC team reviewers APPROVE after 5-round convergence.
+  - architect: APPROVE (round 2)
+  - planner: APPROVE (round 2)
+  - document-specialist: APPROVE (round 3)
+  - critic: APPROVE (round 5)
+  Δ-severity 5-round 단조감소: 1C+9M+3m → 0+0+1NIT.
+  잔존 NIT (R5-NEW-NIT-1) closure 됨 (release notes wording).
+---
+
+# p9-fb-41 finalize — multi-hop RAG post-dogfood safety hardening
+
+## 동기
+
+predecessor spec (`2026-05-25-p9-fb-41-multi-hop-rag-design.md`) 가 정의한 multi-hop pipeline 의 PR-1 ~ PR-8 모두 머지 완료. v0.18 pre-cut 도그푸딩 (`/build/cache/dogfood-v018/results/SUMMARY.md`) 에서 발견된 *safety regression* 을 닫고 v0.18.0 cut 으로 가는 finalize spec.
+
+predecessor 의 frozen contract 는 변경 없음 — 본 spec 는 *delta* 만:
+
+1. dogfood 발견 (S7 hallucination) 의 진단 + fix path 정리.
+2. PR-7 (probe gate) + PR-8 (pool 축소 + prompt rule) 의 부분 fix 결과.
+3. PR-9 (NLI-based post-synthesis verification) 의 최종 fix 설계.
+4. v0.18.0 cut steps.
+
+## 1. 도그푸딩 진단 (S7)
+
+**Query**: `What is the chemical formula of caffeine?` (KB 에 없는 fact).
+
+| path | top_score | grounded | latency | answer |
+|---|---|---|---|---|
+| single-pass (hybrid) | 0.5 | false (LlmSelfJudge) | 30s | "근거가 부족하다" ✓ |
+| multi-hop pre-fix | 0.5 | true ✗ | 614s | hallucination: "C₉H₁₅N₃O [#6]" (Adam optimizer 의 g_t 수식을 인용) |
+| multi-hop PR-7 | 0.5 | true ✗ | 143s | hallucination (probe gate top_score 0.5 > 0.30 통과) |
+| multi-hop PR-8 | 0.5 | true ✗ | **158s** (4× 개선) | hallucination (LLM 새 prompt rule 무시) |
+
+### 1.1 진단 정합
+
+1. **single-pass 의 LlmSelfJudge** = LLM 의 self-judgement 가 *uncorrelated chunks* 에 대해 "근거 부족" 인지. *probabilistic safety* — gemma3:4b 환경에서 우연히 정답 path. 다른 케이스 / 다른 LLM 에서 동일 reliability 보장 없음.
+2. **multi-hop pre-fix 의 hallucination** = synthesize prompt 가 *5 sub-questions + 30 chunks* 의 large context 에서 LLM 의 self-judgement 잃음. `score_gate` 도 `hits[0].fusion_score` 만 검사 — multi-hop pool 의 union 이 한 sub-query 의 top score 가 gate 위면 통과.
+3. **PR-7 probe gate** = single-pass 와 동일한 *원본 query* retrieve top_score 검사. 그러나 hybrid mode 의 RRF default score 가 0.5 (vector embedding 의 false positive — caffeine 와 Adam optimizer 수식 chunk 사이 semantic 유사도) → probe 도 통과.
+4. **PR-8 prompt rule + pool 15** = synthesize prompt 강화 + size 축소 → latency 4× 개선. 그러나 gemma3:4b 의 prompt-following ceiling — strong rule 도 무시.
+
+**근본 원인**: LLM-self-judge 기반 groundedness check 의 ceiling (gemma3:4b 한정 관측 — larger LLM 의 ceiling 도 unknown). *deterministic external verifier* 필요.
+
+### 1.2 alternative root cause 검토 (왜 NLI path 인가)
+
+다음 lighter alternatives 도 검토했으나 NLI path 채택:
+
+| alternative | 효과 | 한계 / 거부 이유 |
+|---|---|---|
+| `[rag] vector_min_score = 0.4` knob (RRF *원본 vector cosine* threshold 추가) | caffeine ↔ Adam optimizer 의 vector 유사도 차단 가능 | RRF formula `score = sum(1/(60+rank))` 가 top-K 통과 시 *원본 cosine 낮아도* RRF 0.5 → vector_min_score 추가 = retrieval-side fix. Synthesis-side hallucination 의 *근본 원인 (LLM 의 prompt-following ceiling)* 미해결. 다른 query 패턴 (paraphrase chunk 가 retrieve) 의 hallucination 같은 path. |
+| LLM 모델 업그레이드 (gemma2:9b / qwen2.5:7b) | larger LLM 의 prompt-following 능력 강화 → "근거 부족" rule 잘 따를 가능성 | CPU only 16 GB RAM 환경에서 9B+ Q4 모델은 RAM/latency 부담 ↑ (HOTFIXES 2026-05-25 v0.17.0 post-dogfood entry). 사용자 환경 의존성 ↑. *모델 무관 safety floor* 가 본 spec 의 목표. |
+| LLM-as-judge (별 LLM call 으로 yes/no) | 모델 prompt-following 안 의존 — 별 call 의 binary judgement | 추가 LLM call → multi-hop latency 더 늘어남 (현재 158s + 10-30s). 그리고 *judge LLM 도 prompt-following ceiling* 가짐 — 같은 문제 재발 가능. |
+| **NLI post-synthesis verification (선택)** | deterministic + lightweight + 학계 표준 | model dep + first-run download 부담. *그러나 단일 280 MB ONNX 가 모든 multi-hop ask 의 safety floor 제공*. |
+
+NLI 가 *deterministic verifier* 의 약속 (LLM 의 stochastic self-judge 와 직교) + production proven (Auto-GDA, MedTrust-RAG) + multilingual 가능 (multilingual NLI model) 의 3 axis 모두 만족.
+
+(향후 v0.19+ 의 ceiling 측정 / dogfood iteration 에서 LLM-as-judge 또는 cross-encoder reranker 도 보조 path 로 검토 가능 — `nli_threshold = 0.0` disable 옵션 항상 보존.)
+
+### 1.3 LLM upgrade vs NLI 의 future 관계
+
+§1.2 의 LLM 업그레이드 path 가 v0.19+ 의 *NLI 와 병행* 또는 *NLI 대체* 가능성:
+
+- **병행**: larger LLM 도 hallucinate 가능 — NLI 가 safety floor 유지. *defense in depth*.
+- **대체**: 만약 future LLM (예: gemma4:e4b 의 instruction-tuned variant) 가 prompt-following ceiling 가 사라지면 NLI cost 정당화 약화 — `[rag] nli_threshold = 0.0` disable 로 opt-out.
+
+본 v0.18 spec 의 NLI 는 *opt-in default OFF* (§2.6) 이라 사용자가 환경에 맞춰 enable. v0.19+ 의 measurement 후 default ON / OFF 결정.
+
+## 2. PR-9 — NLI-based post-synthesis verification
+
+학계 / industry 표준 (Self-RAG / CRAG / Auto-GDA / MedTrust-RAG) 의 결론: *post-synthesis groundedness verification* 이 정답 path. **multilingual NLI ONNX model** (~280 MB) 이 `(premise = packed_chunks, hypothesis = answer)` entailment 검사 → score < threshold 면 refuse.
+
+### 2.1 Model
+
+- **HuggingFace repo (production default)**: `Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7` — Xenova org 의 ONNX export.
+- **원본 PyTorch weight**: `MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7` (Apache-2.0 lic.). Xenova 의 ONNX export 는 *Optimum* 으로 생성된 변환본. config 의 default 는 ONNX 호스팅하는 `Xenova/...` 사용.
+- 280 MB ONNX (FP32). Q8 양자화 variant 도 Xenova 에 별 file (`onnx/model_quantized.onnx`) 있음 — v0.19+ 에서 옵션 추가 검토.
+- 3-way classifier: `[entailment, neutral, contradiction]` (XNLI `id2label` 표준).
+- 100+ multilingual (Korean + English 필수).
+- CPU inference: ~10-50 ms per (premise, hypothesis) pair (mDeBERTa-base 기준).
+
+**pre-flight check (PR-9a 시작 전 manual 확인)**:
+```sh
+curl -I https://huggingface.co/Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7/resolve/main/onnx/model.onnx
+curl -I https://huggingface.co/Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7/resolve/main/tokenizer.json
+```
+두 HEAD 모두 `200 OK` 면 진행. 404 면 PR-9 의 design re-evaluation (다른 ONNX repo 또는 self-export via Optimum).
+
+#### 2.1.1 대안 모델 trade-off (informational)
+
+| 모델 | size | lang | quality 차이 | 적합도 |
+|---|---|---|---|---|
+| `xlm-roberta-large-xnli` | 1.5 GB | 100+ multilingual | ~3-5% 더 높음 | 16 GB RAM 환경에서 LLM + lance + NLI 동시 cold start 부담 (overflow risk). |
+| **`Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7` (선택)** | **280 MB** | **100+ multilingual** | **baseline** | **균형 — 본 spec 의 default** |
+| `MiniLM-L12-mnli-xnli` | 110 MB | multilingual (좁음) | ~5-10% 낮음 | Korean 의 quality 약함 — kebab corpus 의 KR+EN mix 와 부적합. |
+
+선택 사유: kebab 의 사용자 환경 (CPU only, 16 GB RAM, KR+EN mix) 에서 *유일한 균형점*. 모델 교체 시 본 표 + dogfood retest 측정값 함께 갱신.
+
+### 2.2 Architecture
+
+```
+crates/kebab-nli/  (신규 crate, trait + impl 한 곳)
+├── Cargo.toml
+└── src/
+    ├── lib.rs     — NliVerifier trait + NliScores struct + softmax helper
+    └── onnx.rs    — OnnxNliVerifier (ort + tokenizers + hf-hub)
+```
+
+**Trait + impl 동일 crate 정당화** (vs `kebab-embed` + `kebab-embed-local` 패턴):
+
+- v0.18 scope = ONNX adapter 하나만. trait split crate 의 *현재* 가치 0.
+- v0.19+ 에서 candle / CUDA / remote adapter 등장 시 `kebab-nli-onnx` 분리 가능 — *그때 breaking change* 는 internal API only (kebab-app 만 영향, *wire 무관*). PR-9 단순화 우선.
+- §8 self-review 에 향후 split 시 trigger 명시.
+
+#### 2.2.1 Trait surface
+
+```rust
+#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, Deserialize)]
+pub struct NliScores {
+    pub entailment: f32,    // production accept signal
+    pub neutral: f32,
+    pub contradiction: f32, // observability
+}
+
+impl NliScores {
+    pub fn faithfulness(&self) -> f32 { self.entailment }
+    pub fn from_xnli_logits(logits: [f32; 3]) -> Self { /* softmax + wrap */ }
+}
+
+pub trait NliVerifier: Send + Sync {
+    fn score(&self, premise: &str, hypothesis: &str) -> Result<NliScores>;
+}
+```
+
+`NliScores::faithfulness()` 가 `entailment` channel 반환 — production accept rule (`entailment >= threshold` = grounded).
+
+#### 2.2.2 OnnxNliVerifier
+
+- `ort::Session` (transitive download-binaries 으로 fastembed 와 같은 ONNX runtime).
+- `tokenizers::Tokenizer` (SentencePiece via mDeBERTa tokenizer.json).
+- `hf-hub::api::sync::Api` 가 first-run model + tokenizer download.
+- **Lazy init**: 첫 `score` 호출 시 model + tokenizer load. 후속 호출 reuse (OnceLock 또는 OnceCell).
+- **Cache dir**: `{config.storage.model_dir}/nli/<sanitized-model-id>/{model.onnx, tokenizer.json}`. fastembed 의 model cache 와 sibling. sanitization 은 `/` → `_` 로 (`Xenova/mDeBERTa-...` → `Xenova_mDeBERTa-...`).
+- **Failure handling**: download 실패 (network / disk / corrupt) 시 `RefusalReason::NliModelUnavailable` (단일 ask) 또는 facade construction 시 verifier=None 으로 graceful — §2.6 참조.
+
+#### 2.2.3 Input encoding + truncation
+
+mDeBERTa-v3 의 `max_seq_len = 512` token. multi-hop 의 packed_chunks (15 chunks × ~300-500 token = 4500-7500 token) 가 무조건 초과 → **명시적 truncation 정책 필수**:
+
+```rust
+let mut encoding_params = tokenizers::EncodeInput::Dual(premise, hypothesis);
+tokenizer
+    .with_truncation(Some(tokenizers::TruncationParams {
+        max_length: 512,
+        strategy: tokenizers::TruncationStrategy::OnlyFirst, // premise (chunks) 만 truncate
+        stride: 0,
+        direction: tokenizers::TruncationDirection::Right,    // 끝부터 잘림
+    }))?
+    .encode(encoding_params, /*add_special_tokens=*/true)?
+```
+
+- **`OnlyFirst`**: hypothesis (answer) 는 보전, premise (chunks) 끝부터 truncate. answer 가 잘리면 entailment 가 *임의로 fail* — 절대 회피.
+- **packed_text pre-budget in pipeline (옵션)**: `kebab-rag` 가 NLI 호출 전 packed_chunks 를 self-truncate. PR-9c-2 에서 helper `truncate_for_nli(premise: &str, hypothesis: &str) -> (String, bool)` 작성 — `max_seq_len = 512` 는 helper 내부 상수 `MAX_NLI_PREMISE_CHARS` 로 hardcode (v0.18 scope 단일 NLI model 가정). signature single source of truth = §3 PR-9c-2.
+
+회귀 핀 (PR-9c unit test): `long_premise_truncation_preserves_hypothesis_score` — premise 가 10000-token 일 때 score 가 정상 (panic / NaN 없음). truncation indicator (`encoding.get_overflowing()`) 비어 있지 않음 검증.
+
+#### 2.2.4 Inference
+
+```
+input_ids        : [1, seq_len] i64
+attention_mask   : [1, seq_len] i64
+→ Session::run
+→ logits          : [1, 3]      f32
+→ softmax(logits) → NliScores
+```
+
+mDeBERTa-v3 는 token_type_ids 없음 (single-segment encoding). ort input name 확정:
+- input: `input_ids`, `attention_mask`
+- output: `logits`
+
+(PR-9a 의 pre-flight check 에서 ONNX 의 `onnx.SessionInfo::inputs()` / `outputs()` 출력으로 검증 후 lock — 다른 name 이면 spec 갱신.)
+
+### 2.3 Pipeline integration
+
+`RagPipeline::ask_multi_hop` 의 step 8.5 (synthesize 후, citation extract 전):
+
+**Empty answer (stream abort / LM crash) 의 처리**: synthesize 가 empty `acc` 반환 시 step 8.5 *skip* — 이미 별 path 의 refusal 처리 (예: `RefusalReason::LlmStreamAborted` for fb-33 cancel) 가 이전 단계에서 결정. 본 step 8.5 의 NLI verify 는 *non-empty answer* 에 대해서만 호출 — empty hypothesis 가 NLI tokenizer 의 edge case 진입 회피. PR-9c-2 의 `ask_multi_hop` integration 시 `if !acc.trim().is_empty() { /* step 8.5 */ }` 가드 추가.
+
+```rust
+// 8.5 — NLI groundedness verification (multi-hop only in v0.18 scope)
+// §2.7: single-pass `ask` 는 LlmSelfJudge 그대로. NLI 미적용.
+let verification = if self.config.rag.nli_threshold > 0.0 {
+    let v = self.verifier.as_ref().expect(
+        "verifier must be Some when nli_threshold > 0.0 \
+         (facade enforces this invariant in App::new)"
+    );
+    let (truncated_premise, _) = truncate_for_nli(&packed_text, &acc);
+    match v.score(&truncated_premise, &acc) {
+        Ok(scores) => {
+            let passed = scores.entailment >= self.config.rag.nli_threshold;
+            Some(VerificationSummary {
+                nli_score: scores.entailment,
+                nli_threshold: self.config.rag.nli_threshold,
+                nli_passed: passed,
+            })
+        }
+        Err(e) => {
+            // model unavailable / inference error → refusal path
+            tracing::warn!(target: "kebab-rag", error=%e, "NLI verifier failed");
+            return self.refuse_nli_model_unavailable(query, &opts, hops, started);
+        }
+    }
+} else {
+    None
+};
+if let Some(v) = &verification {
+    if !v.nli_passed {
+        return self.refuse_nli_verification(query, &opts, hops, v.clone(), started);
+    }
+}
+```
+
+- `nli_threshold = 0.0` (config default) → verify skip (backwards-compat for environments without model). 명시적 *single source of truth* — `enabled` field 별도 안 둠 (§2.6 참조).
+- `nli_threshold > 0.0` → verify ON. 권장 production 0.5 (multilingual NLI 의 한국어 보수). dogfood iteration 으로 tuning.
+- Inference error (model download fail, ONNX runtime panic 등) → `RefusalReason::NliModelUnavailable` (fail-closed).
+
+### 2.4 RefusalReason
+
+`kebab_core::RefusalReason` 에 신규 2 variant + wire mapping:
+
+| Rust variant | answer.v1 `refusal_reason` (snake) | error.v1 `code` (snake) | identical? |
+|---|---|---|---|
+| `NliVerificationFailed` | `"nli_verification_failed"` | `"nli_verification_failed"` | ✓ (predecessor `MultiHopDecomposeFailed` 패턴 정합 — noun + verb + state 순서) |
+| `NliModelUnavailable` | `"nli_model_unavailable"` | `"nli_model_unavailable"` | ✓ |
+
+두 wire string 모두 RefusalReason 과 error.v1.code 가 *동일* — consumer agent translation table 불요. predecessor `MultiHopDecomposeFailed` / `"multi_hop_decompose_failed"` 패턴 일관.
+
+**구현 시 결정**:
+- `RefusalReason::NliVerificationFailed` (Rust variant) → `#[serde(rename_all="snake_case")]` 가 자동으로 `"nli_verification_failed"` emit.
+- `answer.schema.json` 의 `refusal_reason.anyOf[0].enum` 에 두 값 추가.
+- `error.v1.code` enum 에 두 reservation 추가.
+- `error.v1.details.description` 의 per-code section 추가:
+  - `nli_verification_failed: { score, threshold }` (forward-looking, reserved).
+  - `nli_model_unavailable: { source }` (download / inference 실패 chain).
+
+### 2.5 Wire schema
+
+`answer.v1` 에 `verification` optional field:
+
+```json
+{
+  "schema_version": "answer.v1",
+  ...
+  "verification": {
+    "nli_score": 0.12,
+    "nli_threshold": 0.5,
+    "nli_passed": false
+  }
+}
+```
+
+- field naming: **`nli_score`** (단일 entire-answer NLI). future v0.19+ 의 atomic claim split 도입 시 `nli_min_score` / `nli_mean_score` 추가 가능 — 그때 별 wire bump.
+- `#[serde(default, skip_serializing_if = "Option::is_none")]` — additive minor. pre-v0.18 reader 무영향.
+- `$defs.VerificationSummary` 인라인 정의 (기존 `$defs.HopRecord` 패턴):
+  ```json
+  "$defs": {
+    "VerificationSummary": {
+      "type": "object",
+      "required": ["nli_score", "nli_threshold", "nli_passed"],
+      "properties": {
+        "nli_score":     { "type": "number" },
+        "nli_threshold": { "type": "number" },
+        "nli_passed":    { "type": "boolean" }
+      }
+    }
+  }
+  ```
+  `required` array 가 3 field 모두 present-when-non-null 명시 — strict consumer 정합 (HopRecord 패턴 답습).
+
+`refusal_reason.enum` 갱신 (`answer.schema.json` 의 `anyOf[0].enum` 에 추가):
+- `"nli_verification_failed"`
+- `"nli_model_unavailable"`
+
+### 2.6 Config knobs
+
+```toml
+[models.nli]
+# Production default = Xenova 의 ONNX export. 원본 PyTorch weight 는
+# MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7.
+model = "Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
+provider = "onnx"            # only one supported in v0.18
+
+[rag]
+# 0.0 = NLI disabled (v0.18 default). > 0.0 = enable.
+# 권장 production 0.5 (multilingual NLI 의 한국어 confidence 보수).
+# strict 환경 0.9 (Auto-GDA / MedTrust-RAG paper 의 production threshold).
+nli_threshold = 0.0
+```
+
+**default 결정 — `nli_threshold = 0.0` (disabled)**:
+
+- backward-compat: 옛 config / 새 사용자 모두 *NLI off* 로 시작 → 280 MB first-run download 강제 없음.
+- opt-in flag: 사용자가 `[rag] nli_threshold = 0.5` 설정 시 NLI active.
+- single source of truth: code path `if self.config.rag.nli_threshold > 0.0 { verify } else { skip }`. `enabled` flag 별도 안 둠 — 두 gate 의 모순 회피.
+- **edge case — `nli_threshold = 0.0` 의 entailment=0.0 비교**: §2.3 코드의 outer guard `if self.config.rag.nli_threshold > 0.0 { ... }` 가 disabled path 를 *short-circuit* — `>=` 비교 (`entailment >= threshold`) 는 *active 분기 (threshold > 0.0) 에서만* 도달. 즉 `entailment=0.0` + `threshold=0.0` 시나리오는 guard 가 verify 자체 skip → `>= 0.0 = true` 통과 path 절대 발생 안 함. doc reader 헷갈림 회피.
+
+**default 결정 — `enabled` field 제거**:
+
+round-1 review 의 D3 / A3 발견: `[models.nli].enabled` + `[rag].nli_threshold` 두 gate 모순 위험. **`enabled` field 미도입** — single gate `nli_threshold` 만:
+- `nli_threshold = 0.0` → verify skip + model never loaded.
+- `nli_threshold > 0.0` → verify on + model lazy-loaded on first multi-hop ask.
+
+env override: `KEBAB_MODELS_NLI_MODEL`, `KEBAB_RAG_NLI_THRESHOLD`. legacy config 의 `#[serde(default)]` backward-compat — 옛 config.toml 그대로 parse + `nli_threshold = 0.0` (skip).
+
+**model download 실패 fallback**:
+
+- `nli_threshold > 0.0` + first-run model download 실패 (network / disk full / corrupt) → 모든 multi-hop ask 가 `RefusalReason::NliModelUnavailable` (fail-closed). stderr warn 명시. 사용자가 (a) `nli_threshold = 0.0` 으로 임시 disable 또는 (b) network / disk 복구 후 재시도.
+- 사유: silent skip (verify 우회) 은 *S7 hallucination 재발* — 보안 측면에서 fail-closed 가 안전.
+
+**download progress indicator**:
+
+- first-run `score` 호출 시 hf-hub download — stderr 에 simple progress (예: `kebab-nli: downloading model.onnx (280 MB)...`).
+- non-`--json` mode 만 progress emit. `--json` mode 는 quiet (wire output 의 노이즈 회피).
+
+### 2.7 Single-pass NLI 도 적용?
+
+학계 표준은 single-pass + multi-hop 양쪽. 그러나 single-pass 의 LlmSelfJudge 가 *gemma3:4b 환경에서* 작동 (S7 single-pass 가 grounded=false). 본 spec 의 v0.18 scope:
+
+- **multi-hop 만 NLI 적용** — large prompt + pool union 의 hallucination risk 가 single-pass 보다 압도적.
+- single-pass NLI 는 *v0.18.1 priority candidate* — §1.1 의 "LlmSelfJudge probabilistic safety" 인정 위에 *defense in depth*. config knob `[rag] nli_single_pass_enabled = false` (default) 별 PR 에서 추가.
+
+(round-1 wording "redundant safety" → "v0.18 scope priority" 로 조정 — §1.4 의 ceiling 주장과 일관.)
+
+## 3. PR-9 단계별 sub-PRs
+
+### PR-9a — kebab-nli crate skeleton
+
+**Goal**: trait surface + scaffolding + workspace dep chain 도입. implementation 없이도 build 가능.
+
+**Files**:
+- `Cargo.toml` (workspace):
+  - `members` 에 `"crates/kebab-nli"` 추가.
+  - `workspace.dependencies` 에 추가 (fastembed 의 transitive 와 *정확히 일치*):
+    - `ort = { version = "=2.0.0-rc.9", default-features = false, features = ["ndarray"] }` (download-binaries 는 fastembed 의 transitive 활성화 의존 — features union).
+    - `tokenizers = { version = "0.21", default-features = false, features = ["onig"] }`.
+    - `hf-hub = { version = "0.4", default-features = false, features = ["ureq", "rustls-tls"] }` (fastembed 의 `hf-hub-native-tls` 와 cargo features union 처리 — `rustls-tls` 둘 다 활성화는 build OK).
+    - `ndarray = "0.16"`.
+- `crates/kebab-nli/Cargo.toml` 신규 (skeleton 만, PR-9b 가 추가):
+  - `dependencies`: `kebab-config`, `anyhow`, `serde`.
+  - `dev-dependencies`: `tempfile`.
+- `crates/kebab-nli/src/lib.rs`:
+  - `NliScores` struct + `faithfulness()` + `from_xnli_logits()`.
+  - `NliVerifier` trait.
+  - private `softmax3` helper.
+- `crates/kebab-nli/src/onnx.rs`:
+  - `OnnxNliVerifier` placeholder struct.
+  - `OnnxNliVerifier::new(&Config) -> Result<Self>` placeholder.
+  - `impl NliVerifier::score → bail!("PR-9a stub")`.
+
+**Pre-flight check (PR-9a 시작 전, manual)**:
+
+1. **Model + tokenizer file 존재 검증** — §2.1 의 `curl -I` 두 commands → `200 OK` 확인. 실패 시 PR-9 design re-evaluation.
+2. **`tokenizers` features 검증** — mDeBERTa-v3 tokenizer.json 이 `Tokenizer::from_file` 로 *어떤 feature set* 필요한지 standalone repro 로 확인:
+   ```sh
+   cargo new --bin /tmp/nli-tok-probe
+   cd /tmp/nli-tok-probe
+   cargo add tokenizers --no-default-features -F onig
+   wget https://huggingface.co/Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7/resolve/main/tokenizer.json
+   # main.rs: tokenizers::Tokenizer::from_file("tokenizer.json").expect("load");
+   cargo run --release
+   ```
+   성공 시 PR-9a 의 `tokenizers = { ..., features = ["onig"] }` lock. 실패 시 *진단*:
+   - `unstable_wasm` 또는 다른 feature 가 SentencePiece 모듈 활성화에 필요한지 확인 (tokenizers 0.21 docs 참조).
+   - `default-features = true` 가 가장 안전 path — features 결정에 confidence 부족 시.
+
+**Cargo features 의 결정 trace**: 본 pre-flight 결과는 PR-9a 의 PR description 의 `## Cargo features 결정 trace` 절에 첨부 (`cargo run` 출력 + 최종 features set). spec lock value.
+
+**Tests** (6 unit):
+- `softmax3_normalises_to_unit` — sum = 1, monotonic.
+- `softmax3_is_invariant_to_constant_shift` — log-sum-exp 안전성.
+- `nli_scores_from_xnli_logits_orders_correctly` — high entailment → entailment 최대.
+- `faithfulness_returns_entailment_channel`.
+- `new_succeeds_on_default_config`.
+- `score_returns_err_in_skeleton` — stub 의 명시적 err 메시지.
+
+**검증**:
+- `cargo test -p kebab-nli -j 1` — 6 통과.
+- `cargo clippy -p kebab-nli --all-targets -j 1 -- -D warnings` clean.
+
+**Wire 영향**: 없음 (crate 만 도입).
+
+**Risks**:
+- workspace 의 ort / tokenizers / hf-hub 추가 → 전체 build 재 link (큰 변화 없음, fastembed 가 이미 transitive).
+- features union 위험 — `download-binaries` (fastembed) + `ndarray` (kebab-nli) 동시 활성화는 build OK 검증 필수.
+
+**시간**: 2-3h.
+
+### PR-9b — OnnxNliVerifier 의 ONNX inference + model download
+
+**Goal**: `OnnxNliVerifier::score` 의 진짜 implementation. model + tokenizer download / cache / inference 완성.
+
+**Dependency**: PR-9a 머지 완료.
+
+**Files**:
+- `crates/kebab-nli/Cargo.toml`:
+  - `ort`, `tokenizers`, `hf-hub`, `ndarray`, `tracing` 추가 (workspace.dependencies 에서).
+- `crates/kebab-nli/src/onnx.rs`:
+  - `OnnxNliVerifier` 의 fields:
+    - `model_id: String`.
+    - `cache_dir: PathBuf` (`config.storage.model_dir.join("nli").join(sanitize(model_id))`).
+    - `session: OnceLock<ort::Session>`.
+    - `tokenizer: OnceLock<tokenizers::Tokenizer>`.
+  - `OnnxNliVerifier::new(&Config) -> Result<Self>`:
+    - `model_id`, `cache_dir` stamp. actual session/tokenizer load *deferred*.
+  - `ensure_loaded(&self) -> Result<(&Session, &Tokenizer)>`:
+    - hf-hub download (cache hit 시 skip + warn 에서 hit/miss 명시).
+    - tokenizer.json 로드 → `Tokenizer::from_file`.
+    - model.onnx 로드 → `Session::builder().commit_from_file`.
+    - truncation params 설정 (§2.2.3).
+    - 두 OnceLock 에 store.
+  - `score(premise, hypothesis)`:
+    - `ensure_loaded()` 호출.
+    - `tokenizer.encode((premise, hypothesis), add_special_tokens=true)`.
+    - input_ids + attention_mask ndarray `[1, seq_len]` i64.
+    - `session.run(ort::inputs![...])`.
+    - `outputs["logits"].try_extract_tensor::<f32>()` → shape `[1, 3]`.
+    - `NliScores::from_xnli_logits([l0, l1, l2])`.
+  - `sanitize_model_id(s: &str) -> String` helper — `/` → `_`.
+- `crates/kebab-nli/tests/inference.rs` 신규:
+  - `#[ignore]` integration test — real model download + 5 forward pass cases:
+    1. `premise = "Caffeine is a stimulant.", hypothesis = "Caffeine is a stimulant."` → entailment 매우 높음 (>0.8).
+    2. `premise = "Caffeine is a stimulant.", hypothesis = "The chemical formula of caffeine is C8H10N4O2."` → entailment 낮음 (<0.3) — neutral/contradiction.
+    3. Korean: `premise = "사과는 빨갛다.", hypothesis = "사과는 색이 있다."` → entailment 높음.
+    4. Long premise (10000 char) → truncation 적용 후 정상 score (panic 없음).
+    5. Empty hypothesis → graceful error (panic 없음, err 반환).
+
+**Manual smoke protocol (PR-9b PR description 강제)**:
+
+PR description 의 `## 검증` 절에 다음 *manual run* 결과 첨부:
+```sh
+cargo test -p kebab-nli -j 1 --test inference -- --ignored 2>&1 | tail -20
+```
+- 5 test 모두 PASS 확인.
+- 첫 case (entailment 높음) 의 NliScores dump (예: `entailment=0.92, neutral=0.05, contradiction=0.03`).
+
+CI 부담 회피 위해 unit test (no `--ignored`) 만 CI 실행. ignored test 는 PR 작업자 manual.
+
+**검증**:
+- unit test 통과 + clippy clean.
+- `--ignored` integration test 의 manual run (PR 작업자 책임, PR body 첨부).
+
+**Wire 영향**: 없음 (crate-internal).
+
+**Risks**:
+- `ort` 2.0-rc.9 의 API stability — rc 라 minor 사이 incompat 가능. *=mitigation*: workspace `ort = "=2.0.0-rc.9"` pin (fastembed 와 정확히 일치).
+- mDeBERTa-v3 의 ONNX export 가 Xenova HF Hub 에 존재 — §2.1 의 pre-flight check 가 PR-9a 시작 전 검증. 없으면 PR-9 design re-evaluation (다른 ONNX repo 또는 Optimum self-export).
+- `tokenizers` 0.21 의 SentencePiece 지원 — fastembed 가 BERT tokenizer 사용 (multilingual-e5-small), kebab-nli 가 mDeBERTa SentencePiece 사용 (다른 patterns). 첫 통합 위험.
+- `hf-hub` 0.4 의 `ureq + rustls-tls` features 가 workspace 의 다른 deps 와 incompat 없는지 — fastembed 의 `hf-hub-native-tls` 와 cargo features union 시 build OK 가정 (rustls-tls + native-tls 동시 활성화는 hf-hub crate features 가 mutually compatible 검증 필요).
+
+**시간**: **8-12h** (round-1 planner 의 6-8h underestimated 지적 반영). 첫 시도 실패 시 fallback (Optimum self-export) 까지 포함.
+
+### PR-9c — Pipeline integration (split: 9c-1 core + 9c-2 pipeline)
+
+**Goal**: kebab-rag pipeline 의 `ask_multi_hop` 에 NLI verify 통합. core types + wire + config 추가.
+
+**Dependency**: PR-9b 머지 완료.
+
+**round-1 review (P1 / M2) 분할 권장 반영** — 9c 를 **별 PR 2개로 분할** (9c-1 → 9c-2 sequential 머지). review 부담 분산 + git bisect 시 surface (wire/types) vs behavior (pipeline integration) 분리. 한 PR 의 commit 2개 방식보다 *별 PR* 채택 — round-1 P1 의 목적 (review 부담 ↓) 정합.
+
+#### PR-9c-1 — Core types + wire scaffolding (breaking surface)
+
+**Files**:
+- `crates/kebab-core/src/answer.rs`:
+  - `RefusalReason::NliVerificationFailed` + `RefusalReason::NliModelUnavailable` 신규.
+  - `Answer.verification: Option<VerificationSummary>` field.
+  - `VerificationSummary { nli_score: f32, nli_threshold: f32, nli_passed: bool }` 신규 struct.
+- `crates/kebab-config/src/lib.rs`:
+  - `NliCfg` 신규 struct + `[models.nli]`:
+    - `model: String` (default `"Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"`).
+    - `provider: String` (default `"onnx"`).
+  - `RagCfg.nli_threshold: f32` (default `0.0`).
+  - env override + legacy parse 단위 test.
+- `crates/kebab-rag/src/pipeline.rs`:
+  - `RagPipeline` 의 새 field: `verifier: Option<Arc<dyn NliVerifier>>` (None = verify off).
+  - **시그니처 widening 결정 = Option B (builder pattern)**:
+    - 기존 `RagPipeline::new(config, retriever, llm, sqlite)` 시그니처 *유지* (backward-compat for 18+ existing call sites).
+    - 신규 `pub fn with_verifier(self, v: Arc<dyn NliVerifier>) -> Self` builder.
+    - `kebab-app` facade 만 `with_verifier` 호출. 다른 caller (cli/tui/mcp tests) 무영향.
+    - Cargo.toml: `kebab-rag` 가 `kebab-nli` 의존 추가.
+- `docs/wire-schema/v1/answer.schema.json`:
+  - `verification` field 추가 (anyOf [object, null]) + `$defs.VerificationSummary` 인라인.
+  - `refusal_reason.enum` 에 `"nli_verification_failed"`, `"nli_model_unavailable"` 추가.
+- `docs/wire-schema/v1/error.schema.json`:
+  - `code` enum 에 `nli_verification_failed`, `nli_model_unavailable` 추가.
+  - `details.description` 에 두 항목 추가 (`multi_hop_decompose_failed: {}` 패턴 그대로).
+
+**Tests**:
+- `crates/kebab-config/src/lib.rs::tests`:
+  - `default_nli_threshold_is_zero`.
+  - `default_nli_model_is_xenova_mdeberta`.
+  - `legacy_config_without_nli_uses_defaults`.
+  - `env_override_nli_threshold`.
+- `crates/kebab-cli/tests/wire_ask_multi_hop.rs`:
+  - `answer_schema_declares_verification_field_and_defs`.
+  - `answer_schema_refusal_reason_enum_includes_nli_verification_failed` (+ `nli_model_unavailable`).
+  - `error_schema_code_enum_includes_nli_verification_failed` (+ `nli_model_unavailable`).
+
+**검증**:
+- `cargo test --workspace -j 1` — 회귀 0 (기존 multi-hop tests pass, RagPipeline::new 시그니처 unchanged).
+- `cargo clippy --workspace --all-targets -j 1 -- -D warnings` clean.
+
+**시간**: 2-3h.
+
+#### PR-9c-2 — Pipeline integration + mock test
+
+**Dependency**: PR-9c-1 머지 완료 (core types: `RefusalReason::Nli*` variants + `Answer.verification` field + `RagPipeline.verifier` field + `kebab-nli` 의 trait + config knobs 가 9c-1 에서 도입). 9c-2 가 그 위에 *behavior* 통합.
+
+**Files**:
+- `crates/kebab-rag/src/pipeline.rs`:
+  - `ask_multi_hop` 의 step 8.5 NLI hook (§2.3 코드).
+  - `refuse_nli_verification` helper (`refuse_*` 패턴) — `verification: Some(...)` 채움.
+  - `refuse_nli_model_unavailable` helper — `verification: None`.
+  - `pub fn truncate_for_nli(premise: &str, hypothesis: &str) -> (String, bool)` helper (§2.2.3 packed_text pre-budget). signature: 첫 return = truncated premise (max char count = `MAX_NLI_PREMISE_CHARS = 4 * 400` ≈ 1600 chars, hypothesis 길이 빼고 special tokens 32 char budget 적용 후 자연 보존). 둘째 return = was_truncated boolean (caller 가 tracing log 또는 wire 의 `verification` extension 에서 사용 가능 — v0.18 wire 추가 안 함, future v0.19+ candidate).
+  - **`MAX_NLI_PREMISE_CHARS` 의 token ratio 가정**: 4 char ≈ 1 token (영어 BPE 기준, mDeBERTa-v3 의 default). 한국어 SentencePiece 는 1-2 char/token (한 음절 = 1 token 통상) — 1600 chars 한국어 = 800-1600 tokens, max_seq_len 512 초과 가능. 이때 tokenizer 의 `OnlyFirst` truncation 가 backup 으로 작동 (premise 끝부터 잘림, hypothesis 보전). dogfood retest 의 S10 (KR) NLI score 측정 후 가능하면 *token-count 기반 budget* 으로 v0.18.1 갱신 — char-based budget 의 EN-biased 보정.
+- `crates/kebab-app`:
+  - `App::new` 또는 `pipeline_from_config` 가 NliVerifier 생성:
+    - `config.rag.nli_threshold > 0.0` → `OnnxNliVerifier::new(config)` 호출 + `Arc::new` wrap.
+    - `config.rag.nli_threshold == 0.0` → verifier = None.
+  - **facade invariant 결정 — `Result<App, anyhow::Error>` (construction-time error)**: `App::new` 가 `Result<Self, anyhow::Error>` 반환. `config.rag.nli_threshold > 0.0` + `OnnxNliVerifier::new` 실패 시 `bail!()` — user-facing crash 회피. `RagPipeline.verifier == None` + `config.rag.nli_threshold > 0.0` 의 *unreachable* 조합은 `expect("App::new enforces invariant")` safety net 만 — 정상 path 도달 불가능. round-2 critic NEW-M2 closure.
+- `crates/kebab-rag/tests/multi_hop.rs`:
+  - `common/mod.rs` 에 `MockNliVerifier { scores: NliScores }` helper.
+  - `multi_hop_nli_pass_keeps_grounded` — entailment 0.9 → grounded=true, verification.nli_passed=true.
+  - `multi_hop_nli_fail_refuses` — entailment 0.1 → refusal=NliVerificationFailed.
+  - `multi_hop_nli_disabled_skip_verify` — threshold = 0.0 → verify skip, verification=None.
+  - `multi_hop_nli_model_unavailable_refuses` — verifier Err → refusal=NliModelUnavailable.
+  - `multi_hop_truncate_for_nli_preserves_hypothesis` — long premise + 짧은 hypothesis → hypothesis 그대로.
+- `integrations/claude-code/kebab/SKILL.md`:
+  - `mcp__kebab__ask` 절에 NLI 안내 한 줄 — `answer.v1.verification.nli_passed` 의미 + threshold tuning 가이드 + `nli_verification_failed` / `nli_model_unavailable` refusal 처리.
+
+**Tests**: 5 신규 multi-hop tests (위 list) + 기존 tests 회귀 0.
+
+**검증**:
+- `cargo test --workspace -j 1` — 모든 test 통과 + 신규 5 multi-hop pass.
+- `cargo clippy --workspace --all-targets -j 1 -- -D warnings` clean.
+
+**Wire 영향**: PR-9c-1 의 wire schema 변경에 *behavior wiring* — `verification` field 가 multi-hop ask 의 happy path / refuse path 양쪽에서 채움.
+
+**시간**: 3-4h.
+
+**Total PR-9c (1+2)**: 5-7h (round-1 4-6h underestimated 반영 → 5-7h).
+
+### PR-9d — Dogfood retest + HOTFIXES closure
+
+**Goal**: PR-9c 머지 후 같은 dogfood corpus 에서 S7 + S1 + S3 + S10 retest. PR-9 의 진짜 작동 확인.
+
+**Dependency**: PR-9c 머지 완료.
+
+**Scope**: 본 *PR* 가 아닌 *별 commit* 로 가능성 ↑:
+- repo 변경 = `tasks/HOTFIXES.md` 의 "PR-9 closure" sub-section 추가 + (선택) `docs/dogfood/v0.18.0/` 의 dogfood result snapshot.
+- `/build/cache/dogfood-v018/results/post-pr9/` 는 repo 외 (gitignore 처럼).
+- **결정**: PR (gitea-pr) 또는 main 직접 commit 둘 다 가능. 작업자 선택 — review 부담 ↓ 우선이면 commit, audit trail 우선이면 PR. *본 spec 의 default = PR* (다른 PR 패턴과 일관).
+
+**Files**:
+- `tasks/HOTFIXES.md`:
+  - "PR-9 closure (post-v0.18 dogfood retest)" sub-section 추가 — pre/post 결과 비교 표.
+- `docs/dogfood/v0.18.0/` (신규 디렉토리):
+  - `SUMMARY.md` — sanitized dogfood 보고서 (원본 `/build/cache/dogfood-v018/results/SUMMARY.md` 의 repo 포함 가능 부분).
+  - `s7-multihop-post-pr9.json` — S7 multi-hop NLI 결과 sample (refuse + nli_score).
+  - `s1-multihop-post-pr9.json` — S1 multi-hop NLI 결과 sample (grounded + nli_score).
+- `/build/cache/dogfood-v018/results/post-pr9/` (작업 디렉토리, repo 외):
+  - 시나리오별 JSON dump + findings.md.
+
+**Tests**: 자동화 없음. 사용자 환경 (release binary + Ollama gemma3:4b + NLI model first-run) 에서 manual run:
+
+- `[rag] nli_threshold = 0.5` config (production 권장값).
+- S7 / S1 / S3 / S10 query → 각각 NLI score 측정 + grounded/refuse 확인.
+- **RAM peak 측정 protocol** (round-2 critic gap 반영) — 시작 전 `ps -o rss=,vsz=,comm= -p $(pgrep -f 'ollama|kebab')` baseline. multi-hop ask 진행 중 1초 간격 sampling (5분 cap) — `while sleep 1; do ps ... ; done > /tmp/ram-S<N>.log`. peak RSS = `awk '{sum+=$1} END {print max}'` (Ollama + kebab + NLI model 합산). 16 GB 환경 OOM 없는지 + peak < 10 GB 확인. release notes 의 권장 RAM (peak + 4 GB headroom) 한 줄 명시.
+
+**Pre-run prereq (manual + subagent 양쪽 적용)**: PR-9d 시작 전 환경 검증 — manual run 작업자 또는 subagent dispatch 모두 동일 prereq:
+
+- Ollama service running (`curl -s 127.0.0.1:11434/api/tags`).
+- dogfood corpus 디렉토리 존재 (`/build/cache/dogfood-v018/queries/*.txt`).
+- network reachable (hf-hub 의 280 MB NLI model first-run download 가능).
+- free RAM ≥ 6 GB (peak headroom).
+- release binary path: `/build/out/cargo-target/release/kebab` (CARGO_TARGET_DIR 활용 environment) 또는 `./target/release/kebab` (default in-tree).
+
+prereq 실패 시 subagent 가 *조기 abort* + 사용자 보고. *partial* dogfood 결과 commit 회피.
+
+**Expected (PASS criteria)**: §7 verification plan 의 acceptance criteria 표 단일 source of truth. 본 절에서는 *워크플로우 설명* 만 — measurement value 와 threshold 결정은 §7 표에서. duplication 회피 (round-4 R4-NEW-M1 + R4-NEW-N1 closure).
+
+dogfood iteration 결과에 따른 default 조정 trigger:
+- S1 의 entailment 가 0.6 미만이면 *legitimate answer 가 reject* 의 false positive — threshold 조정 (`nli_threshold = 0.3` 등) 또는 NLI model 교체 (xlm-roberta-large) 검토.
+- S3/S10 의 acceptable degraded outcome 이 50% 이상이면 multilingual NLI 의 한국어 confidence 약함 — model 교체 또는 token-count budget 갱신 (R3-NEW-N1 의 v0.18.1 candidate).
+
+**Wire 영향**: 없음 (docs only).
+
+**시간**: 4-6h (round-1 P3 PR vs commit 결정 + RAM 측정 + dogfood corpus 보존 추가).
+
+## 4. 이미 머지된 PR-1 ~ PR-8 의 결과
+
+| PR | 변경 | 상태 |
+|---|---|---|
+| #166 PR-1 | multi-hop eval golden set | ✅ |
+| #167 PR-2 | `ask_multi_hop` skeleton (fixed depth=2) | ✅ |
+| #168 PR-3a | HopRecord wire + RagCfg knobs | ✅ |
+| #169 PR-3b-i | dynamic decide loop + helpers | ✅ |
+| #170 PR-3b-ii | ScriptedLm + 7 multi-hop tests + refusal hop trace | ✅ |
+| #171 PR-4 | CLI `--multi-hop` flag + wire schema | ✅ |
+| #172 PR-5 | MCP `multi_hop: bool` arg + SKILL.md | ✅ |
+| #173 PR-6 | TUI F2 toggle + badge + hops summary | ✅ |
+| #174 PR-7 | pre-decompose probe gate (S7 1차 fix) | ✅ |
+| #175 PR-8 | synthesize prompt rule + pool 30→15 (S7 2차 partial mitigation) | ✅ |
+
+frozen design contract (`2026-05-25-p9-fb-41-multi-hop-rag-design.md`) 의 PR-3 분할 (3a/3b-i/3b-ii) + PR-7 / PR-8 추가는 *post-merge deviation*. HOTFIXES 에 기록 (이미 dated entries 존재).
+
+## 5. v0.18.0 cut PR (PR-9d 머지 후, 별 PR `chore: cut v0.18.0`)
+
+**바람직한 patterns** (round-1 M7 / D2 / M6 모두 반영):
+- `v0.18.0` bump + tag = **같은 commit** (CLAUDE.md "Release / binary version bump" rule).
+- frozen design §3.8 갱신은 *본 cut PR 안* 에서 (PR-9c 가 design contract 변경 안 함, 머지 후 한꺼번에).
+- `gitea-release` tag 는 본 PR 머지 commit 위에 즉시.
+
+**한 commit 내용 (또는 짧은 PR scope)**:
+1. **Workspace `Cargo.toml` version** 0.17.2 → 0.18.0 (minor bump).
+   - surface 확장: CLI `--multi-hop`, MCP `multi_hop`, TUI F2, answer.v1 `hops` + `verification`.
+   - prompt_template_version: `rag-multi-hop-v1` (PR-2 이후, 변경 없음).
+   - safety fix: PR-7 + PR-8 + PR-9.
+   - `Cargo.lock` 자동 cascade.
+2. **HANDOFF.md**:
+   - 한 줄 요약 (P0~P9 + P10 + v0.18.0 fb-41 multi-hop ship).
+   - 머지 후 결정 절에 fb-41 entry 단락 (PR-1~PR-9 + dogfood + NLI 한 문단).
+3. **HOTFIXES.md**:
+   - PR-9 closure sub-section anchor 정리 (`post-v0.18`).
+   - 기존 fb-41 entry 들 `post-v0.18` anchor.
+4. **INDEX.md**:
+   - fb-41 status `open` → `completed`.
+   - v0.18.0 release subheader (fb-41 multi-hop + NLI verification).
+5. **frozen design** (`docs/superpowers/specs/2026-04-27-kebab-final-form-design.md`):
+   - §3.8 RAG 의 multi-hop sub-section 추가 — 본 finalize spec 의 §1-§3 요약을 verbatim 형식으로 inline.
+   - §9 versioning cascade 표에 (선택) `nli_model_version` row — *config knob 변경만 cascade 영향, embedding 처럼 chunks 재-index 불요* 명시.
+6. **integrations/claude-code/kebab/SKILL.md**:
+   - PR-9c-2 에서 *비활성* 상태 NLI 안내 추가됨. cut PR 에서 v0.18.0 release notes link 한 줄.
+7. **README**:
+   - `kebab ask --multi-hop` + NLI 옵션 안내 한 단락 (model first-run download cost, RAM 권장).
+   - binary path confusion (round-1 N1 / dogfood SUMMARY §부수 발견) 한 줄 — `CARGO_TARGET_DIR` 활용 시 `/build/out/cargo-target/release/kebab` 명시.
+8. **`docs/SMOKE.md`**:
+   - NLI 옵션 활성화 절차 ([rag] nli_threshold = 0.5).
+   - first-run model download 안내 (~280 MB to `{data_dir}/models/nli/`).
+   - RAM 권장 (NLI active + Ollama **gemma3:4b** (권장 모델) 동시 — peak RSS ~5-6 GB; 16 GB 머신에서 OK). **8B+ Q4 모델** (gemma4:e4b 8B / gemma2:9b 등) 사용 시 *추정* peak ~10 GB — 16 GB 환경 경계, OOM risk 별 안내 한 줄.
+
+**같은 commit 의 PR title + tag**:
+- Commit msg: `chore: bump version 0.17.2 → 0.18.0 + cut fb-41 multi-hop`.
+- gitea-release: `v0.18.0` tag *본 commit* 위.
+- Release notes (자동 `--auto-notes`):
+  ```
+  # v0.18.0 — fb-41 multi-hop RAG ship + NLI verification
+  
+  ## 새 surface
+  - CLI: `kebab ask --multi-hop <query>` — multi-hop reasoning.
+  - MCP: `ask` tool `multi_hop: true` argument.
+  - TUI: Ask 패널 F2 toggle + multi-hop badge + hops summary.
+  
+  ## 새 wire
+  - `answer.v1.hops` — multi-hop per-iter trace (decompose/decide/synthesize).
+  - `answer.v1.verification` — NLI groundedness score (`nli_threshold > 0.0` 일 때).
+  - `error.v1.code` enum 확장: `multi_hop_decompose_failed`, `nli_verification_failed`, `nli_model_unavailable`.
+  
+  ## 새 config
+  - `[rag] multi_hop_max_depth` (default 3), `multi_hop_max_sub_queries_per_iter` (5), `multi_hop_max_pool_chunks` (15).
+  - `[rag] nli_threshold` (default 0.0 — disabled; 권장 production 0.5).
+  - `[models.nli] model` (default `Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7`).
+  
+  ## 새 RefusalReason
+  - `multi_hop_decompose_failed`, `nli_verification_failed`, `nli_model_unavailable`.
+  
+  ## 권장 환경
+  - LLM: gemma3:4b (CPU only, 16 GB RAM 권장).
+  - NLI 활성화 시: ~280 MB first-run download to `{data_dir}/models/nli/`.
+  - RAM peak (NLI active + Ollama 동시, **gemma3:4b 기준**): ~5-6 GB (16 GB 환경 OK). 8B+ Q4 모델 (gemma4:e4b 8B / gemma2:9b 등) 은 *추정* peak ~10 GB — 16 GB 경계.
+  
+  ## Known limitations
+  - single-pass NLI 미적용 (v0.18.1 priority).
+  - atomic claim split 미적용 (entire answer = 1 NLI call).
+  - GPU acceleration 미지원 (CPU ONNX runtime).
+  
+  ## 도그푸딩
+  - dogfood corpus snapshot: `docs/dogfood/v0.18.0/`.
+  - HOTFIXES dated entries 의 PR-9 closure 절 참조.
+  ```
+
+## 6. 한계 / 미해결 (v0.18.1+ 또는 P+)
+
+- **NLI single-pass 적용** — v0.18 scope 외. `[rag] nli_single_pass_enabled = false` (default) 별 PR.
+- **NLI threshold tuning** — production 표준 0.9, kebab 권장 enable 값 0.5 (config default 는 0.0 disabled, §2.6 참조; multilingual NLI 의 한국어 confidence 보수). PR-9d dogfood 후 적정값 결정 — *measured value* 가 default 갱신 또는 doc 권장값 갱신.
+- **Atomic claim split** — 현재 entire answer 1 claim. LLM-based claim extraction (별 LLM call) 은 v0.19+. wire field `nli_score` 가 *single* 인 이유.
+- **NLI false negative** — strong paraphrase → reject. dogfood 측정 후 threshold 조정 또는 model 교체 (xlm-roberta-large 1.5 GB).
+- **GPU acceleration** — ort 의 CUDA execution provider 가능. v0.19+ 사용자 환경 의존.
+- **release binary path confusion** — `target/release/kebab` (in-tree) vs `/build/cache/...` (CARGO_TARGET_DIR). v0.18.0 cut PR 의 README 한 줄 (§5 의 step 7 포함) — *deferred 아닌 closure*.
+- **Future LLM 의 ceiling 측정** — gemma4:e4b / qwen2.5:7b / larger 의 prompt-following 측정. NLI vs LLM-upgrade 의 ROI 재평가. v0.19+ dogfood agenda.
+- **NLI model 양자화 (Q8 INT8)** — 280 MB FP32 → ~70 MB INT8 (`Xenova/.../onnx/model_quantized.onnx`). accuracy 미세 저하. v0.19+ config knob `[models.nli] quantization = "fp32" | "q8"`.
+
+## 7. 검증 plan (PR-9d acceptance criteria)
+
+각 sub-PR 가 자체 회귀 핀. PR-9d 의 dogfood retest 가 *integration-level* 검증.
+
+**측정 환경 (전체 표 공통)**: `[rag] nli_threshold = 0.5` (production 권장값). *NLI score 자체* 가 expected range 안인지가 PASS — `nli_passed` boolean 은 threshold 함수라 redundant. dogfood 작업자가 다른 threshold 로 측정 시 (예: 0.3) 결과 해석 다를 수 있어 spec 가 *threshold lock*.
+
+| 시나리오 | path | primary expected | acceptable degraded | nli_score range | latency expected |
+|---|---|---|---|---|---|
+| S7 (caffeine, KB outside, EN) | multi-hop NLI | grounded=false, refusal=`nli_verification_failed` | (없음 — 반드시 NLI refuse) | **< 0.3** | 158s + NLI ~50ms (PR-7 probe gate 가 RRF top_score 0.5 > 0.30 통과시키므로 multi-hop pipeline 전체 진행 후 step 8.5 NLI refuse) |
+| S1 (compiler compound, KR) | multi-hop NLI | grounded=true, refusal=None | (없음 — 반드시 grounded) | **≥ 0.6** | 158-200s + NLI |
+| S3 (retrieval stack, **EN**) | multi-hop NLI | grounded=true, refusal=None | grounded=false + LlmSelfJudge (paraphrase 강한 EN→KR sub-queries 의 entailment 약함) — *citation marker 누락 잔존 issue, NLI 자체는 통과* | **≥ 0.5** | 같은 range |
+| S10 (dinosaur, KB outside, KR) | multi-hop NLI | grounded=false, refusal=`nli_verification_failed` | grounded=false + LlmSelfJudge (NLI 의 한국어 confidence 낮으면 LLM self-judge 가 reject path) | **< 0.4** | 590s |
+| S7 single-pass | single-pass (NLI 미적용) | grounded=false, LlmSelfJudge | (없음) | n/a (verification field 없음) | 30s |
+
+**Primary vs degraded acceptable** (round-2 critic P-M5 closure):
+- S7: NLI refuse 가 본 PR-9 의 *core 검증* — degraded outcome 허용 안 함. NLI 가 안 refuse 면 *PR-9 가 작동 안 함*.
+- S1: legitimate compound query — NLI 가 reject 시 *false positive*. degraded outcome 허용 안 함.
+- S3 / S10: NLI 의 한국어 confidence / paraphrase 강도가 multilingual NLI 의 known weakness. primary 우선 기대지만 degraded LlmSelfJudge 도 안전한 fail-closed path 라 acceptable. *그러나 degraded 가 50% 이상* 시 NLI 효과 약함 — threshold 조정 또는 model 교체 (xlm-roberta-large) 검토.
+
+PR-9d 의 PASS: S7 + S1 primary expectation 모두 충족 + S3/S10 의 primary 또는 acceptable degraded. range 밖 시 threshold 또는 model 재검토.
+
+**RAM peak 측정** (protocol 은 §3 PR-9d 참조):
+- Ollama RSS + kebab-cli RSS + NLI model RSS = peak 약 ~5-6 GB.
+- 16 GB 환경에서 OOM 없는지 확인. release notes 의 권장 RAM 명시.
+
+## 8. self-review notes
+
+- **PR-9 의 ONNX integration** 가 *새 dep chain* (ort + tokenizers + hf-hub) 도입 — 첫 사용 안정화 필요. PR-9b 의 `#[ignore]` test 의 manual smoke protocol (PR description 강제 첨부) 이 *production binary 의 실제 동작 검증* path.
+- **multi-hop NLI 의 latency 추가** — current multi-hop synthesize 158s + NLI ~50ms ≈ 158s. negligible.
+- **Model first-run download (~280 MB)** — 사용자 도그푸딩 환경 (CPU only) 의 disk + download bandwidth 1회 비용. README + SMOKE 안내. fail-closed download failure 정책.
+- **`RagPipeline::new` 시그니처 widening — Option B (builder) 결정** (round-1 A2 반영). 기존 시그니처 유지 + `with_verifier` builder. 18+ existing call sites 무영향.
+- **frozen design contract §3.8 갱신 timing — v0.18.0 cut PR 안** (round-1 M6 / D2 반영). PR-9c 가 contract 변경 안 함 — implementation 만. cut PR 에서 contract + implementation 결과 동시 갱신.
+- **kebab-nli 의 trait + impl 동일 crate** (round-1 A4 deferred 결정 명시) — v0.18 scope = adapter 1개. v0.19+ 에 multi-adapter 등장 시 `kebab-nli-onnx` 분리 (그 시점에 internal API breaking, wire 무관).
+- **single-pass NLI deferred wording** "v0.18 scope priority — multi-hop hallucination risk 가 single-pass 보다 큼" 으로 round-1 wording 조정 (M9 반영).
+- **alternative root cause 검토** (M1 반영) — §1.2 의 4 alternatives 비교 표. NLI 채택 ROI justification 강화.
+- **PR-9c 분할** (M2 / P1 반영) — 9c-1 (core types) + 9c-2 (pipeline integration).
+- **PR-9d PR vs commit** (P3 반영) — PR default, 작업자 선택 가능.
+- **dogfood corpus 보존** (P5 반영) — `docs/dogfood/v0.18.0/` 신규 dir + sanitized SUMMARY + sample JSON.
+- **RAM cold-start 측정** (P6 반영) — PR-9d 의 PASS criteria 에 포함.
+- **ort version pin** (P7 반영) — `workspace.dependencies.ort = "=2.0.0-rc.9"` (fastembed transitive 와 정확히 일치).
+- **integrations/claude-code/kebab/SKILL.md NLI 안내** (D6 반영) — PR-9c-2 에서 추가.
+
+## 9. round-1 review 의 issue closure 매트릭스
+
+| reviewer | issue | resolution |
+|---|---|---|
+| architect | A1 model ID 불일치 | §2.1 — Xenova/... config default + MoritzLaurer/... 원본 출처 명시 |
+| architect | A2 widening path 미결정 | §8 / §3 PR-9c-1 — Option B (builder) 결정 |
+| architect | A3 config default 모순 | §2.6 — `enabled` field 제거 + `nli_threshold` single gate |
+| architect | A4 crate split | §2.2 + §8 — v0.18 단일 crate, future split trigger 명시 |
+| architect | A5 ort version + feature | §3 PR-9a + §8 — `ort = "=2.0.0-rc.9"` pin, fastembed transitive 와 정확히 일치 |
+| architect | A6 cache_dir → model_dir | §2.2.2 — `config.storage.model_dir.join("nli")` |
+| architect | A7 §2.3 single-pass 주석 | §2.3 — 주석에서 single-pass 제거 + §2.7 cross-ref |
+| critic | C1 truncation strategy | §2.2.3 — `OnlyFirst` + `truncate_for_nli` helper + 회귀 핀 |
+| critic | M1 alternative root cause | §1.2 — 4 alternatives 비교 표 |
+| critic | M2 PR-9c scope 과대 | §3 — 9c-1 + 9c-2 분할 |
+| critic | M3 9b smoke protocol | §3 PR-9b — manual smoke + PR description 강제 첨부 |
+| critic | M4 threshold default 모순 | §2.6 + §7 — default 0.0 (disabled), production 권장 0.5, dogfood 측정값 별 명시 |
+| critic | M5 S1 acceptance criteria | §7 — measured value range 표 |
+| critic | M6 frozen design timing | §5 + §8 — cut PR 안에 통합 |
+| critic | M7 bump same-commit | §5 — 같은 commit 명시 + tag |
+| critic | M8 download fallback | §2.6 — fail-closed + NliModelUnavailable + warn |
+| critic | M9 single-pass deferred wording | §2.7 + §8 — wording 조정 |
+| critic | N1 binary path | §5 step 7 — README 한 줄 |
+| critic | N2 threshold 0.0 edge | §2.6 — doc comment 명시 |
+| critic | N3 wire naming | §2.4 — `nli_verification_failed` + `nli_model_unavailable` (snake 통일) |
+| planner | P1 9c scope | M2 와 같음 — 분할 |
+| planner | P2 9b 시간 | §3 PR-9b — 8-12h 갱신 |
+| planner | P3 9d PR vs commit | §3 PR-9d — PR default, 작업자 선택 |
+| planner | P4 model pre-flight | §2.1 + §3 PR-9a — pre-flight curl check |
+| planner | P5 dogfood 보존 | §3 PR-9d + §5 step 5 — `docs/dogfood/v0.18.0/` 신규 |
+| planner | P6 RAM cold-start | §7 — PR-9d acceptance criteria + release notes |
+| planner | P7 ort pin | §3 PR-9a — `"=2.0.0-rc.9"` |
+| planner | P8 frozen design timing | M6 와 같음 — cut PR 안 |
+| document | D1 schema refusal_reason.enum | §3 PR-9c-1 — `nli_verification_failed` + `nli_model_unavailable` |
+| document | D2 frozen design timing | M6 와 같음 |
+| document | D3 enabled/threshold | A3 와 같음 — `enabled` 제거 |
+| document | D4 error.v1 description | §2.4 — per-code description 갱신 |
+| document | D5 Xenova vs MoritzLaurer | A1 와 같음 — §2.1 명시 |
+| document | D6 SKILL.md | §3 PR-9c-2 — multi-hop ask 절에 NLI 안내 |
+
+### Round-2 issues (post-spec-v2 review)
+
+| reviewer | round-2 issue | round-3 resolution |
+|---|---|---|
+| document | ISSUE-1 RefusalReason rename | §2.4 — `FailedNliVerification` → `NliVerificationFailed`. wire `"nli_verification_failed"` 가 RefusalReason + error.v1.code 양쪽 동일. mapping 표 §2.4 inline. |
+| document | NIT-2 VerificationSummary required | §2.5 — `$defs.VerificationSummary` 의 `required: ["nli_score", "nli_threshold", "nli_passed"]` 명시. HopRecord 패턴. |
+| critic | P-M4 threshold context | §7 — 측정 환경 명시 (`nli_threshold = 0.5` lock). |
+| critic | P-M5 S3/S10 multi-outcome | §7 — primary expected + acceptable degraded 컬럼 + 50% 이상 degraded 시 model 재검토 명시. |
+| critic | P-N2 entailment=0.0 edge | §2.6 — outer guard `> 0.0` 가 disabled path short-circuit + `>=` 비교는 active 분기 도달. doc comment 명시. |
+| critic | P-N3 wire naming | §2.4 — RefusalReason wire 도 `nli_verification_failed` 통일. mapping 표 명시. document ISSUE-1 와 같음. |
+| critic | NEW-M1 tokenizers features | §3 PR-9a — pre-flight 의 standalone repro (`cargo new --bin nli-tok-probe ...`). Cargo features 결정 trace 를 PR description 의 별 절에 첨부. |
+| critic | NEW-M2 facade panic vs error | §3 PR-9c-2 — `App::new` 가 `Result<App, anyhow::Error>` 반환. `OnnxNliVerifier::new` 실패 시 `bail!`. unreachable safety net 만 `expect()`. |
+| critic | NEW-N1 truncate_for_nli signature | §3 PR-9c-2 — `pub fn truncate_for_nli(premise: &str, hypothesis: &str) -> (String, bool)` 명시. second = was_truncated. |
+| critic | NEW-N2 empty hypothesis | §2.3 — `if !acc.trim().is_empty()` guard. empty answer 는 step 8.5 skip — 다른 path (LlmStreamAborted 등) 가 처리. |
+| critic | What's missing RAM protocol | §3 PR-9d — `ps -o rss` 1초 sampling. peak < 10 GB 검증. |
+| critic | What's missing S3 EN | §7 — S3 표 row 의 language `(EN)` 명시. |
+| planner | round-2 nit #1 9c-1/9c-2 별 PR | §3 PR-9c — "별 PR 2개로 분할 (9c-1 → 9c-2 sequential 머지)" 명시. |
+| planner | round-2 nit #2 9c-2 dependency | §3 PR-9c-2 — "Dependency: PR-9c-1 머지 완료" 명시. |
+| planner | round-2 nit #3 시간 합산 | spec self-review 의 시간 합산 19-28h (plan v2 갱신 시 정정 예정). |
+| planner | round-2 nit #4 9d subagent prereq | §3 PR-9d — Ollama running + corpus 존재 + network reachable + free RAM 검증 prereq 명시. |
+
+### Round-3 issues (post-spec-v3 review)
+
+| reviewer | round-3 issue | round-4 resolution |
+|---|---|---|
+| critic | R3-NEW-M1 truncate_for_nli signature mismatch | §2.2.3 — *3-arg* recommendation 제거, `(premise, hypothesis) -> (String, bool)` 단일 source. signature lock = §3 PR-9c-2. |
+| critic | R3-NEW-M2 S7 latency wrong baseline | §7 표 S7 row — `158s + NLI ~50ms` (multi-hop pipeline 전체 진행 후 step 8.5 refuse). probe gate pass 가 원인 설명 inline. |
+| critic | R3-NEW-N1 MAX_NLI_PREMISE_CHARS 한국어 token ratio | §3 PR-9c-2 — 4 char ≈ 1 token (EN BPE), 한국어 SentencePiece 1-2 char/token. tokenizer OnlyFirst backup 명시 + dogfood S10 (KR) 측정 후 v0.18.1 token-count 기반 budget 갱신. |
+| critic | R3-NEW-N2 LLM 모델 환경 모순 | §5 step 8 + release notes — RAM peak 의 모델 명시 (gemma3:4b 기준 ~5-6 GB, 9B+ 모델 *추정* ~10 GB / 16 GB 경계). |
+| critic | R3-NEW-N3 prereq scope | §3 PR-9d — "Pre-run prereq (manual + subagent 양쪽)" wording 갱신. |
+
+### Round-4 issues (post-spec-v4 review)
+
+| reviewer | round-4 issue | round-5 resolution |
+|---|---|---|
+| critic | R4-NEW-M1 §3 PR-9d 표 vs §7 표 latency contradiction (S7) | §3 PR-9d — Expected 표 전체 제거, "§7 verification plan 표 단일 source of truth" cross-ref 로 대체. duplication 회피. |
+| critic | R4-NEW-N1 §3 PR-9d 표 format inconsistency | R4-NEW-M1 와 동시 closure (cross-ref 가 양쪽 해결). |
+| critic | R4-NEW-NIT-1 9B+ 모델 naming | §5 step 8 — "9B+ 모델" → "8B+ Q4 모델 (gemma4:e4b 8B / gemma2:9b 등)" |
+| critic | R4-NEW-NIT-2 §6 default 0.5 wording | §6 — "kebab default 0.5" → "kebab 권장 enable 값 0.5 (config default 는 0.0 disabled, §2.6 참조)" |
-- 
2.49.1


From 1eb0bbecb3832e0cb95e273c39c47cb2ba0d8f4c Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Mon, 25 May 2026 21:22:38 +0000
Subject: [PATCH 2/3] =?UTF-8?q?feat(nli):=20fb-41=20PR-9a=20=E2=80=94=20ke?=
 =?UTF-8?q?bab-nli=20crate=20skeleton=20+=20workspace=20deps?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 신규 crate kebab-nli (trait + impl 동일 crate, v0.18 scope = ONNX adapter 1개).
- NliVerifier trait + NliScores struct (XNLI 3-channel: entailment/neutral/contradiction).
- private softmax3 (log-sum-exp 안전).
- OnnxNliVerifier placeholder (PR-9b 가 ONNX inference + model download 추가).
- workspace.dependencies 추가: ort 2.0-rc.9, tokenizers 0.21 (default-features=false, onig), hf-hub 0.4, ndarray 0.16.

Pre-flight (PR-9 design contract 의 gate):
- HF Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7 model.onnx + tokenizer.json → HTTP/2 302 (HF S3 routing, file 존재).
- tokenizers --no-default-features -F onig 의 standalone repro: SentencePiece mDeBERTa tokenizer.json 로드 OK (KR 9 tokens / EN 11 tokens 정상 encode).
- Cargo features 결정 trace: tokenizers = { default-features = false, features = ["onig"] } lock.

Tests: 6 unit (softmax3 정규화 + 불변성 + XNLI logits 변환 + faithfulness + new + score stub) — 통과.
Verification: cargo test -p kebab-nli -j 1 (6/6) + cargo clippy -p kebab-nli --all-targets -j 1 -- -D warnings clean.
Workspace: cargo test --workspace -j 1 — pre-existing kebab-mcp::tools_call_ask_multi_hop 1 fail (main baseline 동일 fail, PR-9a 무관 — ingest fixture/Ollama 의존 flaky).

Wire 영향: 없음 (crate 도입만).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                   |  10 +++
 Cargo.toml                   |  14 +++++
 crates/kebab-nli/Cargo.toml  |  20 ++++++
 crates/kebab-nli/src/lib.rs  | 114 +++++++++++++++++++++++++++++++++++
 crates/kebab-nli/src/onnx.rs |  58 ++++++++++++++++++
 5 files changed, 216 insertions(+)
 create mode 100644 crates/kebab-nli/Cargo.toml
 create mode 100644 crates/kebab-nli/src/lib.rs
 create mode 100644 crates/kebab-nli/src/onnx.rs

diff --git a/Cargo.lock b/Cargo.lock
index 6d6b211..91c7e4a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4327,6 +4327,16 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "kebab-nli"
+version = "0.17.2"
+dependencies = [
+ "anyhow",
+ "kebab-config",
+ "serde",
+ "tempfile",
+]
+
 [[package]]
 name = "kebab-normalize"
 version = "0.17.2"
diff --git a/Cargo.toml b/Cargo.toml
index d2d3d01..0d332f5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,6 +24,7 @@ members = [
     "crates/kebab-tui",
     "crates/kebab-mcp",
     "crates/kebab-parse-code",
+    "crates/kebab-nli",
 ]
 
 [workspace.package]
@@ -102,6 +103,19 @@ tree-sitter-kotlin-ng  = "1.1.0"   # bare tree-sitter-kotlin requires ts <0.23;
 # C/C++ family grammars for code ingest (kebab-parse-code, p10-1D).
 tree-sitter-c         = "0.24.2"
 tree-sitter-cpp       = "0.23.4"
+# fb-41 PR-9 (kebab-nli): mDeBERTa-v3 XNLI verifier deps. Versions match
+# the fastembed 4.9 transitive set so the ONNX Runtime + tokenizer stack
+# stays single-versioned across the workspace. ort `default-features=false`
+# drops the bundled binary downloader (fastembed already provides one);
+# tokenizers `default-features=false, onig` swaps the default `esaxx` regex
+# backend for `onig` so the build doesn't need libstdc++ headers (verified
+# via PR-9a pre-flight: SentencePiece tokenizer.json loads + KR/EN encode).
+# hf-hub uses `ureq + rustls-tls` to stay aligned with kebab-embed-local's
+# pure-Rust TLS stack.
+ort          = { version = "=2.0.0-rc.9", default-features = false, features = ["ndarray"] }
+tokenizers   = { version = "0.21", default-features = false, features = ["onig"] }
+hf-hub       = { version = "0.4", default-features = false, features = ["ureq", "rustls-tls"] }
+ndarray      = "0.16"
 
 # Disk-footprint trim for dev / test builds. Codegen, opt-level, and
 # behavior are unchanged — only DWARF debug info is reduced (line
diff --git a/crates/kebab-nli/Cargo.toml b/crates/kebab-nli/Cargo.toml
new file mode 100644
index 0000000..7336062
--- /dev/null
+++ b/crates/kebab-nli/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "kebab-nli"
+version       = { workspace = true }
+edition       = { workspace = true }
+rust-version  = { workspace = true }
+license       = { workspace = true }
+repository    = { workspace = true }
+description   = "fb-41: NLI-based post-synthesis verification (XNLI mDeBERTa-v3). PR-9a = trait + scaffolding; ONNX inference lands in PR-9b."
+
+[dependencies]
+# PR-9a scope: kebab-config for the OnnxNliVerifier::new(&Config) signature
+# the rag crate will call once PR-9d wires verification into ask_multi_hop.
+# ort / tokenizers / hf-hub / ndarray are intentionally NOT depended on here
+# — they sit in workspace.dependencies until PR-9b adds the real adapter.
+kebab-config = { path = "../kebab-config" }
+anyhow       = { workspace = true }
+serde        = { workspace = true }
+
+[dev-dependencies]
+tempfile     = { workspace = true }
diff --git a/crates/kebab-nli/src/lib.rs b/crates/kebab-nli/src/lib.rs
new file mode 100644
index 0000000..6bc4b3b
--- /dev/null
+++ b/crates/kebab-nli/src/lib.rs
@@ -0,0 +1,114 @@
+//! `kebab-nli` — NLI-based post-synthesis verification for multi-hop RAG.
+//!
+//! fb-41 introduces a mDeBERTa-v3 XNLI verifier that runs on
+//! `(packed_chunks, generated_answer)` after synthesize. If
+//! `NliScores::faithfulness()` < threshold the rag crate refuses the answer
+//! with `NliVerificationFailed`. PR-9a (this file) is the trait surface +
+//! scaffolding only — `OnnxNliVerifier::score` returns a stub error until
+//! PR-9b adds the real ONNX inference path.
+
+use serde::{Deserialize, Serialize};
+
+pub mod onnx;
+
+pub use onnx::OnnxNliVerifier;
+
+/// Three-channel XNLI output. Channel order matches the standard XNLI
+/// `id2label` mapping `[entailment, neutral, contradiction]` shipped with
+/// the Xenova mDeBERTa-v3 model.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, Deserialize)]
+pub struct NliScores {
+    pub entailment: f32,
+    pub neutral: f32,
+    pub contradiction: f32,
+}
+
+impl NliScores {
+    /// Faithfulness score = entailment channel. The rag crate compares this
+    /// against `rag.nli_faithfulness_min` to decide whether to refuse.
+    pub fn faithfulness(&self) -> f32 {
+        self.entailment
+    }
+
+    /// Wrap raw XNLI logits (`[entailment, neutral, contradiction]`) into
+    /// a normalised `NliScores`. Applies a numerically-stable softmax3.
+    pub fn from_xnli_logits(logits: [f32; 3]) -> Self {
+        let probs = softmax3(logits);
+        Self {
+            entailment: probs[0],
+            neutral: probs[1],
+            contradiction: probs[2],
+        }
+    }
+}
+
+/// Abstract NLI verifier. `score` is called with `(premise = packed chunks,
+/// hypothesis = generated answer)` — the standard NLI direction (premise
+/// entails hypothesis ⇒ answer is grounded in retrieved evidence).
+pub trait NliVerifier: Send + Sync {
+    fn score(&self, premise: &str, hypothesis: &str) -> anyhow::Result<NliScores>;
+}
+
+/// Numerically stable 3-way softmax (subtract max for log-sum-exp safety).
+/// Private — call sites should go through `NliScores::from_xnli_logits`.
+fn softmax3(logits: [f32; 3]) -> [f32; 3] {
+    let max = logits[0].max(logits[1]).max(logits[2]);
+    let e0 = (logits[0] - max).exp();
+    let e1 = (logits[1] - max).exp();
+    let e2 = (logits[2] - max).exp();
+    let sum = e0 + e1 + e2;
+    [e0 / sum, e1 / sum, e2 / sum]
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn approx_eq(a: f32, b: f32, eps: f32) -> bool {
+        (a - b).abs() <= eps
+    }
+
+    #[test]
+    fn softmax3_normalises_to_unit() {
+        let p = softmax3([1.0, 2.0, 3.0]);
+        assert!(p.iter().all(|x| *x > 0.0));
+        assert!(approx_eq(p[0] + p[1] + p[2], 1.0, 1e-6));
+        // Monotonic: larger logit ⇒ larger probability.
+        assert!(p[0] < p[1] && p[1] < p[2]);
+    }
+
+    #[test]
+    fn softmax3_is_invariant_to_constant_shift() {
+        let a = softmax3([1.0, 2.0, 3.0]);
+        let b = softmax3([101.0, 102.0, 103.0]);
+        for i in 0..3 {
+            assert!(
+                approx_eq(a[i], b[i], 1e-6),
+                "channel {i} drifted: a={a:?} b={b:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn nli_scores_from_xnli_logits_orders_correctly() {
+        // entailment dominates ⇒ entailment is the max probability channel.
+        let s = NliScores::from_xnli_logits([5.0, 1.0, 0.5]);
+        assert!(s.entailment > s.neutral);
+        assert!(s.entailment > s.contradiction);
+        assert!(approx_eq(
+            s.entailment + s.neutral + s.contradiction,
+            1.0,
+            1e-6
+        ));
+    }
+
+    #[test]
+    fn faithfulness_returns_entailment_channel() {
+        let s = NliScores {
+            entailment: 0.7,
+            neutral: 0.2,
+            contradiction: 0.1,
+        };
+        assert!(approx_eq(s.faithfulness(), 0.7, f32::EPSILON));
+    }
+}
diff --git a/crates/kebab-nli/src/onnx.rs b/crates/kebab-nli/src/onnx.rs
new file mode 100644
index 0000000..da1f025
--- /dev/null
+++ b/crates/kebab-nli/src/onnx.rs
@@ -0,0 +1,58 @@
+//! ONNX-backed `NliVerifier` adapter (mDeBERTa-v3 XNLI).
+//!
+//! PR-9a: scaffolding only. `new` succeeds against the default `Config`
+//! and `score` returns an explicit `"PR-9a stub"` error so any caller that
+//! wires this up before PR-9b lands gets a loud failure instead of silent
+//! all-zero scores. PR-9b will add ort `Session` + `Tokenizer` lazy init
+//! and real inference.
+
+use crate::{NliScores, NliVerifier};
+
+/// ONNX-runtime mDeBERTa-v3 XNLI verifier.
+///
+/// PR-9a scaffolding holds no state — fields land in PR-9b
+/// (`model_id`, `cache_dir`, `session: OnceLock<ort::Session>`,
+/// `tokenizer: OnceLock<tokenizers::Tokenizer>`).
+pub struct OnnxNliVerifier {
+    _private: (),
+}
+
+impl OnnxNliVerifier {
+    /// Construct a verifier from the user's `Config`. PR-9a always returns
+    /// `Ok` because the real model + tokenizer download is deferred to
+    /// PR-9b's first `score` call.
+    pub fn new(_config: &kebab_config::Config) -> anyhow::Result<Self> {
+        Ok(Self { _private: () })
+    }
+}
+
+impl NliVerifier for OnnxNliVerifier {
+    fn score(&self, _premise: &str, _hypothesis: &str) -> anyhow::Result<NliScores> {
+        anyhow::bail!("PR-9a stub — ONNX inference lands in PR-9b")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use kebab_config::Config;
+
+    #[test]
+    fn new_succeeds_on_default_config() {
+        let cfg = Config::defaults();
+        let v = OnnxNliVerifier::new(&cfg).expect("new should succeed on default config");
+        // Silence unused-binding lint without weakening the assertion.
+        let _ = &v;
+    }
+
+    #[test]
+    fn score_returns_err_in_skeleton() {
+        let cfg = Config::defaults();
+        let v = OnnxNliVerifier::new(&cfg).unwrap();
+        let err = v.score("a", "b").expect_err("PR-9a stub must error");
+        assert!(
+            err.to_string().contains("PR-9a stub"),
+            "unexpected error message: {err}"
+        );
+    }
+}
-- 
2.49.1


From 1d88dccf8ab3b86f3f3bffdb88be6dfad8dee402 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Mon, 25 May 2026 21:25:44 +0000
Subject: [PATCH 3/3] =?UTF-8?q?chore(nli):=20PR=20#176=20=ED=9A=8C?=
 =?UTF-8?q?=EC=B0=A8=201=20=EB=A6=AC=EB=B7=B0=20=EB=B0=98=EC=98=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- lib.rs::NliScores::faithfulness doc 의 `rag.nli_faithfulness_min` → `rag.nli_threshold` (spec §2.5/§2.6 의 실 config knob 이름 정합).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kebab-nli/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/kebab-nli/src/lib.rs b/crates/kebab-nli/src/lib.rs
index 6bc4b3b..7a2b43a 100644
--- a/crates/kebab-nli/src/lib.rs
+++ b/crates/kebab-nli/src/lib.rs
@@ -25,7 +25,7 @@ pub struct NliScores {
 
 impl NliScores {
     /// Faithfulness score = entailment channel. The rag crate compares this
-    /// against `rag.nli_faithfulness_min` to decide whether to refuse.
+    /// against `rag.nli_threshold` to decide whether to refuse.
     pub fn faithfulness(&self) -> f32 {
         self.entailment
     }
-- 
2.49.1