kebab/crates/kebab-rag/tests/multi_hop.rs

//! p9-fb-41 PR-3b-ii: integration tests for the dynamic multi-hop
//! decide loop in [`RagPipeline::ask_multi_hop`].
//!
//! Each test uses [`ScriptedLm`] to drive a different LLM response
//! per call (decompose → 0..N decide → synthesize) and, where the
//! scenario requires, [`ScriptedRetriever`] to drive different hit
//! lists per retrieval round. The test fixture stays mock-only —
//! no Ollama / fastembed / LanceDB.
//!
//! Coverage:
//!
//! 1. `decide_stop_triggers_synthesize` — decide returns `[]`,
//!    pipeline transitions straight to synthesize.
//! 2. `decide_continue_adds_more_chunks` — decide returns
//!    `["q2"]`, iter 2 retrieves and grows the pool.
//! 3. `max_depth_force_stops` — `multi_hop_max_depth = 1` forces
//!    `forced_stop = true` on the depth-1 decide hop and skips the
//!    decide LLM call.
//! 4. `pool_chunks_dedup_by_chunk_id` — two sub-queries return the
//!    same chunk; pool dedups by `chunk_id`.
//! 5. `decide_parse_failure_falls_through_to_synthesize` — decide
//!    LLM emits non-JSON garbage; pipeline graceful-degrades to
//!    synthesize (NOT a refusal).

mod common;

use std::sync::Arc;

use common::{MockNliVerifier, RagEnv, ScriptedLm, ScriptedRetriever, id32, mk_hit};
use kebab_core::{HopKind, LanguageModel, RefusalReason, Retriever, SearchMode};
use kebab_nli::NliVerifier;
use kebab_rag::{AskOpts, RagPipeline, truncate_for_nli};

/// Default `AskOpts` for multi-hop tests: deterministic seed,
/// lexical mode (so the test crate doesn't need to wire up an
/// embedder), and `multi_hop: true` to route through
/// `ask_multi_hop`.
fn multi_hop_opts() -> AskOpts {
    AskOpts {
        k: 5,
        explain: false,
        mode: SearchMode::Lexical,
        temperature: Some(0.0),
        seed: Some(0),
        stream_sink: None,
        history: Vec::new(),
        conversation_id: None,
        turn_index: None,
        multi_hop: true,
    }
}

// ── 1. decide returns [] → synthesize immediately ─────────────────────────

#[test]
fn multi_hop_decide_stop_triggers_synthesize() {
    let env = RagEnv::new();
    let cid = id32("c1");
    let did = id32("d1");
    env.seed_chunk(&cid, &did, "notes/a.md", "Body text.", &["Intro"]);
    let hits = vec![mk_hit(1, &cid, &did, "notes/a.md", 0.85, &["Intro"])];
    // PR-7: ScriptedRetriever entry 0 = probe retrieve (pre-decompose
    // score-gate), entry 1 = decompose-driven retrieve for "q1".
    let retriever = Arc::new(ScriptedRetriever::new(vec![hits.clone(), hits]));
    let retriever_handle = retriever.clone();
    let retriever_dyn: Arc<dyn Retriever> = retriever;

    // Three LLM calls in order: decompose → decide → synthesize.
    let lm = Arc::new(ScriptedLm::new(vec![
        r#"["q1"]"#,
        r"[]",
        "answer body [#1]",
    ]));
    let lm_handle = lm.clone();
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let pipeline =
        RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone());

    let answer = pipeline.ask("compound", multi_hop_opts()).unwrap();

    assert!(answer.grounded, "decide-stop synthesize must be grounded");
    assert_eq!(answer.refusal_reason, None);
    assert_eq!(
        lm_handle.calls(),
        3,
        "decompose + decide + synthesize = 3 LLM calls"
    );
    assert_eq!(
        retriever_handle.calls(),
        2,
        "probe retrieve + 1 sub-query retrieve = 2"
    );

    let hops = answer.hops.expect("multi-hop happy path stamps Some(hops)");
    assert_eq!(hops.len(), 3, "[Decompose, Decide(stop), Synthesize]");
    assert_eq!(hops[0].kind, HopKind::Decompose);
    assert_eq!(hops[0].sub_queries, vec!["q1"]);
    assert_eq!(hops[1].kind, HopKind::Decide);
    assert!(
        hops[1].sub_queries.is_empty(),
        "decide stop signal → empty sub_queries on the HopRecord"
    );
    assert!(
        !hops[1].forced_stop,
        "LLM stop signal is NOT a forced_stop (forced_stop = cap-driven only)"
    );
    assert_eq!(hops[2].kind, HopKind::Synthesize);
}

// ── 2. decide ["q2"] → iter 2 retrieves and grows the pool ────────────────

#[test]
fn multi_hop_decide_continue_adds_more_chunks() {
    let env = RagEnv::new();
    let cid1 = id32("c1");
    let did1 = id32("d1");
    let cid2 = id32("c2");
    let did2 = id32("d2");
    env.seed_chunk(&cid1, &did1, "notes/a.md", "Chunk one.", &["A"]);
    env.seed_chunk(&cid2, &did2, "notes/b.md", "Chunk two.", &["B"]);
    // PR-7: entry 0 = probe (above gate), entry 1 = iter 1 retrieves
    // chunk 1, entry 2 = iter 2 retrieves chunk 2.
    let retriever = Arc::new(ScriptedRetriever::new(vec![
        vec![mk_hit(1, &cid1, &did1, "notes/a.md", 0.85, &["A"])],
        vec![mk_hit(1, &cid1, &did1, "notes/a.md", 0.85, &["A"])],
        vec![mk_hit(1, &cid2, &did2, "notes/b.md", 0.80, &["B"])],
    ]));
    let retriever_handle = retriever.clone();
    let retriever_dyn: Arc<dyn Retriever> = retriever;

    let lm = Arc::new(ScriptedLm::new(vec![
        r#"["q1"]"#,
        r#"["q2"]"#,
        r"[]",
        "synthesized [#1] [#2]",
    ]));
    let lm_handle = lm.clone();
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let pipeline =
        RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone());

    let answer = pipeline.ask("compound", multi_hop_opts()).unwrap();

    assert!(answer.grounded);
    assert_eq!(answer.refusal_reason, None);
    assert_eq!(
        lm_handle.calls(),
        4,
        "decompose + 2 decide + synthesize = 4 LLM calls"
    );
    assert_eq!(
        retriever_handle.calls(),
        3,
        "probe + iter 1 retrieves q1 + iter 2 retrieves q2"
    );
    assert_eq!(
        answer.retrieval.chunks_returned, 2,
        "pool accumulates one new chunk per iter"
    );

    let hops = answer.hops.expect("happy path stamps hops");
    assert_eq!(hops.len(), 4, "[Decompose, Decide(continue), Decide(stop), Synthesize]");
    assert_eq!(hops[0].kind, HopKind::Decompose);
    assert_eq!(hops[1].kind, HopKind::Decide);
    assert_eq!(hops[1].sub_queries, vec!["q2"], "iter 1 decide emits q2");
    assert_eq!(
        hops[1].context_chunks_added, 1,
        "iter 1 retrieve added chunk 1"
    );
    assert_eq!(hops[2].kind, HopKind::Decide);
    assert!(hops[2].sub_queries.is_empty(), "iter 2 decide signals stop");
    assert_eq!(
        hops[2].context_chunks_added, 1,
        "iter 2 retrieve added chunk 2"
    );
    assert_eq!(hops[3].kind, HopKind::Synthesize);
}

// ── 3. max_depth=1 → forced_stop, decide LLM call skipped ─────────────────

#[test]
fn multi_hop_max_depth_force_stops() {
    let env = RagEnv::new();
    let cid = id32("c1");
    let did = id32("d1");
    env.seed_chunk(&cid, &did, "notes/a.md", "Body text.", &["Intro"]);
    let mut cfg = env.config.clone();
    // depth 1 means: iter 1 is the last iter, so the per-iter
    // `depth_force_stop = iter >= max_depth` fires and the decide
    // LLM call is skipped entirely.
    cfg.rag.multi_hop_max_depth = 1;

    let hits = vec![mk_hit(1, &cid, &did, "notes/a.md", 0.85, &["Intro"])];
    // PR-7: entry 0 = probe, entry 1 = decompose-driven retrieve.
    let retriever = Arc::new(ScriptedRetriever::new(vec![hits.clone(), hits]));
    let retriever_handle = retriever.clone();
    let retriever_dyn: Arc<dyn Retriever> = retriever;

    // Only 2 LLM calls scripted — decompose + synthesize. If the
    // pipeline tries to call decide (a bug), ScriptedLm panics on
    // exhaustion and the test fails loudly with the call index.
    let lm = Arc::new(ScriptedLm::new(vec![
        r#"["q1"]"#,
        "answer [#1]",
    ]));
    let lm_handle = lm.clone();
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let pipeline = RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone());

    let answer = pipeline.ask("q", multi_hop_opts()).unwrap();

    assert!(answer.grounded);
    assert_eq!(
        lm_handle.calls(),
        2,
        "depth-cap skips decide → only decompose + synthesize"
    );
    assert_eq!(retriever_handle.calls(), 2, "probe + 1 decompose retrieve");

    let hops = answer.hops.expect("happy path stamps hops");
    assert_eq!(hops.len(), 3, "[Decompose, Decide(forced_stop), Synthesize]");
    assert_eq!(hops[1].kind, HopKind::Decide);
    assert!(
        hops[1].forced_stop,
        "depth cap must surface forced_stop=true on the Decide hop"
    );
    assert!(
        hops[1].sub_queries.is_empty(),
        "skipped decide carries no sub_queries"
    );
    assert_eq!(
        hops[1].llm_call_ms, 0,
        "skipped decide records 0ms — no LLM call happened"
    );
}

// ── 4. dedup: two sub-queries hit same chunk_id, pool keeps 1 ─────────────

#[test]
fn multi_hop_pool_chunks_dedup_by_chunk_id() {
    let env = RagEnv::new();
    let cid = id32("c1");
    let did = id32("d1");
    env.seed_chunk(&cid, &did, "notes/a.md", "Shared chunk text.", &["X"]);
    // Both sub-queries retrieve the same chunk_id — dedup must
    // keep exactly one pool entry. PR-7: entry 0 = probe.
    let shared_hit = mk_hit(1, &cid, &did, "notes/a.md", 0.85, &["X"]);
    let retriever = Arc::new(ScriptedRetriever::new(vec![
        vec![shared_hit.clone()],
        vec![shared_hit.clone()],
        vec![shared_hit],
    ]));
    let retriever_handle = retriever.clone();
    let retriever_dyn: Arc<dyn Retriever> = retriever;

    let lm = Arc::new(ScriptedLm::new(vec![
        r#"["q1", "q2"]"#,
        r"[]",
        "merged answer [#1]",
    ]));
    let lm_handle = lm.clone();
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let pipeline =
        RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone());

    let answer = pipeline.ask("q", multi_hop_opts()).unwrap();

    assert!(answer.grounded);
    assert_eq!(
        retriever_handle.calls(),
        3,
        "probe + two sub-query retrieves"
    );
    assert_eq!(
        answer.retrieval.chunks_returned, 1,
        "dedup by chunk_id keeps pool at 1"
    );
    assert_eq!(answer.citations.len(), 1, "only one chunk cited as [#1]");
    assert_eq!(answer.citations[0].marker.as_deref(), Some("[1]"));
    assert_eq!(
        lm_handle.calls(),
        3,
        "decompose + decide + synthesize = 3"
    );

    let hops = answer.hops.expect("happy path stamps hops");
    assert_eq!(hops.len(), 3, "[Decompose, Decide, Synthesize]");
    assert_eq!(hops[0].sub_queries, vec!["q1", "q2"]);
    assert_eq!(
        hops[1].context_chunks_added, 1,
        "dedup reduces 2 retrievals → 1 new pool entry"
    );
}

// ── 5. decide parse failure → graceful synthesize (NOT a refusal) ─────────

#[test]
fn multi_hop_decide_parse_failure_falls_through_to_synthesize() {
    let env = RagEnv::new();
    let cid = id32("c1");
    let did = id32("d1");
    env.seed_chunk(&cid, &did, "notes/a.md", "Body text.", &["Intro"]);
    let hits = vec![mk_hit(1, &cid, &did, "notes/a.md", 0.85, &["Intro"])];
    // PR-7: entry 0 = probe, entry 1 = decompose-driven retrieve.
    let retriever = Arc::new(ScriptedRetriever::new(vec![hits.clone(), hits]));
    let retriever_dyn: Arc<dyn Retriever> = retriever;

    // Decide LLM emits non-JSON garbage. Spec §9: this is NOT a
    // refusal — pipeline graceful-degrades to synthesize as if the
    // decide had returned `[]`. Only the *initial* decompose's
    // parse failure is a refusal (MultiHopDecomposeFailed).
    let lm = Arc::new(ScriptedLm::new(vec![
        r#"["q1"]"#,
        "definitely not a JSON array",
        "answer [#1]",
    ]));
    let lm_handle = lm.clone();
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let pipeline =
        RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone());

    let answer = pipeline.ask("q", multi_hop_opts()).unwrap();

    assert!(
        answer.grounded,
        "decide parse failure must NOT block synthesis"
    );
    assert_eq!(
        answer.refusal_reason, None,
        "decide parse failure is graceful degrade, not refusal — \
         MultiHopDecomposeFailed is reserved for the initial decompose hop"
    );
    assert_eq!(
        lm_handle.calls(),
        3,
        "decompose + (garbage) decide + synthesize"
    );

    let hops = answer.hops.expect("happy path stamps hops");
    assert_eq!(hops.len(), 3, "[Decompose, Decide(parse-fail→stop), Synthesize]");
    assert_eq!(hops[1].kind, HopKind::Decide);
    assert!(
        hops[1].sub_queries.is_empty(),
        "parse failure → empty sub_queries (same shape as LLM stop)"
    );
    assert!(
        !hops[1].forced_stop,
        "parse-degraded decide is not a cap-driven forced_stop — \
         flag stays false even though we synthesize early"
    );
}

// ── 6. refuse path: NoChunks preserves partial hop trace ──────────────────
//
// PR-3b-ii widens `refuse_no_chunks` to accept `hops:
// Option<Vec<HopRecord>>` and wires `ask_multi_hop` to forward the
// partial trace. PR-7 added a pre-decompose probe — so this test
// now exercises the *decompose-driven* empty-pool path: probe
// passes (KB has at least one relevant chunk), decompose emits
// sub-queries, but the sub-query retrieve hits nothing → pool stays
// empty → refuse_no_chunks with the partial hop trace preserved.
// (For the *probe-driven* refusal, see
// `multi_hop_empty_probe_pool_refuses_before_any_llm_call` —
// that path returns hops=None because decompose never ran.)

#[test]
fn multi_hop_refuse_no_chunks_preserves_hops_trace() {
    let env = RagEnv::new();
    let cid = id32("c1");
    let did = id32("d1");
    env.seed_chunk(&cid, &did, "notes/a.md", "Body text.", &["Intro"]);
    let probe_hits = vec![mk_hit(1, &cid, &did, "notes/a.md", 0.85, &["Intro"])];
    // PR-7: entry 0 = probe (passes gate), entry 1 = decompose-driven
    // retrieve (empty — sub-query returned nothing).
    let retriever = Arc::new(ScriptedRetriever::new(vec![probe_hits, vec![]]));
    let retriever_handle = retriever.clone();
    let retriever_dyn: Arc<dyn Retriever> = retriever;

    // Only one LM call needed (decompose). Decide is skipped because
    // `pool.is_empty()` triggers the (Vec::new(), 0) shortcut. If a
    // bug calls the LM beyond decompose, ScriptedLm panics on
    // exhaustion and the test fails loudly.
    let lm = Arc::new(ScriptedLm::new(vec![r#"["q1"]"#]));
    let lm_handle = lm.clone();
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let pipeline =
        RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone());

    let answer = pipeline.ask("q", multi_hop_opts()).unwrap();

    assert!(!answer.grounded);
    assert_eq!(answer.refusal_reason, Some(RefusalReason::NoChunks));
    assert_eq!(
        retriever_handle.calls(),
        2,
        "probe (passes) + 1 decompose-driven retrieve (empty)"
    );
    assert_eq!(lm_handle.calls(), 1, "decompose only — decide skipped (empty pool), no synthesize");

    let hops = answer
        .hops
        .expect("PR-3b-ii: refuse_no_chunks must preserve the partial hop trace");
    assert_eq!(
        hops.len(),
        2,
        "[Decompose, Decide(empty_pool_skip)] — synthesize never ran"
    );
    assert_eq!(hops[0].kind, HopKind::Decompose);
    assert_eq!(hops[0].sub_queries, vec!["q1"]);
    assert_eq!(hops[1].kind, HopKind::Decide);
    assert!(hops[1].sub_queries.is_empty());
    assert_eq!(
        hops[1].context_chunks_added, 0,
        "retrieve returned 0 hits → 0 added to pool"
    );
}

// ── 7. refuse path: ScoreGate preserves partial hop trace ─────────────────

#[test]
fn multi_hop_refuse_score_gate_preserves_hops_trace() {
    // PR-7 narrowed this path: with the pre-decompose probe gate,
    // the *probe* must pass (high-score chunk) for decompose to
    // run at all. The *decompose-driven* retrieve can then return
    // a below-gate hit that triggers the post-pool gate refusal —
    // which is the surface that preserves hops.
    //
    // For the *probe-driven* gate refusal (single-pass-equivalent
    // safety floor), see
    // `multi_hop_below_probe_gate_refuses_before_any_llm_call` —
    // that returns hops=None because decompose never ran.
    let env = RagEnv::new();
    let (low_cid, low_did) = seed_low_score_chunk(&env);
    let high_cid = id32("c_high");
    let high_did = id32("d_high");
    env.seed_chunk(&high_cid, &high_did, "notes/high.md", "high score body", &["High"]);

    let probe_hits = vec![mk_hit(1, &high_cid, &high_did, "notes/high.md", 0.85, &["High"])];
    let decompose_hits = vec![mk_hit(1, &low_cid, &low_did, "notes/low.md", 0.10, &["Low"])];
    let retriever = Arc::new(ScriptedRetriever::new(vec![probe_hits, decompose_hits]));
    let retriever_dyn: Arc<dyn Retriever> = retriever;

    // decompose + decide (pool not empty so decide fires) — synthesize
    // never runs because we refuse before pack_context.
    let lm = Arc::new(ScriptedLm::new(vec![
        r#"["q1"]"#,
        r"[]",
    ]));
    let lm_handle = lm.clone();
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let pipeline =
        RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone());

    let answer = pipeline.ask("q", multi_hop_opts()).unwrap();

    assert!(!answer.grounded);
    assert_eq!(answer.refusal_reason, Some(RefusalReason::ScoreGate));
    assert_eq!(
        lm_handle.calls(),
        2,
        "decompose + decide ran; synthesize skipped by gate"
    );

    let hops = answer
        .hops
        .expect("PR-3b-ii: refuse_score_gate must preserve the partial hop trace");
    assert_eq!(
        hops.len(),
        2,
        "[Decompose, Decide(stop)] — synthesize never ran"
    );
    assert_eq!(hops[0].kind, HopKind::Decompose);
    assert_eq!(hops[1].kind, HopKind::Decide);
    assert_eq!(
        hops[1].context_chunks_added, 1,
        "the low-score chunk did enter the pool — gate fires after pool build"
    );
}

/// Seed a chunk + return its `(chunk_id, doc_id)` pair. Helper for
/// the score-gate test so the test body stays focused on the
/// hop-trace assertions; returning the pair (instead of the chunk_id
/// alone) avoids the caller having to re-derive `id32("d_low")` and
/// keeps the id pair as a single source of truth.
fn seed_low_score_chunk(env: &RagEnv) -> (String, String) {
    let cid = id32("c_low");
    let did = id32("d_low");
    env.seed_chunk(&cid, &did, "notes/low.md", "low score text", &["Low"]);
    (cid, did)
}

// ── p9-fb-41 v0.18 dogfood fix: pre-decompose score-gate probe ────────────
//
// Out-of-corpus query that single-pass would have refused via
// score-gate must also refuse on the multi-hop path — *before* any
// decompose / decide / synthesize LLM call. Otherwise the decompose
// can emit sub-queries that pull in chunks loosely matching each
// sub-query, fill the pool past the gate, and let the synthesize
// hallucinate over chunks that were never relevant to the *original*
// query. Dogfood S7 (`/build/cache/dogfood-v018/results/SUMMARY.md`)
// is the symptom; these tests pin the fix.

#[test]
fn multi_hop_below_probe_gate_refuses_before_any_llm_call() {
    let env = RagEnv::new();
    let cid = id32("c_low");
    let did = id32("d_low");
    env.seed_chunk(&cid, &did, "notes/low.md", "low score body", &["Low"]);
    // Single hit far below the default 0.30 gate.
    let hits = vec![mk_hit(1, &cid, &did, "notes/low.md", 0.05, &["Low"])];
    let retriever = Arc::new(ScriptedRetriever::new(vec![hits]));
    let retriever_handle = retriever.clone();
    let retriever_dyn: Arc<dyn Retriever> = retriever;

    // Empty LM script — ANY LLM call panics on exhaustion. The fix
    // must short-circuit before decompose.
    let lm = Arc::new(ScriptedLm::new(vec![]));
    let lm_handle = lm.clone();
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let pipeline =
        RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone());

    let answer = pipeline.ask("out-of-corpus query", multi_hop_opts()).unwrap();

    assert!(!answer.grounded);
    assert_eq!(answer.refusal_reason, Some(RefusalReason::ScoreGate));
    assert_eq!(
        lm_handle.calls(),
        0,
        "below-gate must short-circuit BEFORE any LLM call (no decompose, decide, or synthesize)"
    );
    assert_eq!(
        retriever_handle.calls(),
        1,
        "only the probe retrieve happened — no decompose-driven retrieves"
    );
    // S7 dogfood: in the pre-fix world the multi-hop path would have
    // returned grounded=true with hallucinated content. This test
    // pins the safe envelope.
    assert!(
        answer.hops.is_none(),
        "pre-decompose refusal carries no hop trace (decompose never ran)"
    );
}

#[test]
fn multi_hop_empty_probe_pool_refuses_before_any_llm_call() {
    let env = RagEnv::new();
    // Retriever returns 0 hits — probe is empty.
    let retriever = Arc::new(ScriptedRetriever::new(vec![vec![]]));
    let retriever_handle = retriever.clone();
    let retriever_dyn: Arc<dyn Retriever> = retriever;

    let lm = Arc::new(ScriptedLm::new(vec![]));
    let lm_handle = lm.clone();
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let pipeline =
        RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone());

    let answer = pipeline.ask("q", multi_hop_opts()).unwrap();

    assert!(!answer.grounded);
    assert_eq!(answer.refusal_reason, Some(RefusalReason::NoChunks));
    assert_eq!(
        lm_handle.calls(),
        0,
        "empty probe must short-circuit BEFORE any LLM call"
    );
    assert_eq!(
        retriever_handle.calls(),
        1,
        "only the probe retrieve happened — no decompose retrieves"
    );
    assert!(answer.hops.is_none());
}

#[test]
fn multi_hop_above_probe_gate_proceeds_to_decompose() {
    // Sanity counterpart: a query that PASSES the probe gate still
    // exercises the full multi-hop flow (decompose → decide → synth).
    // Guards against the fix accidentally short-circuiting valid
    // multi-hop calls.
    let env = RagEnv::new();
    let cid = id32("c1");
    let did = id32("d1");
    env.seed_chunk(&cid, &did, "notes/a.md", "Body text.", &["Intro"]);
    // Probe retrieve returns a high-score hit (above gate),
    // decompose-driven retrieve returns the same chunk again.
    let probe_hits = vec![mk_hit(1, &cid, &did, "notes/a.md", 0.85, &["Intro"])];
    let decompose_hits = vec![mk_hit(1, &cid, &did, "notes/a.md", 0.85, &["Intro"])];
    let retriever = Arc::new(ScriptedRetriever::new(vec![probe_hits, decompose_hits]));
    let retriever_handle = retriever.clone();
    let retriever_dyn: Arc<dyn Retriever> = retriever;

    let lm = Arc::new(ScriptedLm::new(vec![
        r#"["q1"]"#,
        r"[]",
        "answer [#1]",
    ]));
    let lm_handle = lm.clone();
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let pipeline =
        RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone());

    let answer = pipeline.ask("valid query", multi_hop_opts()).unwrap();

    assert!(answer.grounded);
    assert_eq!(answer.refusal_reason, None);
    assert_eq!(
        lm_handle.calls(),
        3,
        "decompose + decide + synthesize all ran"
    );
    assert_eq!(
        retriever_handle.calls(),
        2,
        "probe retrieve + decompose-driven retrieve"
    );
    let hops = answer.hops.expect("happy path stamps hops");
    assert_eq!(hops.len(), 3);
}

// ── p9-fb-41 PR-9c-2: step 8.5 NLI verification tests ──────────────────────
//
// Five tests pin the NLI hook on the multi-hop path:
// 1. `multi_hop_nli_pass_keeps_grounded` — entailment 0.9 ≥ threshold 0.5 →
//    happy path, `verification.nli_passed = true`.
// 2. `multi_hop_nli_fail_refuses` — entailment 0.1 < threshold 0.5 →
//    refusal with `RefusalReason::NliVerificationFailed` + verification stamp.
// 3. `multi_hop_nli_disabled_skip_verify` — threshold 0.0 → verify skipped,
//    `Answer.verification` stays `None` (no verifier attached).
// 4. `multi_hop_nli_model_unavailable_refuses` — verifier returns `Err` →
//    refusal with `RefusalReason::NliModelUnavailable` + `verification = None`.
// 5. `multi_hop_truncate_for_nli_char_budget` — pure unit test on
//    `truncate_for_nli`'s char-budget contract.

/// Helper to build a "valid multi-hop happy-path" scenario where probe +
/// decompose retrieves the same single chunk, decompose emits one
/// sub-query, decide signals stop, and synthesize produces a cited
/// answer. Returns the seeded `RagEnv`, scripted retriever (so the
/// test can assert call count), and scripted LM with the 3-call
/// script ready.
fn happy_multi_hop_env() -> (RagEnv, Arc<ScriptedRetriever>, Arc<ScriptedLm>) {
    let env = RagEnv::new();
    let cid = id32("c1");
    let did = id32("d1");
    env.seed_chunk(&cid, &did, "notes/a.md", "Body text.", &["Intro"]);
    let hits = vec![mk_hit(1, &cid, &did, "notes/a.md", 0.85, &["Intro"])];
    let retriever = Arc::new(ScriptedRetriever::new(vec![hits.clone(), hits]));
    let lm = Arc::new(ScriptedLm::new(vec![
        r#"["q1"]"#,
        r"[]",
        "answer body [#1]",
    ]));
    (env, retriever, lm)
}

#[test]
fn multi_hop_nli_pass_keeps_grounded() {
    let (env, retriever, lm) = happy_multi_hop_env();
    let mut cfg = env.config.clone();
    cfg.rag.nli_threshold = 0.5;

    let retriever_dyn: Arc<dyn Retriever> = retriever;
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let verifier = MockNliVerifier::pass();
    let verifier_handle = verifier.clone();
    let verifier_dyn: Arc<dyn NliVerifier> = verifier;
    let pipeline =
        RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone())
            .with_verifier(verifier_dyn);

    let answer = pipeline.ask("compound", multi_hop_opts()).unwrap();

    assert!(answer.grounded, "NLI-pass synthesize must stay grounded");
    assert_eq!(answer.refusal_reason, None);
    assert_eq!(
        verifier_handle.calls(),
        1,
        "verifier called exactly once on the synthesized answer"
    );
    let v = answer
        .verification
        .expect("nli_threshold > 0 stamps Some(verification)");
    assert!(v.nli_passed, "entailment 0.9 ≥ threshold 0.5");
    assert!((v.nli_score - 0.9).abs() < 1e-5, "got: {}", v.nli_score);
    assert!((v.nli_threshold - 0.5).abs() < 1e-5);
}

#[test]
fn multi_hop_nli_fail_refuses() {
    let (env, retriever, lm) = happy_multi_hop_env();
    let mut cfg = env.config.clone();
    cfg.rag.nli_threshold = 0.5;

    let retriever_dyn: Arc<dyn Retriever> = retriever;
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let verifier = MockNliVerifier::fail();
    let verifier_handle = verifier.clone();
    let verifier_dyn: Arc<dyn NliVerifier> = verifier;
    let pipeline =
        RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone())
            .with_verifier(verifier_dyn);

    let answer = pipeline.ask("compound", multi_hop_opts()).unwrap();

    assert!(!answer.grounded);
    assert_eq!(
        answer.refusal_reason,
        Some(RefusalReason::NliVerificationFailed)
    );
    assert_eq!(verifier_handle.calls(), 1);
    let v = answer
        .verification
        .expect("refusal still stamps verification summary");
    assert!(!v.nli_passed, "entailment 0.1 < threshold 0.5");
    assert!((v.nli_score - 0.1).abs() < 1e-5, "got: {}", v.nli_score);
}

#[test]
fn multi_hop_nli_disabled_skip_verify() {
    let (env, retriever, lm) = happy_multi_hop_env();
    // Default config keeps `nli_threshold = 0.0` — gate disabled. No
    // verifier is attached to the pipeline; the hook short-circuits
    // entirely (`Answer.verification` stays `None`).
    let cfg = env.config.clone();
    assert!(
        (cfg.rag.nli_threshold - 0.0).abs() < f32::EPSILON,
        "default nli_threshold must be 0.0 (gate disabled)"
    );

    let retriever_dyn: Arc<dyn Retriever> = retriever;
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    // No `with_verifier` call — pipeline.verifier stays None.
    let pipeline =
        RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone());

    let answer = pipeline.ask("compound", multi_hop_opts()).unwrap();

    assert!(answer.grounded);
    assert_eq!(answer.refusal_reason, None);
    assert!(
        answer.verification.is_none(),
        "threshold = 0.0 must skip step 8.5 and leave verification = None"
    );
}

#[test]
fn multi_hop_nli_model_unavailable_refuses() {
    let (env, retriever, lm) = happy_multi_hop_env();
    let mut cfg = env.config.clone();
    cfg.rag.nli_threshold = 0.5;

    let retriever_dyn: Arc<dyn Retriever> = retriever;
    let lm_dyn: Arc<dyn LanguageModel> = lm;
    let verifier = MockNliVerifier::err();
    let verifier_handle = verifier.clone();
    let verifier_dyn: Arc<dyn NliVerifier> = verifier;
    let pipeline =
        RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone())
            .with_verifier(verifier_dyn);

    let answer = pipeline.ask("compound", multi_hop_opts()).unwrap();

    assert!(!answer.grounded);
    assert_eq!(
        answer.refusal_reason,
        Some(RefusalReason::NliModelUnavailable)
    );
    assert_eq!(verifier_handle.calls(), 1, "verifier was invoked once before failing");
    assert!(
        answer.verification.is_none(),
        "NliModelUnavailable: can't summarize a verification that didn't happen"
    );
}

#[test]
fn multi_hop_truncate_for_nli_char_budget() {
    // Long premise (>1600 chars) gets truncated.
    // MAX_NLI_PREMISE_CHARS = 4 * 400 = 1600.
    let long_premise: String = "a".repeat(2000);
    let (truncated, was_truncated) = truncate_for_nli(&long_premise);
    assert!(was_truncated);
    assert_eq!(
        truncated.chars().count(),
        1600,
        "premise truncated to MAX_NLI_PREMISE_CHARS"
    );

    // Short premise (under budget): no truncation, `was_truncated = false`.
    let short_premise = "short premise text";
    let (passthrough, was_truncated) = truncate_for_nli(short_premise);
    assert!(!was_truncated);
    assert_eq!(passthrough, short_premise);

    // Multi-byte safety: 1600 Korean chars (3 bytes each in UTF-8) fits
    // within the char budget even though byte length exceeds 4800.
    let kr_short: String = "가".repeat(1600);
    let (passthrough_kr, was_truncated_kr) = truncate_for_nli(&kr_short);
    assert!(!was_truncated_kr, "1600 KR chars == budget, no truncation");
    assert_eq!(passthrough_kr.chars().count(), 1600);

    // Multi-byte over-budget: truncation must count chars, not bytes.
    let kr_long: String = "가".repeat(2000);
    let (truncated_kr, was_truncated_kr) = truncate_for_nli(&kr_long);
    assert!(was_truncated_kr);
    assert_eq!(
        truncated_kr.chars().count(),
        1600,
        "char-based truncation must not over-cut on multi-byte input"
    );
}