diff --git a/crates/kebab-mcp/src/lib.rs b/crates/kebab-mcp/src/lib.rs index 2475d39..4d11326 100644 --- a/crates/kebab-mcp/src/lib.rs +++ b/crates/kebab-mcp/src/lib.rs @@ -49,7 +49,7 @@ pub fn build_tools_vec() -> Vec { ), Tool::new( "ask", - "RAG question answering over the knowledge base. Returns answer.v1 JSON. Pass session_id for multi-turn context.", + "RAG question answering over the knowledge base. Returns answer.v1 JSON. Pass session_id for multi-turn context. Set multi_hop=true for compound / cross-doc questions (decompose → retrieve → synthesize; 2-5× LLM cost; per-hop trace on Answer.hops).", schema_for_type::(), ), Tool::new( diff --git a/crates/kebab-mcp/src/tools/ask.rs b/crates/kebab-mcp/src/tools/ask.rs index 66c18ba..143bbdf 100644 --- a/crates/kebab-mcp/src/tools/ask.rs +++ b/crates/kebab-mcp/src/tools/ask.rs @@ -20,6 +20,15 @@ pub struct AskInput { pub session_id: Option, /// Optional retrieval mode override ("lexical" / "vector" / "hybrid"). Default "hybrid". pub mode: Option, + /// p9-fb-41: opt the ask into the multi-hop pipeline. Default `false`. + /// When `true`, the query is decomposed into sub-questions, each + /// retrieved independently, then synthesized over the merged + /// chunk pool. Cost trade-off: 2–5× LLM calls vs. single-pass. + /// Use for compound questions / cross-doc reasoning / prereq + /// chains; keep `false` for simple fact lookups. The full + /// per-hop trace (`decompose` / `decide` / `synthesize`) is + /// exposed on `Answer.hops`. + pub multi_hop: Option, } pub fn handle(state: &KebabAppState, input: AskInput) -> CallToolResult { @@ -38,7 +47,7 @@ pub fn handle(state: &KebabAppState, input: AskInput) -> CallToolResult { history: Vec::new(), conversation_id: None, turn_index: None, - multi_hop: false, + multi_hop: input.multi_hop.unwrap_or(false), }; let cfg_clone = (*state.config).clone(); let result = match input.session_id { diff --git a/crates/kebab-mcp/tests/tools_call_ask.rs b/crates/kebab-mcp/tests/tools_call_ask.rs index 09657d1..0a335cb 100644 --- a/crates/kebab-mcp/tests/tools_call_ask.rs +++ b/crates/kebab-mcp/tests/tools_call_ask.rs @@ -55,6 +55,7 @@ async fn ask_tool_returns_answer_v1_with_refusal_on_empty_kb() { // Test env uses provider="none" — Hybrid would hard-error on embedding. // Pass Lexical explicitly so the test stays functional. mode: Some("lexical".to_string()), + multi_hop: None, }, ) }) diff --git a/crates/kebab-mcp/tests/tools_call_ask_multi_hop.rs b/crates/kebab-mcp/tests/tools_call_ask_multi_hop.rs new file mode 100644 index 0000000..b7cc482 --- /dev/null +++ b/crates/kebab-mcp/tests/tools_call_ask_multi_hop.rs @@ -0,0 +1,144 @@ +//! p9-fb-41 PR-5: MCP `ask` tool with `multi_hop: true` argument. +//! +//! Two Ollama-free pins: +//! +//! 1. `ask_tool_routes_multi_hop_true_to_decompose_first` — multi-hop +//! dispatch differs from single-pass on dispatch shape. Single-pass +//! retrieves *first* (empty KB → `NoChunks` refusal, no LLM call, +//! `grounded=false`). Multi-hop calls *decompose first* (no +//! retrieval yet), so an empty KB + no Ollama yields `error.v1` +//! with `code=model_unreachable` — different wire shape than the +//! refusal envelope. The two surfaces' divergence is the signal +//! that the `multi_hop` arg actually routed the dispatch. +//! 2. `ask_input_schema_advertises_multi_hop_field` — `AskInput`'s +//! `JsonSchema` exposes the new field so MCP host capability +//! discovery (tools/list) renders it for agents. +//! +//! A live-Ollama end-to-end multi-hop pin lands in a follow-up +//! `#[ignore]` test (same pattern as `wire_ask_stale.rs`). + +use kebab_config::Config; +use kebab_core::SourceScope; +use kebab_mcp::{KebabAppState, KebabHandler}; +use rmcp::model::RawContent; + +fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config { + let mut cfg = Config::defaults(); + cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); + cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); + cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); + cfg.workspace.exclude.clear(); + cfg.models.embedding.provider = "none".to_string(); + cfg.models.embedding.dimensions = 0; + // Force the LLM endpoint to a known-unreachable port so this test + // is robust against whether a real Ollama happens to be running + // on 127.0.0.1:11434 (the developer's box; CI; etc.). Combined + // with a tight `request_timeout_secs`, the multi-hop dispatch + // surfaces `model_unreachable` quickly and deterministically. + cfg.models.llm.endpoint = "http://127.0.0.1:1".to_string(); + cfg.models.llm.request_timeout_secs = 2; + cfg +} + +/// The dispatch contract: with an empty KB, single-pass `ask` short- +/// circuits at retrieval (no LLM call) and returns a refusal Answer +/// (`grounded=false`, `isError=false`). Multi-hop calls *decompose +/// first*, so the same empty KB + unreachable LLM yields `error.v1` +/// with `code=model_unreachable` (`isError=true`). The divergence +/// confirms the `multi_hop` arg actually rerouted the dispatch. +#[tokio::test] +async fn ask_tool_routes_multi_hop_true_to_decompose_first() { + let dir = tempfile::tempdir().unwrap(); + let data_dir = dir.path().join("data"); + let workspace_root = dir.path().join("notes"); + std::fs::create_dir_all(&data_dir).unwrap(); + std::fs::create_dir_all(&workspace_root).unwrap(); + let cfg = minimal_config(&data_dir, &workspace_root); + + let scope = SourceScope { + root: workspace_root.clone(), + include: vec![], + exclude: vec![], + }; + let _ = kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap(); + + let state = KebabAppState::new(cfg, None); + let handler = KebabHandler::new(state); + + // Multi-hop branch — decompose runs first, hits the unreachable + // endpoint, MCP wraps as error.v1. + let state_mh = handler.state().clone(); + let mh = tokio::task::spawn_blocking(move || { + kebab_mcp::tools::ask::handle( + &state_mh, + kebab_mcp::tools::ask::AskInput { + query: "compound about X and Y".to_string(), + session_id: None, + mode: Some("lexical".to_string()), + multi_hop: Some(true), + }, + ) + }) + .await + .unwrap(); + assert!( + mh.is_error.unwrap_or(false), + "multi_hop=true must reach the LLM (decompose first) — got {mh:?}" + ); + let mh_text = match &mh.content.first().unwrap().raw { + RawContent::Text(t) => t.text.clone(), + other => panic!("expected text, got {other:?}"), + }; + let mh_v: serde_json::Value = serde_json::from_str(&mh_text).unwrap(); + assert_eq!(mh_v["schema_version"], "error.v1"); + assert_eq!( + mh_v["code"], "model_unreachable", + "multi-hop dispatch must hit the LLM and surface model_unreachable; got {mh_v}" + ); + + // Single-pass branch — empty KB short-circuits at retrieve, no LLM + // call happens, refusal Answer comes back as isError=false. + let state_sp = handler.state().clone(); + let sp = tokio::task::spawn_blocking(move || { + kebab_mcp::tools::ask::handle( + &state_sp, + kebab_mcp::tools::ask::AskInput { + query: "anything".to_string(), + session_id: None, + mode: Some("lexical".to_string()), + multi_hop: Some(false), + }, + ) + }) + .await + .unwrap(); + assert!( + !sp.is_error.unwrap_or(false), + "single-pass empty-KB refusal must NOT be isError — got {sp:?}" + ); + let sp_text = match &sp.content.first().unwrap().raw { + RawContent::Text(t) => t.text.clone(), + other => panic!("expected text, got {other:?}"), + }; + let sp_v: serde_json::Value = serde_json::from_str(&sp_text).unwrap(); + assert_eq!(sp_v["schema_version"], "answer.v1"); + assert_eq!(sp_v["grounded"], false); +} + +/// AskInput's JSON-schema (rendered for tools/list) advertises the +/// new `multi_hop` field. Pins agent / MCP host capability discovery +/// against accidental schema-rename or omission. +#[test] +fn ask_input_schema_advertises_multi_hop_field() { + let schema = schemars::schema_for!(kebab_mcp::tools::ask::AskInput); + let v = serde_json::to_value(&schema).unwrap(); + let props = v + .get("properties") + .and_then(|p| p.as_object()) + .expect("AskInput schema must declare properties"); + assert!( + props.contains_key("multi_hop"), + "AskInput.multi_hop must surface in the JsonSchema — got keys: {:?}", + props.keys().collect::>() + ); +} diff --git a/integrations/claude-code/kebab/SKILL.md b/integrations/claude-code/kebab/SKILL.md index 2d47638..3a3359f 100644 --- a/integrations/claude-code/kebab/SKILL.md +++ b/integrations/claude-code/kebab/SKILL.md @@ -80,13 +80,14 @@ Use when the user wants a synthesized answer, not a list of links. Input: ```json -{ "query": "", "session_id": "", "mode": "hybrid" } +{ "query": "", "session_id": "", "mode": "hybrid", "multi_hop": false } ``` -- Returns `answer.v1`: `answer` (markdown), `citations[]`, `grounded` (bool), `refusal_reason`, `model`, `conversation_id`, `turn_index`. +- Returns `answer.v1`: `answer` (markdown), `citations[]`, `grounded` (bool), `refusal_reason`, `model`, `conversation_id`, `turn_index`, `hops` (multi-hop only). - **If `grounded == false`** → KB doesn't have enough context. Don't paraphrase the refusal as if it were an answer. Tell the user the KB came up dry and fall back to your own knowledge or ask for the source. - For follow-up turns on the same topic, pass `session_id` (e.g. `"team-onboarding-2026-05"`) and reuse it across the conversation. Sessions persist until `kebab reset --data-only`. - p9-fb-40: 기본 `prompt_template_version = "rag-v2"`. 답변이 더 strict — fact 인용 시 verbatim span, 학습 지식 동원 금지, 근거 모호 시 "확실하지 않다" 출현 가능. user 가 `[rag] prompt_template_version = "rag-v1"` 명시 시 legacy 동작. +- **p9-fb-41 `multi_hop: true`** — opt the ask into the multi-hop pipeline. The query is decomposed into sub-questions, each retrieved independently (LLM-driven decide loop, up to `rag.multi_hop_max_depth` iters), then synthesized over the merged chunk pool. Cost trade-off: 2–5× LLM calls vs. single-pass. **Use** for compound questions ("X 와 Y 의 차이는?", prereq chains, cross-doc reasoning where one chunk alone is insufficient). **Don't** for simple fact-finding (single-pass is faster + cheaper). When set, `answer.v1.hops[]` carries the per-hop trace (`{iter, kind, sub_queries[], context_chunks_added, forced_stop, llm_call_ms}`) — surface a brief "Searched in N hops" note when the trace is non-trivial. Decompose-failure (model emitted non-JSON) → `refusal_reason = "multi_hop_decompose_failed"`; treat like any other refusal. ### `mcp__kebab__fetch` — when you need raw text