Merge pull request 'feat(rag): fb-41 PR-3a HopRecord wire + RagCfg multi-hop knobs' (#168) from feat/fb-41-pr-3-dynamic-decide-loop into main

This commit was merged in pull request #168.
This commit is contained in:
2026-05-25 07:18:27 +00:00
8 changed files with 233 additions and 3 deletions

View File

@@ -1639,6 +1639,7 @@ mod tests {
created_at: OffsetDateTime::now_utc(),
conversation_id: None,
turn_index: None,
hops: None,
}
}

View File

@@ -181,6 +181,39 @@ pub struct RagCfg {
pub score_gate: f32,
pub explain_default: bool,
pub max_context_tokens: usize,
/// p9-fb-41: hard ceiling on the number of multi-hop iterations
/// (decompose iter + decide iters). When the LLM keeps returning
/// `continue` past this depth the pipeline cuts to `synthesize`
/// with `HopRecord.forced_stop = true`. Default `3` — enough for
/// most cross-doc reasoning, low enough to bound LLM cost.
#[serde(default = "default_multi_hop_max_depth")]
pub multi_hop_max_depth: u32,
/// p9-fb-41: cap on how many sub-queries the LLM may emit in a
/// single decompose / decide call. Mirrors
/// [`MULTI_HOP_MAX_SUB_QUERIES_DEFAULT`] in kebab-rag — the
/// const is the hard floor while this is the runtime knob.
/// Default `5`.
#[serde(default = "default_multi_hop_max_sub_queries_per_iter")]
pub multi_hop_max_sub_queries_per_iter: u32,
/// p9-fb-41: hard ceiling on the deduped chunk pool. When the
/// accumulated pool would exceed this many chunks the pipeline
/// stops accepting new retrieval results and forces synthesize
/// with `forced_stop = true`. Default `30` — generous for
/// 5-hop / 10-hits multi-hop runs while still bounded.
#[serde(default = "default_multi_hop_max_pool_chunks")]
pub multi_hop_max_pool_chunks: u32,
}
fn default_multi_hop_max_depth() -> u32 {
3
}
fn default_multi_hop_max_sub_queries_per_iter() -> u32 {
5
}
fn default_multi_hop_max_pool_chunks() -> u32 {
30
}
/// Settings for the image ingest pipeline (P6). `ocr` controls OCR
@@ -434,6 +467,10 @@ impl Config {
score_gate: 0.30,
explain_default: false,
max_context_tokens: 8000,
multi_hop_max_depth: default_multi_hop_max_depth(),
multi_hop_max_sub_queries_per_iter:
default_multi_hop_max_sub_queries_per_iter(),
multi_hop_max_pool_chunks: default_multi_hop_max_pool_chunks(),
},
image: ImageCfg::defaults(),
ui: UiCfg::defaults(),
@@ -717,6 +754,21 @@ impl Config {
self.rag.max_context_tokens = n;
}
}
"KEBAB_RAG_MULTI_HOP_MAX_DEPTH" => {
if let Ok(n) = v.parse::<u32>() {
self.rag.multi_hop_max_depth = n;
}
}
"KEBAB_RAG_MULTI_HOP_MAX_SUB_QUERIES_PER_ITER" => {
if let Ok(n) = v.parse::<u32>() {
self.rag.multi_hop_max_sub_queries_per_iter = n;
}
}
"KEBAB_RAG_MULTI_HOP_MAX_POOL_CHUNKS" => {
if let Ok(n) = v.parse::<u32>() {
self.rag.multi_hop_max_pool_chunks = n;
}
}
// image.ocr
"KEBAB_IMAGE_OCR_ENABLED" => {
@@ -1092,6 +1144,61 @@ theme = "dark"
assert_eq!(c.image.ocr.request_timeout_secs, 300);
}
// ── p9-fb-41: multi-hop RAG knobs ────────────────────────────────────
#[test]
fn default_multi_hop_max_depth_is_3() {
assert_eq!(Config::defaults().rag.multi_hop_max_depth, 3);
}
#[test]
fn default_multi_hop_max_sub_queries_per_iter_is_5() {
assert_eq!(
Config::defaults().rag.multi_hop_max_sub_queries_per_iter,
5
);
}
#[test]
fn default_multi_hop_max_pool_chunks_is_30() {
assert_eq!(Config::defaults().rag.multi_hop_max_pool_chunks, 30);
}
#[test]
fn env_overrides_multi_hop_knobs() {
let mut env = HashMap::new();
env.insert(
"KEBAB_RAG_MULTI_HOP_MAX_DEPTH".to_string(),
"5".to_string(),
);
env.insert(
"KEBAB_RAG_MULTI_HOP_MAX_SUB_QUERIES_PER_ITER".to_string(),
"7".to_string(),
);
env.insert(
"KEBAB_RAG_MULTI_HOP_MAX_POOL_CHUNKS".to_string(),
"50".to_string(),
);
let c = Config::defaults().apply_env(&env);
assert_eq!(c.rag.multi_hop_max_depth, 5);
assert_eq!(c.rag.multi_hop_max_sub_queries_per_iter, 7);
assert_eq!(c.rag.multi_hop_max_pool_chunks, 50);
}
/// post-PR-3 fb-41: a config file written before the multi-hop
/// knobs existed must still parse and fall back to the documented
/// defaults — backwards-compat invariant. Fixture shared with the
/// LLM / OCR timeout invariants via [`LEGACY_PRE_TIMEOUT_TOML`]
/// (that fixture also predates the multi_hop_* fields).
#[test]
fn legacy_config_without_multi_hop_knobs_uses_defaults() {
let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML)
.expect("parse legacy config");
assert_eq!(c.rag.multi_hop_max_depth, 3);
assert_eq!(c.rag.multi_hop_max_sub_queries_per_iter, 5);
assert_eq!(c.rag.multi_hop_max_pool_chunks, 30);
}
#[test]
fn image_ocr_env_overrides() {
let mut env = HashMap::new();

View File

@@ -29,6 +29,14 @@ pub struct Answer {
/// 이면 single-shot.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub turn_index: Option<u32>,
/// p9-fb-41: multi-hop hop trace. `None` for single-pass asks.
/// Each entry records one hop (`decompose` / `decide` / `synthesize`)
/// — the LLM call category, the sub-queries emitted, retrieval
/// counts, and a `forced_stop` flag for cap-driven termination.
/// Wire-additive: `answer.v1` schema_version unchanged; consumers
/// reading older single-pass answers see `hops: None` (or absent).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub hops: Option<Vec<HopRecord>>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -55,6 +63,62 @@ pub struct Turn {
pub created_at: OffsetDateTime,
}
/// p9-fb-41: one entry in [`Answer::hops`] — the per-iteration trace
/// of a multi-hop ask. The pipeline appends a `HopRecord` per LLM
/// call (decompose / decide / synthesize) so a `--multi-hop` user
/// can see what sub-queries the LLM emitted, how many chunks each
/// hop contributed, whether the iter stopped on the model's own
/// signal or hit a cap, and the per-hop LLM latency.
///
/// Wire-additive — every field uses `#[serde(default)]` where it
/// could plausibly be omitted by a future schema reader.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct HopRecord {
/// 0-based hop index within this ask. `iter=0` is always the
/// initial decompose call; subsequent iters are decide calls;
/// the final iter is the synthesize call.
pub iter: u32,
pub kind: HopKind,
/// Sub-queries the LLM emitted at this iter. For the synthesize
/// hop this is empty (no sub-queries — just the final answer).
#[serde(default)]
pub sub_queries: Vec<String>,
/// Number of *new* chunks the retrieval round contributed to the
/// pool (dedup'd by `chunk_id` — repeated hits from a previous
/// iter do not count). `0` for the decompose hop (no retrieval
/// yet) and the synthesize hop.
pub context_chunks_added: u32,
/// `true` when the pipeline cut the iter loop short because a
/// safety cap fired (`max_depth` / `max_total_sub_queries` /
/// `max_pool_chunks`) rather than because the LLM signalled
/// stop. The user-visible answer still reflects all chunks
/// accumulated up to that point — `forced_stop` is a tracing
/// signal, not a refusal.
pub forced_stop: bool,
/// Wall-clock latency of the LLM call for this hop, in
/// milliseconds. Useful for cost / latency analysis when a
/// `kebab eval` run records `Answer.hops`.
pub llm_call_ms: u32,
}
/// p9-fb-41: which stage of the multi-hop pipeline a [`HopRecord`]
/// describes. The serde tag matches the wire shape so agents /
/// CLIs can branch on the snake_case string without referencing
/// the Rust enum.
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum HopKind {
/// First hop — LLM decomposed the user query into sub-queries.
Decompose,
/// Subsequent hop — LLM was asked whether more retrieval is
/// needed and either emitted new sub-queries (`continue`) or
/// returned an empty array (`stop`).
Decide,
/// Terminal hop — LLM produced the final user-visible answer
/// over the accumulated chunk pool.
Synthesize,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RefusalReason {

View File

@@ -56,8 +56,8 @@ pub use search::{
TraceCandidate, TraceFusionInput, TraceTiming,
};
pub use answer::{
Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, RefusalReason, TokenUsage,
TraceId, Turn,
Answer, AnswerCitation, AnswerRetrievalSummary, HopKind, HopRecord, ModelRef,
RefusalReason, TokenUsage, TraceId, Turn,
};
pub use ingest::{IngestItem, IngestItemKind, IngestReport, SkipExamples};
pub use jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};

View File

@@ -532,6 +532,7 @@ mod tests {
created_at: OffsetDateTime::UNIX_EPOCH,
conversation_id: None,
turn_index: None,
hops: None,
}
}

View File

@@ -531,6 +531,11 @@ impl RagPipeline {
created_at: OffsetDateTime::now_utc(),
conversation_id: opts.conversation_id.clone(),
turn_index: opts.turn_index,
// p9-fb-41 Step 2 of PR-3: every Answer literal carries
// `hops`. Single-pass + refusal paths leave it `None`;
// only the multi-hop happy path will set `Some(...)` in
// Step 5 once the decide loop populates a hop trace.
hops: None,
};
// Drop the moved `finish_reason` early into a tracing breadcrumb; the
@@ -843,6 +848,11 @@ impl RagPipeline {
created_at: OffsetDateTime::now_utc(),
conversation_id: opts.conversation_id.clone(),
turn_index: opts.turn_index,
// p9-fb-41 Step 2 of PR-3: every Answer literal carries
// `hops`. Single-pass + refusal paths leave it `None`;
// only the multi-hop happy path will set `Some(...)` in
// Step 5 once the decide loop populates a hop trace.
hops: None,
};
tracing::debug!(
@@ -979,6 +989,11 @@ impl RagPipeline {
created_at: OffsetDateTime::now_utc(),
conversation_id: opts.conversation_id.clone(),
turn_index: opts.turn_index,
// p9-fb-41 Step 2 of PR-3: every Answer literal carries
// `hops`. Single-pass + refusal paths leave it `None`;
// only the multi-hop happy path will set `Some(...)` in
// Step 5 once the decide loop populates a hop trace.
hops: None,
};
if let Some(sink) = &opts.stream_sink {
let _ = sink.send(StreamEvent::Final {
@@ -1100,6 +1115,11 @@ impl RagPipeline {
created_at: OffsetDateTime::now_utc(),
conversation_id: opts.conversation_id.clone(),
turn_index: opts.turn_index,
// p9-fb-41 Step 2 of PR-3: every Answer literal carries
// `hops`. Single-pass + refusal paths leave it `None`;
// only the multi-hop happy path will set `Some(...)` in
// Step 5 once the decide loop populates a hop trace.
hops: None,
};
if let Err(e) = self.docs.put_answer(&answer, query, None) {
tracing::warn!(target: "kebab-rag", error = %e, "kb-rag: put_answer (NoChunks) failed");
@@ -1182,6 +1202,11 @@ impl RagPipeline {
created_at: OffsetDateTime::now_utc(),
conversation_id: opts.conversation_id.clone(),
turn_index: opts.turn_index,
// p9-fb-41 Step 2 of PR-3: every Answer literal carries
// `hops`. Single-pass + refusal paths leave it `None`;
// only the multi-hop happy path will set `Some(...)` in
// Step 5 once the decide loop populates a hop trace.
hops: None,
};
if let Err(e) = self.docs.put_answer(&answer, query, None) {
tracing::warn!(target: "kebab-rag", error = %e, "kb-rag: put_answer (ScoreGate) failed");
@@ -1789,6 +1814,7 @@ mod stream_event_serde_tests {
created_at: datetime!(2026-05-09 12:00:00 UTC),
conversation_id: None,
turn_index: None,
hops: None,
};
let ev = StreamEvent::Final { answer };
let v = serde_json::to_value(&ev).unwrap();

View File

@@ -77,6 +77,7 @@ fn make_answer(grounded: bool, refusal: Option<RefusalReason>, body: &str) -> An
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
conversation_id: None,
turn_index: None,
hops: None,
}
}

View File

@@ -85,7 +85,37 @@ XL 작업 — 6 PR 분할 (각 머지 후 누적, 마지막 PR 후 v0.18.0 cut).
---
## PR-3: Dynamic iteration (decide loop + caps)
## PR-3 분할 (작업 양 측면, 2026-05-25 사용자 결정)
**원래 plan**: PR-3 가 wire additive (`Answer.hops`) + RagCfg 노브 + decide loop + ScriptedLm + helper refactor + 5+ tests 단일 PR.
**실제 분할** (~1500+ 줄 단일 PR → review 부담 + 회기 위험 ↓):
- **PR-3a (본 PR)**: wire additive (HopRecord + HopKind + Answer.hops) + RagCfg 3 노브 + 모든 Answer literal 갱신 (hops:None). **RAG pipeline 동작 미변경** — additive only.
- **PR-3b (후속)**: dynamic decide loop + ScriptedLm helper + 5+ integration tests + format! named arg + 회차 1 carry-over (mirror refactor / history block helper).
## PR-3a: Wire additive + RagCfg 노브 (HopRecord type + Answer.hops field)
**Goal**: 후속 PR (PR-3b decide loop) 의 wire / config foundation. RAG pipeline 동작 변경 없음 — `Answer.hops` 가 모든 path 에서 `None`, RagCfg 새 3 노브가 default 만 적용. PR-3b 가 이 위에서 decide loop 구현.
**Files**:
- `crates/kebab-core/src/answer.rs`:
- `HopRecord` struct (`iter`, `kind`, `sub_queries`, `context_chunks_added`, `forced_stop`, `llm_call_ms`).
- `HopKind` enum (`Decompose` / `Decide` / `Synthesize`).
- `Answer.hops: Option<Vec<HopRecord>>` field — `#[serde(default, skip_serializing_if = "Option::is_none")]`.
- `crates/kebab-core/src/lib.rs`: `pub use answer::{HopKind, HopRecord, ...}`.
- `crates/kebab-config/src/lib.rs`:
- `RagCfg``multi_hop_max_depth: u32` (default 3), `multi_hop_max_sub_queries_per_iter: u32` (default 5), `multi_hop_max_pool_chunks: u32` (default 30). `#[serde(default)]` + env override + legacy parse.
- 모든 Answer literal site 갱신 (9 sites: kebab-rag/src/pipeline.rs ×6 + kebab-cli/src/main.rs + kebab-tui/tests/ask.rs + kebab-eval/src/metrics.rs): `hops: None` 명시. 향후 PR-3b 의 ask_multi_hop happy path 만 `Some(hops_trace)` 채움.
**Tests**:
- `default_multi_hop_max_depth_is_3`, `default_multi_hop_max_sub_queries_per_iter_is_5`, `default_multi_hop_max_pool_chunks_is_30`.
- `env_overrides_multi_hop_knobs`.
- `legacy_config_without_multi_hop_knobs_uses_defaults` (LEGACY_PRE_TIMEOUT_TOML 공유).
- 모든 기존 RAG / TUI / CLI / eval test 가 hops:None 추가 후도 통과 (회귀 핀).
**Wire 영향**: `answer.v1` JSON Schema 의 `hops` optional 필드 — `skip_serializing_if``None` 일 때 emit 안 함이라 옛 single-pass response 에 변동 없음. wire breaking 아님. JSON Schema 갱신은 PR-3b 또는 PR-4 (실제 emit 시점).
## PR-3b: Dynamic iteration (decide loop + caps) — 후속 PR
**Goal**: depth=2 fixed → dynamic N-hop. LLM 의 decide signal + max_depth / max_sub_queries / max_pool_chunks cap.