Merge pull request 'feat(rag): multi-turn ask — Turn + ask_with_history + token budget (p9-fb-15)' (#60) from feat/p9-fb-15-rag-multiturn into main

This commit was merged in pull request #60.
This commit is contained in:
2026-05-02 23:14:54 +00:00
13 changed files with 304 additions and 6 deletions

View File

@@ -44,6 +44,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
- **2026-05-02 P9 도그푸딩 후속 (spec PR #51 + p9-fb-01 + p9-fb-02)** — `kebab ingest` 진행 표시 도입. frozen design §2.4a 신설 (wire schema `ingest_progress.v1` line-delimited streaming) + §10 의 long-running 작업 절 추가. `kebab-app::ingest_with_config_progress(.., progress: Option<Sender<IngestEvent>>)` facade 추가, 기존 `_with_config``progress=None` forwarding wrapper. CLI 가 indicatif TTY 진행 바 (stderr) / non-TTY 한 줄씩 / `--json` 모드는 line-delimited stdout. p9-fb-03 (TUI background worker) + p9-fb-04 (cancel) 가 같은 stream 위에 build.
- **2026-05-02 P9 도그푸딩 후속 (p9-fb-03)** — TUI 의 background ingest worker. Library 의 `r` 키가 `kebab_app::ingest_with_config_progress` 를 spawned thread 에서 호출, run loop 가 매 frame 마다 progress channel drain → 화면 하단 status bar 1 줄 갱신. terminal event (`Completed`/`Aborted`) 후 3 초 final 라인 hold + 자동 hide + Library auto-refresh. spec: `tasks/p9/p9-fb-03-tui-ingest-background.md`. (cancel slot 은 p9-fb-04 가 추가하는 형태로 단일화 — 회차 1 review 결과.)
- **2026-05-02 P9 도그푸딩 후속 (p9-fb-04)** — ingest cooperative cancellation. `kebab-app::ingest_with_config_cancellable(.., cancel: Option<Arc<AtomicBool>>)` facade 추가, 기존 `_progress``cancel=None` forwarding. asset loop iter 시작 boundary 마다 cancel poll → true 면 break + `IngestEvent::Aborted { partial_counts }` + `Ok(IngestReport)` 정상 반환 (Err 아님). 부분 commit 보존, 다음 ingest 가 idempotent 재개. CLI Ctrl-C SIGINT handler (`ctrlc` crate) — 1회: cancel, 2회: hard exit (130). TUI Esc / Ctrl-C 가 cancel signal (in-flight 시), 그 외에는 quit. `IngestState``cancel: Arc<AtomicBool>` field 추가. spec: `tasks/p9/p9-fb-04-ingest-cancellation.md`.
- **2026-05-02 P9 도그푸딩 후속 (spec PR #59 + p9-fb-15)** — RAG multi-turn 도입. frozen design §3.8 갱신 — `Answer``conversation_id` / `turn_index` optional field, 신규 `Turn` struct, `RefusalReason::LlmStreamAborted` variant. `kebab-rag::AskOpts``history: Vec<Turn>` / `conversation_id` / `turn_index` 3 field 추가, 기존 caller 는 `Vec::new() / None` (single-shot 동작 동일). `RagPipeline::ask_with_history(query, history, conversation_id, turn_index, opts)` helper. prompt 빌드: `[이전 대화]` 블록을 user prompt 위에 prepend, newest-first, char budget (`cfg.rag.max_context_tokens * 4`) 안에서 oldest 부터 drop. retrieval query expansion: 직전 answer 첫 200 자 concat. wire schema `answer.v1` 에 두 필드 + `format: date-time` 추가. p9-fb-16 (TUI conversation UI) + p9-fb-17/18 (V004 storage + CLI session) 가 같은 facade 위에 build. spec: `tasks/p9/p9-fb-15-rag-multi-turn-core.md`.
## 다음 task 후보

View File

@@ -30,6 +30,9 @@ fn ask_lexical_smoke() {
temperature: Some(0.0),
seed: Some(0),
stream_sink: None,
history: Vec::new(),
conversation_id: None,
turn_index: None,
};
// The fixture workspace contains "ownership" content; the model's
// citation behavior depends on its training, so we don't assert on

View File

@@ -430,6 +430,12 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
// once on completion). The TUI ask pane (P9-3) is what
// wires up a real `mpsc::Sender` here.
stream_sink: None,
// p9-fb-15: CLI single-shot ask. p9-fb-18 adds
// `--session` / `--repl` for multi-turn over the same
// facade (passes a populated `history`).
history: Vec::new(),
conversation_id: None,
turn_index: None,
};
let ans = kebab_app::ask_with_config(cfg, query, opts)?;
if cli.json {

View File

@@ -20,6 +20,15 @@ pub struct Answer {
pub usage: TokenUsage,
#[serde(with = "time::serde::rfc3339")]
pub created_at: OffsetDateTime,
/// p9-fb-15: same conversation 의 turn 들이 공유. CLI single-shot
/// (history 없음) / TUI 첫 turn 은 None. blake3 해시 또는 사용자
/// 명시 (`kebab ask --session <id>`, p9-fb-18).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub conversation_id: Option<String>,
/// p9-fb-15: 같은 conversation 안 0-based 순서. 첫 turn = 0. None
/// 이면 single-shot.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub turn_index: Option<u32>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -28,6 +37,19 @@ pub struct AnswerCitation {
pub citation: Citation,
}
/// p9-fb-15: history 가 prompt 에 들어갈 때의 한 turn. RAG facade 가
/// `Vec<Turn>` 받아 system + history + retrieval + new question 으로
/// prompt 빌드. token budget 안에 fit 안 되면 oldest turn 부터 drop
/// (newest 우선 보존).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Turn {
pub question: String,
pub answer: String,
pub citations: Vec<AnswerCitation>,
#[serde(with = "time::serde::rfc3339")]
pub created_at: OffsetDateTime,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RefusalReason {
@@ -35,6 +57,10 @@ pub enum RefusalReason {
LlmSelfJudge,
NoIndex,
NoChunks,
/// p9-fb-15: ask 가 LLM 토큰 stream 도중 cancel 됨. partial answer
/// 가 채워져 있을 수 있음 (사용자가 본 부분까지). RAG retrieval
/// 자체는 정상 — 모델 generation 단계에서만 중단.
LlmStreamAborted,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]

View File

@@ -54,8 +54,8 @@ pub use search::{
SearchMode, SearchQuery,
};
pub use answer::{
Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, RefusalReason,
TokenUsage, TraceId,
Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, RefusalReason, TokenUsage,
TraceId, Turn,
};
pub use ingest::{IngestItem, IngestItemKind, IngestReport};
pub use jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};

View File

@@ -496,6 +496,8 @@ mod tests {
},
usage: TokenUsage { prompt_tokens: 1, completion_tokens: 1, latency_ms: 1 },
created_at: OffsetDateTime::UNIX_EPOCH,
conversation_id: None,
turn_index: None,
}
}

View File

@@ -174,6 +174,11 @@ fn execute_query(app: &App, gq: &GoldenQuery, opts: &EvalRunOpts) -> QueryResult
temperature: opts.temperature,
seed: opts.seed,
stream_sink: None,
// p9-fb-15: golden eval is single-shot per query; no
// conversational history.
history: Vec::new(),
conversation_id: None,
turn_index: None,
};
match app.ask(&gq.query, ask_opts) {
Ok(ans) => Some(ans),

View File

@@ -36,7 +36,7 @@ use anyhow::{Context, Result};
use kebab_core::{
Answer, AnswerCitation, AnswerRetrievalSummary, Citation, FinishReason,
GenerateRequest, LanguageModel, ModelRef, RefusalReason, Retriever, SearchFilters,
SearchHit, SearchMode, SearchQuery, TokenChunk, TokenUsage, TraceId,
SearchHit, SearchMode, SearchQuery, TokenChunk, TokenUsage, TraceId, Turn,
};
use kebab_core::versions::PromptTemplateVersion;
use kebab_store_sqlite::SqliteStore;
@@ -80,6 +80,22 @@ pub struct AskOpts {
/// pipeline — `SendError` is silently swallowed and generation
/// continues so the `Answer` row still gets persisted.
pub stream_sink: Option<std::sync::mpsc::Sender<String>>,
/// p9-fb-15: prior turns of the same conversation. Empty for
/// single-shot ask. The pipeline prepends a serialized `[이전
/// 대화]` block to the user prompt and uses the most-recent
/// answer's first 200 chars to expand the retrieval query
/// (cheap concat — LLM-based standalone-question rewriting is
/// out of scope per spec §3.8). Newest-first prepended; older
/// turns drop when the prompt would otherwise exceed
/// `cfg.rag.max_context_tokens`.
pub history: Vec<Turn>,
/// p9-fb-15: same conversation 의 turn 들이 공유. Filled into
/// `Answer.conversation_id`. None for single-shot ask.
pub conversation_id: Option<String>,
/// p9-fb-15: 0-based index within `conversation_id`. Caller
/// (TUI / CLI session) computes from `history.len()`. None for
/// single-shot ask.
pub turn_index: Option<u32>,
}
// ── RagPipeline ─────────────────────────────────────────────────────────────
@@ -111,6 +127,29 @@ impl RagPipeline {
}
}
/// p9-fb-15: convenience for multi-turn ask. Stuffs `history`,
/// `conversation_id`, `turn_index` into a fresh `AskOpts` (built
/// from `opts.mode` + carried-through knobs) and forwards to
/// [`Self::ask`]. The returned `Answer` carries the same
/// `conversation_id` / `turn_index`. CLI / TUI sessions call this
/// once per follow-up question.
pub fn ask_with_history(
&self,
query: &str,
history: Vec<Turn>,
conversation_id: String,
turn_index: u32,
opts: AskOpts,
) -> Result<Answer> {
let combined = AskOpts {
history,
conversation_id: Some(conversation_id),
turn_index: Some(turn_index),
..opts
};
self.ask(query, combined)
}
/// Run one query through the full pipeline. Always persists an
/// `answers` row (including refusals); the row write is best-effort
/// — a persistence error is surfaced via `tracing::warn!` so the
@@ -121,8 +160,14 @@ impl RagPipeline {
// ── 1. Retrieve ────────────────────────────────────────────────────
// floor at config default — see `AskOpts::k` doc for rationale.
let k_effective = opts.k.max(self.config.search.default_k);
// p9-fb-15: query expansion when history is present.
// Concat the most-recent answer's first 200 chars so the
// retriever sees the full conversational context. Cheap —
// LLM-based standalone-question rewriting is out of scope
// (spec §3.8 marks it P+).
let expanded_query = expand_query_with_history(query, &opts.history);
let search_query = SearchQuery {
text: query.to_string(),
text: expanded_query,
mode: opts.mode,
k: k_effective,
filters: SearchFilters::default(),
@@ -171,7 +216,25 @@ impl RagPipeline {
// ── 4. Render prompt ───────────────────────────────────────────────
let system = SYSTEM_PROMPT_RAG_V1.to_string();
let user = format!("[질문]\n{query}\n\n[근거]\n{packed_text}");
// p9-fb-15: prepend `[이전 대화]` block when history is
// present. `serialize_history` enforces the spec §3.8
// priority — system+question stay untouched, retrieved
// chunks already fit (`pack_context` honoured the budget),
// so the budget remaining for history is what's left over.
let history_budget_chars = remaining_history_budget_chars(
self.config.rag.max_context_tokens,
&system,
query,
&packed_text,
);
let history_block = serialize_history(&opts.history, history_budget_chars);
let user = if history_block.is_empty() {
format!("[질문]\n{query}\n\n[근거]\n{packed_text}")
} else {
format!(
"{history_block}\n\n[질문]\n{query}\n\n[근거]\n{packed_text}"
)
};
// ── 5. Generate ────────────────────────────────────────────────────
// Completion budget is bounded only by what the LM context window
@@ -322,6 +385,8 @@ impl RagPipeline {
},
usage: usage_final,
created_at: OffsetDateTime::now_utc(),
conversation_id: opts.conversation_id.clone(),
turn_index: opts.turn_index,
};
// Drop the moved `finish_reason` early into a tracing breadcrumb; the
@@ -455,6 +520,8 @@ impl RagPipeline {
latency_ms: elapsed_ms,
},
created_at: OffsetDateTime::now_utc(),
conversation_id: opts.conversation_id.clone(),
turn_index: opts.turn_index,
};
if let Err(e) = self.docs.put_answer(&answer, query, None) {
tracing::warn!(target: "kebab-rag", error = %e, "kb-rag: put_answer (NoChunks) failed");
@@ -530,6 +597,8 @@ impl RagPipeline {
latency_ms: elapsed_ms,
},
created_at: OffsetDateTime::now_utc(),
conversation_id: opts.conversation_id.clone(),
turn_index: opts.turn_index,
};
if let Err(e) = self.docs.put_answer(&answer, query, None) {
tracing::warn!(target: "kebab-rag", error = %e, "kb-rag: put_answer (ScoreGate) failed");
@@ -569,6 +638,80 @@ fn est_tokens(s: &str) -> usize {
s.chars().count().div_ceil(4)
}
/// p9-fb-15: expand the retrieval query with the most-recent answer's
/// first 200 chars when history is non-empty. Cheap concat per spec
/// §3.8 — LLM-based standalone-question rewriting is P+. The retriever
/// sees `<question> <last answer prefix>` so embedding / FTS hit on
/// names from the prior turn ("Y" in "Y vs X 의 차이?") still surfaces
/// the right chunks.
fn expand_query_with_history(query: &str, history: &[Turn]) -> String {
let Some(last) = history.last() else {
return query.to_string();
};
let prefix: String = last.answer.chars().take(200).collect();
if prefix.is_empty() {
query.to_string()
} else {
format!("{query} {prefix}")
}
}
/// p9-fb-15: how many *chars* of history block we may afford. The
/// budget is `cfg.rag.max_context_tokens * BYTES_PER_TOKEN` minus the
/// chars already committed to system + question + retrieved chunks.
/// Returns 0 (history fully dropped) when budget already exhausted.
fn remaining_history_budget_chars(
max_context_tokens: usize,
system: &str,
question: &str,
packed_text: &str,
) -> usize {
let total_chars = max_context_tokens.saturating_mul(4);
let used = system.chars().count()
+ question.chars().count()
+ packed_text.chars().count()
// Account for the format-string overhead: `[질문]\n` + `\n\n[근거]\n`
// + `\n\n` between history and question. Round up to ~32 chars
// to keep the maths simple.
+ 32;
total_chars.saturating_sub(used)
}
/// p9-fb-15: serialize history into the `[이전 대화]` block. Newest
/// turn first per spec §3.8 — the loop walks `history` in reverse and
/// stops as soon as appending the next turn would exceed `budget_chars`.
/// Empty when history is empty or no turn fits.
fn serialize_history(history: &[Turn], budget_chars: usize) -> String {
if history.is_empty() || budget_chars == 0 {
return String::new();
}
// Build newest-first, then reverse so the LM reads chronological
// order ("Q1/A1\nQ2/A2 → newest at the bottom, just above the
// current question").
let mut included_rev: Vec<String> = Vec::new();
let mut used = 0usize;
let header = "[이전 대화]\n";
let header_len = header.chars().count();
for turn in history.iter().rev() {
let block = format!("Q: {}\nA: {}\n", turn.question, turn.answer);
let blen = block.chars().count();
if used + blen + header_len > budget_chars {
break;
}
used += blen;
included_rev.push(block);
}
if included_rev.is_empty() {
return String::new();
}
let mut out = String::with_capacity(used + header_len);
out.push_str(header);
for block in included_rev.iter().rev() {
out.push_str(block);
}
out
}
/// Strict marker regex per design §1 / spec line 107: `[#1]` … `[#999]`.
/// Matches without `#`, with whitespace, or with non-digit content are
/// intentionally ignored (see test plan rows 56).
@@ -634,4 +777,104 @@ mod tests {
// 8 chars → 2 tokens
assert_eq!(est_tokens("abcdefgh"), 2);
}
// ── p9-fb-15: multi-turn helpers ───────────────────────────────────────
fn fake_turn(question: &str, answer: &str) -> Turn {
Turn {
question: question.into(),
answer: answer.into(),
citations: Vec::new(),
created_at: OffsetDateTime::now_utc(),
}
}
#[test]
fn expand_query_with_history_empty_returns_query_unchanged() {
assert_eq!(expand_query_with_history("hi", &[]), "hi");
}
#[test]
fn expand_query_with_history_concats_last_answer_prefix() {
let h = vec![fake_turn("Q1", "first answer body")];
let expanded = expand_query_with_history("follow-up", &h);
assert!(expanded.starts_with("follow-up "), "got: {expanded}");
assert!(
expanded.contains("first answer body"),
"got: {expanded}"
);
}
#[test]
fn expand_query_caps_last_answer_at_200_chars() {
let long = "x".repeat(500);
let h = vec![fake_turn("Q", &long)];
let expanded = expand_query_with_history("q", &h);
// query (1 char) + space (1) + 200 of x = 202.
assert_eq!(expanded.chars().count(), 1 + 1 + 200);
}
#[test]
fn expand_query_uses_last_turn_only() {
let h = vec![
fake_turn("Q1", "FIRST ANSWER"),
fake_turn("Q2", "LATEST ANSWER"),
];
let expanded = expand_query_with_history("q3", &h);
assert!(expanded.contains("LATEST ANSWER"), "got: {expanded}");
assert!(!expanded.contains("FIRST ANSWER"), "got: {expanded}");
}
#[test]
fn serialize_history_empty_returns_empty_string() {
assert_eq!(serialize_history(&[], 1000), "");
let h = vec![fake_turn("q", "a")];
assert_eq!(serialize_history(&h, 0), "");
}
#[test]
fn serialize_history_chronological_order_with_header() {
let h = vec![
fake_turn("Q1", "A1"),
fake_turn("Q2", "A2"),
fake_turn("Q3", "A3"),
];
let s = serialize_history(&h, 1000);
assert!(s.starts_with("[이전 대화]\n"), "got: {s:?}");
let q1_pos = s.find("Q1").unwrap();
let q3_pos = s.find("Q3").unwrap();
assert!(q1_pos < q3_pos, "chronological: oldest first; got: {s:?}");
}
#[test]
fn serialize_history_drops_oldest_when_budget_tight() {
// Budget tight enough that only 1 of 3 turns fits.
let h = vec![
fake_turn("Q1", "A1"),
fake_turn("Q2", "A2"),
fake_turn("Q3", "A3"),
];
// Header is "[이전 대화]\n" (8 chars) + 1 turn ("Q: Q3\nA: A3\n" = 12 chars) ≈ 20.
let s = serialize_history(&h, 25);
assert!(s.contains("Q3"), "newest must be kept: {s:?}");
assert!(!s.contains("Q1"), "oldest dropped: {s:?}");
}
#[test]
fn remaining_history_budget_subtracts_known_pieces() {
// total = 100 tokens * 4 chars = 400 chars budget.
// system 100 chars + question 50 chars + packed 150 chars + 32 overhead = 332. left = 68.
let s = "x".repeat(100);
let q = "y".repeat(50);
let p = "z".repeat(150);
let left = remaining_history_budget_chars(100, &s, &q, &p);
assert_eq!(left, 400 - 100 - 50 - 150 - 32);
}
#[test]
fn remaining_history_budget_clamps_to_zero_when_overrun() {
let s = "x".repeat(1000);
let left = remaining_history_budget_chars(10, &s, "q", "p");
assert_eq!(left, 0);
}
}

View File

@@ -72,6 +72,9 @@ fn default_opts() -> AskOpts {
temperature: Some(0.0),
seed: Some(0),
stream_sink: None,
history: Vec::new(),
conversation_id: None,
turn_index: None,
}
}

View File

@@ -98,6 +98,7 @@ fn refusal_reason_label(r: &RefusalReason) -> &'static str {
RefusalReason::LlmSelfJudge => "llm_self_judge",
RefusalReason::NoIndex => "no_index",
RefusalReason::NoChunks => "no_chunks",
RefusalReason::LlmStreamAborted => "llm_stream_aborted",
}
}

View File

@@ -141,6 +141,7 @@ fn render_status(f: &mut Frame, area: Rect, s: &AskState) {
Some(RefusalReason::LlmSelfJudge) => " refusal=llm_self_judge",
Some(RefusalReason::NoIndex) => " refusal=no_index",
Some(RefusalReason::NoChunks) => " refusal=no_chunks",
Some(RefusalReason::LlmStreamAborted) => " refusal=llm_stream_aborted",
None => "",
};
vec![
@@ -300,6 +301,11 @@ fn spawn_ask_worker(state: &mut App) {
temperature: None,
seed: None,
stream_sink: Some(tx),
// p9-fb-15: TUI ask is single-shot in this task; multi-turn
// conversation UI lands in p9-fb-16.
history: Vec::new(),
conversation_id: None,
turn_index: None,
};
let handle =
thread::spawn(move || kebab_app::ask_with_config(cfg, &query, opts));

View File

@@ -66,6 +66,8 @@ fn make_answer(grounded: bool, refusal: Option<RefusalReason>, body: &str) -> An
latency_ms: 1200,
},
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
conversation_id: None,
turn_index: None,
}
}

View File

@@ -3,7 +3,7 @@ phase: P9
component: kebab-rag + kebab-app
task_id: p9-fb-15
title: "RAG multi-turn — history-aware prompt + token budget"
status: planned
status: in_progress
depends_on: []
unblocks: [p9-fb-16, p9-fb-17, p9-fb-18]
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md