From 610d29f0534440939950d2720d41af7a38032b10 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Sat, 9 May 2026 23:44:51 +0900
Subject: [PATCH] feat(app): App::fetch chunk mode + markdown serializer
(fb-35)
Chunk mode + +-N context. doc / span modes return placeholder
errors (filled by subsequent tasks). fmt_canonical_to_markdown
helper introduced now since doc mode (Task 4) consumes it.
Errors are typed StructuredError so classify preserves
chunk_not_found / doc_not_found through the wire layer.
Adds SqliteStore::list_chunk_ids_for_doc so the facade can derive
+-N neighbors without leaking direct rusqlite usage into kebab-app.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
crates/kebab-app/src/fetch.rs | 264 ++++++++++++++++++++
crates/kebab-app/src/lib.rs | 2 +
crates/kebab-app/tests/fetch_integration.rs | 85 +++++++
crates/kebab-store-sqlite/src/documents.rs | 36 +++
4 files changed, 387 insertions(+)
create mode 100644 crates/kebab-app/src/fetch.rs
create mode 100644 crates/kebab-app/tests/fetch_integration.rs
diff --git a/crates/kebab-app/src/fetch.rs b/crates/kebab-app/src/fetch.rs
new file mode 100644
index 0000000..5d1dfeb
--- /dev/null
+++ b/crates/kebab-app/src/fetch.rs
@@ -0,0 +1,264 @@
+//! p9-fb-35 verbatim fetch implementation.
+//!
+//! [`App::fetch`] is the facade entry point. It dispatches on
+//! [`FetchQuery`] variants:
+//!
+//! - `Chunk(id)` — return the chunk row from `chunks.text`, optionally
+//! with ±N surrounding chunks (`FetchOpts::context`).
+//! - `Doc(id)` — return the entire document re-serialized to markdown.
+//! (Implemented in Task 4.)
+//! - `Span { doc_id, line_start, line_end }` — return a contiguous line
+//! slice. (Implemented in Task 5.)
+//!
+//! Errors are surfaced as [`StructuredError`] (anyhow-friendly wrapper
+//! around `ErrorV1`) so the CLI / MCP wire layer's `classify` keeps the
+//! typed `code` (`chunk_not_found` / `doc_not_found` /
+//! `span_not_supported`) instead of falling through to `code =
+//! "generic"`.
+
+use anyhow::Result;
+use time::OffsetDateTime;
+
+use kebab_core::{
+ Block, CanonicalDocument, Chunk, ChunkId, DocumentId, DocumentStore, FetchKind, FetchOpts,
+ FetchQuery, FetchResult,
+};
+
+use crate::App;
+use crate::error_wire::{ERROR_V1_ID, ErrorV1, StructuredError};
+use crate::staleness::compute_stale;
+
+impl App {
+ /// p9-fb-35: verbatim fetch facade. Returns text from
+ /// `chunks.text` / `CanonicalDocument` based on the requested
+ /// mode. Errors surface as `StructuredError(ErrorV1)` with one
+ /// of `chunk_not_found` / `doc_not_found` / `span_not_supported`
+ /// so the wire-layer classifier preserves the typed code.
+ pub fn fetch(&self, query: FetchQuery, opts: FetchOpts) -> Result {
+ match query {
+ FetchQuery::Chunk(id) => fetch_chunk(self, id, opts),
+ FetchQuery::Doc(id) => fetch_doc(self, id, opts),
+ FetchQuery::Span {
+ doc_id,
+ line_start,
+ line_end,
+ } => fetch_span(self, doc_id, line_start, line_end, opts),
+ }
+ }
+}
+
+fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result {
+ let target = ::get_chunk(&app.sqlite, &id)?
+ .ok_or_else(|| {
+ anyhow::Error::new(StructuredError(ErrorV1 {
+ schema_version: ERROR_V1_ID.to_string(),
+ code: "chunk_not_found".to_string(),
+ message: format!("chunk_id '{}' not found", id.0),
+ details: serde_json::Value::Null,
+ hint: None,
+ }))
+ })?;
+
+ let doc_id = target.doc_id.clone();
+ let doc =
+ ::get_document(&app.sqlite, &doc_id)?
+ .ok_or_else(|| {
+ anyhow::Error::new(StructuredError(ErrorV1 {
+ schema_version: ERROR_V1_ID.to_string(),
+ code: "doc_not_found".to_string(),
+ message: format!(
+ "doc_id '{}' (parent of chunk '{}') not found",
+ doc_id.0, id.0
+ ),
+ details: serde_json::Value::Null,
+ hint: None,
+ }))
+ })?;
+
+ let (context_before, context_after) = match opts.context {
+ Some(n) if n > 0 => surrounding_chunks(app, &doc_id, &id, n)?,
+ _ => (Vec::new(), Vec::new()),
+ };
+
+ let now = OffsetDateTime::now_utc();
+ let stale = compute_stale(
+ doc_metadata_updated_at(&doc),
+ now,
+ app.config.search.stale_threshold_days,
+ );
+
+ Ok(FetchResult {
+ kind: FetchKind::Chunk,
+ doc_id: doc.doc_id.clone(),
+ doc_path: doc.workspace_path.clone(),
+ indexed_at: doc_metadata_updated_at(&doc),
+ stale,
+ chunk: Some(target),
+ context_before,
+ context_after,
+ text: None,
+ line_start: None,
+ line_end: None,
+ effective_end: None,
+ truncated: false,
+ })
+}
+
+fn fetch_doc(_app: &App, _id: DocumentId, _opts: FetchOpts) -> Result {
+ // Implemented in Task 4.
+ anyhow::bail!("fetch_doc not yet implemented")
+}
+
+fn fetch_span(
+ _app: &App,
+ _id: DocumentId,
+ _line_start: u32,
+ _line_end: u32,
+ _opts: FetchOpts,
+) -> Result {
+ // Implemented in Task 5.
+ anyhow::bail!("fetch_span not yet implemented")
+}
+
+/// p9-fb-35: list chunks for a document in ordinal order, return
+/// `(before, after)` slices around the target chunk_id. `n` caps each
+/// side independently — the worst case is `2n` total neighbors when
+/// the target sits in the middle of the doc.
+fn surrounding_chunks(
+ app: &App,
+ doc_id: &DocumentId,
+ target: &ChunkId,
+ n: u32,
+) -> Result<(Vec, Vec)> {
+ let chunks = list_chunks_in_order(app, doc_id)?;
+ let target_idx = chunks
+ .iter()
+ .position(|c| c.chunk_id == *target)
+ .ok_or_else(|| anyhow::anyhow!("chunk not found in doc chunk list"))?;
+ let n = n as usize;
+ let lo = target_idx.saturating_sub(n);
+ let hi = (target_idx + n + 1).min(chunks.len());
+ let before: Vec = chunks[lo..target_idx].to_vec();
+ let after: Vec = chunks[target_idx + 1..hi].to_vec();
+ Ok((before, after))
+}
+
+/// p9-fb-35: chunks have no explicit ordinal column, so the underlying
+/// helper sorts by `(created_at, chunk_id)` which matches insertion
+/// order produced by the chunker (deterministic). The actual SQL lives
+/// inside `kebab-store-sqlite` (`SqliteStore::list_chunk_ids_for_doc`)
+/// to keep the facade crate free of direct rusqlite usage.
+fn list_chunks_in_order(app: &App, doc_id: &DocumentId) -> Result> {
+ let chunk_ids = app.sqlite.list_chunk_ids_for_doc(doc_id)?;
+ let mut out: Vec = Vec::with_capacity(chunk_ids.len());
+ for cid in chunk_ids {
+ if let Some(chunk) =
+ ::get_chunk(&app.sqlite, &cid)?
+ {
+ out.push(chunk);
+ }
+ }
+ Ok(out)
+}
+
+fn doc_metadata_updated_at(doc: &CanonicalDocument) -> OffsetDateTime {
+ doc.metadata.updated_at
+}
+
+/// p9-fb-35: serialize a `CanonicalDocument` back to markdown. Best-
+/// effort round-trip — inline-styled spans (Strong/Emph children)
+/// flatten to plain text via the already-flattened `TextBlock.text`
+/// field. Good enough for an agent reading verbatim context. Used by
+/// Task 4 (doc mode) and Task 5 (span mode).
+//
+// The first caller lands in Task 4 (`fetch_doc`); silence the
+// stop-gap dead-code warning until then so this Task 3 commit lands
+// with a clean clippy run.
+#[allow(dead_code)]
+pub(crate) fn fmt_canonical_to_markdown(doc: &CanonicalDocument) -> String {
+ let mut out = String::with_capacity(1024);
+ for (i, block) in doc.blocks.iter().enumerate() {
+ if i > 0 {
+ out.push_str("\n\n");
+ }
+ match block {
+ Block::Heading(h) => {
+ let level = h.level.clamp(1, 6) as usize;
+ for _ in 0..level {
+ out.push('#');
+ }
+ out.push(' ');
+ out.push_str(&h.text);
+ }
+ Block::Paragraph(t) => out.push_str(&t.text),
+ Block::Quote(t) => {
+ // Prefix every line with `> ` so block-quote round-trips.
+ for (li, line) in t.text.split('\n').enumerate() {
+ if li > 0 {
+ out.push('\n');
+ }
+ out.push_str("> ");
+ out.push_str(line);
+ }
+ }
+ Block::List(l) => {
+ for (idx, item) in l.items.iter().enumerate() {
+ if idx > 0 {
+ out.push('\n');
+ }
+ if l.ordered {
+ out.push_str(&format!("{}. {}", idx + 1, item.text));
+ } else {
+ out.push_str(&format!("- {}", item.text));
+ }
+ }
+ }
+ Block::Code(c) => {
+ out.push_str("```");
+ if let Some(lang) = &c.lang {
+ out.push_str(lang);
+ }
+ out.push('\n');
+ out.push_str(&c.code);
+ if !c.code.ends_with('\n') {
+ out.push('\n');
+ }
+ out.push_str("```");
+ }
+ Block::Table(t) => {
+ out.push_str(&t.headers.join(" | "));
+ out.push('\n');
+ // Markdown table separator — N copies of `---|` is
+ // acceptable for a verbatim re-serialization (renderer
+ // tolerates trailing pipe).
+ out.push_str(&"---|".repeat(t.headers.len()));
+ for row in &t.rows {
+ out.push('\n');
+ out.push_str(&row.join(" | "));
+ }
+ }
+ Block::ImageRef(img) => {
+ out.push_str(&format!("", img.alt, img.src));
+ }
+ Block::AudioRef(_a) => {
+ // Canonical doc carries the transcript on AudioRefBlock,
+ // but markdown has no native audio embed. Emit a stub
+ // marker so the agent sees something ran here.
+ out.push_str("(audio reference)");
+ }
+ }
+ }
+ out
+}
+
+/// p9-fb-35: free-function entry for CLI / MCP. Mirrors the
+/// `*_with_config` pattern documented in the kebab-app crate root —
+/// `kebab-cli` calls this so a `--config ` flag is honored.
+#[doc(hidden)]
+pub fn fetch_with_config(
+ config: kebab_config::Config,
+ query: FetchQuery,
+ opts: FetchOpts,
+) -> Result {
+ App::open_with_config(config)?.fetch(query, opts)
+}
diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs
index 66c38ad..a6035a6 100644
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -60,6 +60,7 @@ pub mod doctor_signal;
pub mod error_signal;
pub mod error_wire;
pub mod external;
+pub mod fetch;
pub mod ingest_progress;
pub mod logging;
pub mod reset;
@@ -70,6 +71,7 @@ pub use app::{App, SearchResponse};
pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
pub use reset::{ResetReport, ResetScope};
pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify};
+pub use fetch::fetch_with_config;
pub use schema::{Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config};
pub use staleness::{compute_stale, mark_stale_in_place};
diff --git a/crates/kebab-app/tests/fetch_integration.rs b/crates/kebab-app/tests/fetch_integration.rs
new file mode 100644
index 0000000..8ca9a57
--- /dev/null
+++ b/crates/kebab-app/tests/fetch_integration.rs
@@ -0,0 +1,85 @@
+//! p9-fb-35 App::fetch integration tests.
+
+mod common;
+
+use kebab_app::App;
+use kebab_core::{FetchKind, FetchOpts, FetchQuery};
+
+fn open(env: &common::TestEnv) -> App {
+ env.app()
+}
+
+#[test]
+fn fetch_chunk_returns_target_only_when_no_context() {
+ let env = common::TestEnv::new();
+ common::ingest_md(&env, "a.md", "# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n");
+ let app = open(&env);
+
+ // Find a chunk via search to obtain its id.
+ let q = kebab_core::SearchQuery {
+ text: "First".to_string(),
+ mode: kebab_core::SearchMode::Lexical,
+ k: 1,
+ filters: kebab_core::SearchFilters::default(),
+ };
+ let hits = app.search(q).unwrap();
+ let chunk_id = hits[0].chunk_id.clone();
+
+ let result = app
+ .fetch(FetchQuery::Chunk(chunk_id), FetchOpts::default())
+ .unwrap();
+ assert_eq!(result.kind, FetchKind::Chunk);
+ assert!(result.chunk.is_some(), "target chunk populated");
+ assert!(result.context_before.is_empty());
+ assert!(result.context_after.is_empty());
+ assert!(!result.truncated);
+}
+
+#[test]
+fn fetch_chunk_with_context_returns_neighbors() {
+ let env = common::TestEnv::new();
+ let body = "# H1\n\nA1\n\n# H2\n\nA2\n\n# H3\n\nA3\n\n# H4\n\nA4\n\n# H5\n\nA5\n";
+ common::ingest_md(&env, "multi.md", body);
+ let app = env.app();
+
+ let q = kebab_core::SearchQuery {
+ text: "A3".to_string(),
+ mode: kebab_core::SearchMode::Lexical,
+ k: 1,
+ filters: kebab_core::SearchFilters::default(),
+ };
+ let hits = app.search(q).unwrap();
+ let chunk_id = hits[0].chunk_id.clone();
+
+ let result = app
+ .fetch(
+ FetchQuery::Chunk(chunk_id),
+ FetchOpts {
+ context: Some(2),
+ max_tokens: None,
+ },
+ )
+ .unwrap();
+ assert_eq!(result.kind, FetchKind::Chunk);
+ assert!(result.chunk.is_some());
+ let total = result.context_before.len() + result.context_after.len();
+ assert!(total >= 1, "at least one neighbor expected");
+ assert!(total <= 4, "context capped at +-2 ⇒ max 4 neighbors");
+}
+
+#[test]
+fn fetch_chunk_unknown_id_returns_chunk_not_found() {
+ let env = common::TestEnv::new();
+ let app = env.app();
+ let err = app
+ .fetch(
+ FetchQuery::Chunk(kebab_core::ChunkId("nonexistent-id".to_string())),
+ FetchOpts::default(),
+ )
+ .unwrap_err();
+ let msg = err.to_string();
+ assert!(
+ msg.contains("chunk_not_found") || msg.contains("nonexistent-id"),
+ "expected chunk_not_found error, got: {msg}"
+ );
+}
diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs
index ac59939..ac6b44f 100644
--- a/crates/kebab-store-sqlite/src/documents.rs
+++ b/crates/kebab-store-sqlite/src/documents.rs
@@ -375,6 +375,42 @@ impl kebab_core::DocumentStore for SqliteStore {
}
}
+impl SqliteStore {
+ /// p9-fb-35: list `chunk_id`s for a document in deterministic
+ /// chunker-emit order. `put_chunks` writes one transaction with a
+ /// single `created_at` snapshot, so the secondary `chunk_id` sort
+ /// is what actually orders neighbors within a single re-ingest;
+ /// the primary `created_at` sort distinguishes successive
+ /// re-ingests if they ever co-exist in the table (they shouldn't —
+ /// `put_chunks` deletes the old rows first — but the ordering is
+ /// still well-defined under that scenario).
+ ///
+ /// Used by `kebab-app::fetch::surrounding_chunks` to derive ±N
+ /// neighbors around a target chunk without leaking SQL into the
+ /// facade crate.
+ pub fn list_chunk_ids_for_doc(
+ &self,
+ doc_id: &kebab_core::DocumentId,
+ ) -> Result> {
+ let conn = self.read_conn();
+ let mut stmt = conn
+ .prepare(
+ "SELECT chunk_id FROM chunks
+ WHERE doc_id = ?
+ ORDER BY created_at ASC, chunk_id ASC",
+ )
+ .map_err(StoreError::from)?;
+ let rows = stmt
+ .query_map(params![doc_id.0], |r| r.get::<_, String>(0))
+ .map_err(StoreError::from)?;
+ let ids: Vec = rows
+ .map(|r| r.map(kebab_core::ChunkId))
+ .collect::>>()
+ .map_err(StoreError::from)?;
+ Ok(ids)
+ }
+}
+
// ── Internal row + (de)serialization helpers ─────────────────────────────
struct DocumentRow {