From 610d29f0534440939950d2720d41af7a38032b10 Mon Sep 17 00:00:00 2001 From: th-kim0823 Date: Sat, 9 May 2026 23:44:51 +0900 Subject: [PATCH] feat(app): App::fetch chunk mode + markdown serializer (fb-35) Chunk mode + +-N context. doc / span modes return placeholder errors (filled by subsequent tasks). fmt_canonical_to_markdown helper introduced now since doc mode (Task 4) consumes it. Errors are typed StructuredError so classify preserves chunk_not_found / doc_not_found through the wire layer. Adds SqliteStore::list_chunk_ids_for_doc so the facade can derive +-N neighbors without leaking direct rusqlite usage into kebab-app. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/fetch.rs | 264 ++++++++++++++++++++ crates/kebab-app/src/lib.rs | 2 + crates/kebab-app/tests/fetch_integration.rs | 85 +++++++ crates/kebab-store-sqlite/src/documents.rs | 36 +++ 4 files changed, 387 insertions(+) create mode 100644 crates/kebab-app/src/fetch.rs create mode 100644 crates/kebab-app/tests/fetch_integration.rs diff --git a/crates/kebab-app/src/fetch.rs b/crates/kebab-app/src/fetch.rs new file mode 100644 index 0000000..5d1dfeb --- /dev/null +++ b/crates/kebab-app/src/fetch.rs @@ -0,0 +1,264 @@ +//! p9-fb-35 verbatim fetch implementation. +//! +//! [`App::fetch`] is the facade entry point. It dispatches on +//! [`FetchQuery`] variants: +//! +//! - `Chunk(id)` — return the chunk row from `chunks.text`, optionally +//! with ±N surrounding chunks (`FetchOpts::context`). +//! - `Doc(id)` — return the entire document re-serialized to markdown. +//! (Implemented in Task 4.) +//! - `Span { doc_id, line_start, line_end }` — return a contiguous line +//! slice. (Implemented in Task 5.) +//! +//! Errors are surfaced as [`StructuredError`] (anyhow-friendly wrapper +//! around `ErrorV1`) so the CLI / MCP wire layer's `classify` keeps the +//! typed `code` (`chunk_not_found` / `doc_not_found` / +//! `span_not_supported`) instead of falling through to `code = +//! "generic"`. + +use anyhow::Result; +use time::OffsetDateTime; + +use kebab_core::{ + Block, CanonicalDocument, Chunk, ChunkId, DocumentId, DocumentStore, FetchKind, FetchOpts, + FetchQuery, FetchResult, +}; + +use crate::App; +use crate::error_wire::{ERROR_V1_ID, ErrorV1, StructuredError}; +use crate::staleness::compute_stale; + +impl App { + /// p9-fb-35: verbatim fetch facade. Returns text from + /// `chunks.text` / `CanonicalDocument` based on the requested + /// mode. Errors surface as `StructuredError(ErrorV1)` with one + /// of `chunk_not_found` / `doc_not_found` / `span_not_supported` + /// so the wire-layer classifier preserves the typed code. + pub fn fetch(&self, query: FetchQuery, opts: FetchOpts) -> Result { + match query { + FetchQuery::Chunk(id) => fetch_chunk(self, id, opts), + FetchQuery::Doc(id) => fetch_doc(self, id, opts), + FetchQuery::Span { + doc_id, + line_start, + line_end, + } => fetch_span(self, doc_id, line_start, line_end, opts), + } + } +} + +fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result { + let target = ::get_chunk(&app.sqlite, &id)? + .ok_or_else(|| { + anyhow::Error::new(StructuredError(ErrorV1 { + schema_version: ERROR_V1_ID.to_string(), + code: "chunk_not_found".to_string(), + message: format!("chunk_id '{}' not found", id.0), + details: serde_json::Value::Null, + hint: None, + })) + })?; + + let doc_id = target.doc_id.clone(); + let doc = + ::get_document(&app.sqlite, &doc_id)? + .ok_or_else(|| { + anyhow::Error::new(StructuredError(ErrorV1 { + schema_version: ERROR_V1_ID.to_string(), + code: "doc_not_found".to_string(), + message: format!( + "doc_id '{}' (parent of chunk '{}') not found", + doc_id.0, id.0 + ), + details: serde_json::Value::Null, + hint: None, + })) + })?; + + let (context_before, context_after) = match opts.context { + Some(n) if n > 0 => surrounding_chunks(app, &doc_id, &id, n)?, + _ => (Vec::new(), Vec::new()), + }; + + let now = OffsetDateTime::now_utc(); + let stale = compute_stale( + doc_metadata_updated_at(&doc), + now, + app.config.search.stale_threshold_days, + ); + + Ok(FetchResult { + kind: FetchKind::Chunk, + doc_id: doc.doc_id.clone(), + doc_path: doc.workspace_path.clone(), + indexed_at: doc_metadata_updated_at(&doc), + stale, + chunk: Some(target), + context_before, + context_after, + text: None, + line_start: None, + line_end: None, + effective_end: None, + truncated: false, + }) +} + +fn fetch_doc(_app: &App, _id: DocumentId, _opts: FetchOpts) -> Result { + // Implemented in Task 4. + anyhow::bail!("fetch_doc not yet implemented") +} + +fn fetch_span( + _app: &App, + _id: DocumentId, + _line_start: u32, + _line_end: u32, + _opts: FetchOpts, +) -> Result { + // Implemented in Task 5. + anyhow::bail!("fetch_span not yet implemented") +} + +/// p9-fb-35: list chunks for a document in ordinal order, return +/// `(before, after)` slices around the target chunk_id. `n` caps each +/// side independently — the worst case is `2n` total neighbors when +/// the target sits in the middle of the doc. +fn surrounding_chunks( + app: &App, + doc_id: &DocumentId, + target: &ChunkId, + n: u32, +) -> Result<(Vec, Vec)> { + let chunks = list_chunks_in_order(app, doc_id)?; + let target_idx = chunks + .iter() + .position(|c| c.chunk_id == *target) + .ok_or_else(|| anyhow::anyhow!("chunk not found in doc chunk list"))?; + let n = n as usize; + let lo = target_idx.saturating_sub(n); + let hi = (target_idx + n + 1).min(chunks.len()); + let before: Vec = chunks[lo..target_idx].to_vec(); + let after: Vec = chunks[target_idx + 1..hi].to_vec(); + Ok((before, after)) +} + +/// p9-fb-35: chunks have no explicit ordinal column, so the underlying +/// helper sorts by `(created_at, chunk_id)` which matches insertion +/// order produced by the chunker (deterministic). The actual SQL lives +/// inside `kebab-store-sqlite` (`SqliteStore::list_chunk_ids_for_doc`) +/// to keep the facade crate free of direct rusqlite usage. +fn list_chunks_in_order(app: &App, doc_id: &DocumentId) -> Result> { + let chunk_ids = app.sqlite.list_chunk_ids_for_doc(doc_id)?; + let mut out: Vec = Vec::with_capacity(chunk_ids.len()); + for cid in chunk_ids { + if let Some(chunk) = + ::get_chunk(&app.sqlite, &cid)? + { + out.push(chunk); + } + } + Ok(out) +} + +fn doc_metadata_updated_at(doc: &CanonicalDocument) -> OffsetDateTime { + doc.metadata.updated_at +} + +/// p9-fb-35: serialize a `CanonicalDocument` back to markdown. Best- +/// effort round-trip — inline-styled spans (Strong/Emph children) +/// flatten to plain text via the already-flattened `TextBlock.text` +/// field. Good enough for an agent reading verbatim context. Used by +/// Task 4 (doc mode) and Task 5 (span mode). +// +// The first caller lands in Task 4 (`fetch_doc`); silence the +// stop-gap dead-code warning until then so this Task 3 commit lands +// with a clean clippy run. +#[allow(dead_code)] +pub(crate) fn fmt_canonical_to_markdown(doc: &CanonicalDocument) -> String { + let mut out = String::with_capacity(1024); + for (i, block) in doc.blocks.iter().enumerate() { + if i > 0 { + out.push_str("\n\n"); + } + match block { + Block::Heading(h) => { + let level = h.level.clamp(1, 6) as usize; + for _ in 0..level { + out.push('#'); + } + out.push(' '); + out.push_str(&h.text); + } + Block::Paragraph(t) => out.push_str(&t.text), + Block::Quote(t) => { + // Prefix every line with `> ` so block-quote round-trips. + for (li, line) in t.text.split('\n').enumerate() { + if li > 0 { + out.push('\n'); + } + out.push_str("> "); + out.push_str(line); + } + } + Block::List(l) => { + for (idx, item) in l.items.iter().enumerate() { + if idx > 0 { + out.push('\n'); + } + if l.ordered { + out.push_str(&format!("{}. {}", idx + 1, item.text)); + } else { + out.push_str(&format!("- {}", item.text)); + } + } + } + Block::Code(c) => { + out.push_str("```"); + if let Some(lang) = &c.lang { + out.push_str(lang); + } + out.push('\n'); + out.push_str(&c.code); + if !c.code.ends_with('\n') { + out.push('\n'); + } + out.push_str("```"); + } + Block::Table(t) => { + out.push_str(&t.headers.join(" | ")); + out.push('\n'); + // Markdown table separator — N copies of `---|` is + // acceptable for a verbatim re-serialization (renderer + // tolerates trailing pipe). + out.push_str(&"---|".repeat(t.headers.len())); + for row in &t.rows { + out.push('\n'); + out.push_str(&row.join(" | ")); + } + } + Block::ImageRef(img) => { + out.push_str(&format!("![{}]({})", img.alt, img.src)); + } + Block::AudioRef(_a) => { + // Canonical doc carries the transcript on AudioRefBlock, + // but markdown has no native audio embed. Emit a stub + // marker so the agent sees something ran here. + out.push_str("(audio reference)"); + } + } + } + out +} + +/// p9-fb-35: free-function entry for CLI / MCP. Mirrors the +/// `*_with_config` pattern documented in the kebab-app crate root — +/// `kebab-cli` calls this so a `--config ` flag is honored. +#[doc(hidden)] +pub fn fetch_with_config( + config: kebab_config::Config, + query: FetchQuery, + opts: FetchOpts, +) -> Result { + App::open_with_config(config)?.fetch(query, opts) +} diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 66c38ad..a6035a6 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -60,6 +60,7 @@ pub mod doctor_signal; pub mod error_signal; pub mod error_wire; pub mod external; +pub mod fetch; pub mod ingest_progress; pub mod logging; pub mod reset; @@ -70,6 +71,7 @@ pub use app::{App, SearchResponse}; pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown}; pub use reset::{ResetReport, ResetScope}; pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify}; +pub use fetch::fetch_with_config; pub use schema::{Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config}; pub use staleness::{compute_stale, mark_stale_in_place}; diff --git a/crates/kebab-app/tests/fetch_integration.rs b/crates/kebab-app/tests/fetch_integration.rs new file mode 100644 index 0000000..8ca9a57 --- /dev/null +++ b/crates/kebab-app/tests/fetch_integration.rs @@ -0,0 +1,85 @@ +//! p9-fb-35 App::fetch integration tests. + +mod common; + +use kebab_app::App; +use kebab_core::{FetchKind, FetchOpts, FetchQuery}; + +fn open(env: &common::TestEnv) -> App { + env.app() +} + +#[test] +fn fetch_chunk_returns_target_only_when_no_context() { + let env = common::TestEnv::new(); + common::ingest_md(&env, "a.md", "# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n"); + let app = open(&env); + + // Find a chunk via search to obtain its id. + let q = kebab_core::SearchQuery { + text: "First".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 1, + filters: kebab_core::SearchFilters::default(), + }; + let hits = app.search(q).unwrap(); + let chunk_id = hits[0].chunk_id.clone(); + + let result = app + .fetch(FetchQuery::Chunk(chunk_id), FetchOpts::default()) + .unwrap(); + assert_eq!(result.kind, FetchKind::Chunk); + assert!(result.chunk.is_some(), "target chunk populated"); + assert!(result.context_before.is_empty()); + assert!(result.context_after.is_empty()); + assert!(!result.truncated); +} + +#[test] +fn fetch_chunk_with_context_returns_neighbors() { + let env = common::TestEnv::new(); + let body = "# H1\n\nA1\n\n# H2\n\nA2\n\n# H3\n\nA3\n\n# H4\n\nA4\n\n# H5\n\nA5\n"; + common::ingest_md(&env, "multi.md", body); + let app = env.app(); + + let q = kebab_core::SearchQuery { + text: "A3".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 1, + filters: kebab_core::SearchFilters::default(), + }; + let hits = app.search(q).unwrap(); + let chunk_id = hits[0].chunk_id.clone(); + + let result = app + .fetch( + FetchQuery::Chunk(chunk_id), + FetchOpts { + context: Some(2), + max_tokens: None, + }, + ) + .unwrap(); + assert_eq!(result.kind, FetchKind::Chunk); + assert!(result.chunk.is_some()); + let total = result.context_before.len() + result.context_after.len(); + assert!(total >= 1, "at least one neighbor expected"); + assert!(total <= 4, "context capped at +-2 ⇒ max 4 neighbors"); +} + +#[test] +fn fetch_chunk_unknown_id_returns_chunk_not_found() { + let env = common::TestEnv::new(); + let app = env.app(); + let err = app + .fetch( + FetchQuery::Chunk(kebab_core::ChunkId("nonexistent-id".to_string())), + FetchOpts::default(), + ) + .unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("chunk_not_found") || msg.contains("nonexistent-id"), + "expected chunk_not_found error, got: {msg}" + ); +} diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs index ac59939..ac6b44f 100644 --- a/crates/kebab-store-sqlite/src/documents.rs +++ b/crates/kebab-store-sqlite/src/documents.rs @@ -375,6 +375,42 @@ impl kebab_core::DocumentStore for SqliteStore { } } +impl SqliteStore { + /// p9-fb-35: list `chunk_id`s for a document in deterministic + /// chunker-emit order. `put_chunks` writes one transaction with a + /// single `created_at` snapshot, so the secondary `chunk_id` sort + /// is what actually orders neighbors within a single re-ingest; + /// the primary `created_at` sort distinguishes successive + /// re-ingests if they ever co-exist in the table (they shouldn't — + /// `put_chunks` deletes the old rows first — but the ordering is + /// still well-defined under that scenario). + /// + /// Used by `kebab-app::fetch::surrounding_chunks` to derive ±N + /// neighbors around a target chunk without leaking SQL into the + /// facade crate. + pub fn list_chunk_ids_for_doc( + &self, + doc_id: &kebab_core::DocumentId, + ) -> Result> { + let conn = self.read_conn(); + let mut stmt = conn + .prepare( + "SELECT chunk_id FROM chunks + WHERE doc_id = ? + ORDER BY created_at ASC, chunk_id ASC", + ) + .map_err(StoreError::from)?; + let rows = stmt + .query_map(params![doc_id.0], |r| r.get::<_, String>(0)) + .map_err(StoreError::from)?; + let ids: Vec = rows + .map(|r| r.map(kebab_core::ChunkId)) + .collect::>>() + .map_err(StoreError::from)?; + Ok(ids) + } +} + // ── Internal row + (de)serialization helpers ───────────────────────────── struct DocumentRow {