diff --git a/Cargo.lock b/Cargo.lock index ee07125..4815907 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4166,6 +4166,7 @@ dependencies = [ "tracing-appender", "tracing-subscriber", "unicode-normalization", + "uuid", "wiremock", ] diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs index 5f7a42f..445b687 100644 --- a/crates/kebab-app/src/app.rs +++ b/crates/kebab-app/src/app.rs @@ -46,9 +46,8 @@ use kebab_core::{ use kebab_embed_local::FastembedEmbedder; use kebab_llm_local::OllamaLanguageModel; use kebab_parse_code::{ - CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, - JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, - TypescriptAstExtractor, + CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor, + KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor, }; use kebab_parse_image::ImageExtractor; use kebab_parse_pdf::PdfTextExtractor; @@ -242,15 +241,15 @@ impl App { // kebab-nli construction. Failure (`?`) surfaces as a user- // facing error at App boot — never a panic in the pipeline's // `expect("verifier must be Some when nli_threshold > 0.0")`. - let pipeline_verifier: Option> = - if config.rag.nli_threshold > 0.0 { - let v = kebab_nli::OnnxNliVerifier::new(&config).context( - "kebab-app: construct OnnxNliVerifier (config.rag.nli_threshold > 0)", - )?; - Some(Arc::new(v)) - } else { - None - }; + let pipeline_verifier: Option> = if config.rag.nli_threshold + > 0.0 + { + let v = kebab_nli::OnnxNliVerifier::new(&config) + .context("kebab-app: construct OnnxNliVerifier (config.rag.nli_threshold > 0)")?; + Some(Arc::new(v)) + } else { + None + }; Ok(Self { config, sqlite: Arc::new(sqlite), @@ -350,7 +349,9 @@ impl App { // so other in-flight searches can use the cache concurrently. drop(guard); let hits = self.search_uncached(query)?; - let mut guard = cache.lock().unwrap_or_else(std::sync::PoisonError::into_inner); + let mut guard = cache + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); guard.put(key, hits.clone()); Ok(hits) } @@ -430,11 +431,7 @@ impl App { /// /// `SearchResponse.next_cursor` and `truncated` are independent /// signals — see `SearchResponse` doc for details. - pub fn search_with_opts( - &self, - query: SearchQuery, - opts: SearchOpts, - ) -> Result { + pub fn search_with_opts(&self, query: SearchQuery, opts: SearchOpts) -> Result { use crate::cursor; let corpus_revision = self.sqlite.corpus_revision().to_string(); @@ -519,8 +516,7 @@ impl App { // Apply offset + k_effective truncation (mirrors non-trace path). let drop_n = offset.min(traced_hits.len()); traced_hits.drain(..drop_n); - let mut hits: Vec = - traced_hits.into_iter().take(k_effective).collect(); + let mut hits: Vec = traced_hits.into_iter().take(k_effective).collect(); // Snippet truncation if opts.snippet_chars set (mirror non-trace path). if opts.snippet_chars.is_some() { @@ -551,8 +547,7 @@ impl App { // Skip offset. let drop_n = offset.min(all_hits.len()); all_hits.drain(..drop_n); - let mut hits: Vec = - all_hits.into_iter().take(k_effective).collect(); + let mut hits: Vec = all_hits.into_iter().take(k_effective).collect(); // Apply snippet_chars override if shorter than what the // retriever returned (retriever already honored @@ -573,15 +568,11 @@ impl App { // Step 1: shorten snippets progressively to a 60-char floor. const SNIPPET_FLOOR: usize = 60; let mut current_snippet_cap = snippet_chars; - while estimate_chars(&hits) > max_chars - && current_snippet_cap > SNIPPET_FLOOR - { - current_snippet_cap = - (current_snippet_cap / 2).max(SNIPPET_FLOOR); + while estimate_chars(&hits) > max_chars && current_snippet_cap > SNIPPET_FLOOR { + current_snippet_cap = (current_snippet_cap / 2).max(SNIPPET_FLOOR); for h in &mut hits { if h.snippet.chars().count() > current_snippet_cap { - h.snippet = - trim_to_chars(&h.snippet, current_snippet_cap); + h.snippet = trim_to_chars(&h.snippet, current_snippet_cap); truncated = true; } } @@ -651,8 +642,7 @@ impl App { retriever: Arc, llm: Arc, ) -> RagPipeline { - let pipeline = - RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone()); + let pipeline = RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone()); match &self.pipeline_verifier { Some(v) => pipeline.with_verifier(v.clone()), None => pipeline, @@ -723,12 +713,7 @@ impl App { /// returns; on persistence error, the answer is still returned /// (don't lose the user's compute) but the error is logged so /// the operator notices. - pub fn ask_with_session( - &self, - session_id: &str, - query: &str, - opts: AskOpts, - ) -> Result { + pub fn ask_with_session(&self, session_id: &str, query: &str, opts: AskOpts) -> Result { use kebab_core::traits::{ChatSessionRepo, ChatSessionRow, ChatTurnRow}; use std::time::{SystemTime, UNIX_EPOCH}; @@ -766,13 +751,8 @@ impl App { let retriever = self.build_retriever(opts.mode)?; let llm = self.llm()?; let pipeline = self.build_pipeline(retriever, llm); - let answer = pipeline.ask_with_history( - query, - history, - session_id.to_string(), - next_index, - opts, - )?; + let answer = + pipeline.ask_with_history(query, history, session_id.to_string(), next_index, opts)?; // Auto-create the session header on first use. Title from // the first question (≤40 chars after trim). @@ -813,7 +793,8 @@ impl App { turn_index: next_index, question: query.to_string(), answer: answer.answer.clone(), - citations_json: serde_json::to_string(&answer.citations).unwrap_or_else(|_| "[]".to_string()), + citations_json: serde_json::to_string(&answer.citations) + .unwrap_or_else(|_| "[]".to_string()), created_at: now_unix, }; if let Err(e) = self.sqlite.append_turn(&turn_row) { @@ -848,8 +829,7 @@ impl App { return Ok(Some(e.clone())); } let emb: Arc = Arc::new( - FastembedEmbedder::new(&self.config) - .context("kb-app: load FastembedEmbedder")?, + FastembedEmbedder::new(&self.config).context("kb-app: load FastembedEmbedder")?, ); // `set` returns Err if another thread won the race; in that case // the loser still returns the (now-cached) winner via `get()`. @@ -925,7 +905,9 @@ impl App { /// clear` admin command). No-op when the cache is disabled. pub fn clear_search_cache(&self) { if let Some(cache) = self.search_cache.as_ref() { - let mut guard = cache.lock().unwrap_or_else(std::sync::PoisonError::into_inner); + let mut guard = cache + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); guard.clear(); } } @@ -946,8 +928,8 @@ impl App { /// git tree) correctly keep `repo: None` — `Metadata.repo` is already /// `None` for those, so the assignment is a no-op. fn backfill_repo(&self, hits: &mut [SearchHit]) { - use std::collections::HashMap; use kebab_core::DocumentId; + use std::collections::HashMap; // doc_id → Option where None means "not found / no repo" let mut cache: HashMap> = HashMap::new(); @@ -956,26 +938,24 @@ impl App { if hit.repo.is_some() { continue; } - let repo_val = cache - .entry(hit.doc_id.clone()) - .or_insert_with(|| { - // Deliberately non-aborting: a failed store lookup for - // one hit must not abort the whole search response. Log - // the error so it's observable rather than silently - // dropped (review #140 round 1). - match self.sqlite.get_document(&hit.doc_id) { - Ok(opt) => opt.and_then(|doc| doc.metadata.repo), - Err(e) => { - tracing::warn!( - target: "kebab-app", - doc_id = %hit.doc_id, - error = %e, - "backfill_repo: get_document failed; leaving hit.repo = None" - ); - None - } + let repo_val = cache.entry(hit.doc_id.clone()).or_insert_with(|| { + // Deliberately non-aborting: a failed store lookup for + // one hit must not abort the whole search response. Log + // the error so it's observable rather than silently + // dropped (review #140 round 1). + match self.sqlite.get_document(&hit.doc_id) { + Ok(opt) => opt.and_then(|doc| doc.metadata.repo), + Err(e) => { + tracing::warn!( + target: "kebab-app", + doc_id = %hit.doc_id, + error = %e, + "backfill_repo: get_document failed; leaving hit.repo = None" + ); + None } - }); + } + }); if let Some(r) = repo_val { hit.repo = Some(r.clone()); } @@ -986,10 +966,7 @@ impl App { /// "switch to --mode lexical" error when embeddings are disabled. fn require_embeddings( &self, - ) -> Result<( - Arc, - Arc, - )> { + ) -> Result<(Arc, Arc)> { let emb = self.embedder()?.ok_or_else(|| { anyhow!( "embeddings disabled (config.models.embedding.provider == \"none\" \ @@ -1278,8 +1255,8 @@ mod tests_extractor_dispatch { MediaType::Code("kotlin".into()), MediaType::Code("c".into()), MediaType::Code("cpp".into()), - MediaType::Code("yaml".into()), // registry NOT cover - MediaType::Code("shell".into()), // registry NOT cover + MediaType::Code("yaml".into()), // registry NOT cover + MediaType::Code("shell".into()), // registry NOT cover MediaType::Audio(AudioType::Wav), // registry NOT cover ]; for sample in &samples { diff --git a/crates/kebab-app/src/bulk.rs b/crates/kebab-app/src/bulk.rs index 1971682..8954fbe 100644 --- a/crates/kebab-app/src/bulk.rs +++ b/crates/kebab-app/src/bulk.rs @@ -215,7 +215,10 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> { .and_then(serde_json::Value::as_u64) .map(|n| n as usize), cursor: obj.get("cursor").and_then(|v| v.as_str()).map(String::from), - trace: obj.get("trace").and_then(serde_json::Value::as_bool).unwrap_or(false), + trace: obj + .get("trace") + .and_then(serde_json::Value::as_bool) + .unwrap_or(false), }; Ok(( diff --git a/crates/kebab-app/src/error_signal.rs b/crates/kebab-app/src/error_signal.rs index 7d7ab6b..a1e0907 100644 --- a/crates/kebab-app/src/error_signal.rs +++ b/crates/kebab-app/src/error_signal.rs @@ -10,6 +10,6 @@ pub use crate::doctor_signal::{DoctorUnhealthy, NoHitSignal, RefusalSignal}; -pub use kebab_llm_local::LlmError; pub use kebab_config::{ConfigInvalid, ConfigNotFound}; +pub use kebab_llm_local::LlmError; pub use kebab_store_sqlite::NotIndexed; diff --git a/crates/kebab-app/src/error_wire.rs b/crates/kebab-app/src/error_wire.rs index c0397cf..d2d5e32 100644 --- a/crates/kebab-app/src/error_wire.rs +++ b/crates/kebab-app/src/error_wire.rs @@ -172,7 +172,10 @@ mod tests { }); let v1 = classify(&err, false); assert_eq!(v1.code, "config_invalid"); - assert_eq!(v1.details.get("path").and_then(|p| p.as_str()), Some("/tmp/x.toml")); + assert_eq!( + v1.details.get("path").and_then(|p| p.as_str()), + Some("/tmp/x.toml") + ); assert!(v1.hint.is_some()); } @@ -196,7 +199,8 @@ mod tests { // the resulting LlmError::Unreachable maps to "model_unreachable". let client = reqwest::blocking::Client::builder() .timeout(std::time::Duration::from_millis(500)) - .build().unwrap(); + .build() + .unwrap(); let err = client.get("http://127.0.0.1:1").send().unwrap_err(); let llm = LlmError::Unreachable { endpoint: "http://127.0.0.1:1".to_string(), @@ -212,7 +216,10 @@ mod tests { let llm = LlmError::ModelNotPulled("gemma4:e4b".to_string()); let v1 = classify(&anyhow::Error::new(llm), false); assert_eq!(v1.code, "model_not_pulled"); - assert_eq!(v1.details.get("model").and_then(|p| p.as_str()), Some("gemma4:e4b")); + assert_eq!( + v1.details.get("model").and_then(|p| p.as_str()), + Some("gemma4:e4b") + ); } #[test] @@ -249,7 +256,10 @@ mod tests { // (single source of truth). classify must not pattern-match on // anyhow string contents — that would create two sources of // truth. The bare anyhow string falls through to "generic". - assert_ne!(v1.code, "stale_cursor", "classify must not produce stale_cursor from bare anyhow string"); + assert_ne!( + v1.code, "stale_cursor", + "classify must not produce stale_cursor from bare anyhow string" + ); } #[test] diff --git a/crates/kebab-app/src/external.rs b/crates/kebab-app/src/external.rs index 48d396d..0f421aa 100644 --- a/crates/kebab-app/src/external.rs +++ b/crates/kebab-app/src/external.rs @@ -36,9 +36,7 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> { } else { String::new() }; - let already = existing - .lines() - .any(|line| line.trim() == KEBABIGNORE_LINE); + let already = existing.lines().any(|line| line.trim() == KEBABIGNORE_LINE); if already { return Ok(()); } @@ -57,11 +55,7 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> { /// Copy bytes to `/.`. Idempotent — if the /// destination file already exists with the expected hash, the existing /// file is reused (no second write). Returns the destination path. -pub fn copy_to_external( - external_dir: &Path, - bytes: &[u8], - ext: &str, -) -> Result { +pub fn copy_to_external(external_dir: &Path, bytes: &[u8], ext: &str) -> Result { let hash = blake3::hash(bytes); let hex = hash.to_hex(); let prefix = &hex.as_str()[..12]; @@ -82,11 +76,7 @@ pub fn copy_to_external( /// Internal `yaml_quote` always uses double-quoted YAML form with backslash /// escapes for `"` / `\` / control chars — agent-supplied titles with /// special characters are safe. -pub fn inject_frontmatter( - body: &str, - title: &str, - source_uri: Option<&str>, -) -> Result { +pub fn inject_frontmatter(body: &str, title: &str, source_uri: Option<&str>) -> Result { let head = body.trim_start(); if head.starts_with("---\n") || head.starts_with("---\r\n") || head.starts_with("---\r") { anyhow::bail!( diff --git a/crates/kebab-app/src/fetch.rs b/crates/kebab-app/src/fetch.rs index d0f1eea..5d378b1 100644 --- a/crates/kebab-app/src/fetch.rs +++ b/crates/kebab-app/src/fetch.rs @@ -50,14 +50,14 @@ impl App { fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result { let target = ::get_chunk(&app.sqlite, &id)? .ok_or_else(|| { - anyhow::Error::new(StructuredError(ErrorV1 { - schema_version: ERROR_V1_ID.to_string(), - code: "chunk_not_found".to_string(), - message: format!("chunk_id '{}' not found", id.0), - details: serde_json::Value::Null, - hint: None, - })) - })?; + anyhow::Error::new(StructuredError(ErrorV1 { + schema_version: ERROR_V1_ID.to_string(), + code: "chunk_not_found".to_string(), + message: format!("chunk_id '{}' not found", id.0), + details: serde_json::Value::Null, + hint: None, + })) + })?; let doc_id = target.doc_id.clone(); let doc = @@ -107,14 +107,14 @@ fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result { fn fetch_doc(app: &App, id: DocumentId, opts: FetchOpts) -> Result { let doc = ::get_document(&app.sqlite, &id)? .ok_or_else(|| { - anyhow::Error::new(StructuredError(ErrorV1 { - schema_version: ERROR_V1_ID.to_string(), - code: "doc_not_found".to_string(), - message: format!("doc_id '{}' not found", id.0), - details: serde_json::Value::Null, - hint: None, - })) - })?; + anyhow::Error::new(StructuredError(ErrorV1 { + schema_version: ERROR_V1_ID.to_string(), + code: "doc_not_found".to_string(), + message: format!("doc_id '{}' not found", id.0), + details: serde_json::Value::Null, + hint: None, + })) + })?; let mut text = fmt_canonical_to_markdown(&doc); let mut truncated = false; @@ -176,14 +176,14 @@ fn fetch_span( ) -> Result { let doc = ::get_document(&app.sqlite, &id)? .ok_or_else(|| { - anyhow::Error::new(StructuredError(ErrorV1 { - schema_version: ERROR_V1_ID.to_string(), - code: "doc_not_found".to_string(), - message: format!("doc_id '{}' not found", id.0), - details: serde_json::Value::Null, - hint: None, - })) - })?; + anyhow::Error::new(StructuredError(ErrorV1 { + schema_version: ERROR_V1_ID.to_string(), + code: "doc_not_found".to_string(), + message: format!("doc_id '{}' not found", id.0), + details: serde_json::Value::Null, + hint: None, + })) + })?; // Reject line-incompatible media types (PDF / audio). `SourceType` // (markdown / note / paper / reference / inbox) is the *user-facing* diff --git a/crates/kebab-app/src/ingest_log.rs b/crates/kebab-app/src/ingest_log.rs index 0ce0ef5..3702793 100644 --- a/crates/kebab-app/src/ingest_log.rs +++ b/crates/kebab-app/src/ingest_log.rs @@ -81,7 +81,9 @@ fn generate_run_id() -> String { use time::macros::format_description; let now = time::OffsetDateTime::now_utc(); let ts = now - .format(format_description!("[year][month][day]T[hour][minute][second]Z")) + .format(format_description!( + "[year][month][day]T[hour][minute][second]Z" + )) .unwrap_or_else(|_| "19700101T000000Z".to_string()); let uid = uuid::Uuid::now_v7().simple().to_string(); let suffix = &uid[uid.len() - 8..]; @@ -211,8 +213,8 @@ pub(crate) fn percentiles(samples: &[u64]) -> (Option, Option, Option< #[cfg(test)] mod tests { use super::*; - use tempfile::TempDir; use kebab_config::LoggingCfg; + use tempfile::TempDir; #[test] fn generate_run_id_has_iso_prefix_and_8_hex_suffix() { @@ -224,7 +226,10 @@ mod tests { assert!(prefix.contains('T'), "prefix should contain T: {prefix}"); assert!(prefix.ends_with('Z'), "prefix should end with Z: {prefix}"); assert_eq!(suffix.len(), 8, "suffix should be 8 chars: {suffix}"); - assert!(suffix.chars().all(|c| c.is_ascii_hexdigit()), "suffix should be hex: {suffix}"); + assert!( + suffix.chars().all(|c| c.is_ascii_hexdigit()), + "suffix should be hex: {suffix}" + ); } #[test] @@ -256,31 +261,43 @@ mod tests { let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap(); let path = writer.path().to_path_buf(); - writer.write_event(&LogEvent::Skip { - ts: now_ts(), - doc_path: "a.zip", - reason: "builtin_blacklist", - detail: Some(".zip extension"), - }).unwrap(); - writer.write_event(&LogEvent::Error { - ts: now_ts(), - code: "ingest_fatal", - message: "something bad", - }).unwrap(); - writer.write_event(&LogEvent::ParseError { - ts: now_ts(), - doc_path: "weird.pdf", - reason: "lopdf_error", - message: "unexpected EOF", - }).unwrap(); + writer + .write_event(&LogEvent::Skip { + ts: now_ts(), + doc_path: "a.zip", + reason: "builtin_blacklist", + detail: Some(".zip extension"), + }) + .unwrap(); + writer + .write_event(&LogEvent::Error { + ts: now_ts(), + code: "ingest_fatal", + message: "something bad", + }) + .unwrap(); + writer + .write_event(&LogEvent::ParseError { + ts: now_ts(), + doc_path: "weird.pdf", + reason: "lopdf_error", + message: "unexpected EOF", + }) + .unwrap(); writer.flush().unwrap(); let contents = std::fs::read_to_string(&path).unwrap(); let lines: Vec<&str> = contents.lines().collect(); assert_eq!(lines.len(), 3, "expected 3 lines, got: {}", lines.len()); for line in &lines { - assert!(line.starts_with('{'), "each line should be JSON object: {line}"); - assert!(line.contains("\"kind\""), "each line should have 'kind': {line}"); + assert!( + line.starts_with('{'), + "each line should be JSON object: {line}" + ); + assert!( + line.contains("\"kind\""), + "each line should have 'kind': {line}" + ); } } @@ -293,14 +310,19 @@ mod tests { }; let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap(); let path = writer.path().to_path_buf(); - writer.write_event(&LogEvent::Error { - ts: now_ts(), - code: "test", - message: "drop flush test", - }).unwrap(); + writer + .write_event(&LogEvent::Error { + ts: now_ts(), + code: "test", + message: "drop flush test", + }) + .unwrap(); // Drop without explicit flush — Drop impl should flush BufWriter. drop(writer); let contents = std::fs::read_to_string(&path).unwrap(); - assert!(contents.lines().count() >= 1, "file should have at least 1 line after drop"); + assert!( + contents.lines().count() >= 1, + "file should have at least 1 line after drop" + ); } } diff --git a/crates/kebab-app/src/ingest_progress.rs b/crates/kebab-app/src/ingest_progress.rs index 05284a0..b877a37 100644 --- a/crates/kebab-app/src/ingest_progress.rs +++ b/crates/kebab-app/src/ingest_progress.rs @@ -145,10 +145,7 @@ pub fn render_skipped_breakdown(map: &std::collections::BTreeMap) - /// Best-effort send into an optional `mpsc::Sender`. A dropped receiver /// is silently absorbed — the ingest hot path must not stall on a slow /// consumer. Logged at `trace` for diagnostics. -pub(crate) fn emit( - progress: Option<&std::sync::mpsc::Sender>, - event: IngestEvent, -) { +pub(crate) fn emit(progress: Option<&std::sync::mpsc::Sender>, event: IngestEvent) { if let Some(tx) = progress { if tx.send(event).is_err() { tracing::trace!( @@ -192,7 +189,10 @@ mod tests { media: "markdown".into(), }; let v = serde_json::to_value(&ev).unwrap(); - assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("asset_started")); + assert_eq!( + v.get("kind").and_then(|s| s.as_str()), + Some("asset_started") + ); assert_eq!(v.get("idx").and_then(serde_json::Value::as_u64), Some(1)); assert_eq!(v.get("total").and_then(serde_json::Value::as_u64), Some(10)); assert_eq!(v.get("path").and_then(|s| s.as_str()), Some("notes/foo.md")); @@ -211,8 +211,14 @@ mod tests { let v = serde_json::to_value(&ev).unwrap(); assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("completed")); let counts = v.get("counts").unwrap(); - assert_eq!(counts.get("scanned").and_then(serde_json::Value::as_u64), Some(5)); - assert_eq!(counts.get("new").and_then(serde_json::Value::as_u64), Some(2)); + assert_eq!( + counts.get("scanned").and_then(serde_json::Value::as_u64), + Some(5) + ); + assert_eq!( + counts.get("new").and_then(serde_json::Value::as_u64), + Some(2) + ); } #[test] diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 8f983aa..328e219 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -39,13 +39,17 @@ use std::sync::{Arc, Mutex}; use anyhow::{Context, anyhow}; use serde::{Deserialize, Serialize}; -use kebab_chunk::{CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; +use kebab_chunk::{ + CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, CodeJavaAstV1Chunker, + CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, + CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, + K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker, +}; use kebab_core::{ - Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, - DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, - EmbeddingKind, ExtractContext, IngestReport, Lang, LanguageModel, MediaType, - ParserVersion, RawAsset, SearchHit, SearchQuery, SourceScope, - SourceUri, VectorRecord, VectorStore, + Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, Chunker, ChunkerVersion, + DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, EmbeddingKind, + ExtractContext, IngestReport, Lang, LanguageModel, MediaType, ParserVersion, RawAsset, + SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore, }; use kebab_llm_local::OllamaLanguageModel; use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_caption, apply_ocr}; @@ -69,15 +73,17 @@ pub mod schema; mod staleness; pub use app::{App, SearchResponse, short_query_hint}; -pub use ingest_log::{IngestLogWriter, IngestSummary, LogEvent}; -pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown}; -pub use reset::{ResetReport, ResetScope, enumerate_orphans}; -pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify}; -pub use kebab_config::{ConfigInvalid, ConfigNotFound}; -pub use fetch::fetch_with_config; #[doc(hidden)] pub use bulk::{BULK_QUERIES_MAX, bulk_search_with_config}; -pub use schema::{Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config}; +pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify}; +pub use fetch::fetch_with_config; +pub use ingest_log::{IngestLogWriter, IngestSummary, LogEvent}; +pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown}; +pub use kebab_config::{ConfigInvalid, ConfigNotFound}; +pub use reset::{ResetReport, ResetScope, enumerate_orphans}; +pub use schema::{ + Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config, +}; pub use staleness::{compute_stale, mark_stale_in_place}; /// p9-fb-25: sentinel for files without an extension in @@ -322,8 +328,8 @@ pub fn ingest_with_config_opts( root: scope.root.to_string_lossy().into_owned(), }, ); - let connector = FsSourceConnector::new(&app.config) - .context("kb-app::ingest: build FsSourceConnector")?; + let connector = + FsSourceConnector::new(&app.config).context("kb-app::ingest: build FsSourceConnector")?; let (assets, fs_skips) = connector .scan_with_skips(&scope) .context("kb-app::ingest: scan workspace")?; @@ -372,18 +378,14 @@ pub fn ingest_with_config_opts( // endpoint) aborts ingest fail-fast — better than silently disabling // OCR/caption mid-run. let ocr_engine: Option = if app.config.image.ocr.enabled { - Some( - OllamaVisionOcr::new(&app.config) - .context("kb-app::ingest: build OllamaVisionOcr")?, - ) + Some(OllamaVisionOcr::new(&app.config).context("kb-app::ingest: build OllamaVisionOcr")?) } else { None }; let caption_llm: Option> = if app.config.image.caption.enabled { - Some(Box::new( - OllamaLanguageModel::new(&app.config) - .context("kb-app::ingest: build OllamaLanguageModel for caption")?, - )) + Some(Box::new(OllamaLanguageModel::new(&app.config).context( + "kb-app::ingest: build OllamaLanguageModel for caption", + )?)) } else { None }; @@ -440,10 +442,8 @@ pub fn ingest_with_config_opts( // current walker scope (config narrowing / include-glob change) is // NOT purged — we leave it in place to protect against accidental // data loss via config edits. - let scanned_paths: std::collections::HashSet = assets - .iter() - .map(|a| a.workspace_path.clone()) - .collect(); + let scanned_paths: std::collections::HashSet = + assets.iter().map(|a| a.workspace_path.clone()).collect(); let purged_deleted_files = sweep_deleted_files( &app, &scanned_paths, @@ -659,8 +659,7 @@ pub fn ingest_with_config_opts( } } - let duration_ms = u32::try_from(started_instant.elapsed().as_millis()) - .unwrap_or(u32::MAX); + let duration_ms = u32::try_from(started_instant.elapsed().as_millis()).unwrap_or(u32::MAX); let finished_at = time::OffsetDateTime::now_utc(); // Record the ingest_runs row with aggregate counts. @@ -941,8 +940,8 @@ fn try_skip_unchanged( if stored_is_tier3_fallback { // Embedder version still must match. - let embedder_match = existing_doc.last_embedding_version.as_ref() - == current_embedding_version; + let embedder_match = + existing_doc.last_embedding_version.as_ref() == current_embedding_version; if !embedder_match { return Ok(None); } @@ -986,23 +985,17 @@ fn try_skip_unchanged( // sentinel removes every doc at this path (the new doc_id is // not yet known here — it's computed downstream from the new // PARSER_VERSION). - purge_workspace_path_for_parser_bump(app, asset).with_context(|| { - format!( - "parser-bump orphan purge at {}", - asset.workspace_path.0 - ) - })?; + purge_workspace_path_for_parser_bump(app, asset) + .with_context(|| format!("parser-bump orphan purge at {}", asset.workspace_path.0))?; return Ok(None); } // 3. Chunker unchanged. - let chunker_match = existing_doc.last_chunker_version.as_ref() - == Some(current_chunker_version); + let chunker_match = existing_doc.last_chunker_version.as_ref() == Some(current_chunker_version); if !chunker_match { return Ok(None); } // 4. Embedder unchanged. - let embedder_match = existing_doc.last_embedding_version.as_ref() - == current_embedding_version; + let embedder_match = existing_doc.last_embedding_version.as_ref() == current_embedding_version; if !embedder_match { return Ok(None); } @@ -1038,7 +1031,8 @@ fn try_skip_unchanged( fn ext_for_skip_warning(path: &str) -> String { std::path::Path::new(path) .extension() - .and_then(|s| s.to_str()).map_or_else(|| NO_EXT_SENTINEL.to_string(), str::to_ascii_lowercase) + .and_then(|s| s.to_str()) + .map_or_else(|| NO_EXT_SENTINEL.to_string(), str::to_ascii_lowercase) } /// p9-fb-25: render the `IngestItem.warnings` line for a Skipped @@ -1121,10 +1115,26 @@ fn ingest_one_asset( } // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. p10-1D: c/cpp added. MediaType::Code(lang) - if matches!(lang.as_str(), - "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" - | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" - | "shell" | "c" | "cpp") => + if matches!( + lang.as_str(), + "rust" + | "python" + | "typescript" + | "javascript" + | "go" + | "java" + | "kotlin" + | "yaml" + | "dockerfile" + | "toml" + | "json" + | "xml" + | "groovy" + | "go-mod" + | "shell" + | "c" + | "cpp" + ) => { return ingest_one_code_asset( app, @@ -1204,16 +1214,17 @@ fn ingest_one_asset( // Frontmatter — `parse_frontmatter` returns Ok even on malformed // frontmatter (warnings are surfaced through the `Vec`). - let (metadata, fm_span, fm_warns) = parse_frontmatter(&bytes, &body_hints) - .context("kb-parse-md::parse_frontmatter")?; + let (metadata, fm_span, fm_warns) = + parse_frontmatter(&bytes, &body_hints).context("kb-parse-md::parse_frontmatter")?; let body_offset_lines = match fm_span { Some(span) => count_lines_in(&bytes[..span.end]), None => 0, }; - let (parsed_blocks, blk_warns) = parse_blocks(&bytes[fm_span_end(fm_span)..], body_offset_lines) - .context("kb-parse-md::parse_blocks")?; + let (parsed_blocks, blk_warns) = + parse_blocks(&bytes[fm_span_end(fm_span)..], body_offset_lines) + .context("kb-parse-md::parse_blocks")?; let mut all_warnings = Vec::with_capacity(fm_warns.len() + blk_warns.len()); all_warnings.extend(fm_warns); @@ -1226,14 +1237,9 @@ fn ingest_one_asset( .map(|w| format!("{:?}: {}", w.kind, w.note)) .collect(); - let mut canonical = build_canonical_document( - asset, - metadata, - parsed_blocks, - parser_version, - all_warnings, - ) - .context("kb-parse-md::build_canonical_document")?; + let mut canonical = + build_canonical_document(asset, metadata, parsed_blocks, parser_version, all_warnings) + .context("kb-parse-md::build_canonical_document")?; let chunks = MdHeadingV1Chunker .chunk(&canonical, chunk_policy) @@ -1300,9 +1306,7 @@ fn ingest_one_asset( dimensions, }) .collect(); - vec_store - .upsert(&records) - .context("VectorStore::upsert")?; + vec_store.upsert(&records).context("VectorStore::upsert")?; } } @@ -1367,9 +1371,7 @@ fn ingest_one_image_asset( chunk_count: None, parser_version: None, chunker_version: None, - warnings: vec![ - "kb:// URI not yet supported".to_string(), - ], + warnings: vec!["kb:// URI not yet supported".to_string()], pdf_ocr_pages: None, pdf_ocr_ms_total: None, error: None, @@ -1481,17 +1483,19 @@ fn ingest_one_image_asset( "image document missing leading ImageRef block — OCR/caption skipped (first block: {:?})", other.map(|b| std::mem::discriminant(b)) ); - canonical.provenance.events.push(kebab_core::ProvenanceEvent { - at: now, - agent: "kb-app".to_string(), - kind: kebab_core::ProvenanceKind::Warning, - note: Some( - "image document missing leading ImageRef block — OCR/caption skipped" - .to_string(), - ), - }); - warning_notes - .push("ImageDispatchAnomaly: missing ImageRef block".to_string()); + canonical + .provenance + .events + .push(kebab_core::ProvenanceEvent { + at: now, + agent: "kb-app".to_string(), + kind: kebab_core::ProvenanceKind::Warning, + note: Some( + "image document missing leading ImageRef block — OCR/caption skipped" + .to_string(), + ), + }); + warning_notes.push("ImageDispatchAnomaly: missing ImageRef block".to_string()); } } @@ -1639,10 +1643,7 @@ fn record_image_analysis_failure( /// 3. Sweeps the SQLite `documents` row (CASCADE drops `blocks` / /// `chunks` / `embedding_records`). The `assets` row stays — same /// bytes, same asset_id, only the derived `doc_id` changed. -fn purge_workspace_path_for_parser_bump( - app: &App, - asset: &RawAsset, -) -> anyhow::Result<()> { +fn purge_workspace_path_for_parser_bump(app: &App, asset: &RawAsset) -> anyhow::Result<()> { let path = &asset.workspace_path.0; let stale = app .sqlite @@ -1777,21 +1778,19 @@ fn sweep_deleted_files( } // File is truly absent → purge. - let chunk_ids = match kebab_store_sqlite::purge_deleted_workspace_path( - &app.sqlite, - &stored_path, - ) { - Ok(ids) => ids, - Err(e) => { - tracing::warn!( - target: "kebab-app", - path = %stored_path.0, - error = %e, - "sweep_deleted_files: purge failed; skipping this path" - ); - continue; - } - }; + let chunk_ids = + match kebab_store_sqlite::purge_deleted_workspace_path(&app.sqlite, &stored_path) { + Ok(ids) => ids, + Err(e) => { + tracing::warn!( + target: "kebab-app", + path = %stored_path.0, + error = %e, + "sweep_deleted_files: purge failed; skipping this path" + ); + continue; + } + }; // Purge associated vectors (best-effort; partial failure // acceptable — orphan vectors get cleaned by `kebab reset @@ -1875,9 +1874,7 @@ fn ingest_one_pdf_asset( chunk_count: None, parser_version: None, chunker_version: None, - warnings: vec![ - "kb:// URI not yet supported".to_string(), - ], + warnings: vec!["kb:// URI not yet supported".to_string()], pdf_ocr_pages: None, pdf_ocr_ms_total: None, error: None, @@ -1946,9 +1943,7 @@ fn ingest_one_pdf_asset( crate::pdf_ocr_apply::PdfOcrProgress::Started { page } => { if let Some(sender) = progress { let _ = sender.send( - crate::ingest_progress::IngestEvent::PdfOcrStarted { - page, - }, + crate::ingest_progress::IngestEvent::PdfOcrStarted { page }, ); } } @@ -1996,9 +1991,13 @@ fn ingest_one_pdf_asset( }); } } - if let Ok(mut p) = pages_for_ocr.lock() { *p += 1; } + if let Ok(mut p) = pages_for_ocr.lock() { + *p += 1; + } if success { - if let Ok(mut s) = samples_for_ocr.lock() { s.push(ms); } + if let Ok(mut s) = samples_for_ocr.lock() { + s.push(ms); + } } else if let Ok(mut f) = failures_for_ocr.lock() { *f += 1; } @@ -2053,9 +2052,7 @@ fn ingest_one_pdf_asset( kind: EmbeddingKind::Document, }) .collect(); - let vectors = emb - .embed(&inputs) - .context("Embedder::embed (pdf chunks)")?; + let vectors = emb.embed(&inputs).context("Embedder::embed (pdf chunks)")?; let model_id = emb.model_id(); let model_version = emb.model_version(); let dimensions = emb.dimensions(); @@ -2139,7 +2136,7 @@ fn ingest_one_code_asset( vector_store: Option<&Arc>, existing_doc_ids: &std::collections::HashSet, force_reingest: bool, - code_lang: &str, // <-- NEW (p10-1b Task D) + code_lang: &str, // <-- NEW (p10-1b Task D) ) -> anyhow::Result { let path = match &asset.source_uri { SourceUri::File(p) => p.clone(), @@ -2154,9 +2151,7 @@ fn ingest_one_code_asset( chunk_count: None, parser_version: None, chunker_version: None, - warnings: vec![ - "kb:// URI not yet supported".to_string(), - ], + warnings: vec!["kb:// URI not yet supported".to_string()], pdf_ocr_pages: None, pdf_ocr_ms_total: None, error: None, @@ -2166,43 +2161,43 @@ fn ingest_one_code_asset( // p10-1b Task D/G/J: parser_version per-lang. let parser_version = match code_lang { - "rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()), - "python" => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.to_string()), + "rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()), + "python" => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.to_string()), "typescript" => ParserVersion(kebab_parse_code::TS_PARSER_VERSION.to_string()), "javascript" => ParserVersion(kebab_parse_code::JS_PARSER_VERSION.to_string()), "go" => ParserVersion(kebab_parse_code::GO_PARSER_VERSION.to_string()), "java" => ParserVersion(kebab_parse_code::JAVA_PARSER_VERSION.to_string()), "kotlin" => ParserVersion(kebab_parse_code::KOTLIN_PARSER_VERSION.to_string()), // p10-2: Tier 2 has no parse step — sentinel "none-v1". - "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" - => ParserVersion("none-v1".to_string()), + "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => { + ParserVersion("none-v1".to_string()) + } // p10-3: shell direct routes to Tier 3 (no parse step). "shell" => ParserVersion("none-v1".to_string()), // p10-1D: C + C++ AST extractors. - "c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()), + "c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()), "cpp" => ParserVersion(kebab_parse_code::CPP_PARSER_VERSION.to_string()), other => anyhow::bail!("unsupported code_lang: {other}"), }; // p10-1b Task D/G/J/L: chunker_version per-lang. let mut chunker_version = match code_lang { - "rust" => CodeRustAstV1Chunker.chunker_version(), - "python" => CodePythonAstV1Chunker.chunker_version(), + "rust" => CodeRustAstV1Chunker.chunker_version(), + "python" => CodePythonAstV1Chunker.chunker_version(), "typescript" => CodeTsAstV1Chunker.chunker_version(), "javascript" => CodeJsAstV1Chunker.chunker_version(), "go" => CodeGoAstV1Chunker.chunker_version(), "java" => CodeJavaAstV1Chunker.chunker_version(), - "kotlin" => CodeKotlinAstV1Chunker.chunker_version(), + "kotlin" => CodeKotlinAstV1Chunker.chunker_version(), // p10-2 Tier 2: - "yaml" => K8sManifestResourceV1Chunker.chunker_version(), + "yaml" => K8sManifestResourceV1Chunker.chunker_version(), "dockerfile" => DockerfileFileV1Chunker.chunker_version(), - "toml" | "json" | "xml" | "groovy" | "go-mod" - => ManifestFileV1Chunker.chunker_version(), + "toml" | "json" | "xml" | "groovy" | "go-mod" => ManifestFileV1Chunker.chunker_version(), // p10-3: - "shell" => CodeTextParagraphV1Chunker.chunker_version(), + "shell" => CodeTextParagraphV1Chunker.chunker_version(), // p10-1D: C + C++ AST chunkers. - "c" => CodeCAstV1Chunker.chunker_version(), - "cpp" => CodeCppAstV1Chunker.chunker_version(), + "c" => CodeCAstV1Chunker.chunker_version(), + "cpp" => CodeCppAstV1Chunker.chunker_version(), other => anyhow::bail!("unreachable chunker_version: {other}"), }; @@ -2265,8 +2260,12 @@ fn ingest_one_code_asset( // Tier 2 (yaml/dockerfile/…) and shell errors are real (e.g. non-UTF-8) — propagate. let mut canonical = match canonical_result { Ok(d) => d, - Err(e) if code_lang == "shell" - || matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod") => + Err(e) + if code_lang == "shell" + || matches!( + code_lang, + "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + ) => { return Err(e).context("synthesize_tier2_document failed for tier 2/3 lang"); } @@ -2290,7 +2289,10 @@ fn ingest_one_code_asset( // Tier 2 langs already have "none-v1" parser_version normally, so exclude them // from the extract_fell_back guard with the !matches! exclusion. let extract_fell_back = canonical.parser_version.0 == "none-v1" - && !matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" | "shell"); + && !matches!( + code_lang, + "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" | "shell" + ); let chunks_result: anyhow::Result> = if extract_fell_back { // Tier 1 lang whose extractor errored — go straight to Tier 3 chunker. @@ -2349,7 +2351,7 @@ fn ingest_one_code_asset( // "shell" direct path is already Tier 3 — don't retry-double-up. let chunks: Vec = match chunks_result { Ok(v) if !v.is_empty() => v, - other if code_lang == "shell" => other?, // shell propagates directly + other if code_lang == "shell" => other?, // shell propagates directly Ok(_empty) => { tracing::warn!( workspace_path = %asset.workspace_path.0, @@ -2373,7 +2375,9 @@ fn ingest_one_code_asset( canonical.parser_version = ParserVersion("none-v1".to_string()); CodeTextParagraphV1Chunker .chunk(&canonical, chunk_policy) - .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback after error)")? + .context( + "kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback after error)", + )? } }; @@ -2501,13 +2505,7 @@ fn synthesize_tier2_document( symbol: Some("".to_string()), lang: Some(code_lang.to_string()), }; - let block_id: BlockId = id_for_block( - &doc_id, - "code", - &[], - 0, - &span, - ); + let block_id: BlockId = id_for_block(&doc_id, "code", &[], 0, &span); let block = kebab_core::Block::Code(CodeBlock { common: CommonBlock { block_id, @@ -2553,7 +2551,9 @@ fn synthesize_tier2_document( }; let title = { - let fname = asset.workspace_path.0 + let fname = asset + .workspace_path + .0 .rsplit('/') .next() .unwrap_or(&asset.workspace_path.0); @@ -2799,7 +2799,9 @@ pub fn ask_with_session_with_config( /// `data_dir_writable` check probes the resolved `storage.data_dir` /// from that config (so `--config` users see their custom paths /// reflected in the report rather than the XDG defaults). -pub fn doctor_with_config_path(config_path: Option<&std::path::Path>) -> anyhow::Result { +pub fn doctor_with_config_path( + config_path: Option<&std::path::Path>, +) -> anyhow::Result { tracing::debug!("doctor() invoked"); let mut checks = Vec::new(); @@ -2817,11 +2819,7 @@ pub fn doctor_with_config_path(config_path: Option<&std::path::Path>) -> anyhow: } else if config_path.is_some() { // Explicit `--config ` that doesn't exist is a hard error // — defaults would silently mask the user's intent. - ( - false, - format!("{} (not found)", cfg_path.display()), - None, - ) + (false, format!("{} (not found)", cfg_path.display()), None) } else { // No `--config` and no XDG file: defaults are always loadable. (true, format!("{} (defaults)", cfg_path.display()), None) @@ -2907,16 +2905,18 @@ pub fn ingest_file_with_config( path: &std::path::Path, ) -> anyhow::Result { if !path.exists() { - anyhow::bail!("ingest-file: source path does not exist: {}", path.display()); + anyhow::bail!( + "ingest-file: source path does not exist: {}", + path.display() + ); } if !path.is_file() { anyhow::bail!("ingest-file: not a regular file: {}", path.display()); } - let ext_raw = path - .extension() - .and_then(|e| e.to_str()) - .ok_or_else(|| anyhow::anyhow!("ingest-file: source has no extension: {}", path.display()))?; + let ext_raw = path.extension().and_then(|e| e.to_str()).ok_or_else(|| { + anyhow::anyhow!("ingest-file: source has no extension: {}", path.display()) + })?; let ext = ext_raw.to_lowercase(); const SUPPORTED_EXTS: &[&str] = &["md", "pdf", "png", "jpg", "jpeg"]; @@ -2993,11 +2993,7 @@ pub fn ingest_stdin_with_config( let external_dir = crate::external::ensure_external_dir(&workspace_root)?; crate::external::ensure_kebabignore_entry(&workspace_root)?; - let dest = crate::external::copy_to_external( - &external_dir, - wrapped.as_bytes(), - "md", - )?; + let dest = crate::external::copy_to_external(&external_dir, wrapped.as_bytes(), "md")?; ingest_file_with_config(config, &dest) } @@ -3005,7 +3001,10 @@ pub fn ingest_stdin_with_config( /// Returns true if `source_path` matches any `.kebabignore` pattern /// rooted at `workspace_root`. Used by `ingest_file_with_config` to /// emit a stderr warn before bypassing the ignore. -fn check_kebabignore_match(workspace_root: &std::path::Path, source_path: &std::path::Path) -> bool { +fn check_kebabignore_match( + workspace_root: &std::path::Path, + source_path: &std::path::Path, +) -> bool { let kebabignore = workspace_root.join(".kebabignore"); if !kebabignore.exists() { return false; @@ -3026,5 +3025,7 @@ fn check_kebabignore_match(workspace_root: &std::path::Path, source_path: &std:: Ok(m) => m, Err(_) => return false, }; - matcher.matched(source_path, source_path.is_dir()).is_ignore() + matcher + .matched(source_path, source_path.is_dir()) + .is_ignore() } diff --git a/crates/kebab-app/src/logging.rs b/crates/kebab-app/src/logging.rs index 1b4baeb..3d7c05a 100644 --- a/crates/kebab-app/src/logging.rs +++ b/crates/kebab-app/src/logging.rs @@ -26,7 +26,9 @@ pub fn init(level: LogLevel) -> Result { let (nb, guard) = tracing_appender::non_blocking(file_appender); let env_filter = match level { - LogLevel::Default => EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")), + LogLevel::Default => { + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")) + } LogLevel::Verbose => EnvFilter::new("info"), LogLevel::Debug => EnvFilter::new("debug"), }; diff --git a/crates/kebab-app/src/pdf_ocr_apply.rs b/crates/kebab-app/src/pdf_ocr_apply.rs index 129da6d..26240e8 100644 --- a/crates/kebab-app/src/pdf_ocr_apply.rs +++ b/crates/kebab-app/src/pdf_ocr_apply.rs @@ -13,8 +13,8 @@ use std::time::Instant; use anyhow::{Context, Result}; use kebab_core::{ - Block, CanonicalDocument, CommonBlock, Inline, Lang, ProvenanceEvent, - ProvenanceKind, SourceSpan, TextBlock, id_for_block, + Block, CanonicalDocument, CommonBlock, Inline, Lang, ProvenanceEvent, ProvenanceKind, + SourceSpan, TextBlock, id_for_block, }; use kebab_parse_image::OcrEngine; use kebab_parse_pdf::{compute_valid_char_ratio, extract_dctdecode_page_image}; @@ -88,7 +88,10 @@ where F: FnMut(PdfOcrProgress), { if !opts.enabled { - return Ok(PdfOcrSummary { pages_ocrd: 0, ms_total: 0 }); + return Ok(PdfOcrSummary { + pages_ocrd: 0, + ms_total: 0, + }); } let pdf_doc = LopdfDocument::load_mem(pdf_bytes) .context("kb-app::pdf_ocr_apply: re-parse PDF for image extract")?; @@ -117,8 +120,7 @@ where }; let chars = text.chars().count() as u32; let valid_ratio = compute_valid_char_ratio(&text); - let needs_ocr = - chars < opts.min_char_count || valid_ratio < opts.valid_ratio_threshold; + let needs_ocr = chars < opts.min_char_count || valid_ratio < opts.valid_ratio_threshold; // 결정 matrix: // always_on=true → 모든 page OCR (dual-block). @@ -131,7 +133,9 @@ where emit_progress(PdfOcrProgress::Started { page: page_num }); - let page_image_bytes = if let Some(b) = extract_dctdecode_page_image(&pdf_doc, page_num)? { b } else { + let page_image_bytes = if let Some(b) = extract_dctdecode_page_image(&pdf_doc, page_num)? { + b + } else { let note = format!( "page={page_num} skipped: no DCTDecode image XObject (vector PDF page or unsupported /Filter — v1 supports DCTDecode passthrough only; see release notes for normalization guidance)" ); @@ -266,7 +270,10 @@ where canonical.blocks.extend(ocr_blocks); canonical.provenance.events.extend(new_events); - Ok(PdfOcrSummary { pages_ocrd, ms_total }) + Ok(PdfOcrSummary { + pages_ocrd, + ms_total, + }) } fn find_paragraph_block_idx(blocks: &[Block], page_num: u32) -> usize { diff --git a/crates/kebab-app/src/reset.rs b/crates/kebab-app/src/reset.rs index a9cfc27..87f8b4a 100644 --- a/crates/kebab-app/src/reset.rs +++ b/crates/kebab-app/src/reset.rs @@ -85,8 +85,7 @@ pub fn enumerate_paths(scope: ResetScope, cfg: &Config) -> Vec { ResetScope::All => vec![cfg_dir, data_dir, cache_dir, state_dir], ResetScope::DataOnly => vec![data_dir, cache_dir, state_dir], ResetScope::VectorOnly => { - let vector_dir = - expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy()); + let vector_dir = expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy()); vec![vector_dir] } ResetScope::ConfigOnly => vec![cfg_dir], @@ -137,8 +136,8 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 { /// the double scan is acceptable for a rare destructive operation. pub fn enumerate_orphans(cfg: &Config) -> Result> { use kebab_core::DocumentStore as _; - use kebab_source_fs::FsSourceConnector; use kebab_core::SourceScope; + use kebab_source_fs::FsSourceConnector; let store = kebab_store_sqlite::SqliteStore::open(cfg) .context("enumerate_orphans: open SqliteStore")?; @@ -160,16 +159,13 @@ pub fn enumerate_orphans(cfg: &Config) -> Result> { ..Default::default() }; - let connector = FsSourceConnector::new(cfg) - .context("enumerate_orphans: build FsSourceConnector")?; + let connector = + FsSourceConnector::new(cfg).context("enumerate_orphans: build FsSourceConnector")?; let (assets, _skips) = connector .scan_with_skips(&scope) .context("enumerate_orphans: scan workspace")?; - let scanned: HashSet = assets - .into_iter() - .map(|a| a.workspace_path) - .collect(); + let scanned: HashSet = assets.into_iter().map(|a| a.workspace_path).collect(); let mut orphans: Vec = stored .into_iter() @@ -206,8 +202,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result { if !p.exists() { continue; } - std::fs::remove_dir_all(p) - .with_context(|| format!("remove {}", p.display()))?; + std::fs::remove_dir_all(p).with_context(|| format!("remove {}", p.display()))?; removed.push(p.clone()); } @@ -229,8 +224,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result { /// Execute the `OrphansOnly` variant: reconcile stored docs against the /// current walker scope without touching any filesystem directory. fn execute_orphans_only(cfg: &Config) -> Result { - let orphans = enumerate_orphans(cfg) - .context("execute_orphans_only: enumerate orphans")?; + let orphans = enumerate_orphans(cfg).context("execute_orphans_only: enumerate orphans")?; if orphans.is_empty() { return Ok(ResetReport { diff --git a/crates/kebab-app/src/schema.rs b/crates/kebab-app/src/schema.rs index aeca125..8982f62 100644 --- a/crates/kebab-app/src/schema.rs +++ b/crates/kebab-app/src/schema.rs @@ -168,12 +168,8 @@ fn open_store_for_stats(cfg: &Config) -> anyhow::Result anyhow::Result { - let counts = store - .count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?; +fn collect_stats(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result { + let counts = store.count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?; let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, ""); let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir) .map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?; @@ -298,6 +294,9 @@ mod tests_capabilities { // Bug #9: kebab ingest-file + kebab ingest-stdin --title 양쪽 모두 // ingest_report.v1 정상 emit → capabilities.single_file_ingest 가 true 여야 함. let caps = capabilities_snapshot(); - assert!(caps.single_file_ingest, "single_file_ingest must be true (Bug #9)"); + assert!( + caps.single_file_ingest, + "single_file_ingest must be true (Bug #9)" + ); } } diff --git a/crates/kebab-app/src/staleness.rs b/crates/kebab-app/src/staleness.rs index c90d548..85d42c5 100644 --- a/crates/kebab-app/src/staleness.rs +++ b/crates/kebab-app/src/staleness.rs @@ -10,11 +10,7 @@ use kebab_core::SearchHit; /// /// p9-fb-32: mirrored in `kebab_rag::pipeline::compute_stale` (dep-boundary /// rule prevents `kebab-rag → kebab-app`). Update both together. -pub fn compute_stale( - indexed_at: OffsetDateTime, - now: OffsetDateTime, - threshold_days: u32, -) -> bool { +pub fn compute_stale(indexed_at: OffsetDateTime, now: OffsetDateTime, threshold_days: u32) -> bool { if threshold_days == 0 { return false; } @@ -23,11 +19,7 @@ pub fn compute_stale( } /// Sets `stale` on each hit in place using `compute_stale`. -pub fn mark_stale_in_place( - hits: &mut [SearchHit], - now: OffsetDateTime, - threshold_days: u32, -) { +pub fn mark_stale_in_place(hits: &mut [SearchHit], now: OffsetDateTime, threshold_days: u32) { for h in hits { h.stale = compute_stale(h.indexed_at, now, threshold_days); } diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index 6c01119..534e3fd 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -29,9 +29,8 @@ fn rust_file_ingests_and_searches_as_code_citation() { ) .unwrap(); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("ingest must succeed"); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); assert_eq!(report.errors, 0, "no errors expected: {report:?}"); let items = report.items.as_ref().expect("items present"); @@ -127,9 +126,8 @@ fn rust_code_search_hit_has_repo() { ) .unwrap(); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("ingest must succeed"); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("mul")) @@ -147,8 +145,7 @@ fn rust_code_search_hit_has_repo() { .and_then(|n| n.to_str()) .map(str::to_owned); assert_eq!( - h.repo, - expected_repo, + h.repo, expected_repo, "SearchHit.repo must match the workspace dir name (detect_repo result)" ); // Also sanity-check code_lang is still filled. @@ -177,9 +174,8 @@ fn python_file_ingests_and_searches_as_code_citation() { ) .unwrap(); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("ingest must succeed"); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); assert!(report.new >= 1, "python file ingested: {report:?}"); @@ -254,9 +250,8 @@ fn typescript_file_ingests_and_searches_as_code_citation() { ) .unwrap(); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("ingest must succeed"); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); assert!(report.new >= 1, "ts file ingested: {report:?}"); @@ -331,9 +326,8 @@ fn javascript_file_ingests_and_searches_as_code_citation() { ) .unwrap(); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("ingest must succeed"); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); assert!(report.new >= 1, "js file ingested: {report:?}"); @@ -515,7 +509,11 @@ fn java_file_ingests_and_searches_as_code_citation() { line_start, .. } => { - assert_eq!(lang.as_deref(), Some("java"), "citation.lang must be 'java'"); + assert_eq!( + lang.as_deref(), + Some("java"), + "citation.lang must be 'java'" + ); assert_eq!( symbol.as_deref(), Some("com.foo.Foo.bar"), @@ -586,7 +584,11 @@ fn kotlin_file_ingests_and_searches_as_code_citation() { line_start, .. } => { - assert_eq!(lang.as_deref(), Some("kotlin"), "citation.lang must be 'kotlin'"); + assert_eq!( + lang.as_deref(), + Some("kotlin"), + "citation.lang must be 'kotlin'" + ); assert_eq!( symbol.as_deref(), Some("com.foo.Foo.bar"), @@ -651,8 +653,8 @@ fn tier2_k8s_yaml_ingest_searchable() { ..Default::default() }, }; - let hits = kebab_app::search_with_config(env.config.clone(), query) - .expect("search must succeed"); + let hits = + kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed"); let h = hits .iter() @@ -666,7 +668,11 @@ fn tier2_k8s_yaml_ingest_searchable() { line_start, .. } => { - assert_eq!(lang.as_deref(), Some("yaml"), "citation.lang must be 'yaml'"); + assert_eq!( + lang.as_deref(), + Some("yaml"), + "citation.lang must be 'yaml'" + ); assert_eq!( symbol.as_deref(), Some("Deployment/prod/api"), @@ -730,8 +736,8 @@ fn tier2_dockerfile_ingest_searchable() { ..Default::default() }, }; - let hits = kebab_app::search_with_config(env.config.clone(), query) - .expect("search must succeed"); + let hits = + kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed"); let h = hits .iter() @@ -813,8 +819,8 @@ fn tier2_cargo_toml_ingest_searchable() { ..Default::default() }, }; - let hits = kebab_app::search_with_config(env.config.clone(), query) - .expect("search must succeed"); + let hits = + kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed"); let h = hits .iter() @@ -896,8 +902,8 @@ fn tier3_shell_ingest_searchable() { ..Default::default() }, }; - let hits = kebab_app::search_with_config(env.config.clone(), query) - .expect("search must succeed"); + let hits = + kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed"); let h = hits .iter() @@ -987,8 +993,8 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() { ..Default::default() }, }; - let hits = kebab_app::search_with_config(env.config.clone(), query) - .expect("search must succeed"); + let hits = + kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed"); let h = hits .iter() @@ -1031,14 +1037,9 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() { fn rust_file_re_ingest_is_unchanged() { let env = TestEnv::lexical_only(); - std::fs::write( - env.workspace_root.join("stable.rs"), - "pub fn noop() {}\n", - ) - .unwrap(); + std::fs::write(env.workspace_root.join("stable.rs"), "pub fn noop() {}\n").unwrap(); - let r1 = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); + let r1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); let item1 = r1 .items .as_ref() @@ -1049,8 +1050,7 @@ fn rust_file_re_ingest_is_unchanged() { .unwrap(); assert_eq!(item1.kind, IngestItemKind::New); - let r2 = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); + let r2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); let item2 = r2 .items .unwrap() @@ -1081,9 +1081,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() { ) .unwrap(); - let report1 = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("first ingest"); + let report1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("first ingest"); let item1 = report1 .items .as_ref() @@ -1093,7 +1092,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() { .expect("docker-compose.yml in first report"); assert!( matches!(item1.kind, IngestItemKind::New), - "first ingest must be New, got {:?}", item1.kind + "first ingest must be New, got {:?}", + item1.kind ); assert_eq!( item1.chunker_version.as_ref().map(|c| c.0.as_str()), @@ -1101,9 +1101,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() { "first ingest must use Tier 3 fallback chunker" ); - let report2 = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("second ingest"); + let report2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest"); let item2 = report2 .items .as_ref() @@ -1113,7 +1112,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() { .expect("docker-compose.yml in second report"); assert!( matches!(item2.kind, IngestItemKind::Unchanged), - "second ingest must be Unchanged, got {:?}", item2.kind + "second ingest must be Unchanged, got {:?}", + item2.kind ); } @@ -1163,8 +1163,8 @@ fn tier1_c_ingest_searchable() { ..Default::default() }, }; - let hits = kebab_app::search_with_config(env.config.clone(), query) - .expect("search must succeed"); + let hits = + kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed"); let h = hits .iter() @@ -1247,8 +1247,8 @@ fn tier1_cpp_ingest_searchable() { ..Default::default() }, }; - let hits = kebab_app::search_with_config(env.config.clone(), query) - .expect("search must succeed"); + let hits = + kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed"); let h = hits .iter() @@ -1266,7 +1266,9 @@ fn tier1_cpp_ingest_searchable() { // Symbol could be "kebab::chunk::Foo" (class) or "kebab::chunk::Foo::bar" // (method) depending on which chunk ranks first. assert!( - symbol.as_deref().is_some_and(|s| s.starts_with("kebab::chunk::Foo")), + symbol + .as_deref() + .is_some_and(|s| s.starts_with("kebab::chunk::Foo")), "C++ symbol must start with namespace::Class prefix, got {symbol:?}" ); assert!(*line_start >= 1, "line_start must be >=1"); @@ -1335,8 +1337,8 @@ fn tier2_k8s_multi_resource_yaml_ingests_without_collision() { ..Default::default() }, }; - let hits = kebab_app::search_with_config(env.config.clone(), query) - .expect("search must succeed"); + let hits = + kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed"); assert!( hits.len() >= 2, "expected ≥2 hits (Deployment + Service), got {}", @@ -1359,9 +1361,8 @@ fn tier3_shell_reingest_is_unchanged() { ) .unwrap(); - let report1 = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("first ingest"); + let report1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("first ingest"); let item1 = report1 .items .as_ref() @@ -1371,12 +1372,12 @@ fn tier3_shell_reingest_is_unchanged() { .expect("deploy.sh in first report"); assert!( matches!(item1.kind, IngestItemKind::New), - "first ingest must be New, got {:?}", item1.kind + "first ingest must be New, got {:?}", + item1.kind ); - let report2 = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("second ingest"); + let report2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest"); let item2 = report2 .items .as_ref() @@ -1386,6 +1387,7 @@ fn tier3_shell_reingest_is_unchanged() { .expect("deploy.sh in second report"); assert!( matches!(item2.kind, IngestItemKind::Unchanged), - "shell reingest must be Unchanged, got {:?}", item2.kind + "shell reingest must be Unchanged, got {:?}", + item2.kind ); } diff --git a/crates/kebab-app/tests/common/mod.rs b/crates/kebab-app/tests/common/mod.rs index ee85663..25ae766 100644 --- a/crates/kebab-app/tests/common/mod.rs +++ b/crates/kebab-app/tests/common/mod.rs @@ -93,8 +93,7 @@ impl TestEnv { /// directly. Caller can invoke this multiple times to simulate /// re-opening the binary after a corpus revision bump. pub fn app(&self) -> kebab_app::App { - kebab_app::App::open_with_config(self.config.clone()) - .expect("App::open_with_config") + kebab_app::App::open_with_config(self.config.clone()).expect("App::open_with_config") } } diff --git a/crates/kebab-app/tests/fetch_integration.rs b/crates/kebab-app/tests/fetch_integration.rs index 0fd40a4..5937973 100644 --- a/crates/kebab-app/tests/fetch_integration.rs +++ b/crates/kebab-app/tests/fetch_integration.rs @@ -12,7 +12,11 @@ fn open(env: &common::TestEnv) -> App { #[test] fn fetch_chunk_returns_target_only_when_no_context() { let env = common::TestEnv::new(); - common::ingest_md(&env, "a.md", "# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n"); + common::ingest_md( + &env, + "a.md", + "# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n", + ); let app = open(&env); // Find a chunk via search to obtain its id. @@ -42,7 +46,8 @@ fn fetch_chunk_with_context_returns_neighbors() { // match. The earlier fixture used 2-char tokens like `A1`/`A3` for // section bodies — those zero-hit under trigram. Use 5-char unique // words per section so the query can pin one chunk deterministically. - let body = "# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n"; + let body = + "# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n"; common::ingest_md(&env, "multi.md", body); let app = env.app(); @@ -110,7 +115,10 @@ fn fetch_doc_returns_serialized_markdown() { .unwrap(); assert_eq!(result.kind, FetchKind::Doc); let text = result.text.expect("doc text"); - assert!(text.contains("Heading One"), "doc text contains heading: {text:?}"); + assert!( + text.contains("Heading One"), + "doc text contains heading: {text:?}" + ); assert!(text.contains("First paragraph"), "doc text contains body"); assert!(!result.truncated); } @@ -155,7 +163,11 @@ fn fetch_doc_with_max_tokens_truncates() { .unwrap(); assert!(result.truncated); let text = result.text.expect("doc text"); - assert!(text.chars().count() <= 100, "trimmed text len {}", text.chars().count()); + assert!( + text.chars().count() <= 100, + "trimmed text len {}", + text.chars().count() + ); } #[test] @@ -292,8 +304,7 @@ fn fetch_span_line_start_beyond_total_returns_empty_text() { fn fetch_chunk_context_at_first_chunk_clamps_lower_bound() { let env = common::TestEnv::new(); // Multi-chunk markdown so context ±N has neighbors. - let body = - "# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n"; + let body = "# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n"; common::ingest_md(&env, "boundary.md", body); let app = env.app(); let q = kebab_core::SearchQuery { diff --git a/crates/kebab-app/tests/file_deletion_auto_purge.rs b/crates/kebab-app/tests/file_deletion_auto_purge.rs index 5e16456..37789ca 100644 --- a/crates/kebab-app/tests/file_deletion_auto_purge.rs +++ b/crates/kebab-app/tests/file_deletion_auto_purge.rs @@ -16,8 +16,8 @@ mod common; use common::TestEnv; -use kebab_app::ingest_with_config_opts; use kebab_app::IngestOpts; +use kebab_app::ingest_with_config_opts; use kebab_core::{DocFilter, DocumentStore, SearchMode, SearchQuery, SourceScope}; /// Helper: open the store via `TestEnv` and run `list_documents`. @@ -125,17 +125,10 @@ fn include_scope_narrowing_does_not_purge() { include: vec!["**/*.rs".to_string()], exclude: env.config.workspace.exclude.clone(), }; - let first = ingest_with_config_opts( - env.config.clone(), - wide_scope, - false, - IngestOpts::default(), - ) - .expect("first ingest (wide) must succeed"); - assert!( - first.new >= 2, - "expected at least 2 new docs: {first:?}" - ); + let first = + ingest_with_config_opts(env.config.clone(), wide_scope, false, IngestOpts::default()) + .expect("first ingest (wide) must succeed"); + assert!(first.new >= 2, "expected at least 2 new docs: {first:?}"); assert_eq!( first.purged_deleted_files, 0, "no purges on first ingest: {first:?}" diff --git a/crates/kebab-app/tests/image_pipeline.rs b/crates/kebab-app/tests/image_pipeline.rs index 9e2b1fe..8772c7b 100644 --- a/crates/kebab-app/tests/image_pipeline.rs +++ b/crates/kebab-app/tests/image_pipeline.rs @@ -24,8 +24,7 @@ use wiremock::{Mock, MockServer, ResponseTemplate}; /// inspectable in stored DB rows. fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf { use image::{ImageBuffer, Rgb}; - let img: ImageBuffer, _> = - ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0])); + let img: ImageBuffer, _> = ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0])); let path = root.join(name); img.save(&path).expect("write PNG fixture"); path @@ -80,7 +79,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() { // Counters: scanned should include the PNG; new ≥ 1 (markdown // fixtures from the workspace tree may also count). - assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items); + assert!( + report.scanned >= 1, + "scanned={}, items={:?}", + report.scanned, + report.items + ); assert_eq!(report.errors, 0, "no errors on lenient OCR path"); // Locate the image doc in the report items. @@ -94,7 +98,11 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() { kebab_core::IngestItemKind::New, "image asset must be classified New on first ingest" ); - assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk"); + assert_eq!( + img_item.chunk_count, + Some(1), + "image emits exactly one chunk" + ); // Inspect the stored chunk text via kb-app's inspect_chunk facade. let doc_id = img_item.doc_id.clone().expect("image doc id"); @@ -117,10 +125,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() { // Sanity: the doc was actually persisted into SQLite (kb-app's // list_docs facade reads the same store the chunker writes to). - let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()) - .expect("list_docs"); + let summaries = + kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).expect("list_docs"); assert!( - summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")), + summaries + .iter() + .any(|s| s.doc_path.0.ends_with("diagram.png")), "image doc must appear in list_docs" ); @@ -171,8 +181,7 @@ async fn ingest_image_with_ocr_and_caption_populates_both_fields() { .iter() .find(|i| i.doc_path.0.ends_with("diagram.png")) .unwrap(); - let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()) - .unwrap(); + let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap(); let block = match &doc.blocks[0] { kebab_core::Block::ImageRef(b) => b, _ => unreachable!(), @@ -267,8 +276,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() { let cfg_clone = cfg.clone(); let scope = env.scope(); let report = spawn_blocking(move || { - kebab_app::ingest_with_config(cfg_clone, scope, false) - .expect("ingest with no OCR/caption") + kebab_app::ingest_with_config(cfg_clone, scope, false).expect("ingest with no OCR/caption") }) .await .expect("task"); @@ -282,8 +290,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() { .find(|i| i.doc_path.0.ends_with("raw.png")) .unwrap(); assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk"); - let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()) - .unwrap(); + let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap(); let block = match &doc.blocks[0] { kebab_core::Block::ImageRef(b) => b, _ => unreachable!(), @@ -392,16 +399,12 @@ async fn re_ingest_image_produces_unchanged_with_same_doc_id() { let scope1 = scope.clone(); let scope2 = scope.clone(); - let r1 = spawn_blocking(move || { - kebab_app::ingest_with_config(cfg1, scope1, false).unwrap() - }) - .await - .unwrap(); - let r2 = spawn_blocking(move || { - kebab_app::ingest_with_config(cfg2, scope2, false).unwrap() - }) - .await - .unwrap(); + let r1 = spawn_blocking(move || kebab_app::ingest_with_config(cfg1, scope1, false).unwrap()) + .await + .unwrap(); + let r2 = spawn_blocking(move || kebab_app::ingest_with_config(cfg2, scope2, false).unwrap()) + .await + .unwrap(); let id1 = r1 .items diff --git a/crates/kebab-app/tests/incremental_ingest.rs b/crates/kebab-app/tests/incremental_ingest.rs index f103a16..cf9d44c 100644 --- a/crates/kebab-app/tests/incremental_ingest.rs +++ b/crates/kebab-app/tests/incremental_ingest.rs @@ -21,11 +21,16 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() { // First ingest — populates the DB. Use the legacy entry so the // assertions cover the "previously ingested" set without needing // IngestOpts::default() to behave identically. - let first = - ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); + let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); assert_eq!(first.errors, 0, "first ingest must not error: {first:?}"); - assert!(first.new >= 1, "first ingest must create new docs: {first:?}"); - assert_eq!(first.unchanged, 0, "first ingest cannot have unchanged: {first:?}"); + assert!( + first.new >= 1, + "first ingest must create new docs: {first:?}" + ); + assert_eq!( + first.unchanged, 0, + "first ingest cannot have unchanged: {first:?}" + ); let scanned = first.scanned; @@ -38,9 +43,15 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() { IngestOpts::default(), ) .unwrap(); - assert_eq!(second.scanned, scanned, "second scanned matches first: {second:?}"); + assert_eq!( + second.scanned, scanned, + "second scanned matches first: {second:?}" + ); assert_eq!(second.new, 0, "no new docs on re-ingest: {second:?}"); - assert_eq!(second.updated, 0, "nothing should be marked updated: {second:?}"); + assert_eq!( + second.updated, 0, + "nothing should be marked updated: {second:?}" + ); assert_eq!( second.unchanged, scanned, "every doc must be Unchanged: {second:?}" @@ -52,10 +63,12 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() { fn force_reingest_bypasses_skip() { let env = TestEnv::lexical_only(); - let first = - ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); + let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); assert_eq!(first.errors, 0, "first ingest must not error: {first:?}"); - assert!(first.new >= 1, "first ingest must create new docs: {first:?}"); + assert!( + first.new >= 1, + "first ingest must create new docs: {first:?}" + ); let scanned = first.scanned; let second = ingest_with_config_opts( diff --git a/crates/kebab-app/tests/ingest_cancel.rs b/crates/kebab-app/tests/ingest_cancel.rs index bab94d8..ddf2894 100644 --- a/crates/kebab-app/tests/ingest_cancel.rs +++ b/crates/kebab-app/tests/ingest_cancel.rs @@ -107,13 +107,9 @@ fn cancel_none_is_uncancellable_default() { // ingest_with_config_progress (no cancel) runs to completion. let env = TestEnv::lexical_only(); let (tx, rx) = mpsc::channel::(); - let report = kebab_app::ingest_with_config_progress( - env.config.clone(), - env.scope(), - true, - Some(tx), - ) - .unwrap(); + let report = + kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx)) + .unwrap(); assert_eq!(report.scanned, 3); assert_eq!(report.new, 3); diff --git a/crates/kebab-app/tests/ingest_file.rs b/crates/kebab-app/tests/ingest_file.rs index 2b9696d..f87a9f3 100644 --- a/crates/kebab-app/tests/ingest_file.rs +++ b/crates/kebab-app/tests/ingest_file.rs @@ -107,5 +107,8 @@ fn ingest_file_errors_on_unsupported_extension() { let err = kebab_app::ingest_file_with_config(cfg, &docx).unwrap_err(); assert!(err.to_string().contains("unsupported extension"), "{err}"); - assert!(err.to_string().contains(".docx") || err.to_string().contains("docx"), "{err}"); + assert!( + err.to_string().contains(".docx") || err.to_string().contains("docx"), + "{err}" + ); } diff --git a/crates/kebab-app/tests/ingest_lexical.rs b/crates/kebab-app/tests/ingest_lexical.rs index 2fbd293..cf16a9f 100644 --- a/crates/kebab-app/tests/ingest_lexical.rs +++ b/crates/kebab-app/tests/ingest_lexical.rs @@ -8,8 +8,7 @@ use common::TestEnv; #[test] fn ingest_then_list_inspects_round_trip() { let env = TestEnv::lexical_only(); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); // The fixture has 3 markdown files; first ingest should label them // all as New. @@ -27,17 +26,14 @@ fn ingest_then_list_inspects_round_trip() { } // list_docs returns the 3 docs. - let docs = kebab_app::list_docs_with_config( - env.config.clone(), - kebab_core::DocFilter::default(), - ) - .unwrap(); + let docs = + kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default()) + .unwrap(); assert_eq!(docs.len(), 3, "docs: {docs:?}"); // inspect_doc round-trips one of them. let any_doc_id = docs[0].doc_id.clone(); - let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id) - .unwrap(); + let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id).unwrap(); assert_eq!(canonical.doc_id, any_doc_id); assert!(!canonical.blocks.is_empty(), "blocks empty"); } @@ -46,12 +42,10 @@ fn ingest_then_list_inspects_round_trip() { fn ingest_idempotent_on_second_run() { let env = TestEnv::lexical_only(); - let r1 = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); + let r1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); assert_eq!(r1.new, 3); - let r2 = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); + let r2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); // Same files re-ingested — p9-fb-23 task 7 introduced the early-skip // path: when checksum + parser/chunker/embedding versions all match, // the second run reports `Unchanged` rather than `Updated`. Pre-p9-fb-23 @@ -63,19 +57,16 @@ fn ingest_idempotent_on_second_run() { assert_eq!(r2.unchanged, 3, "second run unchanged: {r2:?}"); // list_docs still has 3 docs (no duplicates). - let docs = kebab_app::list_docs_with_config( - env.config.clone(), - kebab_core::DocFilter::default(), - ) - .unwrap(); + let docs = + kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default()) + .unwrap(); assert_eq!(docs.len(), 3); } #[test] fn ingest_summary_only_drops_items() { let env = TestEnv::lexical_only(); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); assert_eq!(report.scanned, 3); assert!(report.items.is_none(), "summary-only should null items"); } @@ -87,12 +78,10 @@ fn ingest_records_ingest_runs_row_with_aggregate_counts() { // of every run. `summary_only=true` writes `items_json=NULL`; the // counts MUST still be present. let env = TestEnv::lexical_only(); - let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true) - .unwrap(); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); assert_eq!(report.scanned, 3); - let db_path = std::path::PathBuf::from(&env.config.storage.data_dir) - .join("kebab.sqlite"); + let db_path = std::path::PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite"); let conn = rusqlite::Connection::open(&db_path).expect("open kebab.sqlite"); let (scanned, new_c, updated, skipped, errors, items_json): ( i64, @@ -141,25 +130,18 @@ fn ingest_provider_none_skips_lance() { // tree shape (no `/lancedb` directory, or no `*.lance` // tables under it). let env = TestEnv::lexical_only(); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); assert_eq!(report.errors, 0, "lexical-only run must not error"); assert_eq!(report.new, 3); - let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir) - .join("lancedb"); + let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir).join("lancedb"); if lance_dir.exists() { // If the dir was created (e.g., by an earlier consumer touching // the path), it MUST contain no `.lance` tables. let mut had_lance_table = false; for entry in std::fs::read_dir(&lance_dir).expect("read lance_dir") { let entry = entry.unwrap(); - if entry - .path() - .extension() - .and_then(|s| s.to_str()) - == Some("lance") - { + if entry.path().extension().and_then(|s| s.to_str()) == Some("lance") { had_lance_table = true; break; } @@ -189,8 +171,7 @@ fn list_docs_filters_by_tags_any() { tags_any: vec!["rust".to_string()], ..Default::default() }; - let rust_docs = - kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap(); + let rust_docs = kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap(); // intro.md and notes/cargo.md both tag "rust". assert_eq!(rust_docs.len(), 2, "expected 2 rust docs: {rust_docs:?}"); } @@ -198,8 +179,9 @@ fn list_docs_filters_by_tags_any() { #[test] fn inspect_doc_not_found_returns_actionable_error() { let env = TestEnv::lexical_only(); - let bogus = - kebab_core::DocumentId("0000000000000000000000000000000000000000000000000000000000000000".to_string()); + let bogus = kebab_core::DocumentId( + "0000000000000000000000000000000000000000000000000000000000000000".to_string(), + ); let err = kebab_app::inspect_doc_with_config(env.config.clone(), &bogus).unwrap_err(); let msg = format!("{err:#}"); assert!( @@ -218,8 +200,7 @@ fn inspect_chunk_not_found_returns_actionable_error() { let bogus = kebab_core::ChunkId( "0000000000000000000000000000000000000000000000000000000000000000".to_string(), ); - let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus) - .unwrap_err(); + let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus).unwrap_err(); let msg = format!("{err:#}"); assert!(msg.contains("not found"), "got: {msg}"); } @@ -251,22 +232,18 @@ fn ingest_with_config_opts_default_matches_legacy_behaviour() { #[test] fn ingest_stamps_chunker_version_on_document() { let env = TestEnv::lexical_only(); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); assert!(report.new >= 1, "expected at least one new doc: {report:?}"); assert_eq!(report.errors, 0, "no errors expected: {report:?}"); - let docs = kebab_app::list_docs_with_config( - env.config.clone(), - kebab_core::DocFilter::default(), - ) - .unwrap(); + let docs = + kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default()) + .unwrap(); assert!(!docs.is_empty(), "no docs after ingest"); for doc_entry in &docs { let canonical = - kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id) - .unwrap(); + kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id).unwrap(); assert!( canonical.last_chunker_version.is_some(), "last_chunker_version must be stamped for doc {}: got {:?}", diff --git a/crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs b/crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs index 5927e94..8198a56 100644 --- a/crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs +++ b/crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs @@ -17,8 +17,7 @@ use std::sync::atomic::AtomicBool; use common::TestEnv; fn ollama_endpoint() -> String { - std::env::var("KEBAB_PDF_OCR_ENDPOINT") - .unwrap_or_else(|_| "http://localhost:11434".to_string()) + std::env::var("KEBAB_PDF_OCR_ENDPOINT").unwrap_or_else(|_| "http://localhost:11434".to_string()) } fn make_ocr_env_real() -> TestEnv { @@ -43,8 +42,8 @@ fn make_ocr_env_real() -> TestEnv { fn ingest_with_mock_ocr_yields_pdf_ocr_summary() { let env = make_ocr_env_real(); - let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("ingest"); + let report = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest"); assert!(report.new >= 1, "at least one PDF ingested: {report:?}"); @@ -72,15 +71,13 @@ fn ingest_with_mock_ocr_yields_pdf_ocr_summary() { fn ocr_text_indexed_and_searchable() { let env = make_ocr_env_real(); - kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) - .expect("ingest"); + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest"); // Search for a Korean morpheme expected to appear in qwen2.5vl:3b OCR // output of the PoC ground-truth page. "다음" is a high-frequency token // in page1.txt truth file. let query = common::lexical_query("다음"); - let hits = - kebab_app::search_with_config(env.config.clone(), query).expect("search"); + let hits = kebab_app::search_with_config(env.config.clone(), query).expect("search"); assert!( !hits.is_empty(), diff --git a/crates/kebab-app/tests/ingest_progress.rs b/crates/kebab-app/tests/ingest_progress.rs index 89eb4b9..0ceb00a 100644 --- a/crates/kebab-app/tests/ingest_progress.rs +++ b/crates/kebab-app/tests/ingest_progress.rs @@ -13,13 +13,9 @@ use kebab_core::IngestItemKind; fn run_with_progress() -> Vec { let env = TestEnv::lexical_only(); let (tx, rx) = mpsc::channel::(); - let report = kebab_app::ingest_with_config_progress( - env.config.clone(), - env.scope(), - false, - Some(tx), - ) - .unwrap(); + let report = + kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), false, Some(tx)) + .unwrap(); assert_eq!(report.scanned, 3); assert_eq!(report.new, 3); @@ -116,13 +112,9 @@ fn ingest_with_config_progress_none_matches_ingest_with_config() { // `ingest_with_config_progress(..., None)` must produce identical // reports modulo wall-clock duration. let env = TestEnv::lexical_only(); - let r_none = kebab_app::ingest_with_config_progress( - env.config.clone(), - env.scope(), - true, - None, - ) - .unwrap(); + let r_none = + kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, None) + .unwrap(); assert_eq!(r_none.scanned, 3); assert_eq!(r_none.new, 3); } @@ -134,13 +126,9 @@ fn dropped_receiver_does_not_panic_or_fail_ingest() { let env = TestEnv::lexical_only(); let (tx, rx) = mpsc::channel::(); drop(rx); - let report = kebab_app::ingest_with_config_progress( - env.config.clone(), - env.scope(), - true, - Some(tx), - ) - .unwrap(); + let report = + kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx)) + .unwrap(); assert_eq!(report.scanned, 3); } @@ -185,13 +173,8 @@ fn pdf_ocr_progress_emits_started_finished_events() { }; let (tx, rx) = mpsc::channel::(); - let _report = kebab_app::ingest_with_config_progress( - config, - scope, - false, - Some(tx), - ) - .expect("ingest_with_config_progress"); + let _report = kebab_app::ingest_with_config_progress(config, scope, false, Some(tx)) + .expect("ingest_with_config_progress"); let events: Vec<_> = rx.iter().collect(); @@ -204,7 +187,16 @@ fn pdf_ocr_progress_emits_started_finished_events() { .filter(|e| matches!(e, IngestEvent::PdfOcrFinished { .. })) .count(); - assert!(started_count >= 1, "PdfOcrStarted 가 ≥ 1 emit 됨 (got {started_count})"); - assert!(finished_count >= 1, "PdfOcrFinished 가 ≥ 1 emit 됨 (got {finished_count})"); - assert_eq!(started_count, finished_count, "Started 와 Finished 의 count 일치"); + assert!( + started_count >= 1, + "PdfOcrStarted 가 ≥ 1 emit 됨 (got {started_count})" + ); + assert!( + finished_count >= 1, + "PdfOcrFinished 가 ≥ 1 emit 됨 (got {finished_count})" + ); + assert_eq!( + started_count, finished_count, + "Started 와 Finished 의 count 일치" + ); } diff --git a/crates/kebab-app/tests/ingest_stdin.rs b/crates/kebab-app/tests/ingest_stdin.rs index 21b5c3e..3e2e478 100644 --- a/crates/kebab-app/tests/ingest_stdin.rs +++ b/crates/kebab-app/tests/ingest_stdin.rs @@ -29,12 +29,14 @@ fn ingest_stdin_writes_frontmatter_and_reports_new() { "## Body content\n\nMore.", "Article X", Some("https://example.com/x"), - ).unwrap(); + ) + .unwrap(); assert_eq!(report.new, 1, "{report:?}"); // _external/ contains exactly one .md file with frontmatter. let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external"); - let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap() + let entries: Vec<_> = fs::read_dir(&ext_dir) + .unwrap() .filter_map(std::result::Result::ok) .collect(); assert_eq!(entries.len(), 1); @@ -50,16 +52,13 @@ fn ingest_stdin_without_source_uri() { let dir = tempfile::tempdir().unwrap(); let cfg = fresh_cfg(dir.path()); - let report = kebab_app::ingest_stdin_with_config( - cfg.clone(), - "## Body", - "Title", - None, - ).unwrap(); + let report = + kebab_app::ingest_stdin_with_config(cfg.clone(), "## Body", "Title", None).unwrap(); assert_eq!(report.new, 1); let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external"); - let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap() + let entries: Vec<_> = fs::read_dir(&ext_dir) + .unwrap() .filter_map(std::result::Result::ok) .collect(); let content = fs::read_to_string(entries[0].path()).unwrap(); diff --git a/crates/kebab-app/tests/init_template.rs b/crates/kebab-app/tests/init_template.rs index d991992..3319751 100644 --- a/crates/kebab-app/tests/init_template.rs +++ b/crates/kebab-app/tests/init_template.rs @@ -17,9 +17,8 @@ fn init_workspace_header_lists_supported_extensions() { } kebab_app::init_workspace(true).expect("init_workspace"); let cfg_path = kebab_config::Config::xdg_config_path(); - let body = std::fs::read_to_string(&cfg_path).unwrap_or_else(|e| { - panic!("read config at {}: {e}", cfg_path.display()) - }); + let body = std::fs::read_to_string(&cfg_path) + .unwrap_or_else(|e| panic!("read config at {}: {e}", cfg_path.display())); assert!( body.contains("처리 가능한 형식"), "header lists supported types section: body=\n{body}" diff --git a/crates/kebab-app/tests/pdf_ocr_apply.rs b/crates/kebab-app/tests/pdf_ocr_apply.rs index 3f8eb4d..e361674 100644 --- a/crates/kebab-app/tests/pdf_ocr_apply.rs +++ b/crates/kebab-app/tests/pdf_ocr_apply.rs @@ -9,9 +9,8 @@ use std::sync::atomic::AtomicBool; use common::mock_ocr::MockOcrEngine; use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages}; use kebab_core::{ - AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext, - Extractor, Inline, Lang, MediaType, RawAsset, SourceSpan, - SourceUri, WorkspacePath, id_for_asset, + AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext, Extractor, + Inline, Lang, MediaType, RawAsset, SourceSpan, SourceUri, WorkspacePath, id_for_asset, }; use kebab_parse_pdf::PdfTextExtractor; use time::OffsetDateTime; @@ -258,8 +257,8 @@ fn f6_flatedecode_skipped_with_warning() { // Test 7: F7 CCITTFax → skip + warning (verifier M-4 split) #[test] fn f7_ccittfax_skipped_with_warning() { - let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf") - .expect("F7 fixture missing"); + let bytes = + std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf").expect("F7 fixture missing"); let mut canonical = canonical_with_empty_block(); // page-1 block from F1 let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false); let opts = default_opts(true); diff --git a/crates/kebab-app/tests/pdf_pipeline.rs b/crates/kebab-app/tests/pdf_pipeline.rs index 4ae7e13..07fb1f7 100644 --- a/crates/kebab-app/tests/pdf_pipeline.rs +++ b/crates/kebab-app/tests/pdf_pipeline.rs @@ -46,17 +46,13 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec { operations: vec![ Operation::new("BT", vec![]), Operation::new("Tf", vec!["F1".into(), 24.into()]), - Operation::new( - "Td", - vec![Object::Integer(100), Object::Integer(700)], - ), + Operation::new("Td", vec![Object::Integer(100), Object::Integer(700)]), Operation::new("Tj", vec![Object::string_literal(*text)]), Operation::new("ET", vec![]), ], }; let stream_data = content.encode().expect("content encode"); - let content_id = - doc.add_object(Stream::new(dictionary! {}, stream_data)); + let content_id = doc.add_object(Stream::new(dictionary! {}, stream_data)); page_dict.set("Contents", content_id); } let page_id = doc.add_object(page_dict); @@ -76,8 +72,7 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec { Object::Integer(842), ], }; - doc.objects - .insert(pages_id, Object::Dictionary(pages_dict)); + doc.objects.insert(pages_id, Object::Dictionary(pages_dict)); let catalog_id = doc.add_object(dictionary! { "Type" => "Catalog", @@ -146,9 +141,8 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() { write_pdf(&env.workspace_root, "three.pdf", &bytes); let cfg = cfg_with_pdf(&env); - let report = - kebab_app::ingest_with_config(cfg.clone(), env.scope(), false) - .expect("PDF ingest must succeed"); + let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false) + .expect("PDF ingest must succeed"); assert_eq!(report.errors, 0); let items = report.items.as_ref().expect("items present"); @@ -157,8 +151,16 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() { .find(|i| i.doc_path.0.ends_with("three.pdf")) .expect("PDF item present"); assert_eq!(pdf_item.kind, IngestItemKind::New); - assert_eq!(pdf_item.block_count, Some(3), "one Block::Paragraph per page"); - assert_eq!(pdf_item.chunk_count, Some(3), "one chunk per non-empty page"); + assert_eq!( + pdf_item.block_count, + Some(3), + "one Block::Paragraph per page" + ); + assert_eq!( + pdf_item.chunk_count, + Some(3), + "one chunk per non-empty page" + ); assert_eq!( pdf_item.parser_version.as_ref().map(|p| p.0.as_str()), Some("pdf-text-v1") @@ -169,11 +171,8 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() { ); // Inspect the stored doc to confirm SourceSpan::Page round-trip. - let doc = kebab_app::inspect_doc_with_config( - cfg, - pdf_item.doc_id.as_ref().unwrap(), - ) - .expect("inspect_doc returns the PDF document"); + let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()) + .expect("inspect_doc returns the PDF document"); assert_eq!(doc.blocks.len(), 3); for (i, block) in doc.blocks.iter().enumerate() { let want_page = (i as u32) + 1; @@ -202,8 +201,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() { write_pdf(&env.workspace_root, "stable.pdf", &bytes); let cfg = cfg_with_pdf(&env); - let report1 = - kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + let report1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); let item1 = report1 .items .as_ref() @@ -214,8 +212,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() { .unwrap(); assert_eq!(item1.kind, IngestItemKind::New); - let report2 = - kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + let report2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); let item2 = report2 .items .unwrap() @@ -239,8 +236,7 @@ fn re_ingest_edited_pdf_produces_new_doc_id() { std::fs::write(&path, &bytes_v1).unwrap(); let cfg = cfg_with_pdf(&env); - let report_v1 = - kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + let report_v1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); let id_v1 = report_v1 .items .as_ref() @@ -252,12 +248,10 @@ fn re_ingest_edited_pdf_produces_new_doc_id() { .clone() .unwrap(); - let bytes_v2 = - build_text_pdf(&[Some("VERSION TWO entirely different body content.")]); + let bytes_v2 = build_text_pdf(&[Some("VERSION TWO entirely different body content.")]); std::fs::write(&path, &bytes_v2).unwrap(); - let report_v2 = - kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + let report_v2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); let item_v2 = report_v2 .items .as_ref() @@ -282,9 +276,11 @@ fn encrypted_pdf_fails_with_qpdf_hint() { write_pdf(&env.workspace_root, "secret.pdf", &bytes); let cfg = cfg_with_pdf(&env); - let report = - kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap(); - assert_eq!(report.errors, 1, "encrypted PDF must increment errors exactly once"); + let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap(); + assert_eq!( + report.errors, 1, + "encrypted PDF must increment errors exactly once" + ); let items = report.items.as_ref().unwrap(); let pdf_item = items .iter() @@ -310,9 +306,11 @@ fn corrupt_pdf_fails_without_storing() { write_pdf(&env.workspace_root, "corrupt.pdf", &bytes); let cfg = cfg_with_pdf(&env); - let report = - kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); - assert_eq!(report.errors, 1, "corrupt PDF must increment errors exactly once"); + let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + assert_eq!( + report.errors, 1, + "corrupt PDF must increment errors exactly once" + ); let items = report.items.as_ref().unwrap(); let pdf_item = items .iter() @@ -322,11 +320,8 @@ fn corrupt_pdf_fails_without_storing() { // Confirm the doc was NOT stored — list_docs returns nothing for // this path. - let summaries = kebab_app::list_docs_with_config( - cfg, - kebab_core::DocFilter::default(), - ) - .unwrap(); + let summaries = + kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap(); assert!( !summaries .iter() @@ -341,14 +336,15 @@ fn corrupt_pdf_fails_without_storing() { #[test] fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() { let env = TestEnv::lexical_only(); - let bytes = - build_text_pdf(&[Some("first page"), None, Some("third page")]); + let bytes = build_text_pdf(&[Some("first page"), None, Some("third page")]); write_pdf(&env.workspace_root, "mixed.pdf", &bytes); let cfg = cfg_with_pdf(&env); - let report = - kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); - assert_eq!(report.errors, 0, "scanned candidate is a Warning, not Error"); + let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + assert_eq!( + report.errors, 0, + "scanned candidate is a Warning, not Error" + ); let pdf_item = report .items .as_ref() @@ -368,11 +364,7 @@ fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() { "pdf-page-v1.1 emits 0 chunks for the empty page; total = 2" ); - let doc = kebab_app::inspect_doc_with_config( - cfg, - pdf_item.doc_id.as_ref().unwrap(), - ) - .unwrap(); + let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap(); let warnings: Vec<_> = doc .provenance .events @@ -419,8 +411,7 @@ fn ingest_report_arithmetic_invariant_holds_with_corrupt_pdf() { write_pdf(&env.workspace_root, "broken.pdf", &corrupt_pdf()); let cfg = cfg_with_pdf(&env); - let report = - kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap(); + let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap(); let total = report.new + report.updated + report.skipped + report.errors; assert_eq!( report.scanned, total, @@ -441,14 +432,12 @@ fn long_pdf_round_trips_through_lexical_pipeline() { let pages: Vec = (1..=50) .map(|i| format!("Page {i} body — lorem ipsum dolor sit amet.")) .collect(); - let page_refs: Vec> = - pages.iter().map(|s| Some(s.as_str())).collect(); + let page_refs: Vec> = pages.iter().map(|s| Some(s.as_str())).collect(); let bytes = build_text_pdf(&page_refs); write_pdf(&env.workspace_root, "long.pdf", &bytes); let cfg = cfg_with_pdf(&env); - let report = - kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); assert_eq!(report.errors, 0); let pdf_item = report .items @@ -466,8 +455,7 @@ fn long_pdf_round_trips_through_lexical_pipeline() { // Round-trip: list_docs sees the long PDF. let summaries = - kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()) - .unwrap(); + kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap(); assert!(summaries.iter().any(|s| s.doc_path.0.ends_with("long.pdf"))); } @@ -476,13 +464,11 @@ fn long_pdf_round_trips_through_lexical_pipeline() { #[test] fn inspect_doc_surfaces_page_spans() { let env = TestEnv::lexical_only(); - let bytes = - build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]); + let bytes = build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]); write_pdf(&env.workspace_root, "inspect.pdf", &bytes); let cfg = cfg_with_pdf(&env); - let report = - kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); + let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap(); let pdf_item = report .items .as_ref() @@ -490,19 +476,12 @@ fn inspect_doc_surfaces_page_spans() { .iter() .find(|i| i.doc_path.0.ends_with("inspect.pdf")) .unwrap(); - let doc = kebab_app::inspect_doc_with_config( - cfg, - pdf_item.doc_id.as_ref().unwrap(), - ) - .unwrap(); + let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap(); assert_eq!(doc.parser_version.0, "pdf-text-v1"); assert_eq!(doc.blocks.len(), 3); for block in &doc.blocks { match block { - Block::Paragraph(p) => assert!(matches!( - p.common.source_span, - SourceSpan::Page { .. } - )), + Block::Paragraph(p) => assert!(matches!(p.common.source_span, SourceSpan::Page { .. })), other => panic!("expected Paragraph, got {other:?}"), } } diff --git a/crates/kebab-app/tests/reset_orphans.rs b/crates/kebab-app/tests/reset_orphans.rs index 6f51f1a..100aa16 100644 --- a/crates/kebab-app/tests/reset_orphans.rs +++ b/crates/kebab-app/tests/reset_orphans.rs @@ -78,19 +78,15 @@ fn reset_orphans_only_purges_out_of_scope_docs() { narrow_cfg.workspace.exclude = vec!["b.rs".to_string(), "c.rs".to_string()]; // Run orphans-only reset. - let report = execute(ResetScope::OrphansOnly, &narrow_cfg) - .expect("orphans-only reset must succeed"); + let report = + execute(ResetScope::OrphansOnly, &narrow_cfg).expect("orphans-only reset must succeed"); assert_eq!( report.orphans_purged, 2, "expected 2 orphans purged (b.rs + c.rs): {report:?}" ); - let mut purged: Vec = report - .purged_paths - .iter() - .map(|p| p.0.clone()) - .collect(); + let mut purged: Vec = report.purged_paths.iter().map(|p| p.0.clone()).collect(); purged.sort(); assert_eq!( purged, diff --git a/crates/kebab-app/tests/schema_active_versions.rs b/crates/kebab-app/tests/schema_active_versions.rs index bd9d118..b39288a 100644 --- a/crates/kebab-app/tests/schema_active_versions.rs +++ b/crates/kebab-app/tests/schema_active_versions.rs @@ -37,8 +37,14 @@ fn schema_models_active_arrays_empty_on_empty_corpus() { drop(store); let s = schema_with_config(&cfg).unwrap(); - assert!(s.models.active_parsers.is_empty(), "empty corpus → no parsers"); - assert!(s.models.active_chunkers.is_empty(), "empty corpus → no chunkers"); + assert!( + s.models.active_parsers.is_empty(), + "empty corpus → no parsers" + ); + assert!( + s.models.active_chunkers.is_empty(), + "empty corpus → no chunkers" + ); // backward compat: 기존 단일 field 는 markdown default 보존. assert_eq!(s.models.parser_version, kebab_parse_md::PARSER_VERSION); } @@ -55,10 +61,19 @@ fn schema_emits_active_parsers_and_chunkers_array_after_ingest() { kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap(); let s = schema_with_config(&cfg).unwrap(); - assert!(!s.models.active_parsers.is_empty(), "active_parsers populated after ingest"); - assert!(!s.models.active_chunkers.is_empty(), "active_chunkers populated after ingest"); + assert!( + !s.models.active_parsers.is_empty(), + "active_parsers populated after ingest" + ); + assert!( + !s.models.active_chunkers.is_empty(), + "active_chunkers populated after ingest" + ); // active arrays must be sorted (ORDER BY in SQL). let mut sorted = s.models.active_parsers.clone(); sorted.sort(); - assert_eq!(s.models.active_parsers, sorted, "active_parsers must be sorted"); + assert_eq!( + s.models.active_parsers, sorted, + "active_parsers must be sorted" + ); } diff --git a/crates/kebab-app/tests/search_budget_integration.rs b/crates/kebab-app/tests/search_budget_integration.rs index c309b69..d6b30b9 100644 --- a/crates/kebab-app/tests/search_budget_integration.rs +++ b/crates/kebab-app/tests/search_budget_integration.rs @@ -27,7 +27,10 @@ fn search_with_opts_no_budget_matches_search() { assert_eq!(resp.hits.len(), baseline.len()); assert!(!resp.truncated); - assert!(resp.next_cursor.is_none(), "k=5 against 1 doc → no next page"); + assert!( + resp.next_cursor.is_none(), + "k=5 against 1 doc → no next page" + ); } #[test] @@ -62,7 +65,11 @@ fn budget_truncates_snippets_when_below_threshold() { fn cursor_paginates_to_next_page() { let env = common::TestEnv::new(); for i in 0..6 { - common::ingest_md(&env, &format!("d{i}.md"), &format!("# T{i}\n\nrust topic {i}\n")); + common::ingest_md( + &env, + &format!("d{i}.md"), + &format!("# T{i}\n\nrust topic {i}\n"), + ); } let app = env.app(); @@ -88,7 +95,10 @@ fn cursor_paginates_to_next_page() { page1.hits.iter().map(|h| h.chunk_id.0.clone()).collect(); let p2_ids: std::collections::HashSet<_> = page2.hits.iter().map(|h| h.chunk_id.0.clone()).collect(); - assert!(p1_ids.is_disjoint(&p2_ids), "page 2 must not repeat page 1 hits"); + assert!( + p1_ids.is_disjoint(&p2_ids), + "page 2 must not repeat page 1 hits" + ); } #[test] diff --git a/crates/kebab-app/tests/search_korean.rs b/crates/kebab-app/tests/search_korean.rs index eaff918..05646f0 100644 --- a/crates/kebab-app/tests/search_korean.rs +++ b/crates/kebab-app/tests/search_korean.rs @@ -75,11 +75,9 @@ fn lexical_multi_token_korean_query_hits() { kebab_app::ingest_with_config(env.config.clone(), env.scope(), true) .expect("ingest must succeed"); - let hits = kebab_app::search_with_config( - env.config.clone(), - common::lexical_query("해시 충돌"), - ) - .expect("search must succeed"); + let hits = + kebab_app::search_with_config(env.config.clone(), common::lexical_query("해시 충돌")) + .expect("search must succeed"); assert!( !hits.is_empty(), @@ -113,11 +111,9 @@ fn lexical_mixed_korean_english_multi_token_query_hits() { kebab_app::ingest_with_config(env.config.clone(), env.scope(), true) .expect("ingest must succeed"); - let hits = kebab_app::search_with_config( - env.config.clone(), - common::lexical_query("Rust 충돌은"), - ) - .expect("search must succeed"); + let hits = + kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust 충돌은")) + .expect("search must succeed"); assert!( !hits.is_empty(), diff --git a/crates/kebab-app/tests/search_lexical.rs b/crates/kebab-app/tests/search_lexical.rs index 36c7d3e..5180d28 100644 --- a/crates/kebab-app/tests/search_lexical.rs +++ b/crates/kebab-app/tests/search_lexical.rs @@ -35,8 +35,8 @@ fn lexical_search_returns_hits_after_ingest() { fn lexical_search_empty_query_returns_empty() { let env = TestEnv::lexical_only(); kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); - let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query(" ")) - .unwrap(); + let hits = + kebab_app::search_with_config(env.config.clone(), common::lexical_query(" ")).unwrap(); assert!(hits.is_empty(), "blank query must short-circuit empty"); } @@ -107,17 +107,17 @@ fn search_uncached_returns_same_hits_as_cached() { #[test] fn first_ingest_bumps_corpus_revision() { let env = TestEnv::lexical_only(); - let store_before = - kebab_store_sqlite::SqliteStore::open(&env.config).unwrap(); + let store_before = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap(); store_before.run_migrations().unwrap(); assert_eq!(store_before.corpus_revision(), 0, "fresh store seeds 0"); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); - assert!(report.new + report.updated > 0, "first ingest must commit ≥1 doc"); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); + assert!( + report.new + report.updated > 0, + "first ingest must commit ≥1 doc" + ); - let store_after = - kebab_store_sqlite::SqliteStore::open(&env.config).unwrap(); + let store_after = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap(); assert!( store_after.corpus_revision() >= 1, "ingest commit must bump corpus_revision (got {})", diff --git a/crates/kebab-app/tests/search_stale_integration.rs b/crates/kebab-app/tests/search_stale_integration.rs index dff1639..c3020dd 100644 --- a/crates/kebab-app/tests/search_stale_integration.rs +++ b/crates/kebab-app/tests/search_stale_integration.rs @@ -29,7 +29,9 @@ fn fresh_doc_is_not_stale_with_default_threshold() { assert!( hits.iter().all(|h| !h.stale), "freshly-ingested doc must not be stale at default 30d threshold: {:?}", - hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::>() + hits.iter() + .map(|h| (h.doc_path.0.clone(), h.stale)) + .collect::>() ); } @@ -50,7 +52,9 @@ fn threshold_zero_disables_staleness() { assert!( hits.iter().all(|h| !h.stale), "threshold=0 disables staleness even for year-old docs: {:?}", - hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::>() + hits.iter() + .map(|h| (h.doc_path.0.clone(), h.stale)) + .collect::>() ); } diff --git a/crates/kebab-app/tests/search_vector.rs b/crates/kebab-app/tests/search_vector.rs index 6fec251..ffaf90a 100644 --- a/crates/kebab-app/tests/search_vector.rs +++ b/crates/kebab-app/tests/search_vector.rs @@ -14,7 +14,8 @@ use common::TestEnv; fn require_avx_or_panic() { #[cfg(target_arch = "x86_64")] { - assert!(std::is_x86_feature_detected!("avx"), + assert!( + std::is_x86_feature_detected!("avx"), "kb-app vector integration test requires AVX-capable hardware; \ host CPU lacks AVX. Run on an AVX-capable machine." ); @@ -28,8 +29,7 @@ fn ingest_then_hybrid_search_returns_hits() { require_avx_or_panic(); let env = TestEnv::with_embeddings(); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); assert_eq!(report.errors, 0, "no per-file errors: {report:?}"); assert_eq!(report.new, 3); @@ -55,8 +55,7 @@ fn ingest_then_vector_search_carries_embedding_model() { require_avx_or_panic(); let env = TestEnv::with_embeddings(); - let report = - kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap(); assert_eq!(report.errors, 0, "no per-file errors: {report:?}"); assert_eq!(report.new, 3); diff --git a/crates/kebab-app/tests/skip_reason.rs b/crates/kebab-app/tests/skip_reason.rs index 0bd9340..3db5613 100644 --- a/crates/kebab-app/tests/skip_reason.rs +++ b/crates/kebab-app/tests/skip_reason.rs @@ -13,11 +13,7 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() { std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap(); std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap(); - let report = kebab_app::ingest_with_config( - env.config.clone(), - env.scope(), - false, - ).unwrap(); + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap(); let items = report.items.as_ref().expect("items array populated"); let docx_item = items @@ -39,5 +35,8 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() { vec!["unsupported media type: ".to_string()], ); assert_eq!(report.skipped_by_extension.get("docx").copied(), Some(1)); - assert_eq!(report.skipped_by_extension.get("").copied(), Some(1)); + assert_eq!( + report.skipped_by_extension.get("").copied(), + Some(1) + ); } diff --git a/crates/kebab-app/tests/twin_files_fetch_span.rs b/crates/kebab-app/tests/twin_files_fetch_span.rs index b8c1d9b..620740e 100644 --- a/crates/kebab-app/tests/twin_files_fetch_span.rs +++ b/crates/kebab-app/tests/twin_files_fetch_span.rs @@ -44,8 +44,8 @@ fn twin_files_fetch_span_uses_correct_asset() { std::fs::write(dir_b.join("note.md"), content).unwrap(); // Ingest all files (fixture workspace + our two new twins). - let report = ingest_with_config(env.config.clone(), env.scope(), false) - .expect("ingest must succeed"); + let report = + ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest must succeed"); assert_eq!(report.errors, 0, "no ingest errors; report={report:?}"); // Both twin paths must appear as New in the report. @@ -53,8 +53,7 @@ fn twin_files_fetch_span_uses_correct_asset() { let twin_items: Vec<_> = items .iter() .filter(|i| { - i.doc_path.0.ends_with("src_a/note.md") - || i.doc_path.0.ends_with("src_b/note.md") + i.doc_path.0.ends_with("src_a/note.md") || i.doc_path.0.ends_with("src_b/note.md") }) .collect(); assert_eq!( @@ -149,7 +148,10 @@ fn twin_files_fetch_span_uses_correct_asset() { // at either twin, making one twin's span fetch behave incorrectly. let report2 = ingest_with_config(env.config.clone(), env.scope(), false) .expect("second ingest must succeed"); - assert_eq!(report2.errors, 0, "no ingest errors on second run; report={report2:?}"); + assert_eq!( + report2.errors, 0, + "no ingest errors on second run; report={report2:?}" + ); // Re-open app after second ingest and verify span still works on both. let app2 = env.app(); diff --git a/crates/kebab-app/tests/twin_files_idempotent.rs b/crates/kebab-app/tests/twin_files_idempotent.rs index 9b08bf4..1eb92a2 100644 --- a/crates/kebab-app/tests/twin_files_idempotent.rs +++ b/crates/kebab-app/tests/twin_files_idempotent.rs @@ -43,9 +43,7 @@ fn twin_files_second_ingest_is_unchanged() { let items = first.items.as_ref().expect("items must be present"); let twin_items: Vec<_> = items .iter() - .filter(|i| { - i.doc_path.0.ends_with("__init__.py") - }) + .filter(|i| i.doc_path.0.ends_with("__init__.py")) .collect(); assert_eq!( twin_items.len(), @@ -63,8 +61,14 @@ fn twin_files_second_ingest_is_unchanged() { // Second ingest — same files, same content → both must be Unchanged. let second = ingest_with_config(env.config.clone(), env.scope(), false) .expect("second ingest must succeed"); - assert_eq!(second.errors, 0, "second ingest: no errors; report={second:?}"); - assert_eq!(second.new, 0, "second ingest: no new docs; report={second:?}"); + assert_eq!( + second.errors, 0, + "second ingest: no errors; report={second:?}" + ); + assert_eq!( + second.new, 0, + "second ingest: no new docs; report={second:?}" + ); assert_eq!( second.updated, 0, "second ingest: no updated docs (twin-file bug would set this to 2); report={second:?}" diff --git a/crates/kebab-chunk/src/code_c_ast_v1.rs b/crates/kebab-chunk/src/code_c_ast_v1.rs index 24a89b8..4e97059 100644 --- a/crates/kebab-chunk/src/code_c_ast_v1.rs +++ b/crates/kebab-chunk/src/code_c_ast_v1.rs @@ -39,17 +39,11 @@ impl Chunker for CodeCAstV1Chunker { hex[..POLICY_HASH_HEX_LEN].to_string() } - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result> { + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { for b in &doc.blocks { let c = match b { Block::Code(c) => c, - _ => anyhow::bail!( - "CodeCAstV1Chunker only handles code docs (got non-Code block)" - ), + _ => anyhow::bail!("CodeCAstV1Chunker only handles code docs (got non-Code block)"), }; if !matches!(c.common.source_span, SourceSpan::Code { .. }) { anyhow::bail!( @@ -68,9 +62,12 @@ impl Chunker for CodeCAstV1Chunker { _ => unreachable!("validated above"), }; let (ls, le, symbol, lang) = match &cb.common.source_span { - SourceSpan::Code { line_start, line_end, symbol, lang } => { - (*line_start, *line_end, symbol.clone(), lang.clone()) - } + SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + } => (*line_start, *line_end, symbol.clone(), lang.clone()), _ => unreachable!("validated above"), }; let block_ids: Vec = vec![cb.common.block_id.clone()]; @@ -84,8 +81,13 @@ impl Chunker for CodeCAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - None, span, cb.code.clone(), + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + None, + span, + cb.code.clone(), )); } else { let parts = split_oversize(&cb.code); @@ -93,9 +95,7 @@ impl Chunker for CodeCAstV1Chunker { for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { let part_ls = ls + off_start; let part_le = ls + off_end; - let part_sym = symbol - .as_ref() - .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1)); let span = SourceSpan::Code { line_start: part_ls, line_end: part_le, @@ -103,8 +103,13 @@ impl Chunker for CodeCAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - Some(part_ls), span, text, + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + Some(part_ls), + span, + text, )); } } @@ -183,9 +188,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { mod tests { use super::*; use kebab_core::{ - Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, - SourceType, TrustLevel, WorkspacePath, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use time::OffsetDateTime; @@ -206,39 +211,60 @@ mod tests { }; let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); Block::Code(CodeBlock { - common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, lang: Some("c".into()), code: (*code).to_string(), }) }) .collect(); CanonicalDocument { - doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), - lang: Lang("und".into()), blocks, + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "a".into(), + lang: Lang("und".into()), + blocks, metadata: Metadata { - aliases: vec![], tags: vec![], + aliases: vec![], + tags: vec![], created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), - source_type: SourceType::Note, trust_level: TrustLevel::Primary, - user_id_alias: None, user: Default::default(), - repo: Some("kebab".into()), git_branch: Some("main".into()), - git_commit: Some("0".repeat(40)), code_lang: Some("c".into()), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("c".into()), }, provenance: Provenance { events: vec![] }, - parser_version: pv, schema_version: 1, doc_version: 1, - last_chunker_version: None, last_embedding_version: None, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, } } fn policy() -> ChunkPolicy { - ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, respect_markdown_headings: false, - chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + } } #[test] fn chunker_version_is_code_c_ast_v1() { - assert_eq!(CodeCAstV1Chunker.chunker_version(), - ChunkerVersion("code-c-ast-v1".into())); + assert_eq!( + CodeCAstV1Chunker.chunker_version(), + ChunkerVersion("code-c-ast-v1".into()) + ); } #[test] @@ -256,7 +282,12 @@ mod tests { assert_eq!(c.chunker_version.0, "code-c-ast-v1"); } match &chunks[0].source_spans[0] { - SourceSpan::Code { symbol, line_start, line_end, .. } => { + SourceSpan::Code { + symbol, + line_start, + line_end, + .. + } => { assert_eq!(symbol.as_deref(), Some("parse")); assert_eq!((*line_start, *line_end), (1, 3)); } @@ -266,22 +297,32 @@ mod tests { #[test] fn oversize_unit_splits_into_parts_with_unique_ids() { - let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::(); + let body = (0..500) + .map(|i| format!("\tx{i} = {i};\n")) + .collect::(); let code = format!("int big() {{\n{body}\n}}"); let doc = code_doc(&[("big", 1, 502, &code)]); let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap(); - assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + assert!( + chunks.len() >= 2, + "oversize unit must split, got {}", + chunks.len() + ); for c in &chunks { match &c.source_spans[0] { SourceSpan::Code { symbol, .. } => { - assert!(symbol.as_deref().unwrap().starts_with("big [part "), - "part-numbered symbol, got {symbol:?}"); + assert!( + symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}" + ); } _ => unreachable!(), } } let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); - let n = ids.len(); ids.sort_unstable(); ids.dedup(); + let n = ids.len(); + ids.sort_unstable(); + ids.dedup(); assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); } @@ -295,7 +336,8 @@ mod tests { heading_path: vec![], source_span: SourceSpan::Line { start: 1, end: 1 }, }, - text: "x".into(), inlines: vec![], + text: "x".into(), + inlines: vec![], })]; let err = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); assert!(err.to_string().contains("CodeCAstV1Chunker")); @@ -304,11 +346,19 @@ mod tests { #[test] fn deterministic_chunk_ids_1000() { let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]); - let base: Vec = CodeCAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let base: Vec = CodeCAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); for _ in 0..1000 { - let again: Vec = CodeCAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let again: Vec = CodeCAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); assert_eq!(again, base); } } @@ -316,7 +366,9 @@ mod tests { #[test] fn policy_hash_matches_md_heading_v1() { let p = policy(); - assert_eq!(CodeCAstV1Chunker.policy_hash(&p), - crate::MdHeadingV1Chunker.policy_hash(&p)); + assert_eq!( + CodeCAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p) + ); } } diff --git a/crates/kebab-chunk/src/code_cpp_ast_v1.rs b/crates/kebab-chunk/src/code_cpp_ast_v1.rs index f69b22d..942eb8e 100644 --- a/crates/kebab-chunk/src/code_cpp_ast_v1.rs +++ b/crates/kebab-chunk/src/code_cpp_ast_v1.rs @@ -39,17 +39,13 @@ impl Chunker for CodeCppAstV1Chunker { hex[..POLICY_HASH_HEX_LEN].to_string() } - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result> { + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { for b in &doc.blocks { let c = match b { Block::Code(c) => c, - _ => anyhow::bail!( - "CodeCppAstV1Chunker only handles code docs (got non-Code block)" - ), + _ => { + anyhow::bail!("CodeCppAstV1Chunker only handles code docs (got non-Code block)") + } }; if !matches!(c.common.source_span, SourceSpan::Code { .. }) { anyhow::bail!( @@ -68,9 +64,12 @@ impl Chunker for CodeCppAstV1Chunker { _ => unreachable!("validated above"), }; let (ls, le, symbol, lang) = match &cb.common.source_span { - SourceSpan::Code { line_start, line_end, symbol, lang } => { - (*line_start, *line_end, symbol.clone(), lang.clone()) - } + SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + } => (*line_start, *line_end, symbol.clone(), lang.clone()), _ => unreachable!("validated above"), }; let block_ids: Vec = vec![cb.common.block_id.clone()]; @@ -84,8 +83,13 @@ impl Chunker for CodeCppAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - None, span, cb.code.clone(), + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + None, + span, + cb.code.clone(), )); } else { let parts = split_oversize(&cb.code); @@ -93,9 +97,7 @@ impl Chunker for CodeCppAstV1Chunker { for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { let part_ls = ls + off_start; let part_le = ls + off_end; - let part_sym = symbol - .as_ref() - .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1)); let span = SourceSpan::Code { line_start: part_ls, line_end: part_le, @@ -103,8 +105,13 @@ impl Chunker for CodeCppAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - Some(part_ls), span, text, + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + Some(part_ls), + span, + text, )); } } @@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { mod tests { use super::*; use kebab_core::{ - Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, - SourceType, TrustLevel, WorkspacePath, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use time::OffsetDateTime; @@ -206,39 +213,60 @@ mod tests { }; let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); Block::Code(CodeBlock { - common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, lang: Some("cpp".into()), code: (*code).to_string(), }) }) .collect(); CanonicalDocument { - doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), - lang: Lang("und".into()), blocks, + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "a".into(), + lang: Lang("und".into()), + blocks, metadata: Metadata { - aliases: vec![], tags: vec![], + aliases: vec![], + tags: vec![], created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), - source_type: SourceType::Note, trust_level: TrustLevel::Primary, - user_id_alias: None, user: Default::default(), - repo: Some("kebab".into()), git_branch: Some("main".into()), - git_commit: Some("0".repeat(40)), code_lang: Some("cpp".into()), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("cpp".into()), }, provenance: Provenance { events: vec![] }, - parser_version: pv, schema_version: 1, doc_version: 1, - last_chunker_version: None, last_embedding_version: None, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, } } fn policy() -> ChunkPolicy { - ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, respect_markdown_headings: false, - chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + } } #[test] fn chunker_version_is_code_cpp_ast_v1() { - assert_eq!(CodeCppAstV1Chunker.chunker_version(), - ChunkerVersion("code-cpp-ast-v1".into())); + assert_eq!( + CodeCppAstV1Chunker.chunker_version(), + ChunkerVersion("code-cpp-ast-v1".into()) + ); } #[test] @@ -256,7 +284,12 @@ mod tests { assert_eq!(c.chunker_version.0, "code-cpp-ast-v1"); } match &chunks[0].source_spans[0] { - SourceSpan::Code { symbol, line_start, line_end, .. } => { + SourceSpan::Code { + symbol, + line_start, + line_end, + .. + } => { assert_eq!(symbol.as_deref(), Some("parse")); assert_eq!((*line_start, *line_end), (1, 3)); } @@ -266,22 +299,32 @@ mod tests { #[test] fn oversize_unit_splits_into_parts_with_unique_ids() { - let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::(); + let body = (0..500) + .map(|i| format!("\tx{i} = {i};\n")) + .collect::(); let code = format!("int big() {{\n{body}\n}}"); let doc = code_doc(&[("big", 1, 502, &code)]); let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap(); - assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + assert!( + chunks.len() >= 2, + "oversize unit must split, got {}", + chunks.len() + ); for c in &chunks { match &c.source_spans[0] { SourceSpan::Code { symbol, .. } => { - assert!(symbol.as_deref().unwrap().starts_with("big [part "), - "part-numbered symbol, got {symbol:?}"); + assert!( + symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}" + ); } _ => unreachable!(), } } let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); - let n = ids.len(); ids.sort_unstable(); ids.dedup(); + let n = ids.len(); + ids.sort_unstable(); + ids.dedup(); assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); } @@ -295,7 +338,8 @@ mod tests { heading_path: vec![], source_span: SourceSpan::Line { start: 1, end: 1 }, }, - text: "x".into(), inlines: vec![], + text: "x".into(), + inlines: vec![], })]; let err = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); assert!(err.to_string().contains("CodeCppAstV1Chunker")); @@ -304,11 +348,19 @@ mod tests { #[test] fn deterministic_chunk_ids_1000() { let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]); - let base: Vec = CodeCppAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let base: Vec = CodeCppAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); for _ in 0..1000 { - let again: Vec = CodeCppAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let again: Vec = CodeCppAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); assert_eq!(again, base); } } @@ -316,7 +368,9 @@ mod tests { #[test] fn policy_hash_matches_md_heading_v1() { let p = policy(); - assert_eq!(CodeCppAstV1Chunker.policy_hash(&p), - crate::MdHeadingV1Chunker.policy_hash(&p)); + assert_eq!( + CodeCppAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p) + ); } } diff --git a/crates/kebab-chunk/src/code_go_ast_v1.rs b/crates/kebab-chunk/src/code_go_ast_v1.rs index 16023ee..e9d8b76 100644 --- a/crates/kebab-chunk/src/code_go_ast_v1.rs +++ b/crates/kebab-chunk/src/code_go_ast_v1.rs @@ -39,17 +39,13 @@ impl Chunker for CodeGoAstV1Chunker { hex[..POLICY_HASH_HEX_LEN].to_string() } - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result> { + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { for b in &doc.blocks { let c = match b { Block::Code(c) => c, - _ => anyhow::bail!( - "CodeGoAstV1Chunker only handles code docs (got non-Code block)" - ), + _ => { + anyhow::bail!("CodeGoAstV1Chunker only handles code docs (got non-Code block)") + } }; if !matches!(c.common.source_span, SourceSpan::Code { .. }) { anyhow::bail!( @@ -68,9 +64,12 @@ impl Chunker for CodeGoAstV1Chunker { _ => unreachable!("validated above"), }; let (ls, le, symbol, lang) = match &cb.common.source_span { - SourceSpan::Code { line_start, line_end, symbol, lang } => { - (*line_start, *line_end, symbol.clone(), lang.clone()) - } + SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + } => (*line_start, *line_end, symbol.clone(), lang.clone()), _ => unreachable!("validated above"), }; let block_ids: Vec = vec![cb.common.block_id.clone()]; @@ -84,8 +83,13 @@ impl Chunker for CodeGoAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - None, span, cb.code.clone(), + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + None, + span, + cb.code.clone(), )); } else { let parts = split_oversize(&cb.code); @@ -93,9 +97,7 @@ impl Chunker for CodeGoAstV1Chunker { for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { let part_ls = ls + off_start; let part_le = ls + off_end; - let part_sym = symbol - .as_ref() - .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1)); let span = SourceSpan::Code { line_start: part_ls, line_end: part_le, @@ -103,8 +105,13 @@ impl Chunker for CodeGoAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - Some(part_ls), span, text, + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + Some(part_ls), + span, + text, )); } } @@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { mod tests { use super::*; use kebab_core::{ - Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, - SourceType, TrustLevel, WorkspacePath, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use time::OffsetDateTime; @@ -206,46 +213,72 @@ mod tests { }; let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); Block::Code(CodeBlock { - common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, lang: Some("go".into()), code: (*code).to_string(), }) }) .collect(); CanonicalDocument { - doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), - lang: Lang("und".into()), blocks, + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "a".into(), + lang: Lang("und".into()), + blocks, metadata: Metadata { - aliases: vec![], tags: vec![], + aliases: vec![], + tags: vec![], created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), - source_type: SourceType::Note, trust_level: TrustLevel::Primary, - user_id_alias: None, user: Default::default(), - repo: Some("kebab".into()), git_branch: Some("main".into()), - git_commit: Some("0".repeat(40)), code_lang: Some("go".into()), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("go".into()), }, provenance: Provenance { events: vec![] }, - parser_version: pv, schema_version: 1, doc_version: 1, - last_chunker_version: None, last_embedding_version: None, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, } } fn policy() -> ChunkPolicy { - ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, respect_markdown_headings: false, - chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + } } #[test] fn chunker_version_is_code_go_ast_v1() { - assert_eq!(CodeGoAstV1Chunker.chunker_version(), - ChunkerVersion("code-go-ast-v1".into())); + assert_eq!( + CodeGoAstV1Chunker.chunker_version(), + ChunkerVersion("code-go-ast-v1".into()) + ); } #[test] fn one_chunk_per_unit_preserves_code_span() { let doc = code_doc(&[ ("parse", 1, 3, "func parse() {\n\t// x\n}"), - ("Foo.double", 5, 7, "func double() int {\n\t//\n\treturn 0\n}"), + ( + "Foo.double", + 5, + 7, + "func double() int {\n\t//\n\treturn 0\n}", + ), ]); let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap(); assert_eq!(chunks.len(), 2); @@ -256,7 +289,12 @@ mod tests { assert_eq!(c.chunker_version.0, "code-go-ast-v1"); } match &chunks[0].source_spans[0] { - SourceSpan::Code { symbol, line_start, line_end, .. } => { + SourceSpan::Code { + symbol, + line_start, + line_end, + .. + } => { assert_eq!(symbol.as_deref(), Some("parse")); assert_eq!((*line_start, *line_end), (1, 3)); } @@ -266,22 +304,33 @@ mod tests { #[test] fn oversize_unit_splits_into_parts_with_unique_ids() { - let body = (0..500).map(|i| format!("\tx{i} := {i}")).collect::>().join("\n"); + let body = (0..500) + .map(|i| format!("\tx{i} := {i}")) + .collect::>() + .join("\n"); let code = format!("func big() {{\n{body}\n}}"); let doc = code_doc(&[("big", 1, 502, &code)]); let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap(); - assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + assert!( + chunks.len() >= 2, + "oversize unit must split, got {}", + chunks.len() + ); for c in &chunks { match &c.source_spans[0] { SourceSpan::Code { symbol, .. } => { - assert!(symbol.as_deref().unwrap().starts_with("big [part "), - "part-numbered symbol, got {symbol:?}"); + assert!( + symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}" + ); } _ => unreachable!(), } } let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); - let n = ids.len(); ids.sort_unstable(); ids.dedup(); + let n = ids.len(); + ids.sort_unstable(); + ids.dedup(); assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); } @@ -295,7 +344,8 @@ mod tests { heading_path: vec![], source_span: SourceSpan::Line { start: 1, end: 1 }, }, - text: "x".into(), inlines: vec![], + text: "x".into(), + inlines: vec![], })]; let err = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); assert!(err.to_string().contains("CodeGoAstV1Chunker")); @@ -304,11 +354,19 @@ mod tests { #[test] fn deterministic_chunk_ids_1000() { let doc = code_doc(&[("parse", 1, 2, "func parse() {}\n")]); - let base: Vec = CodeGoAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let base: Vec = CodeGoAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); for _ in 0..1000 { - let again: Vec = CodeGoAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let again: Vec = CodeGoAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); assert_eq!(again, base); } } @@ -316,7 +374,9 @@ mod tests { #[test] fn policy_hash_matches_md_heading_v1() { let p = policy(); - assert_eq!(CodeGoAstV1Chunker.policy_hash(&p), - crate::MdHeadingV1Chunker.policy_hash(&p)); + assert_eq!( + CodeGoAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p) + ); } } diff --git a/crates/kebab-chunk/src/code_java_ast_v1.rs b/crates/kebab-chunk/src/code_java_ast_v1.rs index 8ebe86d..0f47540 100644 --- a/crates/kebab-chunk/src/code_java_ast_v1.rs +++ b/crates/kebab-chunk/src/code_java_ast_v1.rs @@ -39,11 +39,7 @@ impl Chunker for CodeJavaAstV1Chunker { hex[..POLICY_HASH_HEX_LEN].to_string() } - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result> { + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { for b in &doc.blocks { let c = match b { Block::Code(c) => c, @@ -68,9 +64,12 @@ impl Chunker for CodeJavaAstV1Chunker { _ => unreachable!("validated above"), }; let (ls, le, symbol, lang) = match &cb.common.source_span { - SourceSpan::Code { line_start, line_end, symbol, lang } => { - (*line_start, *line_end, symbol.clone(), lang.clone()) - } + SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + } => (*line_start, *line_end, symbol.clone(), lang.clone()), _ => unreachable!("validated above"), }; let block_ids: Vec = vec![cb.common.block_id.clone()]; @@ -84,8 +83,13 @@ impl Chunker for CodeJavaAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - None, span, cb.code.clone(), + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + None, + span, + cb.code.clone(), )); } else { let parts = split_oversize(&cb.code); @@ -93,9 +97,7 @@ impl Chunker for CodeJavaAstV1Chunker { for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { let part_ls = ls + off_start; let part_le = ls + off_end; - let part_sym = symbol - .as_ref() - .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1)); let span = SourceSpan::Code { line_start: part_ls, line_end: part_le, @@ -103,8 +105,13 @@ impl Chunker for CodeJavaAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - Some(part_ls), span, text, + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + Some(part_ls), + span, + text, )); } } @@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { mod tests { use super::*; use kebab_core::{ - Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, - SourceType, TrustLevel, WorkspacePath, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use time::OffsetDateTime; @@ -206,39 +213,60 @@ mod tests { }; let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); Block::Code(CodeBlock { - common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, lang: Some("java".into()), code: (*code).to_string(), }) }) .collect(); CanonicalDocument { - doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), - lang: Lang("und".into()), blocks, + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "a".into(), + lang: Lang("und".into()), + blocks, metadata: Metadata { - aliases: vec![], tags: vec![], + aliases: vec![], + tags: vec![], created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), - source_type: SourceType::Note, trust_level: TrustLevel::Primary, - user_id_alias: None, user: Default::default(), - repo: Some("kebab".into()), git_branch: Some("main".into()), - git_commit: Some("0".repeat(40)), code_lang: Some("java".into()), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("java".into()), }, provenance: Provenance { events: vec![] }, - parser_version: pv, schema_version: 1, doc_version: 1, - last_chunker_version: None, last_embedding_version: None, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, } } fn policy() -> ChunkPolicy { - ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, respect_markdown_headings: false, - chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + } } #[test] fn chunker_version_is_code_java_ast_v1() { - assert_eq!(CodeJavaAstV1Chunker.chunker_version(), - ChunkerVersion("code-java-ast-v1".into())); + assert_eq!( + CodeJavaAstV1Chunker.chunker_version(), + ChunkerVersion("code-java-ast-v1".into()) + ); } #[test] @@ -256,7 +284,12 @@ mod tests { assert_eq!(c.chunker_version.0, "code-java-ast-v1"); } match &chunks[0].source_spans[0] { - SourceSpan::Code { symbol, line_start, line_end, .. } => { + SourceSpan::Code { + symbol, + line_start, + line_end, + .. + } => { assert_eq!(symbol.as_deref(), Some("parse")); assert_eq!((*line_start, *line_end), (1, 3)); } @@ -266,22 +299,33 @@ mod tests { #[test] fn oversize_unit_splits_into_parts_with_unique_ids() { - let body = (0..500).map(|i| format!("\tint x{i} = {i};")).collect::>().join("\n"); + let body = (0..500) + .map(|i| format!("\tint x{i} = {i};")) + .collect::>() + .join("\n"); let code = format!("void big() {{\n{body}\n}}"); let doc = code_doc(&[("big", 1, 502, &code)]); let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap(); - assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + assert!( + chunks.len() >= 2, + "oversize unit must split, got {}", + chunks.len() + ); for c in &chunks { match &c.source_spans[0] { SourceSpan::Code { symbol, .. } => { - assert!(symbol.as_deref().unwrap().starts_with("big [part "), - "part-numbered symbol, got {symbol:?}"); + assert!( + symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}" + ); } _ => unreachable!(), } } let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); - let n = ids.len(); ids.sort_unstable(); ids.dedup(); + let n = ids.len(); + ids.sort_unstable(); + ids.dedup(); assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); } @@ -295,7 +339,8 @@ mod tests { heading_path: vec![], source_span: SourceSpan::Line { start: 1, end: 1 }, }, - text: "x".into(), inlines: vec![], + text: "x".into(), + inlines: vec![], })]; let err = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); assert!(err.to_string().contains("CodeJavaAstV1Chunker")); @@ -304,11 +349,19 @@ mod tests { #[test] fn deterministic_chunk_ids_1000() { let doc = code_doc(&[("parse", 1, 2, "void parse() {}\n")]); - let base: Vec = CodeJavaAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let base: Vec = CodeJavaAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); for _ in 0..1000 { - let again: Vec = CodeJavaAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let again: Vec = CodeJavaAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); assert_eq!(again, base); } } @@ -316,7 +369,9 @@ mod tests { #[test] fn policy_hash_matches_md_heading_v1() { let p = policy(); - assert_eq!(CodeJavaAstV1Chunker.policy_hash(&p), - crate::MdHeadingV1Chunker.policy_hash(&p)); + assert_eq!( + CodeJavaAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p) + ); } } diff --git a/crates/kebab-chunk/src/code_js_ast_v1.rs b/crates/kebab-chunk/src/code_js_ast_v1.rs index 2019768..ae0bc2e 100644 --- a/crates/kebab-chunk/src/code_js_ast_v1.rs +++ b/crates/kebab-chunk/src/code_js_ast_v1.rs @@ -39,17 +39,13 @@ impl Chunker for CodeJsAstV1Chunker { hex[..POLICY_HASH_HEX_LEN].to_string() } - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result> { + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { for b in &doc.blocks { let c = match b { Block::Code(c) => c, - _ => anyhow::bail!( - "CodeJsAstV1Chunker only handles code docs (got non-Code block)" - ), + _ => { + anyhow::bail!("CodeJsAstV1Chunker only handles code docs (got non-Code block)") + } }; if !matches!(c.common.source_span, SourceSpan::Code { .. }) { anyhow::bail!( @@ -68,9 +64,12 @@ impl Chunker for CodeJsAstV1Chunker { _ => unreachable!("validated above"), }; let (ls, le, symbol, lang) = match &cb.common.source_span { - SourceSpan::Code { line_start, line_end, symbol, lang } => { - (*line_start, *line_end, symbol.clone(), lang.clone()) - } + SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + } => (*line_start, *line_end, symbol.clone(), lang.clone()), _ => unreachable!("validated above"), }; let block_ids: Vec = vec![cb.common.block_id.clone()]; @@ -84,8 +83,13 @@ impl Chunker for CodeJsAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - None, span, cb.code.clone(), + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + None, + span, + cb.code.clone(), )); } else { let parts = split_oversize(&cb.code); @@ -93,9 +97,7 @@ impl Chunker for CodeJsAstV1Chunker { for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { let part_ls = ls + off_start; let part_le = ls + off_end; - let part_sym = symbol - .as_ref() - .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1)); let span = SourceSpan::Code { line_start: part_ls, line_end: part_le, @@ -103,8 +105,13 @@ impl Chunker for CodeJsAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - Some(part_ls), span, text, + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + Some(part_ls), + span, + text, )); } } @@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { mod tests { use super::*; use kebab_core::{ - Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, - SourceType, TrustLevel, WorkspacePath, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use time::OffsetDateTime; @@ -206,46 +213,72 @@ mod tests { }; let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); Block::Code(CodeBlock { - common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, lang: Some("javascript".into()), code: (*code).to_string(), }) }) .collect(); CanonicalDocument { - doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), - lang: Lang("und".into()), blocks, + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "a".into(), + lang: Lang("und".into()), + blocks, metadata: Metadata { - aliases: vec![], tags: vec![], + aliases: vec![], + tags: vec![], created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), - source_type: SourceType::Note, trust_level: TrustLevel::Primary, - user_id_alias: None, user: Default::default(), - repo: Some("kebab".into()), git_branch: Some("main".into()), - git_commit: Some("0".repeat(40)), code_lang: Some("javascript".into()), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("javascript".into()), }, provenance: Provenance { events: vec![] }, - parser_version: pv, schema_version: 1, doc_version: 1, - last_chunker_version: None, last_embedding_version: None, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, } } fn policy() -> ChunkPolicy { - ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, respect_markdown_headings: false, - chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + } } #[test] fn chunker_version_is_code_js_ast_v1() { - assert_eq!(CodeJsAstV1Chunker.chunker_version(), - ChunkerVersion("code-js-ast-v1".into())); + assert_eq!( + CodeJsAstV1Chunker.chunker_version(), + ChunkerVersion("code-js-ast-v1".into()) + ); } #[test] fn one_chunk_per_unit_preserves_code_span() { let doc = code_doc(&[ ("parse", 1, 3, "function parse() {\n // x\n}"), - ("Foo.double", 5, 7, "function double() {\n //\n return 0;\n}"), + ( + "Foo.double", + 5, + 7, + "function double() {\n //\n return 0;\n}", + ), ]); let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap(); assert_eq!(chunks.len(), 2); @@ -256,7 +289,12 @@ mod tests { assert_eq!(c.chunker_version.0, "code-js-ast-v1"); } match &chunks[0].source_spans[0] { - SourceSpan::Code { symbol, line_start, line_end, .. } => { + SourceSpan::Code { + symbol, + line_start, + line_end, + .. + } => { assert_eq!(symbol.as_deref(), Some("parse")); assert_eq!((*line_start, *line_end), (1, 3)); } @@ -266,22 +304,33 @@ mod tests { #[test] fn oversize_unit_splits_into_parts_with_unique_ids() { - let body = (0..500).map(|i| format!(" const x{i} = {i};")).collect::>().join("\n"); + let body = (0..500) + .map(|i| format!(" const x{i} = {i};")) + .collect::>() + .join("\n"); let code = format!("function big() {{\n{body}\n}}"); let doc = code_doc(&[("big", 1, 502, &code)]); let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap(); - assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + assert!( + chunks.len() >= 2, + "oversize unit must split, got {}", + chunks.len() + ); for c in &chunks { match &c.source_spans[0] { SourceSpan::Code { symbol, .. } => { - assert!(symbol.as_deref().unwrap().starts_with("big [part "), - "part-numbered symbol, got {symbol:?}"); + assert!( + symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}" + ); } _ => unreachable!(), } } let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); - let n = ids.len(); ids.sort_unstable(); ids.dedup(); + let n = ids.len(); + ids.sort_unstable(); + ids.dedup(); assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); } @@ -295,7 +344,8 @@ mod tests { heading_path: vec![], source_span: SourceSpan::Line { start: 1, end: 1 }, }, - text: "x".into(), inlines: vec![], + text: "x".into(), + inlines: vec![], })]; let err = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); assert!(err.to_string().contains("CodeJsAstV1Chunker")); @@ -304,11 +354,19 @@ mod tests { #[test] fn deterministic_chunk_ids_1000() { let doc = code_doc(&[("parse", 1, 2, "function parse() {}\n")]); - let base: Vec = CodeJsAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let base: Vec = CodeJsAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); for _ in 0..1000 { - let again: Vec = CodeJsAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let again: Vec = CodeJsAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); assert_eq!(again, base); } } @@ -316,7 +374,9 @@ mod tests { #[test] fn policy_hash_matches_md_heading_v1() { let p = policy(); - assert_eq!(CodeJsAstV1Chunker.policy_hash(&p), - crate::MdHeadingV1Chunker.policy_hash(&p)); + assert_eq!( + CodeJsAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p) + ); } } diff --git a/crates/kebab-chunk/src/code_kotlin_ast_v1.rs b/crates/kebab-chunk/src/code_kotlin_ast_v1.rs index d416f5e..c992699 100644 --- a/crates/kebab-chunk/src/code_kotlin_ast_v1.rs +++ b/crates/kebab-chunk/src/code_kotlin_ast_v1.rs @@ -39,11 +39,7 @@ impl Chunker for CodeKotlinAstV1Chunker { hex[..POLICY_HASH_HEX_LEN].to_string() } - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result> { + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { for b in &doc.blocks { let c = match b { Block::Code(c) => c, @@ -68,9 +64,12 @@ impl Chunker for CodeKotlinAstV1Chunker { _ => unreachable!("validated above"), }; let (ls, le, symbol, lang) = match &cb.common.source_span { - SourceSpan::Code { line_start, line_end, symbol, lang } => { - (*line_start, *line_end, symbol.clone(), lang.clone()) - } + SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + } => (*line_start, *line_end, symbol.clone(), lang.clone()), _ => unreachable!("validated above"), }; let block_ids: Vec = vec![cb.common.block_id.clone()]; @@ -84,8 +83,13 @@ impl Chunker for CodeKotlinAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - None, span, cb.code.clone(), + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + None, + span, + cb.code.clone(), )); } else { let parts = split_oversize(&cb.code); @@ -93,9 +97,7 @@ impl Chunker for CodeKotlinAstV1Chunker { for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { let part_ls = ls + off_start; let part_le = ls + off_end; - let part_sym = symbol - .as_ref() - .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1)); let span = SourceSpan::Code { line_start: part_ls, line_end: part_le, @@ -103,8 +105,13 @@ impl Chunker for CodeKotlinAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - Some(part_ls), span, text, + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + Some(part_ls), + span, + text, )); } } @@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { mod tests { use super::*; use kebab_core::{ - Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, - SourceType, TrustLevel, WorkspacePath, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use time::OffsetDateTime; @@ -206,46 +213,72 @@ mod tests { }; let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); Block::Code(CodeBlock { - common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, lang: Some("kotlin".into()), code: (*code).to_string(), }) }) .collect(); CanonicalDocument { - doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), - lang: Lang("und".into()), blocks, + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "a".into(), + lang: Lang("und".into()), + blocks, metadata: Metadata { - aliases: vec![], tags: vec![], + aliases: vec![], + tags: vec![], created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), - source_type: SourceType::Note, trust_level: TrustLevel::Primary, - user_id_alias: None, user: Default::default(), - repo: Some("kebab".into()), git_branch: Some("main".into()), - git_commit: Some("0".repeat(40)), code_lang: Some("kotlin".into()), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("kotlin".into()), }, provenance: Provenance { events: vec![] }, - parser_version: pv, schema_version: 1, doc_version: 1, - last_chunker_version: None, last_embedding_version: None, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, } } fn policy() -> ChunkPolicy { - ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, respect_markdown_headings: false, - chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + } } #[test] fn chunker_version_is_code_kotlin_ast_v1() { - assert_eq!(CodeKotlinAstV1Chunker.chunker_version(), - ChunkerVersion("code-kotlin-ast-v1".into())); + assert_eq!( + CodeKotlinAstV1Chunker.chunker_version(), + ChunkerVersion("code-kotlin-ast-v1".into()) + ); } #[test] fn one_chunk_per_unit_preserves_code_span() { let doc = code_doc(&[ ("parse", 1, 3, "fun parse() {\n\t// x\n}"), - ("Foo.double", 5, 7, "fun double(): Int {\n\t//\n\treturn 0\n}"), + ( + "Foo.double", + 5, + 7, + "fun double(): Int {\n\t//\n\treturn 0\n}", + ), ]); let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap(); assert_eq!(chunks.len(), 2); @@ -256,7 +289,12 @@ mod tests { assert_eq!(c.chunker_version.0, "code-kotlin-ast-v1"); } match &chunks[0].source_spans[0] { - SourceSpan::Code { symbol, line_start, line_end, .. } => { + SourceSpan::Code { + symbol, + line_start, + line_end, + .. + } => { assert_eq!(symbol.as_deref(), Some("parse")); assert_eq!((*line_start, *line_end), (1, 3)); } @@ -266,22 +304,33 @@ mod tests { #[test] fn oversize_unit_splits_into_parts_with_unique_ids() { - let body = (0..500).map(|i| format!("\tval x{i} = {i}")).collect::>().join("\n"); + let body = (0..500) + .map(|i| format!("\tval x{i} = {i}")) + .collect::>() + .join("\n"); let code = format!("fun big() {{\n{body}\n}}"); let doc = code_doc(&[("big", 1, 502, &code)]); let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap(); - assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + assert!( + chunks.len() >= 2, + "oversize unit must split, got {}", + chunks.len() + ); for c in &chunks { match &c.source_spans[0] { SourceSpan::Code { symbol, .. } => { - assert!(symbol.as_deref().unwrap().starts_with("big [part "), - "part-numbered symbol, got {symbol:?}"); + assert!( + symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}" + ); } _ => unreachable!(), } } let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); - let n = ids.len(); ids.sort_unstable(); ids.dedup(); + let n = ids.len(); + ids.sort_unstable(); + ids.dedup(); assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); } @@ -295,7 +344,8 @@ mod tests { heading_path: vec![], source_span: SourceSpan::Line { start: 1, end: 1 }, }, - text: "x".into(), inlines: vec![], + text: "x".into(), + inlines: vec![], })]; let err = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); assert!(err.to_string().contains("CodeKotlinAstV1Chunker")); @@ -304,11 +354,19 @@ mod tests { #[test] fn deterministic_chunk_ids_1000() { let doc = code_doc(&[("parse", 1, 2, "fun parse() {}\n")]); - let base: Vec = CodeKotlinAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let base: Vec = CodeKotlinAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); for _ in 0..1000 { - let again: Vec = CodeKotlinAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let again: Vec = CodeKotlinAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); assert_eq!(again, base); } } @@ -316,7 +374,9 @@ mod tests { #[test] fn policy_hash_matches_md_heading_v1() { let p = policy(); - assert_eq!(CodeKotlinAstV1Chunker.policy_hash(&p), - crate::MdHeadingV1Chunker.policy_hash(&p)); + assert_eq!( + CodeKotlinAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p) + ); } } diff --git a/crates/kebab-chunk/src/code_python_ast_v1.rs b/crates/kebab-chunk/src/code_python_ast_v1.rs index aa5f41d..246a3e0 100644 --- a/crates/kebab-chunk/src/code_python_ast_v1.rs +++ b/crates/kebab-chunk/src/code_python_ast_v1.rs @@ -39,11 +39,7 @@ impl Chunker for CodePythonAstV1Chunker { hex[..POLICY_HASH_HEX_LEN].to_string() } - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result> { + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { for b in &doc.blocks { let c = match b { Block::Code(c) => c, @@ -68,9 +64,12 @@ impl Chunker for CodePythonAstV1Chunker { _ => unreachable!("validated above"), }; let (ls, le, symbol, lang) = match &cb.common.source_span { - SourceSpan::Code { line_start, line_end, symbol, lang } => { - (*line_start, *line_end, symbol.clone(), lang.clone()) - } + SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + } => (*line_start, *line_end, symbol.clone(), lang.clone()), _ => unreachable!("validated above"), }; let block_ids: Vec = vec![cb.common.block_id.clone()]; @@ -84,8 +83,13 @@ impl Chunker for CodePythonAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - None, span, cb.code.clone(), + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + None, + span, + cb.code.clone(), )); } else { let parts = split_oversize(&cb.code); @@ -93,9 +97,7 @@ impl Chunker for CodePythonAstV1Chunker { for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { let part_ls = ls + off_start; let part_le = ls + off_end; - let part_sym = symbol - .as_ref() - .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1)); let span = SourceSpan::Code { line_start: part_ls, line_end: part_le, @@ -103,8 +105,13 @@ impl Chunker for CodePythonAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - Some(part_ls), span, text, + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + Some(part_ls), + span, + text, )); } } @@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { mod tests { use super::*; use kebab_core::{ - Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, - SourceType, TrustLevel, WorkspacePath, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use time::OffsetDateTime; @@ -206,39 +213,60 @@ mod tests { }; let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); Block::Code(CodeBlock { - common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, lang: Some("python".into()), code: (*code).to_string(), }) }) .collect(); CanonicalDocument { - doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), - lang: Lang("und".into()), blocks, + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "a".into(), + lang: Lang("und".into()), + blocks, metadata: Metadata { - aliases: vec![], tags: vec![], + aliases: vec![], + tags: vec![], created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), - source_type: SourceType::Note, trust_level: TrustLevel::Primary, - user_id_alias: None, user: Default::default(), - repo: Some("kebab".into()), git_branch: Some("main".into()), - git_commit: Some("0".repeat(40)), code_lang: Some("python".into()), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("python".into()), }, provenance: Provenance { events: vec![] }, - parser_version: pv, schema_version: 1, doc_version: 1, - last_chunker_version: None, last_embedding_version: None, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, } } fn policy() -> ChunkPolicy { - ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, respect_markdown_headings: false, - chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + } } #[test] fn chunker_version_is_code_python_ast_v1() { - assert_eq!(CodePythonAstV1Chunker.chunker_version(), - ChunkerVersion("code-python-ast-v1".into())); + assert_eq!( + CodePythonAstV1Chunker.chunker_version(), + ChunkerVersion("code-python-ast-v1".into()) + ); } #[test] @@ -256,7 +284,12 @@ mod tests { assert_eq!(c.chunker_version.0, "code-python-ast-v1"); } match &chunks[0].source_spans[0] { - SourceSpan::Code { symbol, line_start, line_end, .. } => { + SourceSpan::Code { + symbol, + line_start, + line_end, + .. + } => { assert_eq!(symbol.as_deref(), Some("parse")); assert_eq!((*line_start, *line_end), (1, 3)); } @@ -266,22 +299,33 @@ mod tests { #[test] fn oversize_unit_splits_into_parts_with_unique_ids() { - let body = (0..500).map(|i| format!(" x{i} = {i}")).collect::>().join("\n"); + let body = (0..500) + .map(|i| format!(" x{i} = {i}")) + .collect::>() + .join("\n"); let code = format!("def big():\n{body}\n"); let doc = code_doc(&[("big", 1, 502, &code)]); let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap(); - assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + assert!( + chunks.len() >= 2, + "oversize unit must split, got {}", + chunks.len() + ); for c in &chunks { match &c.source_spans[0] { SourceSpan::Code { symbol, .. } => { - assert!(symbol.as_deref().unwrap().starts_with("big [part "), - "part-numbered symbol, got {symbol:?}"); + assert!( + symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}" + ); } _ => unreachable!(), } } let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); - let n = ids.len(); ids.sort_unstable(); ids.dedup(); + let n = ids.len(); + ids.sort_unstable(); + ids.dedup(); assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); } @@ -295,7 +339,8 @@ mod tests { heading_path: vec![], source_span: SourceSpan::Line { start: 1, end: 1 }, }, - text: "x".into(), inlines: vec![], + text: "x".into(), + inlines: vec![], })]; let err = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); assert!(err.to_string().contains("CodePythonAstV1Chunker")); @@ -304,11 +349,19 @@ mod tests { #[test] fn deterministic_chunk_ids_1000() { let doc = code_doc(&[("parse", 1, 2, "def parse(): pass\n")]); - let base: Vec = CodePythonAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let base: Vec = CodePythonAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); for _ in 0..1000 { - let again: Vec = CodePythonAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let again: Vec = CodePythonAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); assert_eq!(again, base); } } @@ -316,7 +369,9 @@ mod tests { #[test] fn policy_hash_matches_md_heading_v1() { let p = policy(); - assert_eq!(CodePythonAstV1Chunker.policy_hash(&p), - crate::MdHeadingV1Chunker.policy_hash(&p)); + assert_eq!( + CodePythonAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p) + ); } } diff --git a/crates/kebab-chunk/src/code_rust_ast_v1.rs b/crates/kebab-chunk/src/code_rust_ast_v1.rs index 67a26e5..83dcda3 100644 --- a/crates/kebab-chunk/src/code_rust_ast_v1.rs +++ b/crates/kebab-chunk/src/code_rust_ast_v1.rs @@ -39,11 +39,7 @@ impl Chunker for CodeRustAstV1Chunker { hex[..POLICY_HASH_HEX_LEN].to_string() } - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result> { + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { for b in &doc.blocks { let c = match b { Block::Code(c) => c, @@ -68,9 +64,12 @@ impl Chunker for CodeRustAstV1Chunker { _ => unreachable!("validated above"), }; let (ls, le, symbol, lang) = match &cb.common.source_span { - SourceSpan::Code { line_start, line_end, symbol, lang } => { - (*line_start, *line_end, symbol.clone(), lang.clone()) - } + SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + } => (*line_start, *line_end, symbol.clone(), lang.clone()), _ => unreachable!("validated above"), }; let block_ids: Vec = vec![cb.common.block_id.clone()]; @@ -84,8 +83,13 @@ impl Chunker for CodeRustAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - None, span, cb.code.clone(), + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + None, + span, + cb.code.clone(), )); } else { let parts = split_oversize(&cb.code); @@ -93,9 +97,7 @@ impl Chunker for CodeRustAstV1Chunker { for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { let part_ls = ls + off_start; let part_le = ls + off_end; - let part_sym = symbol - .as_ref() - .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1)); let span = SourceSpan::Code { line_start: part_ls, line_end: part_le, @@ -103,8 +105,13 @@ impl Chunker for CodeRustAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - Some(part_ls), span, text, + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + Some(part_ls), + span, + text, )); } } @@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { mod tests { use super::*; use kebab_core::{ - Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, - SourceType, TrustLevel, WorkspacePath, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use time::OffsetDateTime; @@ -206,39 +213,60 @@ mod tests { }; let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); Block::Code(CodeBlock { - common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, lang: Some("rust".into()), code: (*code).to_string(), }) }) .collect(); CanonicalDocument { - doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), - lang: Lang("und".into()), blocks, + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "a".into(), + lang: Lang("und".into()), + blocks, metadata: Metadata { - aliases: vec![], tags: vec![], + aliases: vec![], + tags: vec![], created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), - source_type: SourceType::Note, trust_level: TrustLevel::Primary, - user_id_alias: None, user: Default::default(), - repo: Some("kebab".into()), git_branch: Some("main".into()), - git_commit: Some("0".repeat(40)), code_lang: Some("rust".into()), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("rust".into()), }, provenance: Provenance { events: vec![] }, - parser_version: pv, schema_version: 1, doc_version: 1, - last_chunker_version: None, last_embedding_version: None, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, } } fn policy() -> ChunkPolicy { - ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, respect_markdown_headings: false, - chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + } } #[test] fn chunker_version_is_code_rust_ast_v1() { - assert_eq!(CodeRustAstV1Chunker.chunker_version(), - ChunkerVersion("code-rust-ast-v1".into())); + assert_eq!( + CodeRustAstV1Chunker.chunker_version(), + ChunkerVersion("code-rust-ast-v1".into()) + ); } #[test] @@ -256,7 +284,12 @@ mod tests { assert_eq!(c.chunker_version.0, "code-rust-ast-v1"); } match &chunks[0].source_spans[0] { - SourceSpan::Code { symbol, line_start, line_end, .. } => { + SourceSpan::Code { + symbol, + line_start, + line_end, + .. + } => { assert_eq!(symbol.as_deref(), Some("parse")); assert_eq!((*line_start, *line_end), (1, 3)); } @@ -266,22 +299,33 @@ mod tests { #[test] fn oversize_unit_splits_into_parts_with_unique_ids() { - let body = (0..500).map(|i| format!(" let x{i} = {i};")).collect::>().join("\n"); + let body = (0..500) + .map(|i| format!(" let x{i} = {i};")) + .collect::>() + .join("\n"); let code = format!("pub fn big() {{\n{body}\n}}"); let doc = code_doc(&[("big", 1, 502, &code)]); let chunks = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap(); - assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + assert!( + chunks.len() >= 2, + "oversize unit must split, got {}", + chunks.len() + ); for c in &chunks { match &c.source_spans[0] { SourceSpan::Code { symbol, .. } => { - assert!(symbol.as_deref().unwrap().starts_with("big [part "), - "part-numbered symbol, got {symbol:?}"); + assert!( + symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}" + ); } _ => unreachable!(), } } let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); - let n = ids.len(); ids.sort_unstable(); ids.dedup(); + let n = ids.len(); + ids.sort_unstable(); + ids.dedup(); assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); } @@ -295,7 +339,8 @@ mod tests { heading_path: vec![], source_span: SourceSpan::Line { start: 1, end: 1 }, }, - text: "x".into(), inlines: vec![], + text: "x".into(), + inlines: vec![], })]; let err = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); assert!(err.to_string().contains("CodeRustAstV1Chunker")); @@ -304,11 +349,19 @@ mod tests { #[test] fn deterministic_chunk_ids_1000() { let doc = code_doc(&[("parse", 1, 2, "fn parse(){}\n}")]); - let base: Vec = CodeRustAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let base: Vec = CodeRustAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); for _ in 0..1000 { - let again: Vec = CodeRustAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let again: Vec = CodeRustAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); assert_eq!(again, base); } } @@ -316,7 +369,9 @@ mod tests { #[test] fn policy_hash_matches_md_heading_v1() { let p = policy(); - assert_eq!(CodeRustAstV1Chunker.policy_hash(&p), - crate::MdHeadingV1Chunker.policy_hash(&p)); + assert_eq!( + CodeRustAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p) + ); } } diff --git a/crates/kebab-chunk/src/code_text_paragraph_v1.rs b/crates/kebab-chunk/src/code_text_paragraph_v1.rs index cda5b99..8b885da 100644 --- a/crates/kebab-chunk/src/code_text_paragraph_v1.rs +++ b/crates/kebab-chunk/src/code_text_paragraph_v1.rs @@ -9,7 +9,7 @@ use crate::tier2_shared::{build_chunk_no_symbol, policy_hash}; use anyhow::Result; -use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker}; +use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion}; pub const VERSION_LABEL: &str = "code-text-paragraph-v1"; diff --git a/crates/kebab-chunk/src/code_ts_ast_v1.rs b/crates/kebab-chunk/src/code_ts_ast_v1.rs index 97de14e..e76af55 100644 --- a/crates/kebab-chunk/src/code_ts_ast_v1.rs +++ b/crates/kebab-chunk/src/code_ts_ast_v1.rs @@ -39,17 +39,13 @@ impl Chunker for CodeTsAstV1Chunker { hex[..POLICY_HASH_HEX_LEN].to_string() } - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result> { + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { for b in &doc.blocks { let c = match b { Block::Code(c) => c, - _ => anyhow::bail!( - "CodeTsAstV1Chunker only handles code docs (got non-Code block)" - ), + _ => { + anyhow::bail!("CodeTsAstV1Chunker only handles code docs (got non-Code block)") + } }; if !matches!(c.common.source_span, SourceSpan::Code { .. }) { anyhow::bail!( @@ -68,9 +64,12 @@ impl Chunker for CodeTsAstV1Chunker { _ => unreachable!("validated above"), }; let (ls, le, symbol, lang) = match &cb.common.source_span { - SourceSpan::Code { line_start, line_end, symbol, lang } => { - (*line_start, *line_end, symbol.clone(), lang.clone()) - } + SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + } => (*line_start, *line_end, symbol.clone(), lang.clone()), _ => unreachable!("validated above"), }; let block_ids: Vec = vec![cb.common.block_id.clone()]; @@ -84,8 +83,13 @@ impl Chunker for CodeTsAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - None, span, cb.code.clone(), + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + None, + span, + cb.code.clone(), )); } else { let parts = split_oversize(&cb.code); @@ -93,9 +97,7 @@ impl Chunker for CodeTsAstV1Chunker { for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { let part_ls = ls + off_start; let part_le = ls + off_end; - let part_sym = symbol - .as_ref() - .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1)); let span = SourceSpan::Code { line_start: part_ls, line_end: part_le, @@ -103,8 +105,13 @@ impl Chunker for CodeTsAstV1Chunker { lang: lang.clone(), }; out.push(make_chunk( - doc, &chunker_version, &block_ids, &base_policy_hash, - Some(part_ls), span, text, + doc, + &chunker_version, + &block_ids, + &base_policy_hash, + Some(part_ls), + span, + text, )); } } @@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { mod tests { use super::*; use kebab_core::{ - Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, - SourceType, TrustLevel, WorkspacePath, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use time::OffsetDateTime; @@ -206,46 +213,72 @@ mod tests { }; let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); Block::Code(CodeBlock { - common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, lang: Some("typescript".into()), code: (*code).to_string(), }) }) .collect(); CanonicalDocument { - doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), - lang: Lang("und".into()), blocks, + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "a".into(), + lang: Lang("und".into()), + blocks, metadata: Metadata { - aliases: vec![], tags: vec![], + aliases: vec![], + tags: vec![], created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), - source_type: SourceType::Note, trust_level: TrustLevel::Primary, - user_id_alias: None, user: Default::default(), - repo: Some("kebab".into()), git_branch: Some("main".into()), - git_commit: Some("0".repeat(40)), code_lang: Some("typescript".into()), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("typescript".into()), }, provenance: Provenance { events: vec![] }, - parser_version: pv, schema_version: 1, doc_version: 1, - last_chunker_version: None, last_embedding_version: None, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, } } fn policy() -> ChunkPolicy { - ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, respect_markdown_headings: false, - chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + chunker_version: ChunkerVersion(VERSION_LABEL.into()), + } } #[test] fn chunker_version_is_code_ts_ast_v1() { - assert_eq!(CodeTsAstV1Chunker.chunker_version(), - ChunkerVersion("code-ts-ast-v1".into())); + assert_eq!( + CodeTsAstV1Chunker.chunker_version(), + ChunkerVersion("code-ts-ast-v1".into()) + ); } #[test] fn one_chunk_per_unit_preserves_code_span() { let doc = code_doc(&[ ("parse", 1, 3, "function parse(): void {\n // x\n}"), - ("Foo.double", 5, 7, "function double(): number {\n //\n return 0;\n}"), + ( + "Foo.double", + 5, + 7, + "function double(): number {\n //\n return 0;\n}", + ), ]); let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap(); assert_eq!(chunks.len(), 2); @@ -256,7 +289,12 @@ mod tests { assert_eq!(c.chunker_version.0, "code-ts-ast-v1"); } match &chunks[0].source_spans[0] { - SourceSpan::Code { symbol, line_start, line_end, .. } => { + SourceSpan::Code { + symbol, + line_start, + line_end, + .. + } => { assert_eq!(symbol.as_deref(), Some("parse")); assert_eq!((*line_start, *line_end), (1, 3)); } @@ -266,22 +304,33 @@ mod tests { #[test] fn oversize_unit_splits_into_parts_with_unique_ids() { - let body = (0..500).map(|i| format!(" const x{i} = {i};")).collect::>().join("\n"); + let body = (0..500) + .map(|i| format!(" const x{i} = {i};")) + .collect::>() + .join("\n"); let code = format!("function big(): void {{\n{body}\n}}"); let doc = code_doc(&[("big", 1, 502, &code)]); let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap(); - assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + assert!( + chunks.len() >= 2, + "oversize unit must split, got {}", + chunks.len() + ); for c in &chunks { match &c.source_spans[0] { SourceSpan::Code { symbol, .. } => { - assert!(symbol.as_deref().unwrap().starts_with("big [part "), - "part-numbered symbol, got {symbol:?}"); + assert!( + symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}" + ); } _ => unreachable!(), } } let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); - let n = ids.len(); ids.sort_unstable(); ids.dedup(); + let n = ids.len(); + ids.sort_unstable(); + ids.dedup(); assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); } @@ -295,7 +344,8 @@ mod tests { heading_path: vec![], source_span: SourceSpan::Line { start: 1, end: 1 }, }, - text: "x".into(), inlines: vec![], + text: "x".into(), + inlines: vec![], })]; let err = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); assert!(err.to_string().contains("CodeTsAstV1Chunker")); @@ -304,11 +354,19 @@ mod tests { #[test] fn deterministic_chunk_ids_1000() { let doc = code_doc(&[("parse", 1, 2, "function parse(): void {}\n")]); - let base: Vec = CodeTsAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let base: Vec = CodeTsAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); for _ in 0..1000 { - let again: Vec = CodeTsAstV1Chunker.chunk(&doc, &policy()) - .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + let again: Vec = CodeTsAstV1Chunker + .chunk(&doc, &policy()) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); assert_eq!(again, base); } } @@ -316,7 +374,9 @@ mod tests { #[test] fn policy_hash_matches_md_heading_v1() { let p = policy(); - assert_eq!(CodeTsAstV1Chunker.policy_hash(&p), - crate::MdHeadingV1Chunker.policy_hash(&p)); + assert_eq!( + CodeTsAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p) + ); } } diff --git a/crates/kebab-chunk/src/dockerfile_file_v1.rs b/crates/kebab-chunk/src/dockerfile_file_v1.rs index 230d86c..3905f13 100644 --- a/crates/kebab-chunk/src/dockerfile_file_v1.rs +++ b/crates/kebab-chunk/src/dockerfile_file_v1.rs @@ -7,7 +7,7 @@ use crate::tier2_shared::{policy_hash, push_chunks_with_oversize}; use anyhow::Result; -use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker}; +use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion}; pub const VERSION_LABEL: &str = "dockerfile-file-v1"; diff --git a/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs b/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs index 5e2b384..55e0f66 100644 --- a/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs +++ b/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs @@ -8,7 +8,7 @@ use crate::tier2_shared::{policy_hash, push_chunks_with_oversize}; use anyhow::Result; -use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker}; +use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion}; pub const VERSION_LABEL: &str = "k8s-manifest-resource-v1"; @@ -49,19 +49,14 @@ impl Chunker for K8sManifestResourceV1Chunker { .get("apiVersion") .and_then(|v| v.as_str()) .unwrap_or(""); - let kind = mapping - .get("kind") - .and_then(|v| v.as_str()) - .unwrap_or(""); + let kind = mapping.get("kind").and_then(|v| v.as_str()).unwrap_or(""); // Skip non-k8s documents. if api.is_empty() || kind.is_empty() { continue; } - let metadata = mapping - .get("metadata") - .and_then(|v| v.as_mapping()); + let metadata = mapping.get("metadata").and_then(|v| v.as_mapping()); let name = metadata .and_then(|m| m.get("name")) .and_then(|v| v.as_str()) @@ -118,10 +113,7 @@ fn split_yaml_documents(text: &str) -> Vec> { .enumerate() .filter_map(|(i, l)| { let trimmed = l.trim_end(); - if trimmed == "---" - || trimmed.starts_with("--- ") - || trimmed.starts_with("---\t") - { + if trimmed == "---" || trimmed.starts_with("--- ") || trimmed.starts_with("---\t") { Some(i) } else { None diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index 1be8bd2..e34de55 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -23,14 +23,14 @@ mod code_js_ast_v1; mod code_kotlin_ast_v1; mod code_python_ast_v1; mod code_rust_ast_v1; +pub mod code_text_paragraph_v1; mod code_ts_ast_v1; +pub mod dockerfile_file_v1; +pub mod k8s_manifest_resource_v1; +pub mod manifest_file_v1; mod md_heading_v1; mod pdf_page_v1; mod tier2_shared; -pub mod k8s_manifest_resource_v1; -pub mod dockerfile_file_v1; -pub mod manifest_file_v1; -pub mod code_text_paragraph_v1; pub use code_c_ast_v1::CodeCAstV1Chunker; pub use code_cpp_ast_v1::CodeCppAstV1Chunker; @@ -40,10 +40,10 @@ pub use code_js_ast_v1::CodeJsAstV1Chunker; pub use code_kotlin_ast_v1::CodeKotlinAstV1Chunker; pub use code_python_ast_v1::CodePythonAstV1Chunker; pub use code_rust_ast_v1::CodeRustAstV1Chunker; +pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker; pub use code_ts_ast_v1::CodeTsAstV1Chunker; +pub use dockerfile_file_v1::DockerfileFileV1Chunker; +pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker; +pub use manifest_file_v1::ManifestFileV1Chunker; pub use md_heading_v1::MdHeadingV1Chunker; pub use pdf_page_v1::PdfPageV1Chunker; -pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker; -pub use dockerfile_file_v1::DockerfileFileV1Chunker; -pub use manifest_file_v1::ManifestFileV1Chunker; -pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker; diff --git a/crates/kebab-chunk/src/manifest_file_v1.rs b/crates/kebab-chunk/src/manifest_file_v1.rs index 9753fdc..259c39d 100644 --- a/crates/kebab-chunk/src/manifest_file_v1.rs +++ b/crates/kebab-chunk/src/manifest_file_v1.rs @@ -8,7 +8,7 @@ use crate::tier2_shared::{policy_hash, push_chunks_with_oversize}; use anyhow::Result; -use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker}; +use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion}; pub const VERSION_LABEL: &str = "manifest-file-v1"; diff --git a/crates/kebab-chunk/src/md_heading_v1.rs b/crates/kebab-chunk/src/md_heading_v1.rs index 7918167..1bac96c 100644 --- a/crates/kebab-chunk/src/md_heading_v1.rs +++ b/crates/kebab-chunk/src/md_heading_v1.rs @@ -1,8 +1,8 @@ //! `md-heading-v1` — heading-aware Markdown chunker. use kebab_core::{ - Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, - ChunkerVersion, DocumentId, SourceSpan, id_for_chunk, + Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId, + SourceSpan, id_for_chunk, }; /// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label @@ -99,11 +99,7 @@ impl Chunker for MdHeadingV1Chunker { hex[..POLICY_HASH_HEX_LEN].to_string() } - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result> { + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { let policy_hash = self.policy_hash(policy); let chunker_version = self.chunker_version(); let mut out: Vec = Vec::new(); @@ -152,22 +148,12 @@ impl Chunker for MdHeadingV1Chunker { // `collect_overlap_seed` keeps seed ≤ target/2, so // a flush here never produces a chunk smaller than // the seed budget. - let would_exceed = acc.text_tokens + next_tokens - > policy.target_tokens + let would_exceed = acc.text_tokens + next_tokens > policy.target_tokens && acc.has_non_heading_content(); if would_exceed { - let overlap_seed = collect_overlap_seed( - &acc, - policy.overlap_tokens, - policy.target_tokens, - ); - flush( - &mut acc, - doc, - &chunker_version, - &policy_hash, - &mut out, - ); + let overlap_seed = + collect_overlap_seed(&acc, policy.overlap_tokens, policy.target_tokens); + flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out); // Seed next accumulator with the prior chunk's // tail blocks (paragraph-level overlap). The // heading is *not* re-included here — it lives @@ -292,10 +278,11 @@ fn build_chunk( ) -> Chunk { debug_assert!(!blocks.is_empty(), "build_chunk requires ≥1 block"); - let block_ids: Vec = - blocks.iter().map(|b| common(b).block_id.clone()).collect(); - let source_spans: Vec = - blocks.iter().map(|b| common(b).source_span.clone()).collect(); + let block_ids: Vec = blocks.iter().map(|b| common(b).block_id.clone()).collect(); + let source_spans: Vec = blocks + .iter() + .map(|b| common(b).source_span.clone()) + .collect(); // heading_path: pick the first non-Heading block's heading_path // (which already includes every parent heading per kb-normalize). @@ -339,12 +326,7 @@ fn build_chunk( text.len().div_ceil(BYTES_PER_TOKEN) }; - let chunk_id = id_for_chunk( - &doc.doc_id, - chunker_version, - &block_ids, - policy_hash, - ); + let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, &block_ids, policy_hash); Chunk { chunk_id, @@ -400,14 +382,8 @@ fn render_block_text(b: &Block) -> String { } else { i.alt.clone() }; - let ocr = i - .ocr - .as_ref() - .map_or("", |o| o.joined.as_str()); - let cap = i - .caption - .as_ref() - .map_or("", |c| c.text.as_str()); + let ocr = i.ocr.as_ref().map_or("", |o| o.joined.as_str()); + let cap = i.caption.as_ref().map_or("", |c| c.text.as_str()); [alt.as_str(), ocr, cap] .iter() .filter(|s| !s.is_empty()) @@ -447,9 +423,8 @@ fn common(b: &Block) -> &kebab_core::CommonBlock { mod tests { use super::*; use kebab_core::{ - AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang, - Metadata, Provenance, SourceType, TableBlock, TextBlock, TrustLevel, - WorkspacePath, id_for_block, + AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang, Metadata, Provenance, + SourceType, TableBlock, TextBlock, TrustLevel, WorkspacePath, id_for_block, }; use time::OffsetDateTime; @@ -492,12 +467,7 @@ mod tests { SourceSpan::Line { start, end } } - fn common_for( - kind: &str, - heading_path: &[String], - ordinal: u32, - s: SourceSpan, - ) -> CommonBlock { + fn common_for(kind: &str, heading_path: &[String], ordinal: u32, s: SourceSpan) -> CommonBlock { CommonBlock { block_id: id_for_block(&doc_id(), kind, heading_path, ordinal, &s), heading_path: heading_path.to_vec(), @@ -532,12 +502,7 @@ mod tests { }) } - fn paragraph( - text: &str, - heading_path: &[&str], - ordinal: u32, - line: u32, - ) -> Block { + fn paragraph(text: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block { let hp: Vec = heading_path.iter().map(|s| (*s).into()).collect(); Block::Paragraph(TextBlock { common: common_for("paragraph", &hp, ordinal, span(line, line)), @@ -546,12 +511,7 @@ mod tests { }) } - fn code_block( - code: &str, - heading_path: &[&str], - ordinal: u32, - s: SourceSpan, - ) -> Block { + fn code_block(code: &str, heading_path: &[&str], ordinal: u32, s: SourceSpan) -> Block { let hp: Vec = heading_path.iter().map(|s| (*s).into()).collect(); Block::Code(CodeBlock { common: common_for("code", &hp, ordinal, s), @@ -578,12 +538,7 @@ mod tests { }) } - fn image_ref( - alt: &str, - heading_path: &[&str], - ordinal: u32, - line: u32, - ) -> Block { + fn image_ref(alt: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block { let hp: Vec = heading_path.iter().map(|s| (*s).into()).collect(); Block::ImageRef(ImageRefBlock { common: common_for("imageref", &hp, ordinal, span(line, line)), diff --git a/crates/kebab-chunk/src/pdf_page_v1.rs b/crates/kebab-chunk/src/pdf_page_v1.rs index ef53af3..246e336 100644 --- a/crates/kebab-chunk/src/pdf_page_v1.rs +++ b/crates/kebab-chunk/src/pdf_page_v1.rs @@ -92,11 +92,7 @@ impl Chunker for PdfPageV1Chunker { hex[..POLICY_HASH_HEX_LEN].to_string() } - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result> { + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result> { // Validate up front — every block must be a Paragraph carrying // SourceSpan::Page. A mixed document signals a routing bug in // the caller (e.g. running this chunker on Markdown) and is @@ -109,18 +105,13 @@ impl Chunker for PdfPageV1Chunker { ), }; if !matches!(common.source_span, SourceSpan::Page { .. }) { - anyhow::bail!( - "PdfPageV1Chunker only handles PDF docs (got non-Page source_span)" - ); + anyhow::bail!("PdfPageV1Chunker only handles PDF docs (got non-Page source_span)"); } } let base_policy_hash = self.policy_hash(policy); let chunker_version = self.chunker_version(); - let target_bytes = policy - .target_tokens - .saturating_mul(BYTES_PER_TOKEN) - .max(1); + let target_bytes = policy.target_tokens.saturating_mul(BYTES_PER_TOKEN).max(1); // Clamp the overlap to half the target. Without this, a policy // with `overlap_tokens >= target_tokens` would make every chunk // fully re-emit the previous chunk's text — mirrors @@ -157,10 +148,8 @@ impl Chunker for PdfPageV1Chunker { // typography); silent `as u32` truncation would only // surface on corrupted input, where an explicit panic // is preferable to an off-by-2^32 span. - let char_start_u32 = u32::try_from(char_start) - .expect("page chars fit in u32"); - let char_end_u32 = - u32::try_from(char_end).expect("page chars fit in u32"); + let char_start_u32 = u32::try_from(char_start).expect("page chars fit in u32"); + let char_end_u32 = u32::try_from(char_end).expect("page chars fit in u32"); let span = SourceSpan::Page { page: page_num, char_start: Some(char_start_u32), @@ -213,7 +202,11 @@ impl Chunker for PdfPageV1Chunker { /// - `chunk_end` = chunk's end char index (exclusive). /// /// Returns an empty vector when `text` is empty or whitespace-only. -fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usize, usize, usize, String)> { +fn chunk_page( + text: &str, + target_bytes: usize, + overlap_bytes: usize, +) -> Vec<(usize, usize, usize, String)> { let chars: Vec = text.chars().collect(); let n = chars.len(); if n == 0 { @@ -233,8 +226,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi let c = chars[k]; let nx = chars[k + 1]; let is_paragraph_break = c == '\n' && nx == '\n'; - let is_sentence_end = - matches!(c, '.' | '?' | '!') && nx.is_whitespace(); + let is_sentence_end = matches!(c, '.' | '?' | '!') && nx.is_whitespace(); if (is_paragraph_break || is_sentence_end) && k + 2 <= n { bounds.push(k + 2); } @@ -246,9 +238,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi bounds.dedup(); // UTF-8 byte length of the slice between two char indices. - let byte_len = |a: usize, b: usize| -> usize { - chars[a..b].iter().map(|c| c.len_utf8()).sum() - }; + let byte_len = |a: usize, b: usize| -> usize { chars[a..b].iter().map(|c| c.len_utf8()).sum() }; let mut chunks: Vec<(usize, usize, usize, String)> = Vec::new(); let mut seg_idx: usize = 0; @@ -403,7 +393,11 @@ mod tests { assert_eq!(c.heading_path, Vec::::new()); assert_eq!(c.source_spans.len(), 1); match c.source_spans[0] { - SourceSpan::Page { page, char_start, char_end } => { + SourceSpan::Page { + page, + char_start, + char_end, + } => { assert_eq!(page, (i as u32) + 1); assert_eq!(char_start, Some(0)); assert!(char_end.unwrap() > 0); @@ -448,11 +442,16 @@ mod tests { // N-1's char_end). for w in chunks.windows(2) { let prev_end = match w[0].source_spans[0] { - SourceSpan::Page { char_end: Some(e), .. } => e, + SourceSpan::Page { + char_end: Some(e), .. + } => e, _ => panic!("missing char_end"), }; let next_start = match w[1].source_spans[0] { - SourceSpan::Page { char_start: Some(s), .. } => s, + SourceSpan::Page { + char_start: Some(s), + .. + } => s, _ => panic!("missing char_start"), }; assert!( @@ -666,11 +665,17 @@ mod tests { // overlap) is the failure mode. for w in chunks.windows(2) { let prev_start = match w[0].source_spans[0] { - SourceSpan::Page { char_start: Some(s), .. } => s, + SourceSpan::Page { + char_start: Some(s), + .. + } => s, _ => panic!("missing char_start"), }; let next_start = match w[1].source_spans[0] { - SourceSpan::Page { char_start: Some(s), .. } => s, + SourceSpan::Page { + char_start: Some(s), + .. + } => s, _ => panic!("missing char_start"), }; assert!( @@ -703,7 +708,7 @@ mod tests { let page_text = format!("{early_seg}. {tail}"); let doc = make_pdf_doc(&[&page_text]); - let policy = default_policy(500, 80); // target=1500 byte, overlap=240 byte + let policy = default_policy(500, 80); // target=1500 byte, overlap=240 byte let chunks = PdfPageV1Chunker.chunk(&doc, &policy).unwrap(); assert!( diff --git a/crates/kebab-chunk/src/tier2_shared.rs b/crates/kebab-chunk/src/tier2_shared.rs index 7d2bf5c..e3dfd14 100644 --- a/crates/kebab-chunk/src/tier2_shared.rs +++ b/crates/kebab-chunk/src/tier2_shared.rs @@ -113,7 +113,14 @@ pub(crate) fn build_chunk( symbol: Some(symbol.to_string()), lang: Some(lang.to_string()), }; - build_chunk_from_span(doc, chunker_version, base_policy_hash, text, span, split_key) + build_chunk_from_span( + doc, + chunker_version, + base_policy_hash, + text, + span, + split_key, + ) } /// Like `build_chunk` but emits `symbol: None`. Used by Tier 3 (per spec §9.3). diff --git a/crates/kebab-chunk/tests/code_c_ast_snapshot.rs b/crates/kebab-chunk/tests/code_c_ast_snapshot.rs index 62162b0..e403ac9 100644 --- a/crates/kebab-chunk/tests/code_c_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_c_ast_snapshot.rs @@ -13,9 +13,9 @@ use std::path::PathBuf; use kebab_chunk::CodeCAstV1Chunker; use kebab_core::{ - AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, - id_for_block, id_for_doc, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use serde_json::Value; use time::OffsetDateTime; diff --git a/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs b/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs index 8a2a2cf..d7ce320 100644 --- a/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs @@ -15,9 +15,9 @@ use std::path::PathBuf; use kebab_chunk::CodeCppAstV1Chunker; use kebab_core::{ - AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, - id_for_block, id_for_doc, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use kebab_parse_code::CppAstExtractor; use serde_json::Value; @@ -171,7 +171,9 @@ fn extract_cpp_fixture() -> CanonicalDocument { workspace_root: &root, config: &cfg, }; - CppAstExtractor::new().extract(&ctx, src.as_bytes()).unwrap() + CppAstExtractor::new() + .extract(&ctx, src.as_bytes()) + .unwrap() } // --------------------------------------------------------------------------- @@ -261,43 +263,61 @@ fn code_cpp_ast_extractor_snapshot() { let doc = extract_cpp_fixture(); // Verify the extractor emits all expected named units. - let block_syms: Vec> = doc.blocks.iter().filter_map(|b| match b { - Block::Code(c) => match &c.common.source_span { - SourceSpan::Code { symbol, .. } => Some(symbol.clone()), + let block_syms: Vec> = doc + .blocks + .iter() + .filter_map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, .. } => Some(symbol.clone()), + _ => None, + }, _ => None, - }, - _ => None, - }).collect(); + }) + .collect(); // Must include namespace-qualified class and its methods assert!( - block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker")), + block_syms + .iter() + .any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker")), "class unit missing: {block_syms:?}" ); assert!( - block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker")), + block_syms + .iter() + .any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker")), "ctor unit missing: {block_syms:?}" ); assert!( - block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker")), + block_syms + .iter() + .any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker")), "dtor unit missing: {block_syms:?}" ); assert!( - block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::chunk_doc")), + block_syms + .iter() + .any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::chunk_doc")), "chunk_doc unit missing: {block_syms:?}" ); assert!( - block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::operator()")), + block_syms + .iter() + .any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::operator()")), "operator() unit missing: {block_syms:?}" ); // Template function (inside kebab::chunk namespace in the fixture) assert!( - block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::identity")), + block_syms + .iter() + .any(|s| s.as_deref() == Some("kebab::chunk::identity")), "identity template fn unit missing: {block_syms:?}" ); // Free function in outer namespace assert!( - block_syms.iter().any(|s| s.as_deref() == Some("kebab::global_helper")), + block_syms + .iter() + .any(|s| s.as_deref() == Some("kebab::global_helper")), "global_helper unit missing: {block_syms:?}" ); // Global main @@ -312,14 +332,23 @@ fn code_cpp_ast_extractor_snapshot() { fn code_cpp_ast_extractor_chunks_deterministic() { let doc1 = extract_cpp_fixture(); let doc2 = extract_cpp_fixture(); - assert_eq!(doc1.blocks, doc2.blocks, "extractor output non-deterministic"); + assert_eq!( + doc1.blocks, doc2.blocks, + "extractor output non-deterministic" + ); let policy = fixed_policy(); let chunks1 = CodeCppAstV1Chunker.chunk(&doc1, &policy).unwrap(); let chunks2 = CodeCppAstV1Chunker.chunk(&doc2, &policy).unwrap(); assert_eq!( - chunks1.iter().map(|c| c.chunk_id.0.clone()).collect::>(), - chunks2.iter().map(|c| c.chunk_id.0.clone()).collect::>(), + chunks1 + .iter() + .map(|c| c.chunk_id.0.clone()) + .collect::>(), + chunks2 + .iter() + .map(|c| c.chunk_id.0.clone()) + .collect::>(), "chunker output non-deterministic" ); } diff --git a/crates/kebab-chunk/tests/code_go_ast_snapshot.rs b/crates/kebab-chunk/tests/code_go_ast_snapshot.rs index 2befe38..a6be7b8 100644 --- a/crates/kebab-chunk/tests/code_go_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_go_ast_snapshot.rs @@ -13,9 +13,9 @@ use std::path::PathBuf; use kebab_chunk::CodeGoAstV1Chunker; use kebab_core::{ - AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, - id_for_block, id_for_doc, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use serde_json::Value; use time::OffsetDateTime; diff --git a/crates/kebab-chunk/tests/code_java_ast_snapshot.rs b/crates/kebab-chunk/tests/code_java_ast_snapshot.rs index 75473d6..42a1ea9 100644 --- a/crates/kebab-chunk/tests/code_java_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_java_ast_snapshot.rs @@ -13,9 +13,9 @@ use std::path::PathBuf; use kebab_chunk::CodeJavaAstV1Chunker; use kebab_core::{ - AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, - id_for_block, id_for_doc, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use serde_json::Value; use time::OffsetDateTime; diff --git a/crates/kebab-chunk/tests/code_js_ast_snapshot.rs b/crates/kebab-chunk/tests/code_js_ast_snapshot.rs index 9cb818d..6171827 100644 --- a/crates/kebab-chunk/tests/code_js_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_js_ast_snapshot.rs @@ -13,9 +13,9 @@ use std::path::PathBuf; use kebab_chunk::CodeJsAstV1Chunker; use kebab_core::{ - AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, - id_for_block, id_for_doc, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use serde_json::Value; use time::OffsetDateTime; diff --git a/crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs b/crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs index a1eafa6..ede55c1 100644 --- a/crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs @@ -13,9 +13,9 @@ use std::path::PathBuf; use kebab_chunk::CodeKotlinAstV1Chunker; use kebab_core::{ - AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, - id_for_block, id_for_doc, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use serde_json::Value; use time::OffsetDateTime; diff --git a/crates/kebab-chunk/tests/code_python_ast_snapshot.rs b/crates/kebab-chunk/tests/code_python_ast_snapshot.rs index 2a164b1..c4d7642 100644 --- a/crates/kebab-chunk/tests/code_python_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_python_ast_snapshot.rs @@ -13,9 +13,9 @@ use std::path::PathBuf; use kebab_chunk::CodePythonAstV1Chunker; use kebab_core::{ - AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, - id_for_block, id_for_doc, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use serde_json::Value; use time::OffsetDateTime; diff --git a/crates/kebab-chunk/tests/code_rust_ast_snapshot.rs b/crates/kebab-chunk/tests/code_rust_ast_snapshot.rs index 9ef4455..af85e66 100644 --- a/crates/kebab-chunk/tests/code_rust_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_rust_ast_snapshot.rs @@ -13,9 +13,9 @@ use std::path::PathBuf; use kebab_chunk::CodeRustAstV1Chunker; use kebab_core::{ - AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, - id_for_block, id_for_doc, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use serde_json::Value; use time::OffsetDateTime; diff --git a/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs b/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs index bca0301..0eedcea 100644 --- a/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs +++ b/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs @@ -13,9 +13,9 @@ use std::path::PathBuf; use kebab_chunk::CodeTsAstV1Chunker; use kebab_core::{ - AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, - Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, - id_for_block, id_for_doc, + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, }; use serde_json::Value; use time::OffsetDateTime; diff --git a/crates/kebab-chunk/tests/dockerfile_file_v1.rs b/crates/kebab-chunk/tests/dockerfile_file_v1.rs index 44dd94a..f0a1a5d 100644 --- a/crates/kebab-chunk/tests/dockerfile_file_v1.rs +++ b/crates/kebab-chunk/tests/dockerfile_file_v1.rs @@ -124,7 +124,11 @@ fn dockerfile_emits_single_chunk() { Some(""), "symbol must be ''" ); - assert_eq!(lang.as_deref(), Some("dockerfile"), "lang must be 'dockerfile'"); + assert_eq!( + lang.as_deref(), + Some("dockerfile"), + "lang must be 'dockerfile'" + ); } other => panic!("expected SourceSpan::Code, got {other:?}"), } diff --git a/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs b/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs index 51f50e3..4682821 100644 --- a/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs +++ b/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs @@ -110,13 +110,11 @@ fn k8s_multi_doc_emits_one_chunk_per_resource() { let symbols: Vec<&str> = chunks .iter() - .map(|c| { - match &c.source_spans[0] { - SourceSpan::Code { symbol, .. } => { - symbol.as_deref().expect("symbol must be Some for k8s chunks") - } - other => panic!("expected Code span, got {other:?}"), - } + .map(|c| match &c.source_spans[0] { + SourceSpan::Code { symbol, .. } => symbol + .as_deref() + .expect("symbol must be Some for k8s chunks"), + other => panic!("expected Code span, got {other:?}"), }) .collect(); @@ -270,7 +268,11 @@ fn k8s_oversize_splits_into_line_windows_sharing_symbol() { let ranges: Vec<(u32, u32)> = chunks .iter() .map(|c| match &c.source_spans[0] { - SourceSpan::Code { line_start, line_end, .. } => (*line_start, *line_end), + SourceSpan::Code { + line_start, + line_end, + .. + } => (*line_start, *line_end), other => panic!("expected Code span, got {other:?}"), }) .collect(); diff --git a/crates/kebab-chunk/tests/long_section_snapshot.rs b/crates/kebab-chunk/tests/long_section_snapshot.rs index e335a68..ceb48b9 100644 --- a/crates/kebab-chunk/tests/long_section_snapshot.rs +++ b/crates/kebab-chunk/tests/long_section_snapshot.rs @@ -15,7 +15,7 @@ use std::path::PathBuf; use kebab_chunk::MdHeadingV1Chunker; use kebab_core::{ - AssetId, AssetStorage, Checksum, ChunkPolicy, ChunkerVersion, Chunker, MediaType, + AssetId, AssetStorage, Checksum, ChunkPolicy, Chunker, ChunkerVersion, MediaType, ParserVersion, RawAsset, SourceUri, WorkspacePath, }; use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter}; @@ -65,8 +65,7 @@ fn long_section_chunks_snapshot() { Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1, None => 1, }; - let (blocks, parse_warns) = - parse_blocks(&bytes, body_offset_lines).expect("blocks parse"); + let (blocks, parse_warns) = parse_blocks(&bytes, body_offset_lines).expect("blocks parse"); // Pin parser_version so doc_id / block_ids are reproducible. let parser_version = ParserVersion("kb-chunk-snapshot-test-0".into()); @@ -74,9 +73,8 @@ fn long_section_chunks_snapshot() { metadata.aliases.sort(); metadata.tags.sort(); - let doc = - build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns) - .expect("build_canonical_document"); + let doc = build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns) + .expect("build_canonical_document"); // Pin policy so policy_hash and chunk_ids are reproducible. let policy = ChunkPolicy { @@ -102,8 +100,7 @@ fn long_section_chunks_snapshot() { baseline_path.display() ), }; - let expected: Value = - serde_json::from_str(&baseline_text).expect("baseline parses as json"); + let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json"); if actual != expected { if std::env::var("UPDATE_SNAPSHOTS").is_ok() { @@ -154,14 +151,8 @@ fn long_section_chunks_are_deterministic() { let mut metadata = metadata; metadata.aliases.sort(); metadata.tags.sort(); - let doc = build_canonical_document( - &asset, - metadata, - blocks, - &parser_version, - parse_warns, - ) - .expect("build_canonical_document"); + let doc = build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns) + .expect("build_canonical_document"); let ids: Vec = MdHeadingV1Chunker .chunk(&doc, &policy) .unwrap() diff --git a/crates/kebab-chunk/tests/manifest_file_v1.rs b/crates/kebab-chunk/tests/manifest_file_v1.rs index 4df5e17..c2bf1cc 100644 --- a/crates/kebab-chunk/tests/manifest_file_v1.rs +++ b/crates/kebab-chunk/tests/manifest_file_v1.rs @@ -107,9 +107,7 @@ fn cargo_toml_single_chunk_with_toml_lang() { .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); let doc = manifest_doc("toml", &text); - let chunks = ManifestFileV1Chunker - .chunk(&doc, &policy()) - .expect("chunk"); + let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk"); assert_eq!( chunks.len(), @@ -149,9 +147,7 @@ fn package_json_single_chunk_with_json_lang() { .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); let doc = manifest_doc("json", &text); - let chunks = ManifestFileV1Chunker - .chunk(&doc, &policy()) - .expect("chunk"); + let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk"); assert_eq!( chunks.len(), @@ -191,9 +187,7 @@ fn pom_xml_single_chunk_with_xml_lang() { .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); let doc = manifest_doc("xml", &text); - let chunks = ManifestFileV1Chunker - .chunk(&doc, &policy()) - .expect("chunk"); + let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk"); assert_eq!( chunks.len(), @@ -233,9 +227,7 @@ fn go_mod_single_chunk_with_go_mod_lang() { .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); let doc = manifest_doc("go-mod", &text); - let chunks = ManifestFileV1Chunker - .chunk(&doc, &policy()) - .expect("chunk"); + let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk"); assert_eq!( chunks.len(), diff --git a/crates/kebab-cli/src/main.rs b/crates/kebab-cli/src/main.rs index b9e3b72..539a277 100644 --- a/crates/kebab-cli/src/main.rs +++ b/crates/kebab-cli/src/main.rs @@ -179,7 +179,12 @@ enum Cmd { /// canonical). Repeatable or comma-separated. /// Examples: `rust`, `python`, `typescript`. /// Unknown values produce empty hits. - #[arg(long = "code-lang", value_name = "LANG", num_args = 1, value_delimiter = ',')] + #[arg( + long = "code-lang", + value_name = "LANG", + num_args = 1, + value_delimiter = ',' + )] code_lang: Vec, /// p9-fb-37: emit pre-fusion lexical / vector / RRF candidate @@ -464,7 +469,9 @@ fn parse_bool_env(s: &str) -> Result { match s.to_ascii_lowercase().as_str() { "1" | "true" | "yes" | "on" => Ok(true), "0" | "false" | "no" | "off" => Ok(false), - other => Err(format!("expected 1/0/true/false/yes/no/on/off, got {other:?}")), + other => Err(format!( + "expected 1/0/true/false/yes/no/on/off, got {other:?}" + )), } } @@ -551,8 +558,14 @@ fn run(cli: &Cli) -> anyhow::Result<()> { "created {}", kebab_config::Config::xdg_config_path().display() ); - println!("created {}", kebab_config::Config::xdg_data_dir().display()); - println!("created {}", kebab_config::Config::xdg_state_dir().display()); + println!( + "created {}", + kebab_config::Config::xdg_data_dir().display() + ); + println!( + "created {}", + kebab_config::Config::xdg_state_dir().display() + ); println!("hint edit the config above, then `kebab ingest`"); } Ok(()) @@ -565,7 +578,9 @@ fn run(cli: &Cli) -> anyhow::Result<()> { } => { let cfg = kebab_config::Config::load(cli.config.as_deref())?; let scope = kebab_core::SourceScope { - root: root.clone().unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)), + root: root + .clone() + .unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)), exclude: cfg.workspace.exclude.clone(), ..Default::default() }; @@ -580,9 +595,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> { .unwrap_or(false); let mode = progress::ProgressMode::from_flags(cli.json, cli.quiet, plain_env); let (tx, rx) = std::sync::mpsc::channel::(); - let display_handle = std::thread::spawn(move || { - progress::ProgressDisplay::new(mode).run(rx) - }); + let display_handle = + std::thread::spawn(move || progress::ProgressDisplay::new(mode).run(rx)); // p9-fb-04: register a Ctrl-C handler that flips the same // AtomicBool the facade polls at each step boundary. The @@ -614,7 +628,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> { if cli.json { println!("{}", serde_json::to_string(&wire::wire_ingest(&report))?); } else { - let skipped_breakdown = kebab_app::render_skipped_breakdown(&report.skipped_by_extension); + let skipped_breakdown = + kebab_app::render_skipped_breakdown(&report.skipped_by_extension); let purged_suffix = if report.purged_deleted_files > 0 { format!(" purged {}", report.purged_deleted_files) } else { @@ -640,7 +655,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> { let cfg = kebab_config::Config::load(cli.config.as_deref())?; let docs = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())?; if cli.json { - println!("{}", serde_json::to_string(&wire::wire_doc_summaries(&docs))?); + println!( + "{}", + serde_json::to_string(&wire::wire_doc_summaries(&docs))? + ); } else { for d in &docs { println!("{}\t{}", d.doc_id, d.doc_path.0); @@ -667,7 +685,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> { let cfg = kebab_config::Config::load(cli.config.as_deref())?; let chunk_id: kebab_core::ChunkId = id.parse()?; let chunk = kebab_app::inspect_chunk_with_config(cfg, &chunk_id)?; - println!("{}", serde_json::to_string(&wire::wire_chunk_inspection(&chunk))?); + println!( + "{}", + serde_json::to_string(&wire::wire_chunk_inspection(&chunk))? + ); Ok(()) } }, @@ -708,7 +729,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> { }; let result = kebab_app::fetch_with_config(cfg, query, opts)?; if cli.json { - println!("{}", serde_json::to_string(&wire::wire_fetch_result(&result))?); + println!( + "{}", + serde_json::to_string(&wire::wire_fetch_result(&result))? + ); } else { render_fetch_plain(&result); } @@ -752,30 +776,21 @@ fn run(cli: &Cli) -> anyhow::Result<()> { if line.trim().is_empty() { continue; } - let v: serde_json::Value = - serde_json::from_str(&line).map_err(|e| { - anyhow::Error::new(kebab_app::StructuredError( - kebab_app::ErrorV1 { - schema_version: kebab_app::ERROR_V1_ID - .to_string(), - code: "config_invalid".to_string(), - message: format!( - "stdin ndjson line {} parse error: {e}", - lineno + 1 - ), - details: serde_json::Value::Null, - hint: Some( - "each line must be a JSON object with at least `query`" - .to_string(), - ), - }, - )) - })?; + let v: serde_json::Value = serde_json::from_str(&line).map_err(|e| { + anyhow::Error::new(kebab_app::StructuredError(kebab_app::ErrorV1 { + schema_version: kebab_app::ERROR_V1_ID.to_string(), + code: "config_invalid".to_string(), + message: format!("stdin ndjson line {} parse error: {e}", lineno + 1), + details: serde_json::Value::Null, + hint: Some( + "each line must be a JSON object with at least `query`".to_string(), + ), + })) + })?; raw_items.push(v); } - let (items, summary) = - kebab_app::bulk_search_with_config(cfg, raw_items)?; + let (items, summary) = kebab_app::bulk_search_with_config(cfg, raw_items)?; if cli.json { let mut stdout = std::io::stdout().lock(); @@ -799,11 +814,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> { if let Some(err) = &item.error { writeln!(stdout, "error: {err}")?; } else if let Some(resp) = &item.response { - writeln!( - stdout, - "{}", - serde_json::to_string_pretty(resp)? - )?; + writeln!(stdout, "{}", serde_json::to_string_pretty(resp)?)?; } writeln!(stdout)?; } @@ -843,8 +854,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> { other => other.to_string(), } } - let media_norm: Vec = - media.iter().map(|s| normalize_media_alias(s)).collect(); + let media_norm: Vec = media.iter().map(|s| normalize_media_alias(s)).collect(); // p9-fb-36: parse --ingested-after as RFC3339; structured error on failure. let ingested_after_parsed: Option = @@ -856,8 +866,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> { ) { Ok(ts) => Some(ts), Err(e) => { - return Err(anyhow::Error::new( - kebab_app::StructuredError(kebab_app::ErrorV1 { + return Err(anyhow::Error::new(kebab_app::StructuredError( + kebab_app::ErrorV1 { schema_version: kebab_app::ERROR_V1_ID.to_string(), code: "config_invalid".to_string(), message: format!( @@ -867,8 +877,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> { hint: Some( "expected format like 2026-04-01T00:00:00Z".to_string(), ), - }), - )); + }, + ))); } } } @@ -943,11 +953,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> { }; println!( "{:>2}. {:.4} {}{}{}", - h.rank, - h.retrieval.fusion_score, - stale_tag, - h.doc_path.0, - heading, + h.rank, h.retrieval.fusion_score, stale_tag, h.doc_path.0, heading, ); } // p9-fb-34: truncation hint goes to stderr so it @@ -969,15 +975,33 @@ fn run(cli: &Cli) -> anyhow::Result<()> { if let Some(t) = &resp.trace { eprintln!(); eprintln!("Trace:"); - eprintln!(" lexical ({} hits, {}ms):", t.lexical.len(), t.timing.lexical_ms); + eprintln!( + " lexical ({} hits, {}ms):", + t.lexical.len(), + t.timing.lexical_ms + ); for c in t.lexical.iter().take(3) { - eprintln!(" rank={} score={:.4} chunk={}", c.rank, c.score, c.chunk_id.0); + eprintln!( + " rank={} score={:.4} chunk={}", + c.rank, c.score, c.chunk_id.0 + ); } - eprintln!(" vector ({} hits, {}ms):", t.vector.len(), t.timing.vector_ms); + eprintln!( + " vector ({} hits, {}ms):", + t.vector.len(), + t.timing.vector_ms + ); for c in t.vector.iter().take(3) { - eprintln!(" rank={} score={:.4} chunk={}", c.rank, c.score, c.chunk_id.0); + eprintln!( + " rank={} score={:.4} chunk={}", + c.rank, c.score, c.chunk_id.0 + ); } - eprintln!(" fusion ({} inputs, {}ms)", t.rrf_inputs.len(), t.timing.fusion_ms); + eprintln!( + " fusion ({} inputs, {}ms)", + t.rrf_inputs.len(), + t.timing.fusion_ms + ); eprintln!(" total: {}ms", t.timing.total_ms); } } @@ -1039,16 +1063,12 @@ fn run(cli: &Cli) -> anyhow::Result<()> { let cfg2 = cfg.clone(); let q = query.clone(); let session2 = session.clone(); - let handle = std::thread::spawn( - move || -> anyhow::Result { - match session2.as_deref() { - Some(sid) => kebab_app::ask_with_session_with_config( - cfg2, sid, &q, opts, - ), - None => kebab_app::ask_with_config(cfg2, &q, opts), - } - }, - ); + let handle = std::thread::spawn(move || -> anyhow::Result { + match session2.as_deref() { + Some(sid) => kebab_app::ask_with_session_with_config(cfg2, sid, &q, opts), + None => kebab_app::ask_with_config(cfg2, &q, opts), + } + }); // Drain receiver, write ndjson to stderr until // completion or BrokenPipe. @@ -1324,9 +1344,18 @@ fn run(cli: &Cli) -> anyhow::Result<()> { println!("{}", serde_json::to_string_pretty(&agg)?); } else { println!("run_id: {run_id}"); - println!("queries: {} ({} failed)", agg.total_queries, agg.failed_queries); - println!("hit@1: {:.4}", agg.hit_at_k.get(&1).copied().unwrap_or(0.0)); - println!("hit@5: {:.4}", agg.hit_at_k.get(&5).copied().unwrap_or(0.0)); + println!( + "queries: {} ({} failed)", + agg.total_queries, agg.failed_queries + ); + println!( + "hit@1: {:.4}", + agg.hit_at_k.get(&1).copied().unwrap_or(0.0) + ); + println!( + "hit@5: {:.4}", + agg.hit_at_k.get(&5).copied().unwrap_or(0.0) + ); println!("MRR: {:.4}", agg.mrr); } Ok(()) @@ -1376,8 +1405,12 @@ fn run(cli: &Cli) -> anyhow::Result<()> { } else { println!( "ingest-file: scanned={} new={} updated={} unchanged={} skipped={} errors={}", - report.scanned, report.new, report.updated, - report.unchanged, report.skipped, report.errors + report.scanned, + report.new, + report.updated, + report.unchanged, + report.skipped, + report.errors ); } Ok(()) @@ -1390,20 +1423,20 @@ fn run(cli: &Cli) -> anyhow::Result<()> { .read_to_string(&mut body) .context("kebab ingest-stdin: read stdin")?; let cfg = kebab_config::Config::load(cli.config.as_deref())?; - let report = kebab_app::ingest_stdin_with_config( - cfg, - &body, - title, - source_uri.as_deref(), - )?; + let report = + kebab_app::ingest_stdin_with_config(cfg, &body, title, source_uri.as_deref())?; if cli.json { let v = wire::wire_ingest(&report); println!("{}", serde_json::to_string(&v)?); } else { println!( "ingest-stdin: scanned={} new={} updated={} unchanged={} skipped={} errors={}", - report.scanned, report.new, report.updated, - report.unchanged, report.skipped, report.errors + report.scanned, + report.new, + report.updated, + report.unchanged, + report.skipped, + report.errors ); } Ok(()) @@ -1432,10 +1465,7 @@ fn render_ask_plain_citations( writeln!(w)?; writeln!(w, "근거:")?; for (idx, c) in ans.citations.iter().enumerate() { - let marker = c - .marker - .clone() - .unwrap_or_else(|| format!("{}", idx + 1)); + let marker = c.marker.clone().unwrap_or_else(|| format!("{}", idx + 1)); // p9-fb-32: `[stale]` prefix on the URI for citations whose // `stale: true`. Yellow on TTY, plain otherwise — mirrors the // search-plain renderer in `Cmd::Search`. @@ -1496,7 +1526,10 @@ fn print_schema_text(s: &kebab_app::SchemaV1) { println!(" parser_version {}", s.models.parser_version); println!(" chunker_version {}", s.models.chunker_version); println!(" embedding_version {}", s.models.embedding_version); - println!(" prompt_template_version {}", s.models.prompt_template_version); + println!( + " prompt_template_version {}", + s.models.prompt_template_version + ); println!(" index_version {}", s.models.index_version); println!(" corpus_revision {}", s.models.corpus_revision); println!(); @@ -1545,9 +1578,7 @@ fn confirm_destructive( /// Confirm prompt for `--orphans-only`: shows the orphan count + a /// sample of up to 5 paths so the user knows what will be purged before /// committing. No filesystem paths are removed — only store records. -fn confirm_orphans_only( - orphan_paths: &[kebab_core::WorkspacePath], -) -> anyhow::Result { +fn confirm_orphans_only(orphan_paths: &[kebab_core::WorkspacePath]) -> anyhow::Result { use std::io::Write; let n = orphan_paths.len(); let mut out = std::io::stderr().lock(); @@ -1560,11 +1591,7 @@ fn confirm_orphans_only( return Ok(true); } - let sample: Vec<&str> = orphan_paths - .iter() - .take(5) - .map(|p| p.0.as_str()) - .collect(); + let sample: Vec<&str> = orphan_paths.iter().take(5).map(|p| p.0.as_str()).collect(); let sample_str = sample.join(", "); let ellipsis = if n > 5 { ", …" } else { "" }; @@ -1593,19 +1620,28 @@ fn render_fetch_plain(r: &kebab_core::FetchResult) { if !r.context_before.is_empty() { println!("\n=== before ==="); for c in &r.context_before { - let heading = c.heading_path.last().map_or("", std::string::String::as_str); + let heading = c + .heading_path + .last() + .map_or("", std::string::String::as_str); println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text); } } if let Some(c) = &r.chunk { println!("\n=== target ==="); - let heading = c.heading_path.last().map_or("", std::string::String::as_str); + let heading = c + .heading_path + .last() + .map_or("", std::string::String::as_str); println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text); } if !r.context_after.is_empty() { println!("\n=== after ==="); for c in &r.context_after { - let heading = c.heading_path.last().map_or("", std::string::String::as_str); + let heading = c + .heading_path + .last() + .map_or("", std::string::String::as_str); println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text); } } @@ -1637,8 +1673,8 @@ mod tests { //! against a synthetic `Answer` instead. use super::*; use kebab_core::{ - Answer, AnswerCitation, AnswerRetrievalSummary, Citation, ModelRef, - PromptTemplateVersion, SearchMode, TokenUsage, TraceId, WorkspacePath, + Answer, AnswerCitation, AnswerRetrievalSummary, Citation, ModelRef, PromptTemplateVersion, + SearchMode, TokenUsage, TraceId, WorkspacePath, }; use time::OffsetDateTime; @@ -1734,4 +1770,3 @@ mod tests { ); } } - diff --git a/crates/kebab-cli/src/progress.rs b/crates/kebab-cli/src/progress.rs index fc64b30..4e96ceb 100644 --- a/crates/kebab-cli/src/progress.rs +++ b/crates/kebab-cli/src/progress.rs @@ -124,11 +124,9 @@ impl ProgressDisplay { bar.set_length(u64::from(*total)); bar.set_position(0); bar.set_style( - ProgressStyle::with_template( - "ingest [{bar:30}] {pos}/{len} {wide_msg}", - ) - .unwrap() - .progress_chars("=> "), + ProgressStyle::with_template("ingest [{bar:30}] {pos}/{len} {wide_msg}") + .unwrap() + .progress_chars("=> "), ); bar.set_message(""); } @@ -170,11 +168,7 @@ impl ProgressDisplay { let _ = writeln!( err, "ingest: complete (scanned={} new={} updated={} skipped={} errors={})", - counts.scanned, - counts.new, - counts.updated, - counts.skipped, - counts.errors, + counts.scanned, counts.new, counts.updated, counts.skipped, counts.errors, ); } } @@ -193,11 +187,7 @@ impl ProgressDisplay { let _ = writeln!( err, "ingest: aborted (scanned={} new={} updated={} skipped={} errors={})", - counts.scanned, - counts.new, - counts.updated, - counts.skipped, - counts.errors, + counts.scanned, counts.new, counts.updated, counts.skipped, counts.errors, ); } } @@ -210,13 +200,26 @@ impl ProgressDisplay { let _ = writeln!(err, " 📷 OCR page {page}..."); } } - IngestEvent::PdfOcrFinished { page, ms, chars, ocr_engine, skipped, .. } => { + IngestEvent::PdfOcrFinished { + page, + ms, + chars, + ocr_engine, + skipped, + .. + } => { if !quiet { let mut err = std::io::stderr().lock(); if *skipped { - let _ = writeln!(err, " ⊘ OCR page {page} skipped (no DCTDecode or engine fail, {ms}ms)"); + let _ = writeln!( + err, + " ⊘ OCR page {page} skipped (no DCTDecode or engine fail, {ms}ms)" + ); } else { - let _ = writeln!(err, " ✓ OCR page {page} ({chars} chars, {ms}ms via {ocr_engine})"); + let _ = writeln!( + err, + " ✓ OCR page {page} ({chars} chars, {ms}ms via {ocr_engine})" + ); } } } @@ -250,7 +253,10 @@ mod tests { #[test] fn from_flags_json_takes_priority_over_tty() { - assert_eq!(ProgressMode::from_flags(true, false, false), ProgressMode::Json); + assert_eq!( + ProgressMode::from_flags(true, false, false), + ProgressMode::Json + ); } #[test] diff --git a/crates/kebab-cli/src/wire.rs b/crates/kebab-cli/src/wire.rs index eeb1aed..be00584 100644 --- a/crates/kebab-cli/src/wire.rs +++ b/crates/kebab-cli/src/wire.rs @@ -114,10 +114,7 @@ pub fn wire_answer(a: &Answer) -> Value { /// The timestamp is added at emit time (caller fills `ts`), since the /// pipeline doesn't carry one in the in-process enum — mirrors the /// `wire_ingest_progress` pattern (§2 ingest_progress.v1). -pub fn wire_answer_event( - ev: &kebab_app::StreamEvent, - ts: time::OffsetDateTime, -) -> Value { +pub fn wire_answer_event(ev: &kebab_app::StreamEvent, ts: time::OffsetDateTime) -> Value { let mut v = serde_json::to_value(ev).expect("StreamEvent serializes"); let ts_str = ts .format(&time::format_description::well_known::Rfc3339) @@ -161,9 +158,7 @@ pub fn wire_reset(r: &kebab_app::ResetReport) -> Value { /// wall-clock — the emit site is the only place that knows the moment /// of emission, so the timestamp is stamped here rather than carried /// on the event itself. -pub fn wire_ingest_progress( - event: &kebab_app::IngestEvent, -) -> anyhow::Result { +pub fn wire_ingest_progress(event: &kebab_app::IngestEvent) -> anyhow::Result { let mut v = serde_json::to_value(event)?; if let Value::Object(ref mut map) = v { map.insert( @@ -305,15 +300,15 @@ mod tests { let v = wire_search_response(&r); assert_eq!(schema_of(&v), Some("search_response.v1")); assert!(v.get("hits").and_then(|h| h.as_array()).is_some()); - assert_eq!( - v.get("hits").and_then(|h| h.as_array()).unwrap().len(), - 0 - ); + assert_eq!(v.get("hits").and_then(|h| h.as_array()).unwrap().len(), 0); assert_eq!( v.get("next_cursor").and_then(|c| c.as_str()), Some("opaque-cursor-abc") ); - assert_eq!(v.get("truncated").and_then(serde_json::Value::as_bool), Some(true)); + assert_eq!( + v.get("truncated").and_then(serde_json::Value::as_bool), + Some(true) + ); } #[test] @@ -322,12 +317,21 @@ mod tests { let schema = SchemaV1 { schema_version: "schema.v1".to_string(), kebab_version: "0.2.1".to_string(), - wire: WireBlock { schemas: vec!["answer.v1".to_string()] }, + wire: WireBlock { + schemas: vec!["answer.v1".to_string()], + }, capabilities: Capabilities { - json_mode: true, ingest_progress: true, ingest_cancellation: true, - rag_multi_turn: true, search_cache: true, incremental_ingest: true, - streaming_ask: false, http_daemon: false, mcp_server: false, - single_file_ingest: false, bulk_search: true, + json_mode: true, + ingest_progress: true, + ingest_cancellation: true, + rag_multi_turn: true, + search_cache: true, + incremental_ingest: true, + streaming_ask: false, + http_daemon: false, + mcp_server: false, + single_file_ingest: false, + bulk_search: true, }, models: Models { parser_version: "x".to_string(), @@ -340,7 +344,9 @@ mod tests { corpus_revision: 7, }, stats: Stats { - doc_count: 1, chunk_count: 2, asset_count: 1, + doc_count: 1, + chunk_count: 2, + asset_count: 1, last_ingest_at: None, media_breakdown: Default::default(), lang_breakdown: Default::default(), @@ -352,7 +358,10 @@ mod tests { }; let v = wire_schema(&schema); assert_eq!(schema_of(&v), Some("schema.v1")); - assert_eq!(v.get("kebab_version").and_then(Value::as_str), Some("0.2.1")); + assert_eq!( + v.get("kebab_version").and_then(Value::as_str), + Some("0.2.1") + ); } #[test] @@ -367,7 +376,10 @@ mod tests { }; let v = wire_error_v1(&err); assert_eq!(schema_of(&v), Some("error.v1")); - assert_eq!(v.get("code").and_then(Value::as_str), Some("config_invalid")); + assert_eq!( + v.get("code").and_then(Value::as_str), + Some("config_invalid") + ); } #[test] @@ -393,8 +405,10 @@ mod tests { #[test] fn search_response_with_trace_serializes_trace_field() { - use kebab_core::{SearchTrace, TraceCandidate, TraceFusionInput, - TraceTiming, ChunkId, DocumentId, WorkspacePath}; + use kebab_core::{ + ChunkId, DocumentId, SearchTrace, TraceCandidate, TraceFusionInput, TraceTiming, + WorkspacePath, + }; let r = kebab_app::SearchResponse { hits: vec![], next_cursor: None, @@ -414,7 +428,12 @@ mod tests { vector_rank: None, fusion_score: 0.0, }], - timing: TraceTiming { lexical_ms: 5, vector_ms: 0, fusion_ms: 1, total_ms: 7 }, + timing: TraceTiming { + lexical_ms: 5, + vector_ms: 0, + fusion_ms: 1, + total_ms: 7, + }, }), hint: None, }; diff --git a/crates/kebab-cli/tests/cli_config_not_found.rs b/crates/kebab-cli/tests/cli_config_not_found.rs index a782621..0478fef 100644 --- a/crates/kebab-cli/tests/cli_config_not_found.rs +++ b/crates/kebab-cli/tests/cli_config_not_found.rs @@ -2,15 +2,18 @@ //! must fail with exit≠0 and error.v1 code=config_not_found (not silently fall //! back to XDG defaults). -use std::process::Command; use serde_json::Value; +use std::process::Command; fn kebab_bin() -> String { env!("CARGO_BIN_EXE_kebab").to_string() } fn parse_error_v1(stderr: &str) -> Value { - let last = stderr.lines().last().expect("expected error.v1 ndjson on stderr"); + let last = stderr + .lines() + .last() + .expect("expected error.v1 ndjson on stderr"); serde_json::from_str(last) .unwrap_or_else(|e| panic!("expected ndjson on stderr: {e}\nstderr={stderr}")) } @@ -25,7 +28,11 @@ fn invalid_config_path_emits_error_v1_with_nonzero_exit() { .output() .expect("spawn kebab"); - assert_ne!(out.status.code(), Some(0), "exit must be nonzero on missing --config"); + assert_ne!( + out.status.code(), + Some(0), + "exit must be nonzero on missing --config" + ); let stderr = String::from_utf8_lossy(&out.stderr); let v = parse_error_v1(&stderr); assert_eq!(v["schema_version"], "error.v1"); @@ -38,7 +45,13 @@ fn invalid_relative_config_path_emits_config_not_found() { // Bug #10 spec §6 R-1: relative path も cwd-relative で cover. let tmp = tempfile::tempdir().unwrap(); let out = Command::new(kebab_bin()) - .args(["search", "rust", "--config", "nonexistent-rel.toml", "--json"]) + .args([ + "search", + "rust", + "--config", + "nonexistent-rel.toml", + "--json", + ]) .current_dir(tmp.path()) .output() .expect("spawn kebab"); diff --git a/crates/kebab-cli/tests/cli_empty_query.rs b/crates/kebab-cli/tests/cli_empty_query.rs index fdbe98c..25eebc4 100644 --- a/crates/kebab-cli/tests/cli_empty_query.rs +++ b/crates/kebab-cli/tests/cli_empty_query.rs @@ -1,15 +1,18 @@ //! Integration tests for Bug #14: empty or whitespace-only query must emit //! error.v1 code=invalid_input and exit nonzero (not silent 0-hit return). -use std::process::Command; use serde_json::Value; +use std::process::Command; fn kebab_bin() -> String { env!("CARGO_BIN_EXE_kebab").to_string() } fn parse_error_v1(stderr: &str) -> Value { - let last = stderr.lines().last().expect("expected error.v1 ndjson on stderr"); + let last = stderr + .lines() + .last() + .expect("expected error.v1 ndjson on stderr"); serde_json::from_str(last) .unwrap_or_else(|e| panic!("expected ndjson on stderr: {e}\nstderr={stderr}")) } diff --git a/crates/kebab-cli/tests/cli_error_wire.rs b/crates/kebab-cli/tests/cli_error_wire.rs index 825ce9c..0aafd9e 100644 --- a/crates/kebab-cli/tests/cli_error_wire.rs +++ b/crates/kebab-cli/tests/cli_error_wire.rs @@ -36,12 +36,7 @@ fn json_mode_emits_error_v1_on_config_invalid() { std::fs::write(&bad_config, b"this is not { valid toml !!!").unwrap(); let mut cmd = Command::new(kebab_bin()); - cmd.args([ - "--json", - "--config", - bad_config.to_str().unwrap(), - "ingest", - ]); + cmd.args(["--json", "--config", bad_config.to_str().unwrap(), "ingest"]); for (k, v) in xdg_envs(tmp.path()) { cmd.env(k, v); } @@ -55,7 +50,10 @@ fn json_mode_emits_error_v1_on_config_invalid() { assert_eq!(exit_code, 2, "expected exit code 2, got {exit_code}"); let stderr = String::from_utf8(out.stderr).unwrap(); - let first_line = stderr.lines().next().expect("stderr must have at least one line"); + let first_line = stderr + .lines() + .next() + .expect("stderr must have at least one line"); let v: serde_json::Value = serde_json::from_str(first_line).expect("stderr first line must be valid JSON"); diff --git a/crates/kebab-cli/tests/cli_ingest_file.rs b/crates/kebab-cli/tests/cli_ingest_file.rs index afd1a1e..0de55f2 100644 --- a/crates/kebab-cli/tests/cli_ingest_file.rs +++ b/crates/kebab-cli/tests/cli_ingest_file.rs @@ -72,21 +72,34 @@ max_context_tokens = 8000 workspace = workspace.display(), data = data.display(), ), - ).unwrap(); + ) + .unwrap(); let src = dir.path().join("doc.md"); fs::write(&src, "# A\n\nbody.").unwrap(); let bin = env!("CARGO_BIN_EXE_kebab"); let out = Command::new(bin) - .args(["--json", "--config", cfg_path.to_str().unwrap(), "ingest-file"]) + .args([ + "--json", + "--config", + cfg_path.to_str().unwrap(), + "ingest-file", + ]) .arg(&src) .output() .unwrap(); - assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr)); + assert!( + out.status.success(), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); let stdout = String::from_utf8_lossy(&out.stdout); let v: serde_json::Value = serde_json::from_str(stdout.trim()).unwrap(); - assert_eq!(v.get("schema_version").and_then(|s| s.as_str()), Some("ingest_report.v1")); + assert_eq!( + v.get("schema_version").and_then(|s| s.as_str()), + Some("ingest_report.v1") + ); assert_eq!(v.get("new").and_then(serde_json::Value::as_u64), Some(1)); } diff --git a/crates/kebab-cli/tests/cli_ingest_stdin.rs b/crates/kebab-cli/tests/cli_ingest_stdin.rs index 1c83eb5..cca634c 100644 --- a/crates/kebab-cli/tests/cli_ingest_stdin.rs +++ b/crates/kebab-cli/tests/cli_ingest_stdin.rs @@ -73,13 +73,18 @@ max_context_tokens = 8000 workspace = workspace.display(), data = data.display(), ), - ).unwrap(); + ) + .unwrap(); let bin = env!("CARGO_BIN_EXE_kebab"); let mut child = Command::new(bin) .args([ - "--json", "--config", cfg_path.to_str().unwrap(), - "ingest-stdin", "--title", "X", + "--json", + "--config", + cfg_path.to_str().unwrap(), + "ingest-stdin", + "--title", + "X", ]) .stdin(Stdio::piped()) .stdout(Stdio::piped()) @@ -91,10 +96,17 @@ max_context_tokens = 8000 stdin.write_all(b"## Body\n\nbody text.\n").unwrap(); } let out = child.wait_with_output().unwrap(); - assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr)); + assert!( + out.status.success(), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); let stdout = String::from_utf8_lossy(&out.stdout); let v: serde_json::Value = serde_json::from_str(stdout.trim()).unwrap(); - assert_eq!(v.get("schema_version").and_then(|s| s.as_str()), Some("ingest_report.v1")); + assert_eq!( + v.get("schema_version").and_then(|s| s.as_str()), + Some("ingest_report.v1") + ); assert_eq!(v.get("new").and_then(serde_json::Value::as_u64), Some(1)); } diff --git a/crates/kebab-cli/tests/cli_readonly_quiet.rs b/crates/kebab-cli/tests/cli_readonly_quiet.rs index 26ab305..b69d084 100644 --- a/crates/kebab-cli/tests/cli_readonly_quiet.rs +++ b/crates/kebab-cli/tests/cli_readonly_quiet.rs @@ -112,7 +112,13 @@ fn kebab_readonly_env_blocks_ingest() { fn readonly_json_mode_emits_error_v1() { let (tmp, ws) = fixture_workspace(); let out = Command::new(kebab_bin()) - .args(["--readonly", "--json", "ingest", "--root", ws.to_str().unwrap()]) + .args([ + "--readonly", + "--json", + "ingest", + "--root", + ws.to_str().unwrap(), + ]) .envs(xdg_envs(tmp.path())) .output() .unwrap(); @@ -164,12 +170,22 @@ fn quiet_flag_suppresses_progress_stderr() { fn quiet_with_json_stdout_has_report_stderr_is_empty() { let (tmp, ws) = fixture_workspace(); let out = Command::new(kebab_bin()) - .args(["--quiet", "--json", "ingest", "--root", ws.to_str().unwrap()]) + .args([ + "--quiet", + "--json", + "ingest", + "--root", + ws.to_str().unwrap(), + ]) .envs(xdg_envs(tmp.path())) .output() .unwrap(); - assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr)); + assert!( + out.status.success(), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); let stderr = String::from_utf8_lossy(&out.stderr); assert!(stderr.is_empty(), "expected empty stderr, got: {stderr}"); let stdout = String::from_utf8_lossy(&out.stdout); diff --git a/crates/kebab-cli/tests/ingest_progress_cli.rs b/crates/kebab-cli/tests/ingest_progress_cli.rs index 34e3b5a..f00891d 100644 --- a/crates/kebab-cli/tests/ingest_progress_cli.rs +++ b/crates/kebab-cli/tests/ingest_progress_cli.rs @@ -90,12 +90,7 @@ fn ingest_human_non_tty_emits_progress_lines_to_stderr() { // target is `hidden` and progress lines go to stderr instead. let (tmp, ws) = fixture_workspace(); let mut cmd = Command::new(kebab_bin()); - cmd.args([ - "ingest", - "--root", - ws.to_str().unwrap(), - "--summary-only", - ]); + cmd.args(["ingest", "--root", ws.to_str().unwrap(), "--summary-only"]); for (k, v) in xdg_envs(tmp.path()) { cmd.env(k, v); } @@ -155,8 +150,14 @@ fn ingest_json_progress_lines_carry_kind_and_ts() { saw_completed = true; // Counts mirror the report. let counts = v.get("counts").unwrap(); - assert_eq!(counts.get("scanned").and_then(serde_json::Value::as_u64), Some(2)); - assert_eq!(counts.get("new").and_then(serde_json::Value::as_u64), Some(2)); + assert_eq!( + counts.get("scanned").and_then(serde_json::Value::as_u64), + Some(2) + ); + assert_eq!( + counts.get("new").and_then(serde_json::Value::as_u64), + Some(2) + ); } } assert!(saw_scan_started, "missing scan_started event"); diff --git a/crates/kebab-cli/tests/reset_cli.rs b/crates/kebab-cli/tests/reset_cli.rs index d45ff8f..7dc518b 100644 --- a/crates/kebab-cli/tests/reset_cli.rs +++ b/crates/kebab-cli/tests/reset_cli.rs @@ -50,9 +50,18 @@ fn reset_data_only_yes_removes_data_dir_and_keeps_config() { ); assert!(!xdg_data.join("kebab").exists(), "data dir should be gone"); - assert!(!xdg_cache.join("kebab").exists(), "cache dir should be gone"); - assert!(!xdg_state.join("kebab").exists(), "state dir should be gone"); - assert!(xdg_cfg.join("kebab/marker").exists(), "config dir preserved"); + assert!( + !xdg_cache.join("kebab").exists(), + "cache dir should be gone" + ); + assert!( + !xdg_state.join("kebab").exists(), + "state dir should be gone" + ); + assert!( + xdg_cfg.join("kebab/marker").exists(), + "config dir preserved" + ); } #[test] @@ -101,7 +110,11 @@ fn reset_data_only_yes_json_emits_reset_report_v1() { .env("XDG_STATE_HOME", tmp.path().join("state")) .output() .unwrap(); - assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr)); + assert!( + out.status.success(), + "stderr: {}", + String::from_utf8_lossy(&out.stderr) + ); let v: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap(); assert_eq!( diff --git a/crates/kebab-cli/tests/wire_ask_multi_hop.rs b/crates/kebab-cli/tests/wire_ask_multi_hop.rs index 49fce55..a2ea2e0 100644 --- a/crates/kebab-cli/tests/wire_ask_multi_hop.rs +++ b/crates/kebab-cli/tests/wire_ask_multi_hop.rs @@ -32,10 +32,9 @@ fn schema_path(name: &str) -> PathBuf { } fn parse_schema(name: &str) -> serde_json::Value { - let text = std::fs::read_to_string(schema_path(name)) - .unwrap_or_else(|e| panic!("read {name}: {e}")); - serde_json::from_str(&text) - .unwrap_or_else(|e| panic!("{name} must parse as valid JSON: {e}")) + let text = + std::fs::read_to_string(schema_path(name)).unwrap_or_else(|e| panic!("read {name}: {e}")); + serde_json::from_str(&text).unwrap_or_else(|e| panic!("{name} must parse as valid JSON: {e}")) } #[test] diff --git a/crates/kebab-cli/tests/wire_ask_stream.rs b/crates/kebab-cli/tests/wire_ask_stream.rs index 98c995a..582e858 100644 --- a/crates/kebab-cli/tests/wire_ask_stream.rs +++ b/crates/kebab-cli/tests/wire_ask_stream.rs @@ -41,8 +41,7 @@ fn relax_score_gate(cfg: &Path) { #[ignore = "requires real Ollama on 127.0.0.1:11434"] fn stream_emits_ndjson_events_on_stderr() { let dir = tempfile::tempdir().unwrap(); - let (cfg, workspace, _data) = - common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b"); + let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b"); relax_score_gate(&cfg); fs::write( workspace.join("a.md"), @@ -93,12 +92,8 @@ fn stream_emits_ndjson_events_on_stderr() { // stdout: last line is answer.v1 (backwards compat with the // non-streaming path — same wire shape, just emitted after the // ndjson event stream rather than instead of it). - let final_line = stdout - .lines() - .last() - .expect("stdout has at least one line"); - let answer: Value = - serde_json::from_str(final_line).expect("stdout final line = answer.v1"); + let final_line = stdout.lines().last().expect("stdout has at least one line"); + let answer: Value = serde_json::from_str(final_line).expect("stdout final line = answer.v1"); assert_eq!(answer["schema_version"], "answer.v1"); } @@ -109,8 +104,7 @@ fn non_stream_path_unchanged() { // emits a single `answer.v1` line on stdout — fb-33 must not // perturb the existing wire surface. let dir = tempfile::tempdir().unwrap(); - let (cfg, workspace, _data) = - common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b"); + let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b"); relax_score_gate(&cfg); fs::write( workspace.join("a.md"), @@ -140,8 +134,7 @@ fn stream_cancels_when_stderr_closes() { use std::process::{Command, Stdio}; let dir = tempfile::tempdir().unwrap(); - let (cfg, workspace, _data) = - common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b"); + let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b"); relax_score_gate(&cfg); fs::write( workspace.join("a.md"), @@ -198,15 +191,10 @@ fn stream_cancels_when_stderr_closes() { #[ignore = "requires real Ollama on 127.0.0.1:11434"] fn stream_score_gate_refusal_emits_only_retrieval_done() { let dir = tempfile::tempdir().unwrap(); - let (cfg, workspace, _data) = - common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b"); + let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b"); // Intentionally NO relax_score_gate — keep the default 0.30 // so the thin-doc + unrelated-query combo trips refusal. - fs::write( - workspace.join("a.md"), - "# Title\n\nrust is a language.\n", - ) - .unwrap(); + fs::write(workspace.join("a.md"), "# Title\n\nrust is a language.\n").unwrap(); common::ingest(&cfg, &workspace); let (stdout, stderr) = @@ -230,12 +218,8 @@ fn stream_score_gate_refusal_emits_only_retrieval_done() { ); // Stdout still has answer.v1 with grounded=false. - let final_line = stdout - .lines() - .last() - .expect("stdout has at least one line"); - let answer: Value = - serde_json::from_str(final_line).expect("answer.v1"); + let final_line = stdout.lines().last().expect("stdout has at least one line"); + let answer: Value = serde_json::from_str(final_line).expect("answer.v1"); assert_eq!(answer["schema_version"], "answer.v1"); assert_eq!(answer["grounded"], false); } diff --git a/crates/kebab-cli/tests/wire_bulk_search.rs b/crates/kebab-cli/tests/wire_bulk_search.rs index 2b48f17..457efba 100644 --- a/crates/kebab-cli/tests/wire_bulk_search.rs +++ b/crates/kebab-cli/tests/wire_bulk_search.rs @@ -21,7 +21,11 @@ fn cargo_bin() -> &'static str { env!("CARGO_BIN_EXE_kebab") } -fn run_bulk_with_stdin(cfg: &std::path::Path, stdin_body: &str, json: bool) -> std::process::Output { +fn run_bulk_with_stdin( + cfg: &std::path::Path, + stdin_body: &str, + json: bool, +) -> std::process::Output { let mut cmd = Command::new(cargo_bin()); cmd.arg("--config").arg(cfg).arg("search").arg("--bulk"); if json { @@ -94,7 +98,10 @@ fn empty_stdin_returns_empty_results_with_zero_summary() { let out = run_bulk_with_stdin(&cfg, "", true); assert!(out.status.success()); let stdout = String::from_utf8_lossy(&out.stdout); - assert!(stdout.trim().is_empty(), "expected empty stdout, got: {stdout}"); + assert!( + stdout.trim().is_empty(), + "expected empty stdout, got: {stdout}" + ); let stderr = String::from_utf8_lossy(&out.stderr); assert!(stderr.contains("bulk_summary: total=0 succeeded=0 failed=0")); } diff --git a/crates/kebab-cli/tests/wire_citation_5_variants_unchanged.rs b/crates/kebab-cli/tests/wire_citation_5_variants_unchanged.rs index 6242024..8e18ce6 100644 --- a/crates/kebab-cli/tests/wire_citation_5_variants_unchanged.rs +++ b/crates/kebab-cli/tests/wire_citation_5_variants_unchanged.rs @@ -19,7 +19,10 @@ fn line_variant_serialization_unchanged() { assert_eq!(v["end"], 2); assert_eq!(v["section"], "§14"); // Must not bleed Code-variant keys. - assert!(v.get("line_start").is_none(), "line_start must be absent: {v}"); + assert!( + v.get("line_start").is_none(), + "line_start must be absent: {v}" + ); assert!(v.get("symbol").is_none(), "symbol must be absent: {v}"); assert!(v.get("code").is_none(), "code must be absent: {v}"); } @@ -48,7 +51,10 @@ fn page_variant_serialization_unchanged() { let v = serde_json::to_value(&c).unwrap(); assert_eq!(v["kind"], "page"); assert_eq!(v["page"], 13); - assert!(v.get("line_start").is_none(), "line_start must be absent: {v}"); + assert!( + v.get("line_start").is_none(), + "line_start must be absent: {v}" + ); assert!(v.get("symbol").is_none(), "symbol must be absent: {v}"); } @@ -67,7 +73,10 @@ fn region_variant_serialization_unchanged() { assert_eq!(v["y"], 20); assert_eq!(v["w"], 100); assert_eq!(v["h"], 200); - assert!(v.get("line_start").is_none(), "line_start must be absent: {v}"); + assert!( + v.get("line_start").is_none(), + "line_start must be absent: {v}" + ); } #[test] @@ -79,7 +88,10 @@ fn caption_variant_serialization_unchanged() { let v = serde_json::to_value(&c).unwrap(); assert_eq!(v["kind"], "caption"); assert_eq!(v["model"], "qwen2.5-vl:7b"); - assert!(v.get("line_start").is_none(), "line_start must be absent: {v}"); + assert!( + v.get("line_start").is_none(), + "line_start must be absent: {v}" + ); } #[test] @@ -95,6 +107,9 @@ fn time_variant_serialization_unchanged() { assert_eq!(v["start_ms"], 1000); assert_eq!(v["end_ms"], 5000); assert_eq!(v["speaker"], "Alice"); - assert!(v.get("line_start").is_none(), "line_start must be absent: {v}"); + assert!( + v.get("line_start").is_none(), + "line_start must be absent: {v}" + ); assert!(v.get("symbol").is_none(), "symbol must be absent: {v}"); } diff --git a/crates/kebab-cli/tests/wire_fetch.rs b/crates/kebab-cli/tests/wire_fetch.rs index 550720b..afd8ccc 100644 --- a/crates/kebab-cli/tests/wire_fetch.rs +++ b/crates/kebab-cli/tests/wire_fetch.rs @@ -24,10 +24,8 @@ fn fetch_chunk_json_emits_fetch_result_v1() { common::ingest(&cfg, &workspace); // Find chunk_id via search. - let (search_stdout, _) = common::run_search_with_args( - &cfg, - &["--json", "--mode", "lexical", "--k", "1", "apples"], - ); + let (search_stdout, _) = + common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "1", "apples"]); let search: Value = serde_json::from_str(search_stdout.trim()) .unwrap_or_else(|e| panic!("search not JSON: {search_stdout:?}: {e}")); let chunk_id = search["hits"][0]["chunk_id"] @@ -35,10 +33,7 @@ fn fetch_chunk_json_emits_fetch_result_v1() { .expect("chunk_id on first hit") .to_string(); - let (stdout, _) = common::run_fetch_with_args( - &cfg, - &["--json", "chunk", &chunk_id], - ); + let (stdout, _) = common::run_fetch_with_args(&cfg, &["--json", "chunk", &chunk_id]); let v: Value = serde_json::from_str(stdout.trim()) .unwrap_or_else(|e| panic!("fetch not JSON: {stdout:?}: {e}")); assert_eq!(v["schema_version"], "fetch_result.v1"); @@ -59,10 +54,8 @@ fn fetch_doc_json_with_max_tokens_truncates() { common::ingest(&cfg, &workspace); // Find doc_id via search. - let (search_stdout, _) = common::run_search_with_args( - &cfg, - &["--json", "--mode", "lexical", "--k", "1", "Lorem"], - ); + let (search_stdout, _) = + common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "1", "Lorem"]); let search: Value = serde_json::from_str(search_stdout.trim()) .unwrap_or_else(|e| panic!("search not JSON: {search_stdout:?}: {e}")); let doc_id = search["hits"][0]["doc_id"] @@ -70,10 +63,8 @@ fn fetch_doc_json_with_max_tokens_truncates() { .expect("doc_id on first hit") .to_string(); - let (stdout, _) = common::run_fetch_with_args( - &cfg, - &["--json", "doc", &doc_id, "--max-tokens", "20"], - ); + let (stdout, _) = + common::run_fetch_with_args(&cfg, &["--json", "doc", &doc_id, "--max-tokens", "20"]); let v: Value = serde_json::from_str(stdout.trim()) .unwrap_or_else(|e| panic!("fetch not JSON: {stdout:?}: {e}")); assert_eq!(v["kind"], "doc"); diff --git a/crates/kebab-cli/tests/wire_search_filters.rs b/crates/kebab-cli/tests/wire_search_filters.rs index 71ba48c..0a1b66b 100644 --- a/crates/kebab-cli/tests/wire_search_filters.rs +++ b/crates/kebab-cli/tests/wire_search_filters.rs @@ -32,12 +32,9 @@ fn search_with_doc_id_filter_returns_only_target_doc() { common::ingest(&cfg, &workspace); // First, search without a doc-id filter to find what doc_ids exist. - let (stdout, _) = common::run_search_with_args( - &cfg, - &["--json", "--mode", "lexical", "rust"], - ); - let resp: Value = serde_json::from_str(stdout.trim()) - .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); + let (stdout, _) = common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]); + let resp: Value = + serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); let hits = resp["hits"].as_array().expect("hits array"); assert!( hits.len() >= 2, @@ -147,15 +144,19 @@ fn search_with_media_filter_md_alias_normalizes_to_markdown() { let (cfg, workspace, _data) = common::write_config(dir.path(), 30); // Only a markdown file — the `md` alias should match it. - fs::write(workspace.join("notes.md"), "# Notes\n\nrust async programming\n").unwrap(); + fs::write( + workspace.join("notes.md"), + "# Notes\n\nrust async programming\n", + ) + .unwrap(); common::ingest(&cfg, &workspace); let (stdout, _) = common::run_search_with_args( &cfg, &["--json", "--mode", "lexical", "--media", "md", "rust"], ); - let resp: Value = serde_json::from_str(stdout.trim()) - .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); + let resp: Value = + serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); let hits = resp["hits"].as_array().expect("hits array"); assert!( @@ -189,10 +190,8 @@ fn search_with_tag_filter_matches_frontmatter_tags() { common::ingest(&cfg, &workspace); // Without filter — both docs must produce hits. - let (unfiltered, _) = common::run_search_with_args( - &cfg, - &["--json", "--mode", "lexical", "rust"], - ); + let (unfiltered, _) = + common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]); let uresp: Value = serde_json::from_str(unfiltered.trim()) .unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}")); let uhits = uresp["hits"].as_array().expect("unfiltered hits array"); @@ -254,10 +253,8 @@ fn search_with_two_tag_filters_returns_or_within_tags() { common::ingest(&cfg, &workspace); // Without filter: all three docs produce hits. - let (unfiltered, _) = common::run_search_with_args( - &cfg, - &["--json", "--mode", "lexical", "rust"], - ); + let (unfiltered, _) = + common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]); let uresp: Value = serde_json::from_str(unfiltered.trim()) .unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}")); let uhits = uresp["hits"].as_array().expect("unfiltered hits array"); @@ -270,10 +267,7 @@ fn search_with_two_tag_filters_returns_or_within_tags() { let (filtered, _) = common::run_search_with_args( &cfg, &[ - "--json", "--mode", "lexical", - "--tag", "rust", - "--tag", "async", - "rust", + "--json", "--mode", "lexical", "--tag", "rust", "--tag", "async", "rust", ], ); let fresp: Value = serde_json::from_str(filtered.trim()) @@ -301,6 +295,12 @@ fn search_with_two_tag_filters_returns_or_within_tags() { .collect(); let has_a = paths.iter().any(|p| p.ends_with("a.md")); let has_b = paths.iter().any(|p| p.ends_with("b.md")); - assert!(has_a, "--tag rust must include a.md (rust-tagged): paths={paths:?}"); - assert!(has_b, "--tag async must include b.md (async-tagged): paths={paths:?}"); + assert!( + has_a, + "--tag rust must include a.md (rust-tagged): paths={paths:?}" + ); + assert!( + has_b, + "--tag async must include b.md (async-tagged): paths={paths:?}" + ); } diff --git a/crates/kebab-cli/tests/wire_search_hit_no_code_fields.rs b/crates/kebab-cli/tests/wire_search_hit_no_code_fields.rs index c3d7d24..53a23b8 100644 --- a/crates/kebab-cli/tests/wire_search_hit_no_code_fields.rs +++ b/crates/kebab-cli/tests/wire_search_hit_no_code_fields.rs @@ -5,7 +5,7 @@ //! inject spurious keys into the existing markdown corpus wire shape. use kebab_core::{ - Citation, ChunkId, ChunkerVersion, DocumentId, IndexVersion, RetrievalDetail, ScoreKind, + ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail, ScoreKind, SearchHit, WorkspacePath, }; diff --git a/crates/kebab-cli/tests/wire_search_response.rs b/crates/kebab-cli/tests/wire_search_response.rs index 740081a..26b24ec 100644 --- a/crates/kebab-cli/tests/wire_search_response.rs +++ b/crates/kebab-cli/tests/wire_search_response.rs @@ -23,12 +23,10 @@ fn search_json_emits_search_response_v1_wrapper() { fs::write(workspace.join("a.md"), "# T\n\napples are red.\n").unwrap(); common::ingest(&cfg, &workspace); - let (stdout, _stderr) = common::run_search_with_args( - &cfg, - &["--json", "--mode", "lexical", "apples"], - ); - let v: Value = serde_json::from_str(stdout.trim()) - .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); + let (stdout, _stderr) = + common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "apples"]); + let v: Value = + serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); assert_eq!(v["schema_version"], "search_response.v1"); assert!(v["hits"].is_array(), "hits must be array, got {v}"); assert!( @@ -67,8 +65,8 @@ fn search_json_truncates_with_max_tokens() { &cfg, &["--json", "--mode", "lexical", "--max-tokens", "30", "rust"], ); - let v: Value = serde_json::from_str(stdout.trim()) - .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); + let v: Value = + serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); assert_eq!( v["truncated"], true, "30-token cap must trip truncation: {v}" @@ -88,10 +86,8 @@ fn search_json_cursor_paginates() { } common::ingest(&cfg, &workspace); - let (page1, _) = common::run_search_with_args( - &cfg, - &["--json", "--mode", "lexical", "--k", "2", "rust"], - ); + let (page1, _) = + common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "2", "rust"]); let v1: Value = serde_json::from_str(page1.trim()) .unwrap_or_else(|e| panic!("page1 not JSON: {page1:?}: {e}")); let cursor = v1["next_cursor"] @@ -101,14 +97,7 @@ fn search_json_cursor_paginates() { let (page2, _) = common::run_search_with_args( &cfg, &[ - "--json", - "--mode", - "lexical", - "--k", - "2", - "--cursor", - cursor, - "rust", + "--json", "--mode", "lexical", "--k", "2", "--cursor", cursor, "rust", ], ); let v2: Value = serde_json::from_str(page2.trim()) @@ -118,23 +107,13 @@ fn search_json_cursor_paginates() { .as_array() .expect("page1 hits array") .iter() - .map(|h| { - h["chunk_id"] - .as_str() - .expect("chunk_id string") - .to_string() - }) + .map(|h| h["chunk_id"].as_str().expect("chunk_id string").to_string()) .collect(); let p2_ids: Vec = v2["hits"] .as_array() .expect("page2 hits array") .iter() - .map(|h| { - h["chunk_id"] - .as_str() - .expect("chunk_id string") - .to_string() - }) + .map(|h| h["chunk_id"].as_str().expect("chunk_id string").to_string()) .collect(); assert!( !p2_ids.is_empty(), @@ -161,10 +140,8 @@ fn search_stale_cursor_returns_error_v1_with_stale_cursor_code() { common::ingest(&cfg, &workspace); // Get a valid cursor first. - let (page1_stdout, _) = common::run_search_with_args( - &cfg, - &["--mode", "lexical", "--json", "--k", "1", "apples"], - ); + let (page1_stdout, _) = + common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "--k", "1", "apples"]); let v1: Value = serde_json::from_str(page1_stdout.trim()).expect("json"); let cursor = v1["next_cursor"] .as_str() @@ -181,16 +158,8 @@ fn search_stale_cursor_returns_error_v1_with_stale_cursor_code() { let cfg_str = cfg.to_str().expect("utf8"); let out = std::process::Command::new(exe) .args([ - "--config", - cfg_str, - "--json", - "search", - "--mode", - "lexical", - "--json", - "--cursor", - &cursor, - "apples", + "--config", cfg_str, "--json", "search", "--mode", "lexical", "--json", "--cursor", + &cursor, "apples", ]) .output() .expect("kebab search --cursor"); @@ -234,10 +203,8 @@ fn search_plain_emits_truncated_hint_to_stderr() { } common::ingest(&cfg, &workspace); - let (_stdout, stderr) = common::run_search_with_args( - &cfg, - &["--mode", "lexical", "--max-tokens", "30", "rust"], - ); + let (_stdout, stderr) = + common::run_search_with_args(&cfg, &["--mode", "lexical", "--max-tokens", "30", "rust"]); assert!( stderr.contains("[truncated;"), "stderr must carry truncated hint: {stderr:?}" @@ -254,10 +221,7 @@ fn search_plain_emits_short_query_hint_to_stderr() { let (cfg, workspace, _data) = common::write_config(dir.path(), 30); common::ingest(&cfg, &workspace); - let (_stdout, stderr) = common::run_search_with_args( - &cfg, - &["--mode", "lexical", "ab"], - ); + let (_stdout, stderr) = common::run_search_with_args(&cfg, &["--mode", "lexical", "ab"]); assert!( stderr.contains("[hint]"), "stderr must carry short-query hint: {stderr:?}" @@ -278,18 +242,18 @@ fn search_json_emits_hint_field_for_short_query() { let (cfg, workspace, _data) = common::write_config(dir.path(), 30); common::ingest(&cfg, &workspace); - let (stdout, _stderr) = common::run_search_with_args( - &cfg, - &["--json", "--mode", "lexical", "ab"], - ); - let v: Value = serde_json::from_str(stdout.trim()) - .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); + let (stdout, _stderr) = + common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "ab"]); + let v: Value = + serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); assert!( v["hits"].as_array().unwrap().is_empty(), "empty hits expected for short query in empty KB: {v}" ); assert_eq!( - v["hint"].as_str().expect("hint field set on short empty result"), + v["hint"] + .as_str() + .expect("hint field set on short empty result"), "3자 이상 키워드 권장 (trigram tokenizer 제약)", "hint must carry the standard advisory: {v}" ); @@ -305,12 +269,10 @@ fn search_json_omits_hint_field_when_query_is_long_enough() { let (cfg, workspace, _data) = common::write_config(dir.path(), 30); common::ingest(&cfg, &workspace); - let (stdout, _stderr) = common::run_search_with_args( - &cfg, - &["--json", "--mode", "lexical", "abc"], - ); - let v: Value = serde_json::from_str(stdout.trim()) - .unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); + let (stdout, _stderr) = + common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "abc"]); + let v: Value = + serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}")); assert!( v.get("hint").is_none(), "hint must be absent for ≥3-char queries: {v}" diff --git a/crates/kebab-cli/tests/wire_search_score_kind.rs b/crates/kebab-cli/tests/wire_search_score_kind.rs index f90c177..c1cb0ec 100644 --- a/crates/kebab-cli/tests/wire_search_score_kind.rs +++ b/crates/kebab-cli/tests/wire_search_score_kind.rs @@ -16,10 +16,8 @@ fn lexical_mode_hits_carry_bm25_score_kind() { doc_with_term(&workspace); common::ingest(&cfg, &workspace); - let (stdout, _stderr) = common::run_search_with_args( - &cfg, - &["--mode", "lexical", "--json", "rust"], - ); + let (stdout, _stderr) = + common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "rust"]); let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); let hits = v["hits"].as_array().expect("hits array"); assert!(!hits.is_empty(), "expected at least 1 hit"); @@ -40,10 +38,8 @@ fn old_wire_reader_compat_score_kind_optional_field() { doc_with_term(&workspace); common::ingest(&cfg, &workspace); - let (stdout, _stderr) = common::run_search_with_args( - &cfg, - &["--mode", "lexical", "--json", "rust"], - ); + let (stdout, _stderr) = + common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "rust"]); let v: Value = serde_json::from_str(stdout.trim()).unwrap(); let hit = &v["hits"][0]; assert!(hit.get("score_kind").is_some(), "score_kind always emitted"); diff --git a/crates/kebab-cli/tests/wire_search_stale.rs b/crates/kebab-cli/tests/wire_search_stale.rs index 483c4a8..8400702 100644 --- a/crates/kebab-cli/tests/wire_search_stale.rs +++ b/crates/kebab-cli/tests/wire_search_stale.rs @@ -59,15 +59,14 @@ fn search_json_includes_indexed_at_and_stale() { .get("hits") .and_then(|h| h.as_array()) .unwrap_or_else(|| panic!("expected hits array, got {stdout}")); - let first = arr.first().unwrap_or_else(|| panic!("expected ≥1 hit, got empty hits: {stdout}")); + let first = arr + .first() + .unwrap_or_else(|| panic!("expected ≥1 hit, got empty hits: {stdout}")); assert!( first.get("indexed_at").is_some(), "missing indexed_at in {first}" ); - assert!( - first.get("stale").is_some(), - "missing stale in {first}" - ); + assert!(first.get("stale").is_some(), "missing stale in {first}"); assert_eq!( first["stale"], false, "freshly ingested doc must not be stale at default 30d threshold" diff --git a/crates/kebab-cli/tests/wire_search_trace.rs b/crates/kebab-cli/tests/wire_search_trace.rs index 4b8daff..0162cbb 100644 --- a/crates/kebab-cli/tests/wire_search_trace.rs +++ b/crates/kebab-cli/tests/wire_search_trace.rs @@ -12,10 +12,8 @@ fn search_trace_json_includes_trace_block() { fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap(); common::ingest(&cfg, &workspace); - let (stdout, _stderr) = common::run_search_with_args( - &cfg, - &["--mode", "lexical", "--trace", "--json", "rust"], - ); + let (stdout, _stderr) = + common::run_search_with_args(&cfg, &["--mode", "lexical", "--trace", "--json", "rust"]); let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); assert_eq!(v["schema_version"], "search_response.v1"); assert!(v["trace"].is_object(), "trace block present"); @@ -33,12 +31,13 @@ fn search_without_trace_omits_trace_field() { fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap(); common::ingest(&cfg, &workspace); - let (stdout, _stderr) = common::run_search_with_args( - &cfg, - &["--mode", "lexical", "--json", "rust"], - ); + let (stdout, _stderr) = + common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "rust"]); let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); - assert!(v.get("trace").is_none(), "trace field absent without --trace"); + assert!( + v.get("trace").is_none(), + "trace field absent without --trace" + ); } #[test] @@ -48,10 +47,8 @@ fn search_trace_lexical_mode_vector_list_empty() { fs::write(workspace.join("doc1.md"), "# Title\n\nrust async hello\n").unwrap(); common::ingest(&cfg, &workspace); - let (stdout, _stderr) = common::run_search_with_args( - &cfg, - &["--mode", "lexical", "--trace", "--json", "rust"], - ); + let (stdout, _stderr) = + common::run_search_with_args(&cfg, &["--mode", "lexical", "--trace", "--json", "rust"]); let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); assert_eq!(v["trace"]["vector"].as_array().unwrap().len(), 0); assert_eq!(v["trace"]["timing"]["vector_ms"], 0); diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs index b0b1b19..7fa693d 100644 --- a/crates/kebab-config/src/lib.rs +++ b/crates/kebab-config/src/lib.rs @@ -420,12 +420,16 @@ pub struct PdfCfg { impl PdfCfg { pub fn defaults() -> Self { - Self { ocr: PdfOcrCfg::defaults() } + Self { + ocr: PdfOcrCfg::defaults(), + } } } impl Default for PdfCfg { - fn default() -> Self { Self::defaults() } + fn default() -> Self { + Self::defaults() + } } /// v0.20.x ingest log surface: structured ndjson log written per ingest run. @@ -444,7 +448,9 @@ pub struct LoggingCfg { pub ingest_log_dir: PathBuf, } -fn default_ingest_log_enabled() -> bool { true } +fn default_ingest_log_enabled() -> bool { + true +} fn default_ingest_log_dir() -> PathBuf { PathBuf::from("{state_dir}/logs") } @@ -531,10 +537,18 @@ impl PdfOcrCfg { /// metro-korea.pdf page 8/9/13) 의 OCR 을 강제 timeout 시켜 본문 indexed 손실. /// **conservative starting point 180s 로 재조정** + dogfood evidence 기반 sweet spot /// 점진적 축소 정책. user 가 `[pdf.ocr] request_timeout_secs = N` 으로 직접 tune. -fn default_pdf_ocr_request_timeout_secs() -> u64 { 180 } -fn default_pdf_ocr_valid_ratio() -> f32 { 0.5 } -fn default_pdf_ocr_min_char_count() -> u32 { 20 } -fn default_pdf_ocr_lang_hint() -> Option { Some("kor".to_string()) } +fn default_pdf_ocr_request_timeout_secs() -> u64 { + 180 +} +fn default_pdf_ocr_valid_ratio() -> f32 { + 0.5 +} +fn default_pdf_ocr_min_char_count() -> u32 { + 20 +} +fn default_pdf_ocr_lang_hint() -> Option { + Some("kor".to_string()) +} /// p9-fb-14: TUI-only configuration. Currently a single `theme` /// selector (`"dark"` / `"light"`); future fields (custom role @@ -675,8 +689,7 @@ impl Config { explain_default: false, max_context_tokens: 8000, multi_hop_max_depth: default_multi_hop_max_depth(), - multi_hop_max_sub_queries_per_iter: - default_multi_hop_max_sub_queries_per_iter(), + multi_hop_max_sub_queries_per_iter: default_multi_hop_max_sub_queries_per_iter(), multi_hop_max_pool_chunks: default_multi_hop_max_pool_chunks(), nli_threshold: default_nli_threshold(), }, @@ -1015,11 +1028,7 @@ impl Config { "KEBAB_IMAGE_OCR_ENDPOINT" => { // Empty env value is treated the same as "fall back // to models.llm.endpoint" — i.e. set None. - self.image.ocr.endpoint = if v.is_empty() { - None - } else { - Some(v.clone()) - }; + self.image.ocr.endpoint = if v.is_empty() { None } else { Some(v.clone()) }; } "KEBAB_IMAGE_OCR_LANGUAGES" => { // Comma-separated list, e.g. "eng,kor". @@ -1319,7 +1328,10 @@ theme = "dark" #[test] fn env_overrides_chunking_target_tokens() { let mut env = HashMap::new(); - env.insert("KEBAB_CHUNKING_TARGET_TOKENS".to_string(), "777".to_string()); + env.insert( + "KEBAB_CHUNKING_TARGET_TOKENS".to_string(), + "777".to_string(), + ); let c = Config::defaults().apply_env(&env); assert_eq!(c.chunking.target_tokens, 777); } @@ -1331,7 +1343,10 @@ theme = "dark" "KEBAB_MODELS_LLM_ENDPOINT".to_string(), "http://10.0.0.1:11434".to_string(), ); - env.insert("KEBAB_MODELS_LLM_TEMPERATURE".to_string(), "0.7".to_string()); + env.insert( + "KEBAB_MODELS_LLM_TEMPERATURE".to_string(), + "0.7".to_string(), + ); let c = Config::defaults().apply_env(&env); assert_eq!(c.models.llm.endpoint, "http://10.0.0.1:11434"); assert!((c.models.llm.temperature - 0.7).abs() < 1e-6); @@ -1361,8 +1376,7 @@ theme = "dark" /// shared with the OCR-side invariant via [`LEGACY_PRE_TIMEOUT_TOML`]. #[test] fn legacy_config_without_request_timeout_secs_uses_default() { - let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML) - .expect("parse legacy config"); + let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config"); assert_eq!(c.models.llm.request_timeout_secs, 300); } @@ -1391,10 +1405,7 @@ theme = "dark" /// existing configs that omit the new field keep behaving identically. #[test] fn default_ocr_request_timeout_secs_is_300() { - assert_eq!( - Config::defaults().image.ocr.request_timeout_secs, - 300 - ); + assert_eq!(Config::defaults().image.ocr.request_timeout_secs, 300); } #[test] @@ -1414,8 +1425,7 @@ theme = "dark" /// with the LLM-side invariant via [`LEGACY_PRE_TIMEOUT_TOML`]. #[test] fn legacy_config_without_ocr_request_timeout_secs_uses_default() { - let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML) - .expect("parse legacy config"); + let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config"); assert_eq!(c.image.ocr.request_timeout_secs, 300); } @@ -1428,10 +1438,7 @@ theme = "dark" #[test] fn default_multi_hop_max_sub_queries_per_iter_is_5() { - assert_eq!( - Config::defaults().rag.multi_hop_max_sub_queries_per_iter, - 5 - ); + assert_eq!(Config::defaults().rag.multi_hop_max_sub_queries_per_iter, 5); } #[test] @@ -1445,10 +1452,7 @@ theme = "dark" #[test] fn env_overrides_multi_hop_knobs() { let mut env = HashMap::new(); - env.insert( - "KEBAB_RAG_MULTI_HOP_MAX_DEPTH".to_string(), - "5".to_string(), - ); + env.insert("KEBAB_RAG_MULTI_HOP_MAX_DEPTH".to_string(), "5".to_string()); env.insert( "KEBAB_RAG_MULTI_HOP_MAX_SUB_QUERIES_PER_ITER".to_string(), "7".to_string(), @@ -1470,8 +1474,7 @@ theme = "dark" /// (that fixture also predates the multi_hop_* fields). #[test] fn legacy_config_without_multi_hop_knobs_uses_defaults() { - let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML) - .expect("parse legacy config"); + let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config"); assert_eq!(c.rag.multi_hop_max_depth, 3); assert_eq!(c.rag.multi_hop_max_sub_queries_per_iter, 5); // v0.18 dogfood (post-PR-7): pool default 30 → 15. @@ -1504,8 +1507,7 @@ theme = "dark" /// all PR-9c-1 fields). #[test] fn legacy_config_without_nli_uses_defaults() { - let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML) - .expect("parse legacy config"); + let c: Config = toml::from_str(LEGACY_PRE_TIMEOUT_TOML).expect("parse legacy config"); assert_eq!(c.rag.nli_threshold, 0.0); assert_eq!( c.models.nli.model, @@ -1705,7 +1707,11 @@ max_context_tokens = 8000 "[workspace]\ninclude = [\"**/*.md\", \"**/*.txt\"]", ); let parsed: Result = toml::from_str(&toml_text); - assert!(parsed.is_ok(), "legacy include must not break load: {:?}", parsed.err()); + assert!( + parsed.is_ok(), + "legacy include must not break load: {:?}", + parsed.err() + ); let cfg = parsed.unwrap(); assert_eq!(cfg.workspace.root, "/tmp/kebab-legacy"); } @@ -1715,7 +1721,10 @@ max_context_tokens = 8000 #[test] fn workspace_cfg_has_only_root_and_exclude_fields() { let ws = Config::defaults().workspace; - let WorkspaceCfg { root: _, exclude: _ } = &ws; + let WorkspaceCfg { + root: _, + exclude: _, + } = &ws; } #[test] @@ -1727,9 +1736,10 @@ max_context_tokens = 8000 #[test] fn env_override_stale_threshold() { let c = Config::defaults(); - let env: HashMap = [ - ("KEBAB_SEARCH_STALE_THRESHOLD_DAYS".to_string(), "7".to_string()), - ] + let env: HashMap = [( + "KEBAB_SEARCH_STALE_THRESHOLD_DAYS".to_string(), + "7".to_string(), + )] .into_iter() .collect(); let c = c.apply_env(&env); @@ -1744,9 +1754,10 @@ max_context_tokens = 8000 // `fb27_tests::file_negative_stale_threshold_returns_config_invalid`) // is the spec-required hard error surface. let c = Config::defaults(); - let env: HashMap = [ - ("KEBAB_SEARCH_STALE_THRESHOLD_DAYS".to_string(), "-5".to_string()), - ] + let env: HashMap = [( + "KEBAB_SEARCH_STALE_THRESHOLD_DAYS".to_string(), + "-5".to_string(), + )] .into_iter() .collect(); let c = c.apply_env(&env); @@ -1765,7 +1776,10 @@ max_context_tokens = 8000 std::env::set_var("XDG_CONFIG_HOME", "/tmp/kebabtest-xdg-config"); } let p = Config::xdg_config_path(); - assert_eq!(p, PathBuf::from("/tmp/kebabtest-xdg-config/kebab/config.toml")); + assert_eq!( + p, + PathBuf::from("/tmp/kebabtest-xdg-config/kebab/config.toml") + ); // SAFETY: scope-local restore. unsafe { match prev { @@ -1810,10 +1824,7 @@ max_context_tokens = 8000 let base = Config::defaults(); let mut toml_text = toml::to_string(&base).unwrap(); // Inject max_file_bytes override into the [ingest.code] table. - toml_text = toml_text.replace( - "max_file_bytes = 262144", - "max_file_bytes = 524288", - ); + toml_text = toml_text.replace("max_file_bytes = 262144", "max_file_bytes = 524288"); let cfg: Config = toml::from_str(&toml_text).unwrap(); assert_eq!(cfg.ingest.code.max_file_bytes, 524_288); } @@ -1828,7 +1839,8 @@ mod fb27_tests { fn config_invalid_carries_path_and_cause() { let nonexistent = PathBuf::from("/this/path/should/not/exist/kebab.toml"); let err = Config::from_file(&nonexistent).unwrap_err(); - let signal = err.downcast_ref::() + let signal = err + .downcast_ref::() .expect("from_file error should downcast to ConfigInvalid"); assert_eq!(signal.path, nonexistent); assert!(!signal.cause.is_empty(), "cause should be non-empty"); @@ -1840,7 +1852,8 @@ mod fb27_tests { let p = dir.path().join("bad.toml"); std::fs::write(&p, "this is not [valid toml").unwrap(); let err = Config::from_file(&p).unwrap_err(); - let signal = err.downcast_ref::() + let signal = err + .downcast_ref::() .expect("malformed TOML should downcast to ConfigInvalid"); assert_eq!(signal.path, p); assert!(!signal.cause.is_empty(), "cause should be non-empty"); @@ -1864,13 +1877,11 @@ mod fb27_tests { toml_text.contains("stale_threshold_days = 30"), "default value drifted; update test fixture" ); - toml_text = toml_text.replace( - "stale_threshold_days = 30", - "stale_threshold_days = -5", - ); + toml_text = toml_text.replace("stale_threshold_days = 30", "stale_threshold_days = -5"); std::fs::write(&p, &toml_text).unwrap(); let err = Config::from_file(&p).unwrap_err(); - let signal = err.downcast_ref::() + let signal = err + .downcast_ref::() .expect("negative stale_threshold_days should downcast to ConfigInvalid"); assert_eq!(signal.path, p); assert!( diff --git a/crates/kebab-config/src/paths.rs b/crates/kebab-config/src/paths.rs index edf638a..98e3262 100644 --- a/crates/kebab-config/src/paths.rs +++ b/crates/kebab-config/src/paths.rs @@ -157,7 +157,9 @@ mod tests { #[test] fn xdg_data_home_set_replaces_var() { - let _lock = ENV_LOCK.lock().unwrap_or_else(std::sync::PoisonError::into_inner); + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); let _guard = XdgGuard::capture(); // SAFETY: lock held for the duration of this test. unsafe { std::env::set_var("XDG_DATA_HOME", "/custom/path") }; @@ -168,7 +170,9 @@ mod tests { #[test] fn xdg_data_home_unset_uses_default() { - let _lock = ENV_LOCK.lock().unwrap_or_else(std::sync::PoisonError::into_inner); + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); let _guard = XdgGuard::capture(); // SAFETY: lock held for the duration of this test. unsafe { std::env::remove_var("XDG_DATA_HOME") }; @@ -181,7 +185,9 @@ mod tests { #[test] fn xdg_with_no_default_resolves_to_empty_when_unset() { - let _lock = ENV_LOCK.lock().unwrap_or_else(std::sync::PoisonError::into_inner); + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); let _guard = XdgGuard::capture(); // SAFETY: lock held for the duration of this test. unsafe { std::env::remove_var("XDG_DATA_HOME") }; @@ -193,7 +199,9 @@ mod tests { #[test] fn leading_tilde_expands_to_home() { - let _lock = ENV_LOCK.lock().unwrap_or_else(std::sync::PoisonError::into_inner); + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); let home = std::env::var("HOME").expect("HOME must be set in tests"); let p = expand_path("~/runs", ""); assert_eq!(p, PathBuf::from(home).join("runs")); @@ -229,7 +237,9 @@ mod tests { #[test] fn tilde_path_ignores_base_dir() { - let _lock = ENV_LOCK.lock().unwrap_or_else(std::sync::PoisonError::into_inner); + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); let home = std::env::var("HOME").expect("HOME must be set in tests"); let base = Path::new("/tmp/ignored-cfg"); let p = expand_path_with_base("~/x", "", base); @@ -238,7 +248,9 @@ mod tests { #[test] fn xdg_var_path_ignores_base_dir() { - let _lock = ENV_LOCK.lock().unwrap_or_else(std::sync::PoisonError::into_inner); + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); let _guard = XdgGuard::capture(); // SAFETY: lock held for the duration of this test. unsafe { std::env::set_var("XDG_DATA_HOME", "/xdg/data") }; @@ -255,7 +267,9 @@ mod tests { // Order matters: substitute `{data_dir}` (which itself contains // an unexpanded `${XDG_DATA_HOME}` and `~`), then the other two // resolve the result. - let _lock = ENV_LOCK.lock().unwrap_or_else(std::sync::PoisonError::into_inner); + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); let _guard = XdgGuard::capture(); // SAFETY: lock held for the duration of this test. unsafe { std::env::set_var("XDG_DATA_HOME", "/xdg/data") }; diff --git a/crates/kebab-config/tests/pdf_ocr.rs b/crates/kebab-config/tests/pdf_ocr.rs index 77ea14b..b1e8cc4 100644 --- a/crates/kebab-config/tests/pdf_ocr.rs +++ b/crates/kebab-config/tests/pdf_ocr.rs @@ -2,13 +2,15 @@ // // Integration tests for [pdf.ocr] config section (v0.20.0 sub-item 1). -use std::collections::HashMap; use kebab_config::{Config, PdfCfg}; +use std::collections::HashMap; // Test 1: toml roundtrip — spec §4.5 line 1034-1047 example block. // Config requires many required fields; test the [pdf] section via PdfCfg wrapper. #[derive(serde::Deserialize)] -struct PdfWrapper { pdf: PdfCfg } +struct PdfWrapper { + pdf: PdfCfg, +} #[test] fn pdf_ocr_toml_roundtrip() { @@ -50,7 +52,10 @@ fn pdf_ocr_defaults_off_with_qwen_3b() { assert_eq!(cfg.pdf.ocr.engine, "ollama-vision"); assert_eq!(cfg.pdf.ocr.model, "qwen2.5vl:3b"); assert!(cfg.pdf.ocr.endpoint.is_none()); - assert_eq!(cfg.pdf.ocr.languages, vec!["eng".to_string(), "kor".to_string()]); + assert_eq!( + cfg.pdf.ocr.languages, + vec!["eng".to_string(), "kor".to_string()] + ); assert_eq!(cfg.pdf.ocr.max_pixels, 2048); assert_eq!(cfg.pdf.ocr.request_timeout_secs, 180); // Bug #11: 600 → 60 → 180 (HOTFIXES 2026-05-28) assert!((cfg.pdf.ocr.valid_ratio_threshold - 0.5).abs() < 1e-6); @@ -63,9 +68,15 @@ fn pdf_ocr_defaults_off_with_qwen_3b() { fn pdf_ocr_env_overrides() { let mut env: HashMap = HashMap::new(); env.insert("KEBAB_PDF_OCR_ENABLED".to_string(), "true".to_string()); - env.insert("KEBAB_PDF_OCR_MODEL".to_string(), "qwen2.5vl:7b".to_string()); + env.insert( + "KEBAB_PDF_OCR_MODEL".to_string(), + "qwen2.5vl:7b".to_string(), + ); env.insert("KEBAB_PDF_OCR_ALWAYS_ON".to_string(), "true".to_string()); - env.insert("KEBAB_PDF_OCR_VALID_RATIO_THRESHOLD".to_string(), "0.75".to_string()); + env.insert( + "KEBAB_PDF_OCR_VALID_RATIO_THRESHOLD".to_string(), + "0.75".to_string(), + ); let cfg = Config::defaults().apply_env(&env); diff --git a/crates/kebab-core/src/citation.rs b/crates/kebab-core/src/citation.rs index a73f74f..5107bea 100644 --- a/crates/kebab-core/src/citation.rs +++ b/crates/kebab-core/src/citation.rs @@ -63,7 +63,9 @@ impl Citation { /// fragment; they live in the structured wire object. pub fn to_uri(&self) -> String { match self { - Citation::Line { path, start, end, .. } => { + Citation::Line { + path, start, end, .. + } => { if start == end { format!("{}#L{}", path.0, start) } else { diff --git a/crates/kebab-core/src/document.rs b/crates/kebab-core/src/document.rs index 643875b..25b2856 100644 --- a/crates/kebab-core/src/document.rs +++ b/crates/kebab-core/src/document.rs @@ -235,7 +235,9 @@ mod tests { href: "h".into(), }, Inline::Strong { - children: vec![Inline::Text { text: "bold".into() }], + children: vec![Inline::Text { + text: "bold".into(), + }], }, Inline::Emph { children: vec![Inline::Text { text: "em".into() }], diff --git a/crates/kebab-core/src/ids.rs b/crates/kebab-core/src/ids.rs index 46f60ce..c43b661 100644 --- a/crates/kebab-core/src/ids.rs +++ b/crates/kebab-core/src/ids.rs @@ -14,8 +14,7 @@ use crate::asset::WorkspacePath; use crate::document::SourceSpan; use crate::errors::CoreError; use crate::versions::{ - ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, - ParserVersion, + ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, ParserVersion, }; macro_rules! newtype_id { @@ -54,9 +53,7 @@ fn validate_hex32(s: &str) -> Result<(), CoreError> { ))); } if !s.bytes().all(|b| b.is_ascii_hexdigit()) { - return Err(CoreError::InvalidId(format!( - "non-hex character in {s:?}" - ))); + return Err(CoreError::InvalidId(format!("non-hex character in {s:?}"))); } Ok(()) } diff --git a/crates/kebab-core/src/lib.rs b/crates/kebab-core/src/lib.rs index d8838c0..8cb57d6 100644 --- a/crates/kebab-core/src/lib.rs +++ b/crates/kebab-core/src/lib.rs @@ -7,67 +7,63 @@ //! See `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` for //! the canonical type bodies — this crate is the byte-for-byte mirror. -pub mod ids; -pub mod versions; -pub mod media; +pub mod answer; pub mod asset; -pub mod document; pub mod chunk; pub mod citation; -pub mod metadata; -pub mod search; -pub mod answer; +pub mod document; +pub mod errors; +pub mod fetch; +pub mod ids; pub mod ingest; pub mod jobs; -pub mod vector; -pub mod errors; -pub mod traits; +pub mod media; +pub mod metadata; pub mod normalize; -pub mod fetch; +pub mod search; +pub mod traits; +pub mod vector; +pub mod versions; // Re-export the most commonly used items at the crate root, mirroring the // public surface listed in the task spec. -pub use ids::{ - AssetId, BlockId, ChunkId, DocumentId, EmbeddingId, IndexId, - id_for_asset, id_for_block, id_for_chunk, id_for_doc, id_for_embedding, - id_for_index, id_from, +pub use answer::{ + Answer, AnswerCitation, AnswerRetrievalSummary, HopKind, HopRecord, ModelRef, RefusalReason, + TokenUsage, TraceId, Turn, VerificationSummary, }; -pub use versions::{ - ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, - ParserVersion, PromptTemplateVersion, SchemaVersion, -}; -pub use media::{AudioType, Checksum, ImageType, Lang, MediaType}; pub use asset::{AssetStorage, RawAsset, SourceUri, WorkspacePath}; -pub use document::{ - AudioRefBlock, Block, CanonicalDocument, CodeBlock, CommonBlock, - HeadingBlock, ImageRefBlock, Inline, ListBlock, ModelCaption, OcrRegion, - OcrText, SourceSpan, TableBlock, TextBlock, Transcript, TranscriptSegment, -}; pub use chunk::Chunk; pub use citation::Citation; -pub use metadata::{ - Metadata, Provenance, ProvenanceEvent, ProvenanceKind, SourceType, - TrustLevel, +pub use document::{ + AudioRefBlock, Block, CanonicalDocument, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, + Inline, ListBlock, ModelCaption, OcrRegion, OcrText, SourceSpan, TableBlock, TextBlock, + Transcript, TranscriptSegment, }; -pub use search::{ - BulkSearchItem, BulkSearchResponse, BulkSearchSummary, DocFilter, DocSummary, IndexBytes, MEDIA_KINDS, - RetrievalDetail, ScoreKind, SearchFilters, SearchHit, SearchMode, SearchOpts, SearchQuery, SearchTrace, - TraceCandidate, TraceFusionInput, TraceTiming, -}; -pub use answer::{ - Answer, AnswerCitation, AnswerRetrievalSummary, HopKind, HopRecord, ModelRef, - RefusalReason, TokenUsage, TraceId, Turn, VerificationSummary, +pub use errors::CoreError; +pub use fetch::{FetchKind, FetchOpts, FetchQuery, FetchResult}; +pub use ids::{ + AssetId, BlockId, ChunkId, DocumentId, EmbeddingId, IndexId, id_for_asset, id_for_block, + id_for_chunk, id_for_doc, id_for_embedding, id_for_index, id_from, }; pub use ingest::{IngestItem, IngestItemKind, IngestReport, SkipExamples}; pub use jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus}; -pub use vector::{VectorHit, VectorRecord}; -pub use errors::CoreError; -pub use traits::{ - ChatSessionRepo, ChatSessionRow, ChatTurnRow, ChunkPolicy, Chunker, DocumentStore, - Embedder, EmbeddingInput, EmbeddingKind, ExtractConfig, ExtractContext, Extractor, - FinishReason, GenerateRequest, JobRepo, LanguageModel, Retriever, SourceConnector, - SourceScope, TokenChunk, VectorStore, -}; +pub use media::{AudioType, Checksum, ImageType, Lang, MediaType}; +pub use metadata::{Metadata, Provenance, ProvenanceEvent, ProvenanceKind, SourceType, TrustLevel}; pub use normalize::{nfc, to_posix}; -pub use fetch::{FetchKind, FetchOpts, FetchQuery, FetchResult}; +pub use search::{ + BulkSearchItem, BulkSearchResponse, BulkSearchSummary, DocFilter, DocSummary, IndexBytes, + MEDIA_KINDS, RetrievalDetail, ScoreKind, SearchFilters, SearchHit, SearchMode, SearchOpts, + SearchQuery, SearchTrace, TraceCandidate, TraceFusionInput, TraceTiming, +}; +pub use traits::{ + ChatSessionRepo, ChatSessionRow, ChatTurnRow, ChunkPolicy, Chunker, DocumentStore, Embedder, + EmbeddingInput, EmbeddingKind, ExtractConfig, ExtractContext, Extractor, FinishReason, + GenerateRequest, JobRepo, LanguageModel, Retriever, SourceConnector, SourceScope, TokenChunk, + VectorStore, +}; +pub use vector::{VectorHit, VectorRecord}; +pub use versions::{ + ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, ParserVersion, + PromptTemplateVersion, SchemaVersion, +}; diff --git a/crates/kebab-core/src/search.rs b/crates/kebab-core/src/search.rs index 8c6e6e0..9297372 100644 --- a/crates/kebab-core/src/search.rs +++ b/crates/kebab-core/src/search.rs @@ -317,7 +317,10 @@ mod tests { #[test] fn search_filters_serialize_with_serde_default_compat() { - let old: SearchFilters = serde_json::from_str(r#"{"tags_any":[],"lang":null,"path_glob":null,"trust_min":null}"#).unwrap(); + let old: SearchFilters = serde_json::from_str( + r#"{"tags_any":[],"lang":null,"path_glob":null,"trust_min":null}"#, + ) + .unwrap(); assert!(old.media.is_empty()); assert!(old.ingested_after.is_none()); assert!(old.doc_id.is_none()); @@ -349,10 +352,7 @@ mod tests { }; let v = serde_json::to_value(&t).unwrap(); assert_eq!(v["timing"]["lexical_ms"], 12); - assert_eq!( - v["lexical"][0]["score"].as_f64().unwrap() as f32, - 0.42_f32 - ); + assert_eq!(v["lexical"][0]["score"].as_f64().unwrap() as f32, 0.42_f32); let back: SearchTrace = serde_json::from_value(v).unwrap(); assert_eq!(back, t); } @@ -490,7 +490,10 @@ mod tests { }; let v = serde_json::to_value(&hit).unwrap(); assert!(v.get("repo").is_none(), "repo should be omitted when None"); - assert!(v.get("code_lang").is_none(), "code_lang should be omitted when None"); + assert!( + v.get("code_lang").is_none(), + "code_lang should be omitted when None" + ); } #[test] diff --git a/crates/kebab-core/src/traits.rs b/crates/kebab-core/src/traits.rs index 83c33f1..e22a338 100644 --- a/crates/kebab-core/src/traits.rs +++ b/crates/kebab-core/src/traits.rs @@ -5,6 +5,7 @@ use std::path::{Path, PathBuf}; use serde::{Deserialize, Serialize}; use serde_json::Value; +use crate::answer::{ModelRef, TokenUsage}; use crate::asset::{RawAsset, WorkspacePath}; use crate::chunk::Chunk; use crate::document::{Block, CanonicalDocument}; @@ -16,7 +17,6 @@ use crate::vector::{VectorHit, VectorRecord}; use crate::versions::{ ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, ParserVersion, }; -use crate::answer::{ModelRef, TokenUsage}; // ── Helper input types (§7.1) ───────────────────────────────────────────── @@ -115,21 +115,13 @@ pub trait SourceConnector { pub trait Extractor: Send + Sync { fn supports(&self, media_type: &MediaType) -> bool; fn parser_version(&self) -> ParserVersion; - fn extract( - &self, - ctx: &ExtractContext<'_>, - bytes: &[u8], - ) -> anyhow::Result; + fn extract(&self, ctx: &ExtractContext<'_>, bytes: &[u8]) -> anyhow::Result; } pub trait Chunker: Send + Sync { fn chunker_version(&self) -> ChunkerVersion; fn policy_hash(&self, policy: &ChunkPolicy) -> String; - fn chunk( - &self, - doc: &CanonicalDocument, - policy: &ChunkPolicy, - ) -> anyhow::Result>; + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result>; } pub trait Embedder: Send + Sync { @@ -178,10 +170,8 @@ pub trait DocumentStore { /// `assets.workspace_path` is "last-registered path" — it /// flip-flops on every ingest. Prefer `get_asset` (by asset_id) /// when you have a `CanonicalDocument.source_asset_id`. - fn get_asset_by_workspace_path( - &self, - path: &WorkspacePath, - ) -> anyhow::Result>; + fn get_asset_by_workspace_path(&self, path: &WorkspacePath) + -> anyhow::Result>; /// Look up a document row by its workspace path. Used by the /// document-centric skip path in `try_skip_unchanged` to avoid the @@ -238,12 +228,7 @@ pub trait VectorStore { pub trait JobRepo { fn create(&self, kind: JobKind, payload: Value) -> anyhow::Result; fn update_progress(&self, id: &JobId, progress: Value) -> anyhow::Result<()>; - fn finish( - &self, - id: &JobId, - status: JobStatus, - error: Option<&str>, - ) -> anyhow::Result<()>; + fn finish(&self, id: &JobId, status: JobStatus, error: Option<&str>) -> anyhow::Result<()>; fn list(&self, filter: &JobFilter) -> anyhow::Result>; } diff --git a/crates/kebab-embed-local/src/lib.rs b/crates/kebab-embed-local/src/lib.rs index 91253d0..c4749f7 100644 --- a/crates/kebab-embed-local/src/lib.rs +++ b/crates/kebab-embed-local/src/lib.rs @@ -78,8 +78,8 @@ impl FastembedEmbedder { // 3. Verify dim match BEFORE loading the model — if the config // is wrong we want to fail without paying the ONNX // initialization cost. - let model_info = TextEmbedding::get_model_info(&model_name) - .context("fastembed: get_model_info")?; + let model_info = + TextEmbedding::get_model_info(&model_name).context("fastembed: get_model_info")?; check_dim(model_info.dim, config.models.embedding.dimensions)?; tracing::info!( @@ -103,8 +103,7 @@ impl FastembedEmbedder { cache_dir = %cache_dir.display(), "loading embedding model (first run downloads model weights — ~470MB for e5-small, ~1.3GB for e5-large)" ); - let inner = TextEmbedding::try_new(opts) - .context("fastembed: TextEmbedding::try_new")?; + let inner = TextEmbedding::try_new(opts).context("fastembed: TextEmbedding::try_new")?; let dimensions = model_info.dim; tracing::info!( target: "kebab-embed-local", @@ -320,8 +319,10 @@ mod tests { fn check_dim_rejects_384_vs_1024() { let err = check_dim(384, 1024).expect_err("dim mismatch must error"); let msg = format!("{err}"); - assert!(msg.contains("384") && msg.contains("1024"), - "error must mention both dims, got: {msg}"); + assert!( + msg.contains("384") && msg.contains("1024"), + "error must mention both dims, got: {msg}" + ); } // expand_path tests live in `kb-config::paths`. The adapter imports diff --git a/crates/kebab-embed-local/tests/embed_model.rs b/crates/kebab-embed-local/tests/embed_model.rs index 2212184..11708ae 100644 --- a/crates/kebab-embed-local/tests/embed_model.rs +++ b/crates/kebab-embed-local/tests/embed_model.rs @@ -110,11 +110,7 @@ fn document_and_query_yield_different_vectors() { assert_eq!(out[1].len(), 1024); // Both vectors are L2-normalized → cosine similarity == dot product. - let cos: f32 = out[0] - .iter() - .zip(out[1].iter()) - .map(|(a, b)| a * b) - .sum(); + let cos: f32 = out[0].iter().zip(out[1].iter()).map(|(a, b)| a * b).sum(); // Same text, different prefix → vectors must NOT be identical. assert!( cos < 0.9999, @@ -232,8 +228,8 @@ const SNAPSHOT_HASH_BASELINE: u64 = 0; #[ignore = "loads ONNX model; CI-only"] fn snapshot_aggregate_hash_is_stable() { let emb = shared_embedder(); - let fixture_path = - std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/embed/known-sentences.json"); + let fixture_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests/fixtures/embed/known-sentences.json"); let raw = std::fs::read_to_string(&fixture_path).expect("read fixture"); let json: serde_json::Value = serde_json::from_str(&raw).expect("parse fixture json"); let sentences: Vec = json["sentences"] diff --git a/crates/kebab-embed/src/lib.rs b/crates/kebab-embed/src/lib.rs index 077a872..000a556 100644 --- a/crates/kebab-embed/src/lib.rs +++ b/crates/kebab-embed/src/lib.rs @@ -19,9 +19,7 @@ // Per spec §7.2 — these are the only public-surface types this crate offers. // Adding new types is forbidden by the task contract. -pub use kebab_core::{ - Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion, -}; +pub use kebab_core::{Embedder, EmbeddingInput, EmbeddingKind, EmbeddingModelId, EmbeddingVersion}; // ── Test helper ─────────────────────────────────────────────────────────── diff --git a/crates/kebab-embed/src/mock.rs b/crates/kebab-embed/src/mock.rs index a458562..fe1131b 100644 --- a/crates/kebab-embed/src/mock.rs +++ b/crates/kebab-embed/src/mock.rs @@ -52,11 +52,7 @@ impl MockEmbedder { /// Construct with `seed = 0`. Use [`Self::with_seed`] to pick a different /// seed (e.g., to verify two embedders with the same identity but /// different seeds yield different vectors). - pub fn new( - model_id: EmbeddingModelId, - version: EmbeddingVersion, - dimensions: usize, - ) -> Self { + pub fn new(model_id: EmbeddingModelId, version: EmbeddingVersion, dimensions: usize) -> Self { Self { model_id, version, diff --git a/crates/kebab-eval/src/compare.rs b/crates/kebab-eval/src/compare.rs index 606d00d..6a5986f 100644 --- a/crates/kebab-eval/src/compare.rs +++ b/crates/kebab-eval/src/compare.rs @@ -19,9 +19,7 @@ use kebab_core::{ChunkId, DocumentId}; use kebab_store_sqlite::SqliteStore; use crate::loader::load_golden_set; -use crate::metrics::{ - AggregateMetrics, compute_aggregate_with_config, resolve_golden_path, -}; +use crate::metrics::{AggregateMetrics, compute_aggregate_with_config, resolve_golden_path}; use crate::types::{GoldenQuery, QueryResult}; /// Strict-mode behavior pivot used by [`CompareOpts::strict_chunker_version`]. @@ -151,7 +149,11 @@ pub fn compare_runs_with_config( /// not a wire schema. Stable enough for snapshot tests. pub fn render_report_md(report: &CompareReport) -> String { let mut out = String::new(); - let _ = writeln!(out, "# Eval compare: `{}` vs `{}`", report.run_a, report.run_b); + let _ = writeln!( + out, + "# Eval compare: `{}` vs `{}`", + report.run_a, report.run_b + ); let _ = writeln!(out); let _ = writeln!(out, "## Aggregate deltas"); let _ = writeln!(out); @@ -171,7 +173,13 @@ pub fn render_report_md(report: &CompareReport) -> String { ), ); } - let _ = writeln!(out, "| MRR | {} | {} | {} |", fmt(a.mrr), fmt(b.mrr), fmt_delta(a.mrr, b.mrr)); + let _ = writeln!( + out, + "| MRR | {} | {} | {} |", + fmt(a.mrr), + fmt(b.mrr), + fmt_delta(a.mrr, b.mrr) + ); for k in crate::metrics::TOP_K_VARIANTS { let _ = writeln!( out, @@ -236,8 +244,16 @@ pub fn render_report_md(report: &CompareReport) -> String { ); let _ = writeln!(out); - let wins: Vec<_> = report.per_query.iter().filter(|c| c.kind == ComparisonKind::Win).collect(); - let losses: Vec<_> = report.per_query.iter().filter(|c| c.kind == ComparisonKind::Loss).collect(); + let wins: Vec<_> = report + .per_query + .iter() + .filter(|c| c.kind == ComparisonKind::Win) + .collect(); + let losses: Vec<_> = report + .per_query + .iter() + .filter(|c| c.kind == ComparisonKind::Loss) + .collect(); let regressions: Vec<_> = report .per_query .iter() @@ -370,8 +386,12 @@ fn build_per_query( let (a_rank, b_rank) = match gq { Some(g) => ( - a.and_then(|q| first_hit_rank(q, &g.expected_chunk_ids, &g.expected_doc_ids, fallback)), - b.and_then(|q| first_hit_rank(q, &g.expected_chunk_ids, &g.expected_doc_ids, fallback)), + a.and_then(|q| { + first_hit_rank(q, &g.expected_chunk_ids, &g.expected_doc_ids, fallback) + }), + b.and_then(|q| { + first_hit_rank(q, &g.expected_chunk_ids, &g.expected_doc_ids, fallback) + }), ), None => (None, None), }; @@ -401,8 +421,9 @@ fn classify( // an expected chunk to find. Without that, downgrade to Loss // so refusal-flow queries (no expected_*) don't appear as // regressions. - let has_expected = gq - .is_some_and(|g| !g.expected_chunk_ids.is_empty() || !g.expected_doc_ids.is_empty()); + let has_expected = gq.is_some_and(|g| { + !g.expected_chunk_ids.is_empty() || !g.expected_doc_ids.is_empty() + }); if has_expected { (ComparisonKind::Regression, Some("hit→miss".into())) } else { @@ -512,7 +533,10 @@ mod tests { total_queries: 0, failed_queries: 0, }; - let b = AggregateMetrics { mrr: 0.75, ..a.clone() }; + let b = AggregateMetrics { + mrr: 0.75, + ..a.clone() + }; let d = build_deltas(&a, &b, "exact"); assert!(d["citation_coverage"].is_null()); assert!(d["refusal_correctness"].is_null()); diff --git a/crates/kebab-eval/src/loader.rs b/crates/kebab-eval/src/loader.rs index 9112ec8..d1b2640 100644 --- a/crates/kebab-eval/src/loader.rs +++ b/crates/kebab-eval/src/loader.rs @@ -73,7 +73,10 @@ fn check_unique_ids(queries: &[GoldenQuery]) -> Result<()> { /// Read every doc_id / chunk_id referenced by `queries` and confirm /// SQLite has rows for them. Builds a sorted, deduplicated error /// message listing every missing ID. -pub(crate) fn validate_against_db(queries: &[GoldenQuery], cfg: &kebab_config::Config) -> Result<()> { +pub(crate) fn validate_against_db( + queries: &[GoldenQuery], + cfg: &kebab_config::Config, +) -> Result<()> { // Short-circuit when there is nothing to validate — saves opening // SQLite for golden sets that omit expected_*_ids entirely. let needs_check = queries diff --git a/crates/kebab-eval/src/metrics.rs b/crates/kebab-eval/src/metrics.rs index 2a6ad41..cdf5f0c 100644 --- a/crates/kebab-eval/src/metrics.rs +++ b/crates/kebab-eval/src/metrics.rs @@ -191,12 +191,18 @@ pub(crate) fn aggregate_from_rows( let total_queries = u32::try_from(rows.len()).unwrap_or(u32::MAX); let mut failed_queries: u32 = 0; - let mut hit_at_k: BTreeMap = - TOP_K_VARIANTS.iter().map(|k| (*k, (0_u32, 0_u32))).collect(); - let mut recall_at_k_doc: BTreeMap = - TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect(); - let mut precision_at_k_chunk: BTreeMap = - TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect(); + let mut hit_at_k: BTreeMap = TOP_K_VARIANTS + .iter() + .map(|k| (*k, (0_u32, 0_u32))) + .collect(); + let mut recall_at_k_doc: BTreeMap = TOP_K_VARIANTS + .iter() + .map(|k| (*k, (0.0_f64, 0_u32))) + .collect(); + let mut precision_at_k_chunk: BTreeMap = TOP_K_VARIANTS + .iter() + .map(|k| (*k, (0.0_f64, 0_u32))) + .collect(); let mut mrr_sum: f64 = 0.0; let mut mrr_denom: u32 = 0; @@ -295,7 +301,10 @@ pub(crate) fn aggregate_from_rows( .filter(|h| h.rank <= *k) .map(|h| &h.doc_id) .collect(); - let covered = expected_docs.iter().filter(|d| topk_docs.contains(*d)).count(); + let covered = expected_docs + .iter() + .filter(|d| topk_docs.contains(*d)) + .count(); let frac = covered as f64 / expected_docs.len() as f64; entry.0 += frac; } @@ -419,14 +428,16 @@ fn ratio_or_zero(num: u32, denom: u32) -> f32 { #[cfg(test)] mod tests { use super::*; + use kebab_core::answer::{ + Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, TokenUsage, TraceId, + }; + use kebab_core::asset::WorkspacePath; + use kebab_core::media::Lang; + use kebab_core::versions::PromptTemplateVersion; use kebab_core::{ ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail, SearchHit, SearchMode, }; - use kebab_core::asset::WorkspacePath; - use kebab_core::media::Lang; - use kebab_core::answer::{Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, TokenUsage, TraceId}; - use kebab_core::versions::PromptTemplateVersion; use time::OffsetDateTime; fn gq(id: &str, expected_chunks: &[&str], expected_docs: &[&str]) -> GoldenQuery { @@ -434,8 +445,14 @@ mod tests { id: id.into(), query: format!("q-{id}"), lang: Lang(String::new()), - expected_doc_ids: expected_docs.iter().map(|s| DocumentId((*s).into())).collect(), - expected_chunk_ids: expected_chunks.iter().map(|s| ChunkId((*s).into())).collect(), + expected_doc_ids: expected_docs + .iter() + .map(|s| DocumentId((*s).into())) + .collect(), + expected_chunk_ids: expected_chunks + .iter() + .map(|s| ChunkId((*s).into())) + .collect(), must_contain: vec![], forbidden: vec![], difficulty: None, @@ -478,7 +495,12 @@ mod tests { } } - fn qr(id: &str, hits: Vec, error: Option, answer: Option) -> QueryResult { + fn qr( + id: &str, + hits: Vec, + error: Option, + answer: Option, + ) -> QueryResult { QueryResult { query_id: id.into(), query: format!("q-{id}"), @@ -490,9 +512,12 @@ mod tests { } } - fn record(id: &str, hits: Vec, error: Option, answer: Option) - -> kebab_store_sqlite::EvalQueryResultRecord - { + fn record( + id: &str, + hits: Vec, + error: Option, + answer: Option, + ) -> kebab_store_sqlite::EvalQueryResultRecord { kebab_store_sqlite::EvalQueryResultRecord { query_id: id.into(), result_json: serde_json::to_string(&qr(id, hits, error, answer)).unwrap(), @@ -502,21 +527,28 @@ mod tests { fn answer(text: &str, grounded: bool, citation_paths: &[&str]) -> Answer { Answer { answer: text.into(), - citations: citation_paths.iter().map(|p| AnswerCitation { - marker: None, - citation: Citation::Line { - path: WorkspacePath::new((*p).into()).unwrap(), - start: 1, - end: 1, - section: None, - }, - // fb-32: synthetic eval citations don't exercise staleness. - indexed_at: OffsetDateTime::UNIX_EPOCH, - stale: false, - }).collect(), + citations: citation_paths + .iter() + .map(|p| AnswerCitation { + marker: None, + citation: Citation::Line { + path: WorkspacePath::new((*p).into()).unwrap(), + start: 1, + end: 1, + section: None, + }, + // fb-32: synthetic eval citations don't exercise staleness. + indexed_at: OffsetDateTime::UNIX_EPOCH, + stale: false, + }) + .collect(), grounded, refusal_reason: None, - model: ModelRef { id: "m".into(), provider: "p".into(), dimensions: None }, + model: ModelRef { + id: "m".into(), + provider: "p".into(), + dimensions: None, + }, embedding: None, prompt_template_version: PromptTemplateVersion("p@1".into()), retrieval: AnswerRetrievalSummary { @@ -528,7 +560,11 @@ mod tests { chunks_returned: 1, chunks_used: 1, }, - usage: TokenUsage { prompt_tokens: 1, completion_tokens: 1, latency_ms: 1 }, + usage: TokenUsage { + prompt_tokens: 1, + completion_tokens: 1, + latency_ms: 1, + }, created_at: OffsetDateTime::UNIX_EPOCH, conversation_id: None, turn_index: None, @@ -547,7 +583,17 @@ mod tests { ]; let rows = vec![ record("q1", vec![hit(1, "c1", "d1")], None, None), - record("q2", vec![hit(1, "x", "y"), hit(2, "x", "y"), hit(3, "x", "y"), hit(4, "c2", "d2")], None, None), + record( + "q2", + vec![ + hit(1, "x", "y"), + hit(2, "x", "y"), + hit(3, "x", "y"), + hit(4, "c2", "d2"), + ], + None, + None, + ), record("q3", vec![hit(1, "x", "y")], None, None), ]; let agg = aggregate_from_rows(&queries, &rows).unwrap(); @@ -568,7 +614,17 @@ mod tests { ]; let rows = vec![ record("q1", vec![hit(1, "c1", "d1")], None, None), - record("q2", vec![hit(1, "x", "y"), hit(2, "x", "y"), hit(3, "x", "y"), hit(4, "c2", "d2")], None, None), + record( + "q2", + vec![ + hit(1, "x", "y"), + hit(2, "x", "y"), + hit(3, "x", "y"), + hit(4, "c2", "d2"), + ], + None, + None, + ), record("q3", vec![hit(1, "x", "y")], None, None), ]; let agg = aggregate_from_rows(&queries, &rows).unwrap(); @@ -579,7 +635,12 @@ mod tests { fn recall_at_k_doc_partial() { // q1 expects {d1, d2}; top-3 returns {d1}. recall@3 = 0.5 let queries = vec![gq("q1", &[], &["d1", "d2"])]; - let rows = vec![record("q1", vec![hit(1, "c1", "d1"), hit(2, "c2", "d3")], None, None)]; + let rows = vec![record( + "q1", + vec![hit(1, "c1", "d1"), hit(2, "c2", "d3")], + None, + None, + )]; let agg = aggregate_from_rows(&queries, &rows).unwrap(); assert_eq!(agg.recall_at_k_doc[&3], 0.5); assert_eq!(agg.recall_at_k_doc[&10], 0.5); @@ -624,7 +685,11 @@ mod tests { let queries = vec![gq("q1", &[], &[])]; let rows = vec![record("q1", vec![], None, None)]; let agg = aggregate_from_rows(&queries, &rows).unwrap(); - assert!(agg.refusal_correctness.is_nan(), "got {}", agg.refusal_correctness); + assert!( + agg.refusal_correctness.is_nan(), + "got {}", + agg.refusal_correctness + ); } #[test] @@ -662,8 +727,16 @@ mod tests { let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, None)]; let agg = aggregate_from_rows(&queries, &rows).unwrap(); let json: serde_json::Value = serde_json::to_value(&agg).unwrap(); - assert!(json["citation_coverage"].is_null(), "expected null, got {:?}", json["citation_coverage"]); - assert!(json["refusal_correctness"].is_null(), "expected null, got {:?}", json["refusal_correctness"]); + assert!( + json["citation_coverage"].is_null(), + "expected null, got {:?}", + json["citation_coverage"] + ); + assert!( + json["refusal_correctness"].is_null(), + "expected null, got {:?}", + json["refusal_correctness"] + ); } #[test] @@ -791,10 +864,7 @@ mod tests { // q1: expected=[c1], hits=[c1@1, x@2, y@3] → P@5 = 1/5 = 0.2 // q2: expected=[c1, c2], hits=[c1@1, c2@2] → P@5 = 2/5 = 0.4 // Avg P@5 = 0.3. - let queries = vec![ - gq("q1", &["c1"], &["d1"]), - gq("q2", &["c1", "c2"], &["d2"]), - ]; + let queries = vec![gq("q1", &["c1"], &["d1"]), gq("q2", &["c1", "c2"], &["d2"])]; let rows = vec![ record( "q1", diff --git a/crates/kebab-eval/src/runner.rs b/crates/kebab-eval/src/runner.rs index 7912a09..45a9652 100644 --- a/crates/kebab-eval/src/runner.rs +++ b/crates/kebab-eval/src/runner.rs @@ -251,10 +251,7 @@ fn write_per_query_jsonl( // workspace-default does); resolve it before threading it into the // `{data_dir}` substitution of `runs_dir`. let resolved_data_dir = expand_path(&cfg.storage.data_dir, ""); - let runs_dir = expand_path( - &cfg.storage.runs_dir, - &resolved_data_dir.to_string_lossy(), - ); + let runs_dir = expand_path(&cfg.storage.runs_dir, &resolved_data_dir.to_string_lossy()); let run_dir = runs_dir.join(run_id); std::fs::create_dir_all(&run_dir) .with_context(|| format!("create run dir {}", run_dir.display()))?; diff --git a/crates/kebab-eval/tests/loader.rs b/crates/kebab-eval/tests/loader.rs index 704f173..5f137dc 100644 --- a/crates/kebab-eval/tests/loader.rs +++ b/crates/kebab-eval/tests/loader.rs @@ -61,7 +61,10 @@ fn loads_multi_hop_golden_fixture() { let single = qs.iter().filter(|q| q.id.starts_with("mh-s-")).count(); assert_eq!(cross_doc, 5, "expected 5 mh-c-* (cross-doc) questions"); assert_eq!(intra_doc, 5, "expected 5 mh-i-* (intra-doc) questions"); - assert_eq!(single, 5, "expected 5 mh-s-* (single-fact negative) questions"); + assert_eq!( + single, 5, + "expected 5 mh-s-* (single-fact negative) questions" + ); // Every question carries at least one `must_contain` so the // rule-based answer-correctness metric (P5-2) has a signal even diff --git a/crates/kebab-eval/tests/metrics_and_compare.rs b/crates/kebab-eval/tests/metrics_and_compare.rs index 9110e60..c1a8327 100644 --- a/crates/kebab-eval/tests/metrics_and_compare.rs +++ b/crates/kebab-eval/tests/metrics_and_compare.rs @@ -11,9 +11,8 @@ use std::path::PathBuf; use kebab_config::Config; use kebab_core::{ - ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, Lang, - RetrievalDetail, SearchHit, SearchMode, - asset::WorkspacePath, + ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, Lang, RetrievalDetail, SearchHit, + SearchMode, asset::WorkspacePath, }; use kebab_eval::{ AggregateMetrics, CompareOpts, CompareReport, ComparisonKind, GoldenQuery, QueryResult, @@ -267,8 +266,11 @@ fn compare_runs_classifies_win_loss_draw_regression() { drop(store); let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap(); - let by_id: std::collections::HashMap<&str, &kebab_eval::QueryComparison> = - report.per_query.iter().map(|c| (c.query_id.as_str(), c)).collect(); + let by_id: std::collections::HashMap<&str, &kebab_eval::QueryComparison> = report + .per_query + .iter() + .map(|c| (c.query_id.as_str(), c)) + .collect(); assert_eq!(by_id["q-001"].kind, ComparisonKind::Loss); assert_eq!(by_id["q-002"].kind, ComparisonKind::Win); assert_eq!(by_id["q-003"].kind, ComparisonKind::Win); @@ -283,8 +285,20 @@ fn compare_strict_mode_refuses_chunker_version_mismatch() { let store = SqliteStore::open(&cfg).unwrap(); store.run_migrations().unwrap(); let now = OffsetDateTime::UNIX_EPOCH; - write_run(&store, "run_a", "test@1", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]); - write_run(&store, "run_b", "test@2", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]); + write_run( + &store, + "run_a", + "test@1", + now, + vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])], + ); + write_run( + &store, + "run_b", + "test@2", + now, + vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])], + ); drop(store); let opts = CompareOpts { @@ -305,7 +319,13 @@ fn compare_graceful_falls_back_to_doc_id() { let now = OffsetDateTime::UNIX_EPOCH; // Run A uses test@1 chunker; run B uses test@2 — chunk_ids no longer // align, but doc_ids do. - write_run(&store, "run_a", "test@1", now, vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])]); + write_run( + &store, + "run_a", + "test@1", + now, + vec![qr("q-001", vec![hit(1, "chunk-1", "doc-1")])], + ); write_run( &store, "run_b", @@ -319,7 +339,11 @@ fn compare_graceful_falls_back_to_doc_id() { let report = compare_runs_with_config(&cfg, "run_a", "run_b", &CompareOpts::default()).unwrap(); assert_eq!(report.deltas["chunker_version_match"], "fallback_doc"); - let q1 = report.per_query.iter().find(|c| c.query_id == "q-001").unwrap(); + let q1 = report + .per_query + .iter() + .find(|c| c.query_id == "q-001") + .unwrap(); // Both runs hit doc-1 at rank 1 → Draw. assert_eq!(q1.kind, ComparisonKind::Draw); assert_eq!(q1.a_hit_rank, Some(1)); @@ -374,13 +398,19 @@ fn compare_report_snapshot_matches_fixture() { .join("eval") .join("compare-1.json"); if std::env::var("UPDATE_SNAPSHOTS").is_ok() { - fs::write(&fixture, format!("{}\n", serde_json::to_string_pretty(&actual).unwrap())) - .unwrap(); + fs::write( + &fixture, + format!("{}\n", serde_json::to_string_pretty(&actual).unwrap()), + ) + .unwrap(); } let expected_text = fs::read_to_string(&fixture) .unwrap_or_else(|e| panic!("missing fixture {}: {e}", fixture.display())); let expected: serde_json::Value = serde_json::from_str(&expected_text).unwrap(); - assert_eq!(actual, expected, "compare report drift — re-run with UPDATE_SNAPSHOTS=1 if intended"); + assert_eq!( + actual, expected, + "compare report drift — re-run with UPDATE_SNAPSHOTS=1 if intended" + ); } /// Project a `CompareReport` to the stable-across-runs subset. diff --git a/crates/kebab-eval/tests/runner.rs b/crates/kebab-eval/tests/runner.rs index 9f26c3e..637b8ce6 100644 --- a/crates/kebab-eval/tests/runner.rs +++ b/crates/kebab-eval/tests/runner.rs @@ -147,7 +147,9 @@ fn lexical_opts() -> EvalRunOpts { /// guard must outlive the call so concurrent tests don't reset the /// var mid-run. fn run_with_golden R, R>(yaml: &Path, f: F) -> R { - let _g = GOLDEN_ENV_LOCK.lock().unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = GOLDEN_ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); // SAFETY: `KEBAB_EVAL_GOLDEN` is a benign env var; the GOLDEN_ENV_LOCK // serializes mutations so concurrent tests don't race. unsafe { diff --git a/crates/kebab-llm-local/src/ollama.rs b/crates/kebab-llm-local/src/ollama.rs index 3e797cf..7f45ff5 100644 --- a/crates/kebab-llm-local/src/ollama.rs +++ b/crates/kebab-llm-local/src/ollama.rs @@ -41,9 +41,7 @@ use std::io::{BufRead, BufReader}; use std::time::Duration; -use kebab_core::{ - FinishReason, GenerateRequest, LanguageModel, ModelRef, TokenChunk, TokenUsage, -}; +use kebab_core::{FinishReason, GenerateRequest, LanguageModel, ModelRef, TokenChunk, TokenUsage}; use serde::{Deserialize, Serialize}; use crate::error::LlmError; @@ -346,9 +344,9 @@ impl Iterator for OllamaStream { // misrouted reverse proxy returning 200). Per §10 // error taxonomy this is `Stream`, not // `Malformed`. - return Some(Err(anyhow::Error::from(LlmError::Stream( - truncate_body(&preview, 512), - )))); + return Some(Err(anyhow::Error::from(LlmError::Stream(truncate_body( + &preview, 512, + ))))); } // Mid-stream corruption — earlier lines parsed, this // one didn't. That's `Malformed`. @@ -364,9 +362,9 @@ impl Iterator for OllamaStream { // Server-side error envelope on a 200 stream. if let Some(err) = line.error { self.done = true; - return Some(Err(anyhow::Error::from(LlmError::Stream( - truncate_body(&err, 512), - )))); + return Some(Err(anyhow::Error::from(LlmError::Stream(truncate_body( + &err, 512, + ))))); } if line.done { @@ -451,11 +449,7 @@ fn map_send_error(err: reqwest::Error, endpoint: &str) -> LlmError { /// Map a non-2xx HTTP response to an [`LlmError`]. Pattern-matches on the /// 404 + "model" / "not found" body envelope to surface the actionable /// `ollama pull ` hint. -fn map_status_error( - status: reqwest::StatusCode, - body: &str, - model_id: &str, -) -> LlmError { +fn map_status_error(status: reqwest::StatusCode, body: &str, model_id: &str) -> LlmError { if status == reqwest::StatusCode::NOT_FOUND { let lower = body.to_ascii_lowercase(); // Heuristic: Ollama's "model not pulled" envelope is roughly @@ -473,10 +467,7 @@ fn map_status_error( return LlmError::ModelNotPulled(model_id.to_string()); } } - LlmError::Stream(truncate_body( - &format!("status={status} body={body}"), - 512, - )) + LlmError::Stream(truncate_body(&format!("status={status} body={body}"), 512)) } /// Truncate a body / error string to `n` characters, appending an @@ -491,7 +482,10 @@ fn truncate_body(s: &str, n: usize) -> String { return s.to_string(); } let mut out: String = s.chars().take(n).collect(); - out.push_str(&format!("... (truncated, original {} chars)", s.chars().count())); + out.push_str(&format!( + "... (truncated, original {} chars)", + s.chars().count() + )); out } @@ -512,11 +506,7 @@ mod tests { #[test] fn map_status_error_404_with_model_not_found_returns_not_pulled() { let body = r#"{"error":"model 'qwen2.5:7b-instruct' not found, try pulling it first"}"#; - let err = map_status_error( - reqwest::StatusCode::NOT_FOUND, - body, - "qwen2.5:7b-instruct", - ); + let err = map_status_error(reqwest::StatusCode::NOT_FOUND, body, "qwen2.5:7b-instruct"); match err { LlmError::ModelNotPulled(m) => assert_eq!(m, "qwen2.5:7b-instruct"), other => panic!("expected ModelNotPulled, got {other:?}"), @@ -540,11 +530,7 @@ mod tests { // The English "not found" substring is absent, but the model id // is echoed — heuristic should still route to ModelNotPulled. let body = r#"{"error":"모델 'qwen2.5:7b-instruct' 을(를) 찾을 수 없습니다"}"#; - let err = map_status_error( - reqwest::StatusCode::NOT_FOUND, - body, - "qwen2.5:7b-instruct", - ); + let err = map_status_error(reqwest::StatusCode::NOT_FOUND, body, "qwen2.5:7b-instruct"); assert!( matches!(err, LlmError::ModelNotPulled(ref m) if m == "qwen2.5:7b-instruct"), "expected ModelNotPulled for localized 404 body, got {err:?}", diff --git a/crates/kebab-llm-local/tests/streaming.rs b/crates/kebab-llm-local/tests/streaming.rs index 337b434..d5429a6 100644 --- a/crates/kebab-llm-local/tests/streaming.rs +++ b/crates/kebab-llm-local/tests/streaming.rs @@ -41,10 +41,7 @@ fn sample_request() -> GenerateRequest { /// Helper: drive `generate_stream` to completion on a blocking thread so /// the sync `OllamaLanguageModel` stays off the async runtime. -async fn collect_chunks( - cfg: Config, - req: GenerateRequest, -) -> anyhow::Result> { +async fn collect_chunks(cfg: Config, req: GenerateRequest) -> anyhow::Result> { tokio::task::spawn_blocking(move || -> anyhow::Result> { let llm = OllamaLanguageModel::new(&cfg)?; let stream = llm.generate_stream(req)?; @@ -58,10 +55,7 @@ async fn collect_chunks( /// `generate_stream` itself (rather than a stream-mid error). Used by the /// "unreachable endpoint" / "model not pulled" tests where the error /// surfaces on `.send()` before any chunks flow. -async fn run_expecting_request_error( - cfg: Config, - req: GenerateRequest, -) -> anyhow::Error { +async fn run_expecting_request_error(cfg: Config, req: GenerateRequest) -> anyhow::Error { tokio::task::spawn_blocking(move || -> anyhow::Result<()> { let llm = OllamaLanguageModel::new(&cfg)?; let _stream = llm.generate_stream(req)?; @@ -78,9 +72,12 @@ async fn run_expecting_request_error( async fn streamed_response_produces_tokens_then_done() { let server = MockServer::start().await; let body = concat!( - r#"{"response":"hi","done":false}"#, "\n", - r#"{"response":" there","done":false}"#, "\n", - r#"{"response":"","done":true,"done_reason":"stop","prompt_eval_count":3,"eval_count":2,"total_duration":1500000}"#, "\n", + r#"{"response":"hi","done":false}"#, + "\n", + r#"{"response":" there","done":false}"#, + "\n", + r#"{"response":"","done":true,"done_reason":"stop","prompt_eval_count":3,"eval_count":2,"total_duration":1500000}"#, + "\n", ); Mock::given(method("POST")) .and(path("/api/generate")) @@ -96,7 +93,10 @@ async fn streamed_response_produces_tokens_then_done() { assert!(matches!(&chunks[0], TokenChunk::Token(t) if t == "hi")); assert!(matches!(&chunks[1], TokenChunk::Token(t) if t == " there")); match &chunks[2] { - TokenChunk::Done { finish_reason, usage } => { + TokenChunk::Done { + finish_reason, + usage, + } => { assert!(matches!(finish_reason, FinishReason::Stop)); assert_eq!(usage.prompt_tokens, 3); assert_eq!(usage.completion_tokens, 2); @@ -155,10 +155,13 @@ async fn multibyte_chars_within_a_line_round_trip() { let server = MockServer::start().await; let body = concat!( // "한국어" (Korean) — each char is 3 bytes in UTF-8. - r#"{"response":"한국어","done":false}"#, "\n", + r#"{"response":"한국어","done":false}"#, + "\n", // Followed by an emoji ZWJ sequence (4 bytes per scalar). - r#"{"response":"🦀","done":false}"#, "\n", - r#"{"response":"","done":true,"done_reason":"stop","prompt_eval_count":1,"eval_count":4,"total_duration":0}"#, "\n", + r#"{"response":"🦀","done":false}"#, + "\n", + r#"{"response":"","done":true,"done_reason":"stop","prompt_eval_count":1,"eval_count":4,"total_duration":0}"#, + "\n", ); Mock::given(method("POST")) .and(path("/api/generate")) @@ -257,8 +260,10 @@ async fn other_4xx_maps_to_stream_error() { async fn done_reason_length_maps_to_finish_reason_length() { let server = MockServer::start().await; let body = concat!( - r#"{"response":"a","done":false}"#, "\n", - r#"{"response":"","done":true,"done_reason":"length","prompt_eval_count":1,"eval_count":1,"total_duration":0}"#, "\n", + r#"{"response":"a","done":false}"#, + "\n", + r#"{"response":"","done":true,"done_reason":"length","prompt_eval_count":1,"eval_count":1,"total_duration":0}"#, + "\n", ); Mock::given(method("POST")) .and(path("/api/generate")) @@ -281,8 +286,10 @@ async fn done_reason_length_maps_to_finish_reason_length() { async fn done_reason_abort_maps_to_finish_reason_aborted() { let server = MockServer::start().await; let body = concat!( - r#"{"response":"a","done":false}"#, "\n", - r#"{"response":"","done":true,"done_reason":"abort","prompt_eval_count":1,"eval_count":1,"total_duration":0}"#, "\n", + r#"{"response":"a","done":false}"#, + "\n", + r#"{"response":"","done":true,"done_reason":"abort","prompt_eval_count":1,"eval_count":1,"total_duration":0}"#, + "\n", ); Mock::given(method("POST")) .and(path("/api/generate")) @@ -312,9 +319,11 @@ async fn missing_eval_counts_default_to_zero() { // here — the comment documents the intent. let server = MockServer::start().await; let body = concat!( - r#"{"response":"hi","done":false}"#, "\n", + r#"{"response":"hi","done":false}"#, + "\n", // No prompt_eval_count / eval_count / total_duration. - r#"{"response":"","done":true,"done_reason":"stop"}"#, "\n", + r#"{"response":"","done":true,"done_reason":"stop"}"#, + "\n", ); Mock::given(method("POST")) .and(path("/api/generate")) @@ -339,9 +348,11 @@ async fn missing_eval_counts_default_to_zero() { async fn missing_done_reason_defaults_to_stop() { let server = MockServer::start().await; let body = concat!( - r#"{"response":"hi","done":false}"#, "\n", + r#"{"response":"hi","done":false}"#, + "\n", // Final frame omits done_reason entirely. - r#"{"response":"","done":true,"prompt_eval_count":1,"eval_count":1,"total_duration":0}"#, "\n", + r#"{"response":"","done":true,"prompt_eval_count":1,"eval_count":1,"total_duration":0}"#, + "\n", ); Mock::given(method("POST")) .and(path("/api/generate")) @@ -406,8 +417,10 @@ async fn endpoint_with_trailing_slash_does_not_double_slash() { // fail the assertion. let server = MockServer::start().await; let body = concat!( - r#"{"response":"ok","done":false}"#, "\n", - r#"{"response":"","done":true,"done_reason":"stop","prompt_eval_count":1,"eval_count":1,"total_duration":0}"#, "\n", + r#"{"response":"ok","done":false}"#, + "\n", + r#"{"response":"","done":true,"done_reason":"stop","prompt_eval_count":1,"eval_count":1,"total_duration":0}"#, + "\n", ); Mock::given(method("POST")) .and(path("/api/generate")) @@ -451,8 +464,10 @@ async fn determinism_seed_zero_temp_zero_two_runs_identical() { // (#[ignore]) where reproducibility is modulo model-internal nondet. let server = MockServer::start().await; let body = concat!( - r#"{"response":"deterministic","done":false}"#, "\n", - r#"{"response":"","done":true,"done_reason":"stop","prompt_eval_count":1,"eval_count":1,"total_duration":0}"#, "\n", + r#"{"response":"deterministic","done":false}"#, + "\n", + r#"{"response":"","done":true,"done_reason":"stop","prompt_eval_count":1,"eval_count":1,"total_duration":0}"#, + "\n", ); Mock::given(method("POST")) .and(path("/api/generate")) diff --git a/crates/kebab-llm/src/mock.rs b/crates/kebab-llm/src/mock.rs index c63faef..3d8c4ed 100644 --- a/crates/kebab-llm/src/mock.rs +++ b/crates/kebab-llm/src/mock.rs @@ -36,9 +36,7 @@ //! - No tokenizer. `usage.prompt_tokens` / `completion_tokens` are whatever //! the constructor was given — the mock does not count. -use kebab_core::{ - FinishReason, GenerateRequest, LanguageModel, ModelRef, TokenChunk, TokenUsage, -}; +use kebab_core::{FinishReason, GenerateRequest, LanguageModel, ModelRef, TokenChunk, TokenUsage}; /// Deterministic test double. See module docs for the streaming recipe. pub struct MockLanguageModel { diff --git a/crates/kebab-mcp/src/error.rs b/crates/kebab-mcp/src/error.rs index b12e5ac..c7faafc 100644 --- a/crates/kebab-mcp/src/error.rs +++ b/crates/kebab-mcp/src/error.rs @@ -10,8 +10,7 @@ use kebab_app::classify; pub fn to_tool_error(err: &anyhow::Error) -> CallToolResult { let v1 = classify(err, false); let body = serde_json::to_string(&v1).unwrap_or_else(|_| { - r#"{"schema_version":"error.v1","code":"generic","message":"serialize failed"}"# - .to_string() + r#"{"schema_version":"error.v1","code":"generic","message":"serialize failed"}"#.to_string() }); CallToolResult::error(vec![Content::text(body)]) } diff --git a/crates/kebab-mcp/src/lib.rs b/crates/kebab-mcp/src/lib.rs index 4d11326..8bdb660 100644 --- a/crates/kebab-mcp/src/lib.rs +++ b/crates/kebab-mcp/src/lib.rs @@ -142,17 +142,13 @@ impl ServerHandler for KebabHandler { } "search" => { let args = request.arguments.unwrap_or_default(); - self.spawn_tool(args, |state, input| { - tools::search::handle(&state, input) - }) - .await + self.spawn_tool(args, |state, input| tools::search::handle(&state, input)) + .await } "ask" => { let args = request.arguments.unwrap_or_default(); - self.spawn_tool(args, |state, input| { - tools::ask::handle(&state, input) - }) - .await + self.spawn_tool(args, |state, input| tools::ask::handle(&state, input)) + .await } "ingest_file" => { let args = request.arguments.unwrap_or_default(); @@ -170,10 +166,8 @@ impl ServerHandler for KebabHandler { } "fetch" => { let args = request.arguments.unwrap_or_default(); - self.spawn_tool(args, |state, input| { - tools::fetch::handle(&state, input) - }) - .await + self.spawn_tool(args, |state, input| tools::fetch::handle(&state, input)) + .await } "bulk_search" => { let args = request.arguments.unwrap_or_default(); diff --git a/crates/kebab-mcp/src/tools/ask.rs b/crates/kebab-mcp/src/tools/ask.rs index 143bbdf..3815d29 100644 --- a/crates/kebab-mcp/src/tools/ask.rs +++ b/crates/kebab-mcp/src/tools/ask.rs @@ -51,9 +51,7 @@ pub fn handle(state: &KebabAppState, input: AskInput) -> CallToolResult { }; let cfg_clone = (*state.config).clone(); let result = match input.session_id { - Some(sid) => { - kebab_app::ask_with_session_with_config(cfg_clone, &sid, &input.query, opts) - } + Some(sid) => kebab_app::ask_with_session_with_config(cfg_clone, &sid, &input.query, opts), None => kebab_app::ask_with_config(cfg_clone, &input.query, opts), }; match result { diff --git a/crates/kebab-mcp/src/tools/fetch.rs b/crates/kebab-mcp/src/tools/fetch.rs index 3f0ea5b..93f2687 100644 --- a/crates/kebab-mcp/src/tools/fetch.rs +++ b/crates/kebab-mcp/src/tools/fetch.rs @@ -49,9 +49,7 @@ pub fn handle(state: &KebabAppState, input: FetchInput) -> CallToolResult { _ => return invalid_input("kind=span requires doc_id, line_start, line_end"), }, other => { - return invalid_input(&format!( - "unknown kind '{other}'; expected chunk|doc|span" - )); + return invalid_input(&format!("unknown kind '{other}'; expected chunk|doc|span")); } }; diff --git a/crates/kebab-mcp/src/tools/ingest_file.rs b/crates/kebab-mcp/src/tools/ingest_file.rs index 0bad2a6..cf47ad0 100644 --- a/crates/kebab-mcp/src/tools/ingest_file.rs +++ b/crates/kebab-mcp/src/tools/ingest_file.rs @@ -24,8 +24,9 @@ pub fn handle(state: &KebabAppState, input: IngestFileInput) -> CallToolResult { Ok(report) => match serde_json::to_value(&report) { Ok(mut v) => { if let serde_json::Value::Object(ref mut map) = v { - map.entry("schema_version".to_string()) - .or_insert_with(|| serde_json::Value::String("ingest_report.v1".to_string())); + map.entry("schema_version".to_string()).or_insert_with(|| { + serde_json::Value::String("ingest_report.v1".to_string()) + }); } match serde_json::to_string(&v) { Ok(json) => to_tool_success(json), diff --git a/crates/kebab-mcp/src/tools/ingest_stdin.rs b/crates/kebab-mcp/src/tools/ingest_stdin.rs index 5957711..fe46ce1 100644 --- a/crates/kebab-mcp/src/tools/ingest_stdin.rs +++ b/crates/kebab-mcp/src/tools/ingest_stdin.rs @@ -29,8 +29,9 @@ pub fn handle(state: &KebabAppState, input: IngestStdinInput) -> CallToolResult Ok(report) => match serde_json::to_value(&report) { Ok(mut v) => { if let serde_json::Value::Object(ref mut map) = v { - map.entry("schema_version".to_string()) - .or_insert_with(|| serde_json::Value::String("ingest_report.v1".to_string())); + map.entry("schema_version".to_string()).or_insert_with(|| { + serde_json::Value::String("ingest_report.v1".to_string()) + }); } match serde_json::to_string(&v) { Ok(json) => to_tool_success(json), diff --git a/crates/kebab-mcp/src/tools/mod.rs b/crates/kebab-mcp/src/tools/mod.rs index f06f91b..b798687 100644 --- a/crates/kebab-mcp/src/tools/mod.rs +++ b/crates/kebab-mcp/src/tools/mod.rs @@ -1,10 +1,10 @@ //! Tool implementations — one module per tool. -pub mod schema; -pub mod doctor; -pub mod search; pub mod ask; +pub mod bulk_search; +pub mod doctor; +pub mod fetch; pub mod ingest_file; pub mod ingest_stdin; -pub mod fetch; -pub mod bulk_search; +pub mod schema; +pub mod search; diff --git a/crates/kebab-mcp/src/tools/schema.rs b/crates/kebab-mcp/src/tools/schema.rs index 2bf5e34..a66d628 100644 --- a/crates/kebab-mcp/src/tools/schema.rs +++ b/crates/kebab-mcp/src/tools/schema.rs @@ -2,8 +2,8 @@ //! Input: {} (no args). Output: schema.v1 JSON. use rmcp::model::CallToolResult; -use serde::{Deserialize, Serialize}; use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; use crate::error::{to_tool_error, to_tool_success}; use crate::state::KebabAppState; diff --git a/crates/kebab-mcp/src/tools/search.rs b/crates/kebab-mcp/src/tools/search.rs index 2586294..00e67be 100644 --- a/crates/kebab-mcp/src/tools/search.rs +++ b/crates/kebab-mcp/src/tools/search.rs @@ -79,15 +79,10 @@ pub fn handle(state: &KebabAppState, input: SearchInput) -> CallToolResult { let ingested_after = match input.ingested_after.as_deref() { Some(s) => { - match time::OffsetDateTime::parse( - s, - &time::format_description::well_known::Rfc3339, - ) { + match time::OffsetDateTime::parse(s, &time::format_description::well_known::Rfc3339) { Ok(ts) => Some(ts), Err(e) => { - return invalid_input(&format!( - "ingested_after: invalid RFC3339 '{s}': {e}" - )); + return invalid_input(&format!("ingested_after: invalid RFC3339 '{s}': {e}")); } } } @@ -152,8 +147,7 @@ pub fn handle(state: &KebabAppState, input: SearchInput) -> CallToolResult { "truncated": resp.truncated, }); if let Some(trace) = &resp.trace { - let trace_v = - serde_json::to_value(trace).unwrap_or(serde_json::Value::Null); + let trace_v = serde_json::to_value(trace).unwrap_or(serde_json::Value::Null); if let serde_json::Value::Object(ref mut map) = envelope { map.insert("trace".to_string(), trace_v); } diff --git a/crates/kebab-mcp/tests/error_mapping.rs b/crates/kebab-mcp/tests/error_mapping.rs index 739d986..7763972 100644 --- a/crates/kebab-mcp/tests/error_mapping.rs +++ b/crates/kebab-mcp/tests/error_mapping.rs @@ -23,7 +23,11 @@ async fn schema_tool_emits_error_v1_when_db_missing() { handler.state(), kebab_mcp::tools::schema::SchemaInput::default(), ); - assert_eq!(result.is_error, Some(true), "expected isError=true on missing DB"); + assert_eq!( + result.is_error, + Some(true), + "expected isError=true on missing DB" + ); let content = result.content.first().unwrap(); let text = match &content.raw { @@ -31,6 +35,9 @@ async fn schema_tool_emits_error_v1_when_db_missing() { other => panic!("expected text content, got {other:?}"), }; let v: serde_json::Value = serde_json::from_str(text).unwrap(); - assert_eq!(v.get("schema_version").and_then(|s| s.as_str()), Some("error.v1")); + assert_eq!( + v.get("schema_version").and_then(|s| s.as_str()), + Some("error.v1") + ); assert_eq!(v.get("code").and_then(|s| s.as_str()), Some("not_indexed")); } diff --git a/crates/kebab-mcp/tests/tools_call_ask.rs b/crates/kebab-mcp/tests/tools_call_ask.rs index 641e4bd..5a5ea0c 100644 --- a/crates/kebab-mcp/tests/tools_call_ask.rs +++ b/crates/kebab-mcp/tests/tools_call_ask.rs @@ -9,10 +9,7 @@ use rmcp::model::RawContent; fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config { let mut cfg = Config::defaults(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); - cfg.storage.model_dir = data_dir - .join("models") - .to_string_lossy() - .into_owned(); + cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); cfg.workspace.exclude.clear(); cfg.models.embedding.provider = "none".to_string(); diff --git a/crates/kebab-mcp/tests/tools_call_bulk_search.rs b/crates/kebab-mcp/tests/tools_call_bulk_search.rs index a326935..348a439 100644 --- a/crates/kebab-mcp/tests/tools_call_bulk_search.rs +++ b/crates/kebab-mcp/tests/tools_call_bulk_search.rs @@ -31,7 +31,11 @@ fn setup() -> (tempfile::TempDir, KebabHandler) { "# Alpha\n\nThis document mentions kebab and bread.", ) .unwrap(); - let scope = SourceScope { root: workspace_root.clone(), include: vec![], exclude: vec![] }; + let scope = SourceScope { + root: workspace_root.clone(), + include: vec![], + exclude: vec![], + }; let _ = kebab_app::ingest_with_config(config.clone(), scope, false).unwrap(); let state = KebabAppState::new(config, None); let handler = KebabHandler::new(state); @@ -39,7 +43,10 @@ fn setup() -> (tempfile::TempDir, KebabHandler) { } fn extract_json(result: &rmcp::model::CallToolResult) -> serde_json::Value { - assert!(!result.is_error.unwrap_or(false), "expected isError=false, got {result:?}"); + assert!( + !result.is_error.unwrap_or(false), + "expected isError=false, got {result:?}" + ); let content = result.content.first().expect("at least one content item"); let text = match &content.raw { RawContent::Text(t) => &t.text, @@ -89,7 +96,7 @@ async fn bulk_search_invalid_item_field_continues_with_per_item_error() { let input = kebab_mcp::tools::bulk_search::BulkSearchInput { queries: vec![ json!({"query": "kebab", "mode": "lexical"}), - json!({"query": "bread", "mode": "bogus"}), // invalid mode + json!({"query": "bread", "mode": "bogus"}), // invalid mode ], }; let result = kebab_mcp::tools::bulk_search::handle(handler.state(), input); @@ -117,5 +124,8 @@ async fn bulk_search_over_cap_returns_tool_error() { RawContent::Text(t) => &t.text, other => panic!("expected Text content, got {other:?}"), }; - assert!(text.contains("max 100"), "expected 'max 100' in error: {text}"); + assert!( + text.contains("max 100"), + "expected 'max 100' in error: {text}" + ); } diff --git a/crates/kebab-mcp/tests/tools_call_fetch.rs b/crates/kebab-mcp/tests/tools_call_fetch.rs index 2810218..fff3b72 100644 --- a/crates/kebab-mcp/tests/tools_call_fetch.rs +++ b/crates/kebab-mcp/tests/tools_call_fetch.rs @@ -15,10 +15,7 @@ use rmcp::model::RawContent; fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config { let mut cfg = Config::defaults(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); - cfg.storage.model_dir = data_dir - .join("models") - .to_string_lossy() - .into_owned(); + cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); cfg.workspace.exclude.clear(); cfg.models.embedding.provider = "none".to_string(); diff --git a/crates/kebab-mcp/tests/tools_call_ingest_file.rs b/crates/kebab-mcp/tests/tools_call_ingest_file.rs index 43a9bd1..5afe75f 100644 --- a/crates/kebab-mcp/tests/tools_call_ingest_file.rs +++ b/crates/kebab-mcp/tests/tools_call_ingest_file.rs @@ -112,6 +112,14 @@ async fn ingest_file_tool_idempotent_on_second_call() { other => panic!("expected text, got {other:?}"), }; let v2: serde_json::Value = serde_json::from_str(text2).unwrap(); - assert_eq!(v2.get("new").and_then(serde_json::Value::as_u64), Some(0), "{v2:?}"); - assert_eq!(v2.get("unchanged").and_then(serde_json::Value::as_u64), Some(1), "{v2:?}"); + assert_eq!( + v2.get("new").and_then(serde_json::Value::as_u64), + Some(0), + "{v2:?}" + ); + assert_eq!( + v2.get("unchanged").and_then(serde_json::Value::as_u64), + Some(1), + "{v2:?}" + ); } diff --git a/crates/kebab-mcp/tests/tools_call_schema.rs b/crates/kebab-mcp/tests/tools_call_schema.rs index eccdf0f..135bbc3 100644 --- a/crates/kebab-mcp/tests/tools_call_schema.rs +++ b/crates/kebab-mcp/tests/tools_call_schema.rs @@ -10,10 +10,7 @@ use rmcp::model::RawContent; fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config { let mut cfg = Config::defaults(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); - cfg.storage.model_dir = data_dir - .join("models") - .to_string_lossy() - .into_owned(); + cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); cfg.workspace.exclude.clear(); cfg.models.embedding.provider = "none".to_string(); @@ -52,7 +49,10 @@ async fn schema_tool_returns_schema_v1_json() { "expected isError=false on healthy schema, got {result:?}" ); - let content = result.content.first().expect("expected at least one content item"); + let content = result + .content + .first() + .expect("expected at least one content item"); // Content = Annotated; deref to get the inner RawContent. let text = match &content.raw { @@ -67,7 +67,9 @@ async fn schema_tool_returns_schema_v1_json() { "unexpected schema_version in: {v}" ); assert_eq!( - v.get("capabilities").and_then(|c| c.get("mcp_server")).and_then(serde_json::Value::as_bool), + v.get("capabilities") + .and_then(|c| c.get("mcp_server")) + .and_then(serde_json::Value::as_bool), Some(true), "mcp_server capability flag should be true after fb-30", ); diff --git a/crates/kebab-mcp/tests/tools_call_search.rs b/crates/kebab-mcp/tests/tools_call_search.rs index 5152fce..8ca7b55 100644 --- a/crates/kebab-mcp/tests/tools_call_search.rs +++ b/crates/kebab-mcp/tests/tools_call_search.rs @@ -10,10 +10,7 @@ use rmcp::model::RawContent; fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config { let mut cfg = Config::defaults(); cfg.storage.data_dir = data_dir.to_string_lossy().into_owned(); - cfg.storage.model_dir = data_dir - .join("models") - .to_string_lossy() - .into_owned(); + cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned(); cfg.workspace.root = workspace_root.to_string_lossy().into_owned(); cfg.workspace.exclude.clear(); cfg.models.embedding.provider = "none".to_string(); @@ -99,15 +96,15 @@ async fn search_tool_returns_search_response_v1() { "expected at least one hit for 'kebab' in 'a.md'" ); assert_eq!( - hits[0] - .get("schema_version") - .and_then(|s| s.as_str()), + hits[0].get("schema_version").and_then(|s| s.as_str()), Some("search_hit.v1"), "first hit should carry schema_version=search_hit.v1" ); // truncated must be present (bool); next_cursor may be null on last page. assert!( - v.get("truncated").and_then(serde_json::Value::as_bool).is_some(), + v.get("truncated") + .and_then(serde_json::Value::as_bool) + .is_some(), "envelope should carry truncated:bool" ); assert!( diff --git a/crates/kebab-mcp/tests/tools_call_search_trace.rs b/crates/kebab-mcp/tests/tools_call_search_trace.rs index 1cb07cd..f1e5835 100644 --- a/crates/kebab-mcp/tests/tools_call_search_trace.rs +++ b/crates/kebab-mcp/tests/tools_call_search_trace.rs @@ -79,7 +79,10 @@ async fn search_with_trace_true_returns_trace_field() { let result = kebab_mcp::tools::search::handle(handler.state(), make_input(Some(true))); let v = extract_json(&result); assert_eq!(v["schema_version"], "search_response.v1"); - assert!(v["trace"].is_object(), "trace field present when trace:true"); + assert!( + v["trace"].is_object(), + "trace field present when trace:true" + ); assert!(v["trace"]["timing"]["total_ms"].is_number()); assert!(v["trace"]["lexical"].is_array()); assert!(v["trace"]["vector"].is_array()); diff --git a/crates/kebab-mcp/tests/tools_list.rs b/crates/kebab-mcp/tests/tools_list.rs index 5746bdf..9d4bcb1 100644 --- a/crates/kebab-mcp/tests/tools_list.rs +++ b/crates/kebab-mcp/tests/tools_list.rs @@ -7,7 +7,12 @@ use kebab_mcp::build_tools_vec; #[test] fn tools_list_returns_eight_tools() { let tools = build_tools_vec(); - assert_eq!(tools.len(), 8, "expected exactly 8 tools, got {}", tools.len()); + assert_eq!( + tools.len(), + 8, + "expected exactly 8 tools, got {}", + tools.len() + ); let names: Vec<&str> = tools.iter().map(|t| t.name.as_ref()).collect(); assert!(names.contains(&"schema"), "missing 'schema' tool"); @@ -15,7 +20,10 @@ fn tools_list_returns_eight_tools() { assert!(names.contains(&"search"), "missing 'search' tool"); assert!(names.contains(&"ask"), "missing 'ask' tool"); assert!(names.contains(&"ingest_file"), "missing 'ingest_file' tool"); - assert!(names.contains(&"ingest_stdin"), "missing 'ingest_stdin' tool"); + assert!( + names.contains(&"ingest_stdin"), + "missing 'ingest_stdin' tool" + ); assert!(names.contains(&"fetch"), "missing 'fetch' tool"); assert!(names.contains(&"bulk_search"), "missing 'bulk_search' tool"); } diff --git a/crates/kebab-nli/src/onnx.rs b/crates/kebab-nli/src/onnx.rs index 981b69e..d38e8bd 100644 --- a/crates/kebab-nli/src/onnx.rs +++ b/crates/kebab-nli/src/onnx.rs @@ -20,9 +20,7 @@ use std::sync::OnceLock; use anyhow::{Context, Result, anyhow}; use kebab_config::expand_path; use ort::session::Session; -use tokenizers::{ - Tokenizer, TruncationDirection, TruncationParams, TruncationStrategy, -}; +use tokenizers::{Tokenizer, TruncationDirection, TruncationParams, TruncationStrategy}; use crate::{NliScores, NliVerifier}; @@ -218,8 +216,12 @@ impl OnnxNliVerifier { fn load_tokenizer(&self) -> Result { let tokenizer_path = self.fetch(HF_TOKENIZER_FILE)?; - let mut tokenizer = Tokenizer::from_file(&tokenizer_path) - .map_err(|e| anyhow!("kebab-nli: Tokenizer::from_file({}) failed: {e}", tokenizer_path.display()))?; + let mut tokenizer = Tokenizer::from_file(&tokenizer_path).map_err(|e| { + anyhow!( + "kebab-nli: Tokenizer::from_file({}) failed: {e}", + tokenizer_path.display() + ) + })?; tokenizer .with_truncation(Some(TruncationParams { max_length: MAX_TOKENS, @@ -354,7 +356,9 @@ mod tests { fn score_empty_hypothesis_returns_err() { let (_tmp, cfg) = tempdir_config(); let v = OnnxNliVerifier::new(&cfg).unwrap(); - let err = v.score("anything", "").expect_err("empty hypothesis must error"); + let err = v + .score("anything", "") + .expect_err("empty hypothesis must error"); assert!( err.to_string().contains("empty hypothesis"), "unexpected error message: {err}" diff --git a/crates/kebab-nli/tests/inference.rs b/crates/kebab-nli/tests/inference.rs index a702a72..75e65de 100644 --- a/crates/kebab-nli/tests/inference.rs +++ b/crates/kebab-nli/tests/inference.rs @@ -152,7 +152,10 @@ fn score_long_en_hypothesis_returns_err_without_pipeline_truncation() { let premise = "short premise"; let hypothesis = "lorem ipsum ".repeat(500); // ~6 000 chars / >>512 tokens let result = v.score(premise, &hypothesis); - assert!(result.is_err(), "long hypothesis should err under OnlyFirst"); + assert!( + result.is_err(), + "long hypothesis should err under OnlyFirst" + ); let msg = result.err().unwrap().to_string(); assert!( msg.contains("Truncation error") || msg.contains("too short to respect"), diff --git a/crates/kebab-parse-code/src/c.rs b/crates/kebab-parse-code/src/c.rs index 0ff2114..5a834a2 100644 --- a/crates/kebab-parse-code/src/c.rs +++ b/crates/kebab-parse-code/src/c.rs @@ -300,12 +300,7 @@ fn build_blocks( if units.is_empty() { // Completely empty file or whitespace/comments only. let total = lines.len() as u32; - units.push(( - "".to_string(), - 1, - total.max(1), - false, - )); + units.push(("".to_string(), 1, total.max(1), false)); } // If there is only glue (no real unit) the single pushed "" // label should be "" — rename it now. @@ -383,10 +378,7 @@ fn recover_typedef_alias(node: tree_sitter::Node, source: &str) -> Option( - decl: tree_sitter::Node, - source: &'a str, -) -> Option<&'a str> { +fn extract_typedef_alias_name<'a>(decl: tree_sitter::Node, source: &'a str) -> Option<&'a str> { if decl.kind() == "type_identifier" { return Some(&source[decl.start_byte()..decl.end_byte()]); } @@ -490,7 +482,10 @@ mod tests { let src = "int *find(int *arr, int n) { return arr; }\n"; let doc = tests_support::extract_c(src, "x/find.c"); let s = syms(&doc); - assert!(s.iter().any(|x| x == "find"), "ptr-return fn missing: {s:?}"); + assert!( + s.iter().any(|x| x == "find"), + "ptr-return fn missing: {s:?}" + ); } #[test] @@ -695,7 +690,10 @@ void print_result(int v) { let s = syms(&doc); // Two real functions + one glue block assert!(s.iter().any(|x| x == "compute"), "compute missing: {s:?}"); - assert!(s.iter().any(|x| x == "print_result"), "print_result missing: {s:?}"); + assert!( + s.iter().any(|x| x == "print_result"), + "print_result missing: {s:?}" + ); assert!( s.iter().any(|x| x == ""), " glue missing: {s:?}" @@ -711,10 +709,7 @@ void noop(void) {} "; let a = tests_support::extract_c(src, "x/det.c"); for _ in 0..20 { - assert_eq!( - tests_support::extract_c(src, "x/det.c").blocks, - a.blocks - ); + assert_eq!(tests_support::extract_c(src, "x/det.c").blocks, a.blocks); } } } diff --git a/crates/kebab-parse-code/src/cpp.rs b/crates/kebab-parse-code/src/cpp.rs index 3661df1..7a8f838 100644 --- a/crates/kebab-parse-code/src/cpp.rs +++ b/crates/kebab-parse-code/src/cpp.rs @@ -98,9 +98,8 @@ impl Extractor for CppAstExtractor { let parser_version = self.parser_version(); let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); - let source = String::from_utf8(bytes.to_vec()).map_err(|e| { - anyhow::anyhow!("kebab-parse-code: C++ source is not valid UTF-8: {e}") - })?; + let source = String::from_utf8(bytes.to_vec()) + .map_err(|e| anyhow::anyhow!("kebab-parse-code: C++ source is not valid UTF-8: {e}"))?; let blocks = build_blocks_top(&source, &doc_id)?; let unit_count = blocks.len() as u32; @@ -309,9 +308,7 @@ fn build_blocks( flush_glue(glue, units); let name_node = child.child_by_field_name("name"); - let body = child - .child_by_field_name("body") - .unwrap_or(child); + let body = child.child_by_field_name("body").unwrap_or(child); match name_node { None => { @@ -335,7 +332,8 @@ fn build_blocks( let mut new_prefix = prefix.to_vec(); let mut nc = nn.walk(); for seg in nn.named_children(&mut nc) { - new_prefix.push(source[seg.start_byte()..seg.end_byte()].to_string()); + new_prefix + .push(source[seg.start_byte()..seg.end_byte()].to_string()); } build_blocks(body, source, &new_prefix, units, glue); flush_glue(glue, units); @@ -528,11 +526,7 @@ fn unwrap_to_fn_declarator<'a>( } /// Given the innermost name node of a function_declarator, produce the symbol. -fn extract_name_node( - inner: tree_sitter::Node, - source: &str, - prefix: &[String], -) -> Option { +fn extract_name_node(inner: tree_sitter::Node, source: &str, prefix: &[String]) -> Option { match inner.kind() { "identifier" | "field_identifier" => { let name = &source[inner.start_byte()..inner.end_byte()]; @@ -652,7 +646,9 @@ pub(crate) mod tests_support { workspace_root: &root, config: &cfg, }; - CppAstExtractor::new().extract(&ctx, src.as_bytes()).unwrap() + CppAstExtractor::new() + .extract(&ctx, src.as_bytes()) + .unwrap() } } @@ -710,10 +706,19 @@ namespace ns { let doc = tests_support::extract_cpp(src, "x/foo.cpp"); let s = syms(&doc); assert!(s.iter().any(|x| x == "ns::Foo"), "ns::Foo missing: {s:?}"); - assert!(s.iter().any(|x| x == "ns::Foo::method"), "method missing: {s:?}"); + assert!( + s.iter().any(|x| x == "ns::Foo::method"), + "method missing: {s:?}" + ); assert!(s.iter().any(|x| x == "ns::Foo::Foo"), "ctor missing: {s:?}"); - assert!(s.iter().any(|x| x == "ns::Foo::~Foo"), "dtor missing: {s:?}"); - assert!(s.iter().any(|x| x == "ns::Foo::operator+"), "op+ missing: {s:?}"); + assert!( + s.iter().any(|x| x == "ns::Foo::~Foo"), + "dtor missing: {s:?}" + ); + assert!( + s.iter().any(|x| x == "ns::Foo::operator+"), + "op+ missing: {s:?}" + ); } #[test] @@ -794,7 +799,10 @@ concept Printable = requires(T t) { t.print(); }; let doc = tests_support::extract_cpp(src, "x/foo.cpp"); let s = syms(&doc); assert!(s.iter().any(|x| x == "Color"), "Color missing: {s:?}"); - assert!(s.iter().any(|x| x == "Printable"), "Printable missing: {s:?}"); + assert!( + s.iter().any(|x| x == "Printable"), + "Printable missing: {s:?}" + ); } #[test] @@ -839,7 +847,10 @@ class Foo { let src = "#include \nusing namespace std;\n"; let doc = tests_support::extract_cpp(src, "x/glue.cpp"); let s = syms(&doc); - assert!(s.iter().any(|x| x == ""), "expected : got {s:?}"); + assert!( + s.iter().any(|x| x == ""), + "expected : got {s:?}" + ); } #[test] @@ -877,7 +888,10 @@ void free_fn() {} "; let a = tests_support::extract_cpp(src, "x/foo.cpp"); for _ in 0..20 { - assert_eq!(tests_support::extract_cpp(src, "x/foo.cpp").blocks, a.blocks); + assert_eq!( + tests_support::extract_cpp(src, "x/foo.cpp").blocks, + a.blocks + ); } } } diff --git a/crates/kebab-parse-code/src/go.rs b/crates/kebab-parse-code/src/go.rs index 76a9d87..f6ac8cf 100644 --- a/crates/kebab-parse-code/src/go.rs +++ b/crates/kebab-parse-code/src/go.rs @@ -363,7 +363,11 @@ fn flush_glue( // imports (1A's `only_mod_decls` analog). The post-pass demotes any // `` to `` if the file produced any real unit. let only_imports = glue.iter().all(|(is_import, _, _)| *is_import == 1); - let label = if only_imports { "" } else { "" }; + let label = if only_imports { + "" + } else { + "" + }; units.push((join_symbol(mod_prefix, &[], label), s, e, false)); glue.clear(); } @@ -429,8 +433,7 @@ mod tests { "got {syms:?}" ); assert!( - syms.iter() - .any(|s| s == "chunk.(MdHeadingV1Chunker).Name2"), + syms.iter().any(|s| s == "chunk.(MdHeadingV1Chunker).Name2"), "got {syms:?}" ); assert!(syms.iter().any(|s| s == "chunk.Stringer"), "got {syms:?}"); diff --git a/crates/kebab-parse-code/src/java.rs b/crates/kebab-parse-code/src/java.rs index 9cf6604..be28326 100644 --- a/crates/kebab-parse-code/src/java.rs +++ b/crates/kebab-parse-code/src/java.rs @@ -83,8 +83,9 @@ impl Extractor for JavaAstExtractor { let parser_version = self.parser_version(); let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); - let source = String::from_utf8(bytes.to_vec()) - .map_err(|e| anyhow::anyhow!("kebab-parse-code: Java source is not valid UTF-8: {e}"))?; + let source = String::from_utf8(bytes.to_vec()).map_err(|e| { + anyhow::anyhow!("kebab-parse-code: Java source is not valid UTF-8: {e}") + })?; let blocks = build_blocks(&source, &doc_id)?; let unit_count = blocks.len() as u32; @@ -302,9 +303,7 @@ fn walk_top( let s = unit_start(&child); let e = child.end_position().row as u32 + 1; match child.kind() { - "class_declaration" - | "interface_declaration" - | "record_declaration" => { + "class_declaration" | "interface_declaration" | "record_declaration" => { if let Some(name) = node_name_text(&child, src) { glue.retain(|(_, gs, _)| *gs < s); flush_glue(glue, units, mod_prefix, mod_path); @@ -426,7 +425,11 @@ fn flush_glue( // imports (1A's `only_mod_decls` analog). The post-pass demotes any // `` to `` if the file produced any real unit. let only_imports = glue.iter().all(|(is_import, _, _)| *is_import == 1); - let label = if only_imports { "" } else { "" }; + let label = if only_imports { + "" + } else { + "" + }; units.push((join_symbol(mod_prefix, mod_path, label), s, e, false)); glue.clear(); } @@ -482,7 +485,8 @@ mod tests { syms.sort(); // package extracted from source = com.kebab.chunk assert!( - syms.iter().any(|s| s == "com.kebab.chunk.MdHeadingV1Chunker"), + syms.iter() + .any(|s| s == "com.kebab.chunk.MdHeadingV1Chunker"), "got {syms:?}" ); // constructor — Java convention is class-name-as-method-name diff --git a/crates/kebab-parse-code/src/javascript.rs b/crates/kebab-parse-code/src/javascript.rs index 88301e7..321cdc8 100644 --- a/crates/kebab-parse-code/src/javascript.rs +++ b/crates/kebab-parse-code/src/javascript.rs @@ -293,7 +293,8 @@ fn build_blocks( let inner_kind = inner.kind(); match inner_kind { "function_declaration" | "class_declaration" => { - let name_opt = name_text(&inner, src).map(std::string::ToString::to_string); + let name_opt = + name_text(&inner, src).map(std::string::ToString::to_string); if let Some(name) = name_opt { glue.retain(|(_, gs, _)| *gs < outer_s); flush_glue(glue, units, mod_prefix, mod_path); @@ -332,9 +333,9 @@ fn build_blocks( | "function_declaration" | "class" | "class_declaration" => { - let name_opt = name_text(&value, src).map(std::string::ToString::to_string); - let leaf = - name_opt.as_deref().unwrap_or("default").to_string(); + let name_opt = + name_text(&value, src).map(std::string::ToString::to_string); + let leaf = name_opt.as_deref().unwrap_or("default").to_string(); glue.retain(|(_, gs, _)| *gs < outer_s); flush_glue(glue, units, mod_prefix, mod_path); let sym = join_symbol(mod_prefix, mod_path, &leaf); @@ -383,7 +384,11 @@ fn build_blocks( let s = glue.iter().map(|(_, a, _)| *a).min().unwrap(); let e = glue.iter().map(|(_, _, b)| *b).max().unwrap(); let only_module = glue.iter().all(|(is_mod, _, _)| *is_mod == 1); - let label = if only_module { "" } else { "" }; + let label = if only_module { + "" + } else { + "" + }; units.push((join_symbol(mod_prefix, mod_path, label), s, e, false)); glue.clear(); } @@ -442,9 +447,10 @@ mod tests { use kebab_core::{Block, MediaType, SourceSpan}; fn extract_fixture(workspace_path: &str) -> kebab_core::CanonicalDocument { - let bytes = std::fs::read( - concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/sample.js"), - ) + let bytes = std::fs::read(concat!( + env!("CARGO_MANIFEST_DIR"), + "/tests/fixtures/sample.js" + )) .unwrap(); let asset = crate::rust::tests_support::fixed_code_asset(workspace_path, "javascript"); let cfg = kebab_core::ExtractConfig::default(); diff --git a/crates/kebab-parse-code/src/kotlin.rs b/crates/kebab-parse-code/src/kotlin.rs index 5db947c..1e95632 100644 --- a/crates/kebab-parse-code/src/kotlin.rs +++ b/crates/kebab-parse-code/src/kotlin.rs @@ -503,7 +503,11 @@ fn flush_glue( // imports. The post-pass demotes any `` to `` if // the file produced any real unit. let only_imports = glue.iter().all(|(is_import, _, _)| *is_import == 1); - let label = if only_imports { "" } else { "" }; + let label = if only_imports { + "" + } else { + "" + }; units.push((join_symbol(mod_prefix, mod_path, label), s, e, false)); glue.clear(); } diff --git a/crates/kebab-parse-code/src/lang.rs b/crates/kebab-parse-code/src/lang.rs index 4590e95..e57f83e 100644 --- a/crates/kebab-parse-code/src/lang.rs +++ b/crates/kebab-parse-code/src/lang.rs @@ -57,16 +57,25 @@ mod tests { #[test] fn module_path_for_python_strips_src_roots_and_extensions() { - assert_eq!(module_path_for_python("kebab_eval/metrics.py"), "kebab_eval.metrics"); - assert_eq!(module_path_for_python("kebab_eval/__init__.py"), "kebab_eval"); - assert_eq!(module_path_for_python("src/foo/bar.py"), "foo.bar"); - assert_eq!(module_path_for_python("crates/x/src/foo/bar.py"), "foo.bar"); - assert_eq!(module_path_for_python("a/b/c.pyi"), "a.b.c"); - assert_eq!(module_path_for_python("standalone.py"), "standalone"); - assert_eq!(module_path_for_python("src/__init__.py"), ""); + assert_eq!( + module_path_for_python("kebab_eval/metrics.py"), + "kebab_eval.metrics" + ); + assert_eq!( + module_path_for_python("kebab_eval/__init__.py"), + "kebab_eval" + ); + assert_eq!(module_path_for_python("src/foo/bar.py"), "foo.bar"); + assert_eq!(module_path_for_python("crates/x/src/foo/bar.py"), "foo.bar"); + assert_eq!(module_path_for_python("a/b/c.pyi"), "a.b.c"); + assert_eq!(module_path_for_python("standalone.py"), "standalone"); + assert_eq!(module_path_for_python("src/__init__.py"), ""); // `tests/` is NOT a stripped source-root — it is preserved as // part of the module path so test symbols stay namespaced. - assert_eq!(module_path_for_python("tests/test_foo.py"), "tests.test_foo"); + assert_eq!( + module_path_for_python("tests/test_foo.py"), + "tests.test_foo" + ); } #[test] @@ -75,8 +84,11 @@ mod tests { let p = format!("src/search/retriever/Retriever.{ext}"); assert_eq!(module_path_for_tsjs(&p), "src/search/retriever/Retriever"); } - assert_eq!(module_path_for_tsjs("foo.ts"), "foo"); - assert_eq!(module_path_for_tsjs("a/b/c.ts"), "a/b/c"); - assert_eq!(module_path_for_tsjs("packages/x/src/Foo.ts"), "packages/x/src/Foo"); + assert_eq!(module_path_for_tsjs("foo.ts"), "foo"); + assert_eq!(module_path_for_tsjs("a/b/c.ts"), "a/b/c"); + assert_eq!( + module_path_for_tsjs("packages/x/src/Foo.ts"), + "packages/x/src/Foo" + ); } } diff --git a/crates/kebab-parse-code/src/lib.rs b/crates/kebab-parse-code/src/lib.rs index 3126801..b487450 100644 --- a/crates/kebab-parse-code/src/lib.rs +++ b/crates/kebab-parse-code/src/lib.rs @@ -19,12 +19,12 @@ pub mod rust; pub(crate) mod scaffold; pub mod typescript; -pub use c::{PARSER_VERSION as C_PARSER_VERSION, CAstExtractor}; -pub use cpp::{PARSER_VERSION as CPP_PARSER_VERSION, CppAstExtractor}; -pub use go::{PARSER_VERSION as GO_PARSER_VERSION, GoAstExtractor}; -pub use java::{PARSER_VERSION as JAVA_PARSER_VERSION, JavaAstExtractor}; -pub use javascript::{PARSER_VERSION as JS_PARSER_VERSION, JavascriptAstExtractor}; -pub use kotlin::{PARSER_VERSION as KOTLIN_PARSER_VERSION, KotlinAstExtractor}; +pub use c::{CAstExtractor, PARSER_VERSION as C_PARSER_VERSION}; +pub use cpp::{CppAstExtractor, PARSER_VERSION as CPP_PARSER_VERSION}; +pub use go::{GoAstExtractor, PARSER_VERSION as GO_PARSER_VERSION}; +pub use java::{JavaAstExtractor, PARSER_VERSION as JAVA_PARSER_VERSION}; +pub use javascript::{JavascriptAstExtractor, PARSER_VERSION as JS_PARSER_VERSION}; +pub use kotlin::{KotlinAstExtractor, PARSER_VERSION as KOTLIN_PARSER_VERSION}; pub use lang::{module_path_for_python, module_path_for_tsjs}; pub use python::{PARSER_VERSION as PYTHON_PARSER_VERSION, PythonAstExtractor}; pub use repo::{RepoMeta, detect_repo}; diff --git a/crates/kebab-parse-code/src/python.rs b/crates/kebab-parse-code/src/python.rs index 4959901..1c7956a 100644 --- a/crates/kebab-parse-code/src/python.rs +++ b/crates/kebab-parse-code/src/python.rs @@ -319,12 +319,23 @@ fn build_blocks( // demotes any `` to `` if the file produced // any real unit. let only_imports = glue.iter().all(|(is_import, _, _)| *is_import == 1); - let label = if only_imports { "" } else { "" }; + let label = if only_imports { + "" + } else { + "" + }; units.push((join_symbol(mod_prefix, mod_path, label), s, e, false)); glue.clear(); } - walk(tree.root_node(), source, mod_prefix, &[], &mut units, &mut glue); + walk( + tree.root_node(), + source, + mod_prefix, + &[], + &mut units, + &mut glue, + ); // `` is correct only when the file produced no real unit. // Otherwise the import-only group becomes `` (same @@ -373,17 +384,18 @@ mod tests { use kebab_core::{Block, MediaType, SourceSpan}; fn extract_fixture() -> kebab_core::CanonicalDocument { - let bytes = std::fs::read( - concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/sample.py"), - ) + let bytes = std::fs::read(concat!( + env!("CARGO_MANIFEST_DIR"), + "/tests/fixtures/sample.py" + )) .unwrap(); - let asset = crate::rust::tests_support::fixed_code_asset( - "kebab_eval/metrics.py", "python", - ); + let asset = crate::rust::tests_support::fixed_code_asset("kebab_eval/metrics.py", "python"); let cfg = kebab_core::ExtractConfig::default(); let root = std::path::PathBuf::from("/tmp"); let ctx = kebab_core::ExtractContext { - asset: &asset, workspace_root: &root, config: &cfg, + asset: &asset, + workspace_root: &root, + config: &cfg, }; PythonAstExtractor::new().extract(&ctx, &bytes).unwrap() } @@ -399,16 +411,20 @@ mod tests { #[test] fn python_units_carry_module_prefixed_symbols() { let doc = extract_fixture(); - let mut syms: Vec = doc.blocks.iter().map(|b| match b { - Block::Code(c) => match &c.common.source_span { - SourceSpan::Code { symbol, lang, .. } => { - assert_eq!(lang.as_deref(), Some("python")); - symbol.clone().unwrap() - } - _ => panic!("expected SourceSpan::Code"), - }, - other => panic!("expected Block::Code, got {other:?}"), - }).collect(); + let mut syms: Vec = doc + .blocks + .iter() + .map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, lang, .. } => { + assert_eq!(lang.as_deref(), Some("python")); + symbol.clone().unwrap() + } + _ => panic!("expected SourceSpan::Code"), + }, + other => panic!("expected Block::Code, got {other:?}"), + }) + .collect(); syms.sort(); assert!(syms.iter().any(|s| s == "kebab_eval.metrics.free")); assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Foo")); @@ -416,8 +432,14 @@ mod tests { assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Foo.name")); assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Outer")); assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Outer.Inner")); - assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Outer.Inner.helper")); - assert!(syms.iter().any(|s| s == "kebab_eval.metrics.with_decorator")); + assert!( + syms.iter() + .any(|s| s == "kebab_eval.metrics.Outer.Inner.helper") + ); + assert!( + syms.iter() + .any(|s| s == "kebab_eval.metrics.with_decorator") + ); assert!(syms.iter().any(|s| s == "kebab_eval.metrics.")); // The `@no_type_check` decorator on `free` is folded into its // unit's line range (decorated_definition unwrap). @@ -426,12 +448,17 @@ mod tests { SourceSpan::Code{symbol,..} if symbol.as_deref()==Some("kebab_eval.metrics.free")) => Some(c.code.clone()), _ => None, }).unwrap(); - assert!(free_src.contains("@no_type_check"), "decorator folded in: {free_src}"); + assert!( + free_src.contains("@no_type_check"), + "decorator folded in: {free_src}" + ); } #[test] fn deterministic_across_runs() { let a = extract_fixture(); - for _ in 0..50 { assert_eq!(extract_fixture().blocks, a.blocks); } + for _ in 0..50 { + assert_eq!(extract_fixture().blocks, a.blocks); + } } } diff --git a/crates/kebab-parse-code/src/repo.rs b/crates/kebab-parse-code/src/repo.rs index 6798fbe..0f4110b 100644 --- a/crates/kebab-parse-code/src/repo.rs +++ b/crates/kebab-parse-code/src/repo.rs @@ -32,10 +32,18 @@ pub fn detect_repo(path: &Path) -> Option { if dotgit.is_dir() { let name = cur.file_name()?.to_string_lossy().into_owned(); let (branch, commit) = read_head(cur); - return Some(RepoMeta { name, branch, commit }); + return Some(RepoMeta { + name, + branch, + commit, + }); } else if dotgit.is_file() { let name = cur.file_name()?.to_string_lossy().into_owned(); - return Some(RepoMeta { name, branch: None, commit: None }); + return Some(RepoMeta { + name, + branch: None, + commit: None, + }); } cur = cur.parent()?; } @@ -50,10 +58,7 @@ fn read_head(repo_dir: &Path) -> (Option, Option) { .flatten() .map(|n| n.shorten().to_string()) .or_else(|| Some("detached".to_string())); - let commit = repo - .head_id() - .ok() - .map(|id| id.to_string()); + let commit = repo.head_id().ok().map(|id| id.to_string()); (branch, commit) } Err(_) => (None, None), diff --git a/crates/kebab-parse-code/src/rust.rs b/crates/kebab-parse-code/src/rust.rs index 29b36c7..d1302a0 100644 --- a/crates/kebab-parse-code/src/rust.rs +++ b/crates/kebab-parse-code/src/rust.rs @@ -224,8 +224,8 @@ fn build_blocks( let s = unit_start(&child); let e = child.end_position().row as u32 + 1; match child.kind() { - "function_item" | "struct_item" | "enum_item" | "union_item" - | "trait_item" | "type_item" => { + "function_item" | "struct_item" | "enum_item" | "union_item" | "trait_item" + | "type_item" => { if let Some(name) = node_name(&child, src) { // Gap 2: a leading attribute/comment that this unit // re-absorbs (via `unit_start`'s upward extension to @@ -296,8 +296,12 @@ fn build_blocks( glue.push((1, s, e)); } } - "use_declaration" | "extern_crate_declaration" | "const_item" - | "static_item" | "attribute_item" | "macro_invocation" => { + "use_declaration" + | "extern_crate_declaration" + | "const_item" + | "static_item" + | "attribute_item" + | "macro_invocation" => { glue.push((0, s, e)); } _ => {} @@ -320,7 +324,11 @@ fn build_blocks( // requires the *whole file* to have produced zero real units; that // demotion to `` happens in the post-pass below. let only_mod_decls = glue.iter().all(|(is_mod, _, _)| *is_mod == 1); - let label = if only_mod_decls { "" } else { "" }; + let label = if only_mod_decls { + "" + } else { + "" + }; // Module-path-prefix the label so glue from `mod inner` carries // module context (`inner::`) and doesn't collide with // file-top-level glue. `prefix` is empty at file top level, so the @@ -379,14 +387,19 @@ mod tests { use kebab_core::{Block, MediaType, SourceSpan}; fn extract_fixture() -> kebab_core::CanonicalDocument { - let bytes = std::fs::read( - concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/sample.rs"), - ) + let bytes = std::fs::read(concat!( + env!("CARGO_MANIFEST_DIR"), + "/tests/fixtures/sample.rs" + )) .unwrap(); let asset = tests_support::fixed_code_asset("crates/x/src/sample.rs", "rust"); let cfg = kebab_core::ExtractConfig::default(); let root = std::path::PathBuf::from("/tmp"); - let ctx = kebab_core::ExtractContext { asset: &asset, workspace_root: &root, config: &cfg }; + let ctx = kebab_core::ExtractContext { + asset: &asset, + workspace_root: &root, + config: &cfg, + }; RustAstExtractor::new().extract(&ctx, &bytes).unwrap() } @@ -406,7 +419,12 @@ mod tests { .iter() .map(|b| match b { Block::Code(c) => match &c.common.source_span { - SourceSpan::Code { symbol, line_start, line_end, lang } => { + SourceSpan::Code { + symbol, + line_start, + line_end, + lang, + } => { assert_eq!(lang.as_deref(), Some("rust")); (symbol.clone().unwrap(), *line_start, *line_end) } @@ -428,7 +446,10 @@ mod tests { Block::Code(c) if matches!(&c.common.source_span, SourceSpan::Code{symbol,..} if symbol.as_deref()==Some("parse")) => Some(c.code.clone()), _ => None, }).unwrap(); - assert!(parse_src.contains("/// Doc comment on a free fn."), "doc comment folded in: {parse_src}"); + assert!( + parse_src.contains("/// Doc comment on a free fn."), + "doc comment folded in: {parse_src}" + ); } /// Run the extractor on an in-memory Rust source string (no fixture @@ -437,7 +458,11 @@ mod tests { let asset = tests_support::fixed_code_asset("crates/x/src/inline.rs", "rust"); let cfg = kebab_core::ExtractConfig::default(); let root = std::path::PathBuf::from("/tmp"); - let ctx = kebab_core::ExtractContext { asset: &asset, workspace_root: &root, config: &cfg }; + let ctx = kebab_core::ExtractContext { + asset: &asset, + workspace_root: &root, + config: &cfg, + }; let doc = RustAstExtractor::new() .extract(&ctx, source.as_bytes()) .unwrap(); @@ -445,9 +470,7 @@ mod tests { .iter() .map(|b| match b { Block::Code(c) => match &c.common.source_span { - SourceSpan::Code { symbol, .. } => { - (symbol.clone().unwrap(), c.code.clone()) - } + SourceSpan::Code { symbol, .. } => (symbol.clone().unwrap(), c.code.clone()), _ => panic!("code block must carry SourceSpan::Code"), }, other => panic!("expected Block::Code, got {other:?}"), diff --git a/crates/kebab-parse-code/src/typescript.rs b/crates/kebab-parse-code/src/typescript.rs index 706eb1d..de07866 100644 --- a/crates/kebab-parse-code/src/typescript.rs +++ b/crates/kebab-parse-code/src/typescript.rs @@ -299,9 +299,7 @@ fn build_blocks( } } } - "interface_declaration" - | "type_alias_declaration" - | "enum_declaration" => { + "interface_declaration" | "type_alias_declaration" | "enum_declaration" => { if let Some(name) = name_text(&child, src) { glue.retain(|(_, gs, _)| *gs < s); flush_glue(glue, units, mod_prefix, mod_path); @@ -326,22 +324,18 @@ fn build_blocks( | "interface_declaration" | "type_alias_declaration" | "enum_declaration" => { - let name_opt = name_text(&inner, src).map(std::string::ToString::to_string); + let name_opt = + name_text(&inner, src).map(std::string::ToString::to_string); if let Some(name) = name_opt { glue.retain(|(_, gs, _)| *gs < outer_s); flush_glue(glue, units, mod_prefix, mod_path); - let sym = - join_symbol(mod_prefix, mod_path, &name); + let sym = join_symbol(mod_prefix, mod_path, &name); units.push((sym, outer_s, outer_e, true)); if inner_kind == "class_declaration" { - if let Some(body) = - inner.child_by_field_name("body") - { + if let Some(body) = inner.child_by_field_name("body") { let mut np = mod_path.to_vec(); np.push(name); - walk_class_body( - body, src, mod_prefix, &np, units, - ); + walk_class_body(body, src, mod_prefix, &np, units); } } } else { @@ -354,8 +348,7 @@ fn build_blocks( // `default` defensively. glue.retain(|(_, gs, _)| *gs < outer_s); flush_glue(glue, units, mod_prefix, mod_path); - let sym = - join_symbol(mod_prefix, mod_path, "default"); + let sym = join_symbol(mod_prefix, mod_path, "default"); units.push((sym, outer_s, outer_e, true)); } } @@ -377,27 +370,17 @@ fn build_blocks( | "class_declaration" => { let name_opt = name_text(&value, src).map(std::string::ToString::to_string); - let leaf = name_opt - .as_deref() - .unwrap_or("default") - .to_string(); + let leaf = name_opt.as_deref().unwrap_or("default").to_string(); glue.retain(|(_, gs, _)| *gs < outer_s); flush_glue(glue, units, mod_prefix, mod_path); let sym = join_symbol(mod_prefix, mod_path, &leaf); units.push((sym, outer_s, outer_e, true)); // Recurse into class body if we have one. - if matches!( - value.kind(), - "class" | "class_declaration" - ) { - if let Some(body) = - value.child_by_field_name("body") - { + if matches!(value.kind(), "class" | "class_declaration") { + if let Some(body) = value.child_by_field_name("body") { let mut np = mod_path.to_vec(); np.push(leaf); - walk_class_body( - body, src, mod_prefix, &np, units, - ); + walk_class_body(body, src, mod_prefix, &np, units); } } } @@ -442,7 +425,11 @@ fn build_blocks( let s = glue.iter().map(|(_, a, _)| *a).min().unwrap(); let e = glue.iter().map(|(_, _, b)| *b).max().unwrap(); let only_module = glue.iter().all(|(is_mod, _, _)| *is_mod == 1); - let label = if only_module { "" } else { "" }; + let label = if only_module { + "" + } else { + "" + }; units.push((join_symbol(mod_prefix, mod_path, label), s, e, false)); glue.clear(); } @@ -514,9 +501,7 @@ mod tests { workspace_root: &root, config: &cfg, }; - TypescriptAstExtractor::new() - .extract(&ctx, &bytes) - .unwrap() + TypescriptAstExtractor::new().extract(&ctx, &bytes).unwrap() } fn symbols(doc: &kebab_core::CanonicalDocument) -> Vec { @@ -565,10 +550,7 @@ mod tests { fn tsx_uses_tsx_grammar_and_emits_units() { let doc = extract_fixture("sample.tsx", "src/sample.tsx"); let syms = symbols(&doc); - assert!( - syms.iter().any(|s| s == "src/sample.Hello"), - "got {syms:?}" - ); + assert!(syms.iter().any(|s| s == "src/sample.Hello"), "got {syms:?}"); assert!( syms.iter().any(|s| s == "src/sample."), "arrow fn + import should roll into top-level glue" @@ -579,7 +561,10 @@ mod tests { fn deterministic_across_runs() { let a = extract_fixture("sample.ts", "src/sample.ts"); for _ in 0..30 { - assert_eq!(extract_fixture("sample.ts", "src/sample.ts").blocks, a.blocks); + assert_eq!( + extract_fixture("sample.ts", "src/sample.ts").blocks, + a.blocks + ); } } diff --git a/crates/kebab-parse-image/examples/gen_smoke_png.rs b/crates/kebab-parse-image/examples/gen_smoke_png.rs index 1e14d42..f5af8d2 100644 --- a/crates/kebab-parse-image/examples/gen_smoke_png.rs +++ b/crates/kebab-parse-image/examples/gen_smoke_png.rs @@ -10,8 +10,7 @@ fn main() { let out = std::env::args() .nth(1) .expect("usage: gen_smoke_png "); - let img: ImageBuffer, _> = - ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0])); + let img: ImageBuffer, _> = ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0])); let mut buf = Cursor::new(Vec::new()); img.write_to(&mut buf, image::ImageFormat::Png) .expect("encode PNG"); diff --git a/crates/kebab-parse-image/src/image_prep.rs b/crates/kebab-parse-image/src/image_prep.rs index e26703d..bbf280a 100644 --- a/crates/kebab-parse-image/src/image_prep.rs +++ b/crates/kebab-parse-image/src/image_prep.rs @@ -26,10 +26,7 @@ use image::{ImageFormat, ImageReader}; /// once: a cheap header sniff peeks at the format / dimensions before /// committing to a decode, so non-PNG passthrough and downscale share /// the same `decode → optionally resize → re-encode` tail. -pub(crate) fn downscale_to_png( - bytes: &[u8], - max_long_edge: u32, -) -> Result<(Vec, u32, u32)> { +pub(crate) fn downscale_to_png(bytes: &[u8], max_long_edge: u32) -> Result<(Vec, u32, u32)> { let reader = ImageReader::new(Cursor::new(bytes)) .with_guessed_format() .context("reading image header")?; @@ -73,8 +70,7 @@ pub(crate) fn downscale_to_png( } else { new_h = new_h.min(max_long_edge); } - let resized = - img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle); + let resized = img.resize_exact(new_w, new_h, image::imageops::FilterType::Triangle); (new_w, new_h, resized) }; @@ -97,8 +93,7 @@ mod tests { /// compresses aggressively so even 4001×3001 stays under a few /// kilobytes. fn solid_png(w: u32, h: u32) -> Vec { - let img: ImageBuffer, _> = - ImageBuffer::from_pixel(w, h, Rgb([0, 0, 255])); + let img: ImageBuffer, _> = ImageBuffer::from_pixel(w, h, Rgb([0, 0, 255])); let mut buf = Cursor::new(Vec::new()); img.write_to(&mut buf, ImageFormat::Png) .expect("encoding solid PNG must not fail"); @@ -106,8 +101,7 @@ mod tests { } fn solid_jpeg(w: u32, h: u32) -> Vec { - let img: ImageBuffer, _> = - ImageBuffer::from_pixel(w, h, Rgb([255, 255, 255])); + let img: ImageBuffer, _> = ImageBuffer::from_pixel(w, h, Rgb([255, 255, 255])); let mut buf = Cursor::new(Vec::new()); img.write_to(&mut buf, ImageFormat::Jpeg) .expect("encoding solid JPEG must not fail"); @@ -119,10 +113,12 @@ mod tests { #[test] fn png_within_cap_passes_through_zero_decode() { let bytes = solid_png(100, 50); - let (out, w, h) = - downscale_to_png(&bytes, 1024).expect("PNG passthrough must succeed"); + let (out, w, h) = downscale_to_png(&bytes, 1024).expect("PNG passthrough must succeed"); assert_eq!((w, h), (100, 50)); - assert_eq!(out, bytes, "PNG passthrough must return source bytes verbatim"); + assert_eq!( + out, bytes, + "PNG passthrough must return source bytes verbatim" + ); } /// JPEG within budget gets re-encoded as PNG (the wire format) @@ -130,8 +126,7 @@ mod tests { #[test] fn jpeg_within_cap_reencodes_as_png() { let bytes = solid_jpeg(100, 50); - let (out, w, h) = - downscale_to_png(&bytes, 1024).expect("JPEG re-encode must succeed"); + let (out, w, h) = downscale_to_png(&bytes, 1024).expect("JPEG re-encode must succeed"); assert_eq!((w, h), (100, 50)); // Byte stream must now start with the PNG magic. assert_eq!( @@ -147,8 +142,7 @@ mod tests { #[test] fn long_edge_clamped_strictly_to_max_for_irrational_scale() { let bytes = solid_png(4001, 3001); - let (_out, w, h) = - downscale_to_png(&bytes, 1601).expect("downscale must succeed"); + let (_out, w, h) = downscale_to_png(&bytes, 1601).expect("downscale must succeed"); let long = w.max(h); assert!(long <= 1601, "long edge must be ≤ max, got {long}"); } @@ -157,8 +151,7 @@ mod tests { #[test] fn aspect_ratio_preserved_within_rounding() { let bytes = solid_png(4000, 3000); - let (_out, w, h) = - downscale_to_png(&bytes, 1024).expect("downscale must succeed"); + let (_out, w, h) = downscale_to_png(&bytes, 1024).expect("downscale must succeed"); let ratio = w as f32 / h as f32; assert!( (ratio - 4.0 / 3.0).abs() < 0.02, diff --git a/crates/kebab-parse-image/src/lib.rs b/crates/kebab-parse-image/src/lib.rs index a8d1be5..ff5dac2 100644 --- a/crates/kebab-parse-image/src/lib.rs +++ b/crates/kebab-parse-image/src/lib.rs @@ -25,10 +25,10 @@ //! ModelCaption stubs), §9.1 (image extraction policy / OCR vs caption //! provenance), §9 (versioning). +pub mod caption; mod dims; mod exif_extract; mod image_prep; -pub mod caption; pub mod ocr; pub use caption::{apply_caption, caption_image}; @@ -37,8 +37,8 @@ pub use ocr::{OcrEngine, OllamaVisionOcr, apply_ocr}; use anyhow::{Context, Result}; use kebab_core::{ Block, CanonicalDocument, CommonBlock, Extractor, ImageRefBlock, Lang, MediaType, Metadata, - ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, - TrustLevel, id_for_block, id_for_doc, + ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TrustLevel, + id_for_block, id_for_doc, }; use serde_json::{Map, Value}; use time::OffsetDateTime; @@ -102,7 +102,11 @@ impl Extractor for ImageExtractor { let exif_map = exif_extract::extract_whitelisted(bytes); let (span, dims_value, dim_warning) = match &dim_outcome { - dims::DimOutcome::Ok { width, height, format } => { + dims::DimOutcome::Ok { + width, + height, + format, + } => { let mut dims = Map::new(); dims.insert("w".into(), Value::Number((*width).into())); dims.insert("h".into(), Value::Number((*height).into())); diff --git a/crates/kebab-parse-image/src/ocr.rs b/crates/kebab-parse-image/src/ocr.rs index e764a17..3143b42 100644 --- a/crates/kebab-parse-image/src/ocr.rs +++ b/crates/kebab-parse-image/src/ocr.rs @@ -64,11 +64,7 @@ pub trait OcrEngine: Send + Sync { /// Run OCR on `image_bytes`. `lang_hint` (BCP-47) can be passed /// through to engines that benefit from it (Tesseract languages, /// LLM prompt steering); ignore otherwise. - fn recognize( - &self, - image_bytes: &[u8], - lang_hint: Option<&Lang>, - ) -> Result; + fn recognize(&self, image_bytes: &[u8], lang_hint: Option<&Lang>) -> Result; } /// Mutate `block.ocr` in place by running `engine` over `image_bytes`, @@ -244,11 +240,7 @@ impl OcrEngine for OllamaVisionOcr { format!("ollama/{}", self.model) } - fn recognize( - &self, - image_bytes: &[u8], - lang_hint: Option<&Lang>, - ) -> Result { + fn recognize(&self, image_bytes: &[u8], lang_hint: Option<&Lang>) -> Result { let (prepared, w, h) = image_prep::downscale_to_png(image_bytes, self.max_pixels) .context("preparing image for OCR")?; let b64 = BASE64_STANDARD.encode(&prepared); @@ -280,9 +272,8 @@ impl OcrEngine for OllamaVisionOcr { truncate(&body_text, 512) ); } - let parsed: OllamaGenerateResponse = resp - .json() - .context("parsing Ollama OCR response as JSON")?; + let parsed: OllamaGenerateResponse = + resp.json().context("parsing Ollama OCR response as JSON")?; if let Some(err) = parsed.error { anyhow::bail!("OllamaVisionOcr: server error — {}", truncate(&err, 512)); } @@ -326,7 +317,10 @@ fn truncate(s: &str, n: usize) -> String { return s.to_string(); } let mut out: String = s.chars().take(n).collect(); - out.push_str(&format!("... (truncated, original {} chars)", s.chars().count())); + out.push_str(&format!( + "... (truncated, original {} chars)", + s.chars().count() + )); out } @@ -395,14 +389,8 @@ mod tests { #[test] fn build_prompt_omits_hint_when_lang_und() { - let engine = OllamaVisionOcr::from_parts( - "http://x", - "m", - vec!["eng".into()], - 1024, - 300, - ) - .unwrap(); + let engine = + OllamaVisionOcr::from_parts("http://x", "m", vec!["eng".into()], 1024, 300).unwrap(); let p = engine.build_prompt(Some(&Lang("und".into()))); assert!(!p.contains("hint:")); } @@ -439,11 +427,9 @@ mod tests { /// tested implicitly (no panic, no error). #[test] fn build_clamps_max_pixels_outside_legal_range() { - let too_small = - OllamaVisionOcr::from_parts("http://x", "m", vec![], 1, 300).unwrap(); + let too_small = OllamaVisionOcr::from_parts("http://x", "m", vec![], 1, 300).unwrap(); assert_eq!(too_small.max_pixels(), MIN_LONG_EDGE); - let too_big = - OllamaVisionOcr::from_parts("http://x", "m", vec![], u32::MAX, 300).unwrap(); + let too_big = OllamaVisionOcr::from_parts("http://x", "m", vec![], u32::MAX, 300).unwrap(); assert_eq!(too_big.max_pixels(), MAX_LONG_EDGE); } } diff --git a/crates/kebab-parse-image/tests/caption.rs b/crates/kebab-parse-image/tests/caption.rs index 583f95d..63b7de3 100644 --- a/crates/kebab-parse-image/tests/caption.rs +++ b/crates/kebab-parse-image/tests/caption.rs @@ -13,8 +13,7 @@ use std::sync::{Arc, Mutex}; use kebab_config::Config; use kebab_core::{ AssetId, BlockId, CommonBlock, FinishReason, GenerateRequest, ImageRefBlock, Lang, - LanguageModel, ModelRef, ProvenanceEvent, ProvenanceKind, SourceSpan, TokenChunk, - TokenUsage, + LanguageModel, ModelRef, ProvenanceEvent, ProvenanceKind, SourceSpan, TokenChunk, TokenUsage, }; use kebab_llm::MockLanguageModel; use kebab_parse_image::{apply_caption, caption_image}; @@ -127,7 +126,10 @@ fn apply_caption_sets_block_caption_and_appends_provenance() { assert_eq!(events[0].kind, ProvenanceKind::CaptionApplied); assert_eq!(events[0].agent, "kb-parse-image"); let note = events[0].note.as_deref().unwrap_or(""); - assert!(note.contains("vision-mock:1b") && note.contains("caption-v1"), "{note}"); + assert!( + note.contains("vision-mock:1b") && note.contains("caption-v1"), + "{note}" + ); } // ── Empty token stream → empty caption text ────────────────────────────── @@ -206,10 +208,7 @@ fn caption_image_routes_image_into_request_images_field() { let decoded = base64::engine::general_purpose::STANDARD .decode(&imgs[0]) .expect("base64 decodes"); - assert!( - !decoded.is_empty(), - "decoded image bytes must be non-empty" - ); + assert!(!decoded.is_empty(), "decoded image bytes must be non-empty"); let sys = captured_system.lock().unwrap().clone().unwrap(); assert!( diff --git a/crates/kebab-parse-image/tests/common/mod.rs b/crates/kebab-parse-image/tests/common/mod.rs index 1cb502a..c6382f7 100644 --- a/crates/kebab-parse-image/tests/common/mod.rs +++ b/crates/kebab-parse-image/tests/common/mod.rs @@ -47,8 +47,7 @@ pub fn no_exif_png() -> Vec { /// adapter's downscale path. Solid-colour PNGs compress aggressively, so /// the on-disk size stays well under 1 MB despite the large dimensions. pub fn large_blue_4000x3000_png() -> Vec { - let img: ImageBuffer, _> = - ImageBuffer::from_fn(4000, 3000, |_, _| Rgb([0, 0, 255])); + let img: ImageBuffer, _> = ImageBuffer::from_fn(4000, 3000, |_, _| Rgb([0, 0, 255])); let mut buf = Cursor::new(Vec::new()); img.write_to(&mut buf, image::ImageFormat::Png) .expect("encoding 4000x3000 PNG must not fail"); @@ -72,9 +71,7 @@ pub fn hello_world_png() -> anyhow::Result> { ImageBuffer::from_fn(400, 100, |_, _| Rgb([255, 255, 255])); let font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"; let font_bytes = std::fs::read(font_path).with_context(|| { - format!( - "{font_path} not found — only the opt-in OCR integration fixture needs this font" - ) + format!("{font_path} not found — only the opt-in OCR integration fixture needs this font") })?; let font = FontRef::try_from_slice(&font_bytes).context("DejaVu font parses")?; let scaled = font.as_scaled(40.0); @@ -211,7 +208,10 @@ fn build_exif_blob_gps(flavor: GpsFlavor) -> Vec { tag: Tag::GPSLatitude, ifd_num: In::PRIMARY, value: Value::Rational(vec![ - Rational { num: lat_deg, denom: 1 }, + Rational { + num: lat_deg, + denom: 1, + }, Rational { num: 30, denom: 1 }, Rational { num: 0, denom: 1 }, ]), @@ -225,7 +225,10 @@ fn build_exif_blob_gps(flavor: GpsFlavor) -> Vec { tag: Tag::GPSLongitude, ifd_num: In::PRIMARY, value: Value::Rational(vec![ - Rational { num: lon_deg, denom: 1 }, + Rational { + num: lon_deg, + denom: 1, + }, Rational { num: 0, denom: 1 }, Rational { num: 0, denom: 1 }, ]), @@ -328,4 +331,3 @@ pub fn strip_dynamic_at(json: &mut serde_json::Value) { } } } - diff --git a/crates/kebab-parse-image/tests/extractor.rs b/crates/kebab-parse-image/tests/extractor.rs index 459f26f..cfc82a2 100644 --- a/crates/kebab-parse-image/tests/extractor.rs +++ b/crates/kebab-parse-image/tests/extractor.rs @@ -80,8 +80,14 @@ fn jpeg_with_exif_gps_captures_whitelisted_tags() { Some(&Value::String("2024-08-15T12:34:56".into())) ); assert_eq!(exif.get("orientation"), Some(&Value::Number(1.into()))); - let lat = exif.get("gps_lat").and_then(serde_json::Value::as_f64).expect("gps_lat"); - let lon = exif.get("gps_lon").and_then(serde_json::Value::as_f64).expect("gps_lon"); + let lat = exif + .get("gps_lat") + .and_then(serde_json::Value::as_f64) + .expect("gps_lat"); + let lon = exif + .get("gps_lon") + .and_then(serde_json::Value::as_f64) + .expect("gps_lon"); assert!((lat - 37.5).abs() < 1e-6, "lat={lat}"); assert!((lon - 127.0).abs() < 1e-6, "lon={lon}"); @@ -118,7 +124,10 @@ fn no_exif_image_yields_empty_exif_map() { .get("exif") .and_then(|v| v.as_object()) .expect("exif object present"); - assert!(exif.is_empty(), "no-EXIF PNG must yield empty exif map: {exif:?}"); + assert!( + exif.is_empty(), + "no-EXIF PNG must yield empty exif map: {exif:?}" + ); } #[test] @@ -147,7 +156,12 @@ fn corrupt_image_emits_warning_no_panic() { let block = extract_block(&doc); assert!(matches!( block.common.source_span, - SourceSpan::Region { x: 0, y: 0, w: 0, h: 0 } + SourceSpan::Region { + x: 0, + y: 0, + w: 0, + h: 0 + } )); // Warning provenance event. let warnings: Vec<_> = doc @@ -281,7 +295,10 @@ fn jpeg_with_gps_out_of_range_drops_latitude() { !exif.contains_key("gps_lat"), "out-of-range latitude must be dropped" ); - let lon = exif.get("gps_lon").and_then(serde_json::Value::as_f64).expect("gps_lon"); + let lon = exif + .get("gps_lon") + .and_then(serde_json::Value::as_f64) + .expect("gps_lon"); assert!((lon - 127.0).abs() < 1e-6); } diff --git a/crates/kebab-parse-image/tests/ocr.rs b/crates/kebab-parse-image/tests/ocr.rs index a3f29f1..63d848f 100644 --- a/crates/kebab-parse-image/tests/ocr.rs +++ b/crates/kebab-parse-image/tests/ocr.rs @@ -8,8 +8,7 @@ mod common; use kebab_config::Config; use kebab_core::{ - AssetId, BlockId, CommonBlock, ImageRefBlock, Lang, ProvenanceEvent, ProvenanceKind, - SourceSpan, + AssetId, BlockId, CommonBlock, ImageRefBlock, Lang, ProvenanceEvent, ProvenanceKind, SourceSpan, }; use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_ocr}; use serde_json::json; @@ -82,7 +81,11 @@ async fn ocr_recognize_decodes_response_into_ocr_text() { assert_eq!(text.joined, "Hello World 2026"); assert_eq!(text.engine, "ollama-vision"); assert!(text.engine_version.starts_with("ollama/gemma4:e4b")); - assert_eq!(text.regions.len(), 1, "non-empty joined → exactly one region"); + assert_eq!( + text.regions.len(), + 1, + "non-empty joined → exactly one region" + ); assert_eq!(text.regions[0].text, "Hello World 2026"); assert!((text.regions[0].confidence - 1.0).abs() < 1e-6); // Region bbox covers prepared image dimensions (100×50 < max_pixels @@ -183,23 +186,22 @@ async fn apply_ocr_sets_block_ocr_and_appends_provenance() { let bytes = red_100x50_png(); let cfg = cfg_for_endpoint(&server.uri()); - let (block, events) = - tokio::task::spawn_blocking(move || -> anyhow::Result<_> { - let engine = OllamaVisionOcr::new(&cfg)?; - let mut block = empty_image_block(); - let mut events: Vec = Vec::new(); - apply_ocr( - &engine, - &bytes, - &mut block, - Some(&Lang("ko".to_string())), - &mut events, - )?; - Ok((block, events)) - }) - .await - .expect("blocking task panicked") - .expect("apply_ocr must succeed"); + let (block, events) = tokio::task::spawn_blocking(move || -> anyhow::Result<_> { + let engine = OllamaVisionOcr::new(&cfg)?; + let mut block = empty_image_block(); + let mut events: Vec = Vec::new(); + apply_ocr( + &engine, + &bytes, + &mut block, + Some(&Lang("ko".to_string())), + &mut events, + )?; + Ok((block, events)) + }) + .await + .expect("blocking task panicked") + .expect("apply_ocr must succeed"); let ocr = block.ocr.as_ref().expect("ocr Some after apply_ocr"); assert_eq!(ocr.joined, "안녕 2026"); @@ -287,8 +289,7 @@ async fn ocr_downscales_large_image_before_sending() { // Pull the request body, parse JSON, base64-decode the image, and // verify the long edge is at most max_pixels (1024). let raw = captured.lock().unwrap().clone().expect("request captured"); - let value: serde_json::Value = - serde_json::from_slice(&raw).expect("request body is JSON"); + let value: serde_json::Value = serde_json::from_slice(&raw).expect("request body is JSON"); let imgs = value .get("images") .and_then(|v| v.as_array()) @@ -322,8 +323,7 @@ async fn ocr_downscales_large_image_before_sending() { #[test] fn from_parts_clamps_max_pixels_into_legal_range() { // Below MIN_LONG_EDGE — bumped up to the floor. - let too_small = - OllamaVisionOcr::from_parts("http://x", "m", vec![], 10, 300).unwrap(); + let too_small = OllamaVisionOcr::from_parts("http://x", "m", vec![], 10, 300).unwrap(); assert_eq!( too_small.max_pixels(), 256, @@ -331,8 +331,7 @@ fn from_parts_clamps_max_pixels_into_legal_range() { ); // Above MAX_LONG_EDGE — capped at the ceiling. - let too_big = - OllamaVisionOcr::from_parts("http://x", "m", vec![], 99_999, 300).unwrap(); + let too_big = OllamaVisionOcr::from_parts("http://x", "m", vec![], 99_999, 300).unwrap(); assert_eq!( too_big.max_pixels(), 4096, @@ -340,8 +339,7 @@ fn from_parts_clamps_max_pixels_into_legal_range() { ); // Inside the legal range — pass through untouched. - let in_range = - OllamaVisionOcr::from_parts("http://x", "m", vec![], 1024, 300).unwrap(); + let in_range = OllamaVisionOcr::from_parts("http://x", "m", vec![], 1024, 300).unwrap(); assert_eq!(in_range.max_pixels(), 1024); } @@ -364,8 +362,7 @@ fn from_parts_clamps_max_pixels_into_legal_range() { async fn ocr_integration_real_ollama_transcribes_text() { let endpoint = std::env::var("KEBAB_IMAGE_OCR_ENDPOINT") .unwrap_or_else(|_| "http://192.168.0.47:11434".to_string()); - let model = - std::env::var("KEBAB_IMAGE_OCR_MODEL").unwrap_or_else(|_| "gemma4:e4b".to_string()); + let model = std::env::var("KEBAB_IMAGE_OCR_MODEL").unwrap_or_else(|_| "gemma4:e4b".to_string()); // Generate a fixture with known text. If the DejaVu font is // missing from this dev box, skip rather than crash. diff --git a/crates/kebab-parse-md/src/blocks.rs b/crates/kebab-parse-md/src/blocks.rs index 71d03d6..6f545eb 100644 --- a/crates/kebab-parse-md/src/blocks.rs +++ b/crates/kebab-parse-md/src/blocks.rs @@ -33,8 +33,8 @@ use std::ops::Range; -use kebab_core::{Inline, SourceSpan}; use crate::types::{ParsedBlock, ParsedBlockKind, ParsedPayload, Warning, WarningKind}; +use kebab_core::{Inline, SourceSpan}; use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag, TagEnd}; /// Parse a Markdown body into a flat `Vec` plus any warnings. @@ -60,7 +60,9 @@ pub fn parse_blocks( let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { parse_blocks_inner(body, body_offset_lines) })); - if let Ok(out) = result { Ok(out) } else { + if let Ok(out) = result { + Ok(out) + } else { tracing::warn!("parse_blocks panicked on adversarial input; returning empty"); Ok(( Vec::new(), @@ -99,7 +101,8 @@ fn parse_blocks_inner(body: &[u8], body_offset_lines: u32) -> (Vec, // possibly-inverted spans would be more harmful than dropping output. if state.overflow_detected { let at = state - .overflow_at_body_line.map_or_else(|| "?".to_string(), |n| n.to_string()); + .overflow_at_body_line + .map_or_else(|| "?".to_string(), |n| n.to_string()); return ( Vec::new(), vec![Warning { @@ -273,7 +276,11 @@ enum InlineFrame { Top(Vec), Strong(Vec), Emph(Vec), - Link { href: String, text: String, kids: Vec }, + Link { + href: String, + text: String, + kids: Vec, + }, } impl InlineBuf { @@ -295,12 +302,16 @@ impl InlineBuf { fn push_text(&mut self, s: &str) { self.text.push_str(s); - self.push_inline(Inline::Text { text: s.to_string() }); + self.push_inline(Inline::Text { + text: s.to_string(), + }); } fn push_code(&mut self, s: &str) { self.text.push_str(s); - self.push_inline(Inline::Code { code: s.to_string() }); + self.push_inline(Inline::Code { + code: s.to_string(), + }); } fn open_strong(&mut self) { @@ -375,7 +386,6 @@ impl InlineBuf { }; (top, self.text) } - } /// Flatten an emitted block into the inline buffer of an enclosing list @@ -413,7 +423,10 @@ fn flatten_block_into_item(block: &ParsedBlock, inlines: &mut InlineBuf) { let hashes = "#".repeat((*level as usize).clamp(1, 6)); inlines.push_text(&format!("\n{hashes} {text}\n")); } - ParsedPayload::Paragraph { text, inlines: child } => { + ParsedPayload::Paragraph { + text, + inlines: child, + } => { // Paragraphs inside list items normally don't reach this path // (the Start(Tag::Paragraph) handler suppresses creating a // Paragraph frame when the parent is a ListItem). This branch @@ -526,7 +539,9 @@ impl<'a> WalkState<'a> { if let (Some(start), Some(end)) = ( start_body.checked_add(self.body_offset_lines), end_body.checked_add(self.body_offset_lines), - ) { SourceSpan::Line { start, end } } else { + ) { + SourceSpan::Line { start, end } + } else { if !self.overflow_detected { self.overflow_detected = true; self.overflow_at_body_line = Some(start_body); @@ -604,10 +619,7 @@ impl<'a> WalkState<'a> { }); } Event::Start(Tag::List(start)) => { - let nested_in_item = matches!( - self.frames.last(), - Some(Frame::ListItem { .. }) - ); + let nested_in_item = matches!(self.frames.last(), Some(Frame::ListItem { .. })); self.frames.push(Frame::List { ordered: start.is_some(), range, @@ -630,7 +642,13 @@ impl<'a> WalkState<'a> { } else { // Take only the first whitespace-delimited token, // matching how editors render the info string. - Some(trimmed.split_whitespace().next().unwrap_or(trimmed).to_string()) + Some( + trimmed + .split_whitespace() + .next() + .unwrap_or(trimmed) + .to_string(), + ) } } }; @@ -709,7 +727,12 @@ impl<'a> WalkState<'a> { // The Tag::Heading frame is the source of truth for the // level — `_level` from TagEnd is identical for well-formed // input. We trust the frame. - if let Some(Frame::Heading { level: level_to_use, range, inlines }) = self.frames.pop() { + if let Some(Frame::Heading { + level: level_to_use, + range, + inlines, + }) = self.frames.pop() + { let (_inline_vec, text) = inlines.finish(); let text = text.trim().to_string(); @@ -737,7 +760,10 @@ impl<'a> WalkState<'a> { kind: ParsedBlockKind::Heading, heading_path: path, source_span: self.span_for(&range), - payload: ParsedPayload::Heading { level: level_to_use, text }, + payload: ParsedPayload::Heading { + level: level_to_use, + text, + }, }; self.emit_block(block); } @@ -781,7 +807,10 @@ impl<'a> WalkState<'a> { kind: ParsedBlockKind::Paragraph, heading_path: self.heading_path(), source_span: span, - payload: ParsedPayload::Paragraph { text, inlines: inline_vec }, + payload: ParsedPayload::Paragraph { + text, + inlines: inline_vec, + }, }; self.emit_block(block); } @@ -803,8 +832,14 @@ impl<'a> WalkState<'a> { let mut inlines: Vec = Vec::new(); for c in &children { match &c.payload { - ParsedPayload::Paragraph { text: t, inlines: il } - | ParsedPayload::Quote { text: t, inlines: il } => { + ParsedPayload::Paragraph { + text: t, + inlines: il, + } + | ParsedPayload::Quote { + text: t, + inlines: il, + } => { if !text.is_empty() { text.push('\n'); } @@ -831,7 +866,13 @@ impl<'a> WalkState<'a> { } } Event::End(TagEnd::List(_)) => { - if let Some(Frame::List { ordered, range, items, nested_in_item }) = self.frames.pop() { + if let Some(Frame::List { + ordered, + range, + items, + nested_in_item, + }) = self.frames.pop() + { if nested_in_item { // Flatten this sub-list into the enclosing list // item's inline text. Each item becomes a line @@ -871,7 +912,12 @@ impl<'a> WalkState<'a> { } } Event::End(TagEnd::CodeBlock) => { - if let Some(Frame::Code { lang, range, mut code }) = self.frames.pop() { + if let Some(Frame::Code { + lang, + range, + mut code, + }) = self.frames.pop() + { // Fenced code blocks include a trailing newline from the // last source line; pulldown-cmark already strips the // closing fence. We trim a single trailing `\n` for @@ -903,11 +949,9 @@ impl<'a> WalkState<'a> { kind: WarningKind::MalformedTable, note, }); - let raw = std::str::from_utf8( - self.body.get(range.clone()).unwrap_or(&[]), - ) - .unwrap_or_default() - .to_string(); + let raw = std::str::from_utf8(self.body.get(range.clone()).unwrap_or(&[])) + .unwrap_or_default() + .to_string(); ParsedBlock { kind: ParsedBlockKind::Paragraph, heading_path: self.heading_path(), @@ -1187,8 +1231,18 @@ mod tests { let body = "# A\n\n## A.1\n\np1\n\n# B\n\np2\n"; let (blocks, _) = parse(body, 1); // p1 under [A, A.1]; p2 under [B]. - let p1 = blocks.iter().find(|b| matches!(b.payload, ParsedPayload::Paragraph { ref text, .. } if text == "p1")).unwrap(); - let p2 = blocks.iter().find(|b| matches!(b.payload, ParsedPayload::Paragraph { ref text, .. } if text == "p2")).unwrap(); + let p1 = blocks + .iter() + .find( + |b| matches!(b.payload, ParsedPayload::Paragraph { ref text, .. } if text == "p1"), + ) + .unwrap(); + let p2 = blocks + .iter() + .find( + |b| matches!(b.payload, ParsedPayload::Paragraph { ref text, .. } if text == "p2"), + ) + .unwrap(); assert_eq!(p1.heading_path, vec!["A".to_string(), "A.1".to_string()]); assert_eq!(p2.heading_path, vec!["B".to_string()]); } @@ -1226,9 +1280,9 @@ mod tests { let (blocks, _) = parse(body, 1); let inner = blocks .iter() - .find(|b| { - matches!(b.payload, ParsedPayload::Heading { ref text, .. } if text == "Inner") - }) + .find( + |b| matches!(b.payload, ParsedPayload::Heading { ref text, .. } if text == "Inner"), + ) .expect("Inner heading present"); assert_eq!( inner.heading_path, @@ -1282,10 +1336,19 @@ mod tests { assert_eq!(blocks.len(), 1); match &blocks[0].payload { ParsedPayload::Table { headers, rows } => { - assert_eq!(headers, &vec!["a".to_string(), "b".to_string(), "c".to_string()]); + assert_eq!( + headers, + &vec!["a".to_string(), "b".to_string(), "c".to_string()] + ); assert_eq!(rows.len(), 2); - assert_eq!(rows[0], vec!["1".to_string(), "2".to_string(), "3".to_string()]); - assert_eq!(rows[1], vec!["x".to_string(), "y".to_string(), "z".to_string()]); + assert_eq!( + rows[0], + vec!["1".to_string(), "2".to_string(), "3".to_string()] + ); + assert_eq!( + rows[1], + vec!["x".to_string(), "y".to_string(), "z".to_string()] + ); } _ => panic!("expected table, got {:?}", blocks[0].payload), } @@ -1313,7 +1376,11 @@ mod tests { // Synthetic events — fake a 3-column table with a 2-cell header so // the `malformed` branch fires. let aligns = vec![Alignment::None, Alignment::None, Alignment::None]; - state.handle_event(Event::Start(Tag::Table(aligns)), 0..body.len(), &mut warnings); + state.handle_event( + Event::Start(Tag::Table(aligns)), + 0..body.len(), + &mut warnings, + ); state.handle_event(Event::Start(Tag::TableHead), 0..0, &mut warnings); state.handle_event(Event::Start(Tag::TableCell), 0..0, &mut warnings); state.handle_event(Event::Text(CowStr::Borrowed("a")), 0..0, &mut warnings); @@ -1330,7 +1397,10 @@ mod tests { assert_eq!(blocks[0].kind, ParsedBlockKind::Paragraph); match &blocks[0].payload { ParsedPayload::Paragraph { text, .. } => { - assert!(text.contains("| a | b |"), "raw markdown preserved: {text:?}"); + assert!( + text.contains("| a | b |"), + "raw markdown preserved: {text:?}" + ); } _ => panic!("expected paragraph fallback"), } @@ -1470,7 +1540,11 @@ mod tests { assert!( matches!( inl, - Inline::Text { .. } | Inline::Code { .. } | Inline::Link { .. } | Inline::Strong { .. } | Inline::Emph { .. } + Inline::Text { .. } + | Inline::Code { .. } + | Inline::Link { .. } + | Inline::Strong { .. } + | Inline::Emph { .. } ), "unexpected inline kind: {inl:?}" ); @@ -1551,7 +1625,10 @@ mod tests { assert_eq!(items.len(), 2); let flat = flatten_inlines_to_text(&items[0]); assert!(flat.contains("item"), "first item missing 'item': {flat:?}"); - assert!(flat.contains("fn f(){}"), "first item missing code body: {flat:?}"); + assert!( + flat.contains("fn f(){}"), + "first item missing code body: {flat:?}" + ); assert!(flat.contains("```"), "first item missing fence: {flat:?}"); assert_eq!(flatten_inlines_to_text(&items[1]).trim(), "next"); } @@ -1658,13 +1735,13 @@ mod tests { b"\0\0\0", b"```\nunclosed", b"# heading\n```\nfn main() {", - b"| a | b |\n|---|---|\n| 1 |\n", // short row - b"| a | b |\n|---|\n| 1 | 2 |\n", // header/sep mismatch + b"| a | b |\n|---|---|\n| 1 |\n", // short row + b"| a | b |\n|---|\n| 1 | 2 |\n", // header/sep mismatch b"![", b"](", b"---\nfm: yes\n", - b"#######", // 7 hashes (invalid heading) - b"\xff\xfe\x00\x00garbage", // non-utf8 + b"#######", // 7 hashes (invalid heading) + b"\xff\xfe\x00\x00garbage", // non-utf8 "# 한글\n\n본문\n".as_bytes(), ]; for c in cases { @@ -1727,13 +1804,16 @@ mod tests { let (blocks, _) = parse(body, 1); match &blocks[0].payload { ParsedPayload::Paragraph { inlines, .. } => { - let kinds: Vec<&'static str> = inlines.iter().map(|i| match i { - Inline::Text { .. } => "Text", - Inline::Code { .. } => "Code", - Inline::Link { .. } => "Link", - Inline::Strong { .. } => "Strong", - Inline::Emph { .. } => "Emph", - }).collect(); + let kinds: Vec<&'static str> = inlines + .iter() + .map(|i| match i { + Inline::Text { .. } => "Text", + Inline::Code { .. } => "Code", + Inline::Link { .. } => "Link", + Inline::Strong { .. } => "Strong", + Inline::Emph { .. } => "Emph", + }) + .collect(); assert!(kinds.contains(&"Strong")); assert!(kinds.contains(&"Emph")); assert!(kinds.contains(&"Code")); diff --git a/crates/kebab-parse-md/src/frontmatter.rs b/crates/kebab-parse-md/src/frontmatter.rs index cd27a1c..04b6efd 100644 --- a/crates/kebab-parse-md/src/frontmatter.rs +++ b/crates/kebab-parse-md/src/frontmatter.rs @@ -18,8 +18,8 @@ use std::ops::Range; use std::sync::OnceLock; -use kebab_core::{Metadata, SourceType, TrustLevel}; use crate::types::{Warning, WarningKind}; +use kebab_core::{Metadata, SourceType, TrustLevel}; use lingua::{IsoCode639_1, Language, LanguageDetector, LanguageDetectorBuilder}; use serde::Deserialize; use serde_json::{Map, Value}; @@ -430,25 +430,33 @@ fn derive_metadata( // ---- source_type ---- let source_type = match raw.source_type.as_deref() { None => SourceType::Markdown, - Some(s) => if let Some(st) = parse_source_type(s) { st } else { - warnings.push(Warning { - kind: WarningKind::MalformedFrontmatter, - note: format!("unknown source_type={s}, defaulted to markdown"), - }); - SourceType::Markdown - }, + Some(s) => { + if let Some(st) = parse_source_type(s) { + st + } else { + warnings.push(Warning { + kind: WarningKind::MalformedFrontmatter, + note: format!("unknown source_type={s}, defaulted to markdown"), + }); + SourceType::Markdown + } + } }; // ---- trust_level ---- let trust_level = match raw.trust_level.as_deref() { None => TrustLevel::Primary, - Some(s) => if let Some(tl) = parse_trust_level(s) { tl } else { - warnings.push(Warning { - kind: WarningKind::MalformedFrontmatter, - note: format!("unknown trust_level={s}, defaulted to primary"), - }); - TrustLevel::Primary - }, + Some(s) => { + if let Some(tl) = parse_trust_level(s) { + tl + } else { + warnings.push(Warning { + kind: WarningKind::MalformedFrontmatter, + note: format!("unknown trust_level={s}, defaulted to primary"), + }); + TrustLevel::Primary + } + } }; // ---- id alias ---- @@ -587,11 +595,7 @@ fn iso_code(lang: Language) -> &'static str { #[cfg(test)] mod tests { use super::*; - use kebab_core::{ - AssetId, WorkspacePath, - ids::id_for_doc, - versions::ParserVersion, - }; + use kebab_core::{AssetId, WorkspacePath, ids::id_for_doc, versions::ParserVersion}; use time::macros::datetime; fn hints() -> BodyHints { @@ -626,7 +630,10 @@ trust_level: secondary\n\ assert_eq!(meta.trust_level, TrustLevel::Secondary); assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC)); assert_eq!(meta.updated_at, datetime!(2024-03-02 00:00:00 UTC)); - assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("My Doc")); + assert_eq!( + meta.user.get("title").and_then(|v| v.as_str()), + Some("My Doc") + ); assert_eq!(meta.user.get("lang").and_then(|v| v.as_str()), Some("en")); assert_eq!(meta.user_id_alias, None); } @@ -679,7 +686,11 @@ source_type: alien\n\ assert_eq!(meta.trust_level, TrustLevel::Primary); assert_eq!(meta.source_type, SourceType::Markdown); assert_eq!(warns.len(), 2); - assert!(warns.iter().all(|w| matches!(w.kind, WarningKind::MalformedFrontmatter))); + assert!( + warns + .iter() + .all(|w| matches!(w.kind, WarningKind::MalformedFrontmatter)) + ); assert!(warns.iter().any(|w| w.note.contains("trust_level=weird"))); assert!(warns.iter().any(|w| w.note.contains("source_type=alien"))); } @@ -776,7 +787,10 @@ source_type: alien\n\ #[test] fn detect_delimiters_no_match_without_leading_marker() { assert!(detect_delimiters(b"# heading\n---\n---\n").is_none()); - assert!(detect_delimiters(b" ---\n---\n").is_none(), "leading whitespace"); + assert!( + detect_delimiters(b" ---\n---\n").is_none(), + "leading whitespace" + ); assert!(detect_delimiters(b"").is_none()); } @@ -857,10 +871,7 @@ updated_at: 2024-03-02T00:00:00Z\r\n\ let (meta, span, warns) = parse_frontmatter(bytes, &hints()).unwrap(); assert!(warns.is_empty(), "warnings: {warns:?}"); assert!(span.is_some()); - assert_eq!( - meta.user.get("title").and_then(|v| v.as_str()), - Some("Doc") - ); + assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("Doc")); assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC)); assert_eq!(meta.updated_at, datetime!(2024-03-02 00:00:00 UTC)); } @@ -874,10 +885,7 @@ created_at = \"2024-03-01T00:00:00Z\"\r\n\ let (meta, span, warns) = parse_frontmatter(bytes, &hints()).unwrap(); assert!(warns.is_empty(), "warnings: {warns:?}"); assert!(span.is_some()); - assert_eq!( - meta.user.get("title").and_then(|v| v.as_str()), - Some("Doc") - ); + assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("Doc")); assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC)); } @@ -960,10 +968,7 @@ created_at: 2024-03-01T00:00:00Z\n\ assert!(warns.is_empty(), "warnings: {warns:?}"); let span = span.expect("span present"); assert_eq!(span.start, 3); - assert_eq!( - meta.user.get("title").and_then(|v| v.as_str()), - Some("Doc") - ); + assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("Doc")); assert_eq!(meta.user.get("lang").and_then(|v| v.as_str()), Some("en")); assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC)); } diff --git a/crates/kebab-parse-md/src/lib.rs b/crates/kebab-parse-md/src/lib.rs index 8e7c083..8f03855 100644 --- a/crates/kebab-parse-md/src/lib.rs +++ b/crates/kebab-parse-md/src/lib.rs @@ -36,10 +36,16 @@ pub use frontmatter::{BodyHints, FrontmatterSpan, parse_frontmatter}; // Spec §3.3 의 surface 보존 정책 — explicit (NOT glob) 으로 future addition leak 방지. pub use crate::normalize::{build_canonical_document, derive_title}; pub use crate::types::{ + ParsedAudioSegment, // 5 사용 type - ParsedBlock, ParsedBlockKind, ParsedPayload, Warning, WarningKind, + ParsedBlock, + ParsedBlockKind, // 3 forward-declared struct (보존 — spec §3.3 + §11.5 future surface) - ParsedImageRegion, ParsedPdfPage, ParsedAudioSegment, + ParsedImageRegion, + ParsedPayload, + ParsedPdfPage, + Warning, + WarningKind, }; /// Parser-version label for Markdown files ingested through this crate. diff --git a/crates/kebab-parse-md/src/normalize.rs b/crates/kebab-parse-md/src/normalize.rs index 0f45d56..9adf4be 100644 --- a/crates/kebab-parse-md/src/normalize.rs +++ b/crates/kebab-parse-md/src/normalize.rs @@ -22,13 +22,13 @@ use std::collections::HashMap; use std::path::Path; +use crate::types::{ParsedBlock, ParsedPayload, Warning, WarningKind}; use anyhow::Result; use kebab_core::{ Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, DocumentId, HeadingBlock, ImageRefBlock, Inline, Lang, ListBlock, Metadata, ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, RawAsset, TableBlock, TextBlock, id_for_block, id_for_doc, }; -use crate::types::{ParsedBlock, ParsedPayload, Warning, WarningKind}; use time::OffsetDateTime; use unicode_normalization::UnicodeNormalization; @@ -234,11 +234,9 @@ fn lift_block( // wire form matches ID input). Without this, NFD `\u{1100}\u{1161}` // and NFC `\u{AC00}` (both render as 가) would produce different // `block_id`s for what is logically the same heading. - let heading_path_nfc: Vec = - pb.heading_path.iter().map(|s| s.nfc().collect()).collect(); + let heading_path_nfc: Vec = pb.heading_path.iter().map(|s| s.nfc().collect()).collect(); let ordinal = next_ordinal(counters, &heading_path_nfc, kind); - let block_id: BlockId = - id_for_block(doc_id, kind, &heading_path_nfc, ordinal, &pb.source_span); + let block_id: BlockId = id_for_block(doc_id, kind, &heading_path_nfc, ordinal, &pb.source_span); let common = CommonBlock { block_id, heading_path: heading_path_nfc, @@ -426,8 +424,8 @@ fn workspace_path_stem(workspace_path: &str) -> String { mod tests { use super::*; use kebab_core::{ - AssetId, AssetStorage, Checksum, MediaType, SourceSpan, SourceType, SourceUri, - TrustLevel, WorkspacePath, normalize::to_posix, + AssetId, AssetStorage, Checksum, MediaType, SourceSpan, SourceType, SourceUri, TrustLevel, + WorkspacePath, normalize::to_posix, }; use serde_json::Value; use std::path::{Path, PathBuf}; @@ -581,8 +579,7 @@ mod tests { let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); - let doc = - build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); + let doc = build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); // Compute the expected IDs out-of-band so the test pins both // the (heading_path, kind) ordinal grouping AND the value of @@ -647,8 +644,7 @@ mod tests { let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); - let doc = - build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); let kinds: Vec<_> = doc.provenance.events.iter().map(|e| e.kind).collect(); assert_eq!( kinds, @@ -665,7 +661,10 @@ mod tests { assert_eq!(events[2].agent, "kb-normalize"); // Pin the implementation invariant that Parsed and Normalized // share the single `now_utc()` reading inside one call. - assert_eq!(events[1].at, events[2].at, "Parsed and Normalized share now_utc"); + assert_eq!( + events[1].at, events[2].at, + "Parsed and Normalized share now_utc" + ); } /// Warnings carried into `build_canonical_document` are emitted as @@ -679,13 +678,17 @@ mod tests { kind: WarningKind::MalformedFrontmatter, note: "missing closing fence".into(), }]; - let doc = - build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap(); + let doc = build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap(); assert_eq!(doc.provenance.events.len(), 4); let last = doc.provenance.events.last().unwrap(); assert_eq!(last.kind, ProvenanceKind::Warning); assert_eq!(last.agent, "kb-parse-md"); - assert!(last.note.as_deref().unwrap().contains("missing closing fence")); + assert!( + last.note + .as_deref() + .unwrap() + .contains("missing closing fence") + ); } /// `metadata.user["title"]` and `metadata.user["lang"]` are lifted @@ -697,8 +700,7 @@ mod tests { let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); - let doc = - build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); assert_eq!(doc.title, "Example"); assert_eq!(doc.lang, Lang("en".into())); assert!(!doc.metadata.user.contains_key("title")); @@ -744,14 +746,9 @@ mod tests { // determinism is exercised on a non-empty `lift_block` path // (block_id hashing, NFC normalization, ordinal counters), not // just an empty Vec. - let baseline = build_canonical_document( - &asset, - metadata.clone(), - fixture_blocks_five(), - &pv, - vec![], - ) - .unwrap(); + let baseline = + build_canonical_document(&asset, metadata.clone(), fixture_blocks_five(), &pv, vec![]) + .unwrap(); let baseline_json = serde_json::to_string(&strip_dynamic_at(&baseline)).unwrap(); let start = std::time::Instant::now(); @@ -788,8 +785,7 @@ mod tests { kind: WarningKind::ExtractFailed, note: "pulldown-cmark panicked; body discarded".into(), }]; - let doc = - build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap(); + let doc = build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap(); let warning_event = doc .provenance .events @@ -825,14 +821,11 @@ mod tests { let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); - let doc = - build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); + let doc = build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); // No AudioRef block in the canonical output. assert!( - !doc.blocks - .iter() - .any(|b| matches!(b, Block::AudioRef(_))), + !doc.blocks.iter().any(|b| matches!(b, Block::AudioRef(_))), "AudioRef block should be skipped pre-P8" ); @@ -908,8 +901,7 @@ mod tests { .user .insert("title".into(), Value::String(String::new())); let pv = parser_version(); - let doc = - build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); // workspace_path = "notes/example.md" → stem "example". assert_eq!(doc.title, "example"); } @@ -926,8 +918,7 @@ mod tests { .user .insert("title".into(), Value::Number(42.into())); let pv = parser_version(); - let doc = - build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); assert_eq!(doc.title, "example"); } @@ -940,8 +931,7 @@ mod tests { let mut metadata = fixture_metadata(); metadata.user.insert("lang".into(), Value::Array(vec![])); let pv = parser_version(); - let doc = - build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); assert_eq!(doc.lang, Lang(String::new())); } @@ -995,14 +985,22 @@ mod tests { /// Step 2 — first H1 wins when frontmatter empty. #[test] fn derive_title_uses_h1_when_no_frontmatter() { - let blocks = vec![paragraph("intro"), heading(1, "Real Title"), heading(2, "Sub")]; + let blocks = vec![ + paragraph("intro"), + heading(1, "Real Title"), + heading(2, "Sub"), + ]; assert_eq!(derive_title("", &blocks, "stem"), "Real Title"); } /// Step 3 — first H2 wins when no H1. #[test] fn derive_title_uses_h2_when_no_h1() { - let blocks = vec![heading(2, "First H2"), heading(2, "Second H2"), heading(1, "")]; + let blocks = vec![ + heading(2, "First H2"), + heading(2, "Second H2"), + heading(1, ""), + ]; assert_eq!(derive_title("", &blocks, "stem"), "First H2"); } @@ -1021,7 +1019,9 @@ mod tests { lang: None, code: "code should be skipped".into(), }), - paragraph("This paragraph wins. Long text that would exceed eighty characters once concatenated end-to-end here."), + paragraph( + "This paragraph wins. Long text that would exceed eighty characters once concatenated end-to-end here.", + ), ]; let title = derive_title("", &blocks, "stem"); assert_eq!(title.chars().count(), 80); @@ -1037,7 +1037,10 @@ mod tests { headers: vec!["a".into()], rows: vec![vec!["1".into()]], })]; - assert_eq!(derive_title("", &blocks, "table-only-doc"), "table-only-doc"); + assert_eq!( + derive_title("", &blocks, "table-only-doc"), + "table-only-doc" + ); } /// Step 5 sentinel — empty file_stem AND no usable blocks falls back @@ -1076,8 +1079,7 @@ mod tests { }, }]; let pv = parser_version(); - let doc = - build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); + let doc = build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); assert_eq!(doc.title, "Lifted From H1"); } diff --git a/crates/kebab-parse-md/tests/blocks_snapshots.rs b/crates/kebab-parse-md/tests/blocks_snapshots.rs index 914240c..708568f 100644 --- a/crates/kebab-parse-md/tests/blocks_snapshots.rs +++ b/crates/kebab-parse-md/tests/blocks_snapshots.rs @@ -47,8 +47,7 @@ fn assert_snapshot(fixture: &str, baseline: &str) { }; let actual: Value = serde_json::to_value(&snap).unwrap(); - let expected_text = - fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable"); + let expected_text = fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable"); let expected: Value = serde_json::from_str(&expected_text).expect("baseline parses as json"); if actual != expected { @@ -64,18 +63,12 @@ fn assert_snapshot(fixture: &str, baseline: &str) { #[test] fn nested_headings_blocks_snapshot() { - assert_snapshot( - "nested-headings.md", - "nested-headings.blocks.snapshot.json", - ); + assert_snapshot("nested-headings.md", "nested-headings.blocks.snapshot.json"); } #[test] fn code_and_table_blocks_snapshot() { - assert_snapshot( - "code-and-table.md", - "code-and-table.blocks.snapshot.json", - ); + assert_snapshot("code-and-table.md", "code-and-table.blocks.snapshot.json"); } /// Run with `cargo test -p kb-parse-md --test blocks_snapshots emit_blocks_snapshots -- --ignored --nocapture` diff --git a/crates/kebab-parse-md/tests/frontmatter_snapshots.rs b/crates/kebab-parse-md/tests/frontmatter_snapshots.rs index ce9c6ae..cf9c473 100644 --- a/crates/kebab-parse-md/tests/frontmatter_snapshots.rs +++ b/crates/kebab-parse-md/tests/frontmatter_snapshots.rs @@ -52,8 +52,7 @@ fn assert_snapshot(fixture: &str, baseline: &str) { }; let actual: Value = serde_json::to_value(&snap).unwrap(); - let expected_text = - fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable"); + let expected_text = fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable"); let expected: Value = serde_json::from_str(&expected_text).expect("baseline parses as json"); if actual != expected { @@ -107,5 +106,8 @@ fn snapshot_is_deterministic_across_runs() { let bytes = fs::read(dir.join("frontmatter-only.md")).unwrap(); let (a, _, _) = parse_frontmatter(&bytes, &pinned_hints()).unwrap(); let (b, _, _) = parse_frontmatter(&bytes, &pinned_hints()).unwrap(); - assert_eq!(serde_json::to_value(&a).unwrap(), serde_json::to_value(&b).unwrap()); + assert_eq!( + serde_json::to_value(&a).unwrap(), + serde_json::to_value(&b).unwrap() + ); } diff --git a/crates/kebab-parse-md/tests/normalize_snapshot.rs b/crates/kebab-parse-md/tests/normalize_snapshot.rs index 27dab88..3c84997 100644 --- a/crates/kebab-parse-md/tests/normalize_snapshot.rs +++ b/crates/kebab-parse-md/tests/normalize_snapshot.rs @@ -16,8 +16,7 @@ use std::path::PathBuf; use kebab_core::{ - AssetId, AssetStorage, Checksum, MediaType, ParserVersion, RawAsset, SourceUri, - WorkspacePath, + AssetId, AssetStorage, Checksum, MediaType, ParserVersion, RawAsset, SourceUri, WorkspacePath, }; use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter}; use serde_json::Value; @@ -101,8 +100,7 @@ fn code_and_table_canonical_snapshot() { Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1, None => 1, }; - let (blocks, parse_warns) = - parse_blocks(&bytes, body_offset_lines).expect("blocks parse"); + let (blocks, parse_warns) = parse_blocks(&bytes, body_offset_lines).expect("blocks parse"); let parser_version = ParserVersion("kb-normalize-snapshot-test-0".into()); let mut metadata = metadata; @@ -111,14 +109,8 @@ fn code_and_table_canonical_snapshot() { metadata.aliases.sort(); metadata.tags.sort(); - let doc = build_canonical_document( - &asset, - metadata, - blocks, - &parser_version, - parse_warns, - ) - .expect("build_canonical_document"); + let doc = build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns) + .expect("build_canonical_document"); // Assert the BodyHints → first_h1 → user.title → CanonicalDocument.title // lift chain end-to-end. Pinned in the snapshot too, but the explicit @@ -140,8 +132,7 @@ fn code_and_table_canonical_snapshot() { baseline_path.display() ), }; - let expected: Value = - serde_json::from_str(&baseline_text).expect("baseline parses as json"); + let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json"); if actual != expected { if std::env::var("UPDATE_SNAPSHOTS").is_ok() { diff --git a/crates/kebab-parse-pdf/examples/gen_smoke_pdf.rs b/crates/kebab-parse-pdf/examples/gen_smoke_pdf.rs index 450273c..bf56a6e 100644 --- a/crates/kebab-parse-pdf/examples/gen_smoke_pdf.rs +++ b/crates/kebab-parse-pdf/examples/gen_smoke_pdf.rs @@ -11,7 +11,9 @@ use std::io::Write; fn main() { let mut args = std::env::args().skip(1); - let out = args.next().expect("usage: gen_smoke_pdf "); + let out = args + .next() + .expect("usage: gen_smoke_pdf "); let pages: Vec = args.collect(); if pages.is_empty() { eprintln!("at least one page text required"); @@ -34,10 +36,7 @@ fn main() { operations: vec![ Operation::new("BT", vec![]), Operation::new("Tf", vec!["F1".into(), 14.into()]), - Operation::new( - "Td", - vec![Object::Integer(72), Object::Integer(720)], - ), + Operation::new("Td", vec![Object::Integer(72), Object::Integer(720)]), Operation::new("Tj", vec![Object::string_literal(text.as_str())]), Operation::new("ET", vec![]), ], @@ -65,8 +64,7 @@ fn main() { Object::Integer(842), ], }; - doc.objects - .insert(pages_id, Object::Dictionary(pages_dict)); + doc.objects.insert(pages_id, Object::Dictionary(pages_dict)); let catalog_id = doc.add_object(dictionary! { "Type" => "Catalog", "Pages" => pages_id, diff --git a/crates/kebab-parse-pdf/src/info.rs b/crates/kebab-parse-pdf/src/info.rs index 4b3eb02..f107b63 100644 --- a/crates/kebab-parse-pdf/src/info.rs +++ b/crates/kebab-parse-pdf/src/info.rs @@ -29,10 +29,7 @@ pub(crate) fn extract_info(doc: &lopdf::Document) -> InfoDict { let dict = match info_obj { lopdf::Object::Dictionary(d) => Some(d), - lopdf::Object::Reference(id) => doc - .get_object(*id) - .ok() - .and_then(|o| o.as_dict().ok()), + lopdf::Object::Reference(id) => doc.get_object(*id).ok().and_then(|o| o.as_dict().ok()), _ => None, }; diff --git a/crates/kebab-parse-pdf/src/page_image.rs b/crates/kebab-parse-pdf/src/page_image.rs index 900b8e5..8b09dcb 100644 --- a/crates/kebab-parse-pdf/src/page_image.rs +++ b/crates/kebab-parse-pdf/src/page_image.rs @@ -11,12 +11,10 @@ use anyhow::{Context, Result}; use lopdf::{Document, Object}; -pub fn extract_dctdecode_page_image( - pdf_doc: &Document, - page_num: u32, -) -> Result>> { +pub fn extract_dctdecode_page_image(pdf_doc: &Document, page_num: u32) -> Result>> { let pages = pdf_doc.get_pages(); - let &page_oid = pages.get(&page_num) + let &page_oid = pages + .get(&page_num) .with_context(|| format!("page {page_num} not in get_pages()"))?; // page → /Resources → /XObject → traverse for first /Subtype /Image with /Filter == /DCTDecode. @@ -27,12 +25,18 @@ pub fn extract_dctdecode_page_image( Some(Object::Reference(r)) => pdf_doc.get_dictionary(*r).ok().cloned(), _ => None, }; - let resources = match resources { Some(r) => r, None => return Ok(None) }; + let resources = match resources { + Some(r) => r, + None => return Ok(None), + }; let xobject_obj = resources.get(b"XObject").ok(); let xobject = match xobject_obj { Some(Object::Dictionary(d)) => d.clone(), - Some(Object::Reference(r)) => match pdf_doc.get_dictionary(*r) { Ok(d) => d.clone(), Err(_) => return Ok(None) }, + Some(Object::Reference(r)) => match pdf_doc.get_dictionary(*r) { + Ok(d) => d.clone(), + Err(_) => return Ok(None), + }, _ => return Ok(None), }; @@ -45,20 +49,31 @@ pub fn extract_dctdecode_page_image( Ok(Object::Stream(s)) => s.clone(), _ => continue, }; - let subtype_is_image = stream.dict.get(b"Subtype") + let subtype_is_image = stream + .dict + .get(b"Subtype") .ok() - .and_then(|o| match o { Object::Name(n) => Some(n.as_slice()), _ => None }) + .and_then(|o| match o { + Object::Name(n) => Some(n.as_slice()), + _ => None, + }) .is_some_and(|n| n == b"Image"); - if !subtype_is_image { continue; } + if !subtype_is_image { + continue; + } let filter_obj = stream.dict.get(b"Filter").ok(); let is_dct_only = match filter_obj { Some(Object::Name(n)) => n.as_slice() == b"DCTDecode", - Some(Object::Array(arr)) => arr.len() == 1 - && matches!(arr.first(), Some(Object::Name(n)) if n.as_slice() == b"DCTDecode"), + Some(Object::Array(arr)) => { + arr.len() == 1 + && matches!(arr.first(), Some(Object::Name(n)) if n.as_slice() == b"DCTDecode") + } _ => false, }; - if !is_dct_only { continue; } + if !is_dct_only { + continue; + } // raw bytes — lopdf 의 stream.content 는 already-encoded (filter 적용 // 후). DCTDecode 의 경우 raw JPEG bytes. diff --git a/crates/kebab-parse-pdf/src/text_quality.rs b/crates/kebab-parse-pdf/src/text_quality.rs index 4996b08..2a9ca19 100644 --- a/crates/kebab-parse-pdf/src/text_quality.rs +++ b/crates/kebab-parse-pdf/src/text_quality.rs @@ -9,9 +9,7 @@ // encodings fall through to `String::from_utf8_lossy(bytes)`, which yields // PUA / replacement-char territory already covered by `pure_pua_zero`. // Re-verify on lopdf dependency upgrade. -const MOJIBAKE_MARKERS: &[&str] = &[ - "?Identity-H Unimplemented?", -]; +const MOJIBAKE_MARKERS: &[&str] = &["?Identity-H Unimplemented?"]; /// Valid char ratio (0.0..=1.0). 빈 string → 0.0. /// valid := ASCII printable + Hangul (Jamo/Compatibility/Syllables) + CJK + Latin Extended + common Korean punctuation. @@ -53,7 +51,11 @@ pub fn compute_valid_char_ratio(s: &str) -> f32 { valid += 1; } } - let raw_ratio = if total == 0 { 0.0 } else { valid as f32 / total as f32 }; + let raw_ratio = if total == 0 { + 0.0 + } else { + valid as f32 / total as f32 + }; return raw_ratio.min(0.3); } } @@ -75,18 +77,30 @@ pub fn compute_valid_char_ratio(s: &str) -> f32 { fn is_valid_text_char(c: char) -> bool { let cp = c as u32; match cp { - 0x0009 | 0x000A | 0x000D => true, // tab / LF / CR - 0x0020..=0x007E => true, // ASCII printable - 0x00A0..=0x024F => true, // Latin-1 Supplement + Latin Extended-A/B - 0x1100..=0x11FF => true, // Hangul Jamo - 0x3130..=0x318F => true, // Hangul Compatibility Jamo - 0x4E00..=0x9FFF => true, // CJK Unified Ideographs - 0xAC00..=0xD7A3 => true, // Hangul Syllables - 0x2010..=0x205F => matches!(c, - '\u{2010}' | '\u{2013}' | '\u{2014}' | '\u{2015}' | - '\u{2018}' | '\u{2019}' | '\u{201C}' | '\u{201D}' | - '\u{201E}' | '\u{2026}' | '\u{2027}' | '\u{2032}' | '\u{2033}' - | '\u{00B7}'), + 0x0009 | 0x000A | 0x000D => true, // tab / LF / CR + 0x0020..=0x007E => true, // ASCII printable + 0x00A0..=0x024F => true, // Latin-1 Supplement + Latin Extended-A/B + 0x1100..=0x11FF => true, // Hangul Jamo + 0x3130..=0x318F => true, // Hangul Compatibility Jamo + 0x4E00..=0x9FFF => true, // CJK Unified Ideographs + 0xAC00..=0xD7A3 => true, // Hangul Syllables + 0x2010..=0x205F => matches!( + c, + '\u{2010}' + | '\u{2013}' + | '\u{2014}' + | '\u{2015}' + | '\u{2018}' + | '\u{2019}' + | '\u{201C}' + | '\u{201D}' + | '\u{201E}' + | '\u{2026}' + | '\u{2027}' + | '\u{2032}' + | '\u{2033}' + | '\u{00B7}' + ), _ => false, } } @@ -116,7 +130,9 @@ mod tests { fn pure_pua_zero() { // Private Use Area codepoints — mojibake 의 patten. // U+E000..U+F8FF 가 valid char list 에 없음. - let s: String = (0xE000u32..0xE010).map(|c| char::from_u32(c).unwrap()).collect(); + let s: String = (0xE000u32..0xE010) + .map(|c| char::from_u32(c).unwrap()) + .collect(); let r = compute_valid_char_ratio(&s); assert_eq!(r, 0.0); } @@ -125,7 +141,9 @@ mod tests { fn mixed_half() { // 5 valid ASCII + 5 PUA → 0.5 let mut s = String::from("ABCDE"); - for c in 0xE000u32..0xE005 { s.push(char::from_u32(c).unwrap()); } + for c in 0xE000u32..0xE005 { + s.push(char::from_u32(c).unwrap()); + } let r = compute_valid_char_ratio(&s); assert!((r - 0.5).abs() < 1e-6, "got {r}"); } @@ -138,7 +156,7 @@ mod tests { #[test] fn hangul_jamo_valid() { - let r = compute_valid_char_ratio("\u{1100}\u{1161}"); // Jamo ㄱㅏ + let r = compute_valid_char_ratio("\u{1100}\u{1161}"); // Jamo ㄱㅏ assert!((r - 1.0).abs() < 1e-6, "got {r}"); } @@ -153,7 +171,10 @@ mod tests { let doc = Document::load_mem(bytes).unwrap(); let text = doc.extract_text(&[1]).unwrap_or_default(); let r = compute_valid_char_ratio(&text); - assert!(r < 0.5, "F4 mojibake fixture 의 valid_ratio < 0.5 (production OCR trigger threshold — got {r})"); + assert!( + r < 0.5, + "F4 mojibake fixture 의 valid_ratio < 0.5 (production OCR trigger threshold — got {r})" + ); } #[test] @@ -163,15 +184,18 @@ mod tests { // With dominance heuristic: ratio ≤ 0.3 (triggers OCR fallback). let s = format!("Page 1 of 5 {}", "?Identity-H Unimplemented?".repeat(20)); let r = compute_valid_char_ratio(&s); - assert!(r <= 0.3, "marker-dominant mixed page → ratio ≤ 0.3 (OCR fallback); got {r}"); + assert!( + r <= 0.3, + "marker-dominant mixed page → ratio ≤ 0.3 (OCR fallback); got {r}" + ); } #[test] fn identity_h_marker_minority_with_long_valid_text_keeps_high_ratio() { // Inverse case: short marker noise + long valid text → ratio stays high // (no false OCR trigger on otherwise-good pages). - let header = "x".repeat(200); // 200 char valid ASCII - let s = format!("{header} ?Identity-H Unimplemented?"); // 1× marker = 26 char + let header = "x".repeat(200); // 200 char valid ASCII + let s = format!("{header} ?Identity-H Unimplemented?"); // 1× marker = 26 char let r = compute_valid_char_ratio(&s); assert!(r > 0.9, "marker-minority page keeps high ratio; got {r}"); } diff --git a/crates/kebab-parse-pdf/tests/common/mod.rs b/crates/kebab-parse-pdf/tests/common/mod.rs index f1dd7e6..6b18baa 100644 --- a/crates/kebab-parse-pdf/tests/common/mod.rs +++ b/crates/kebab-parse-pdf/tests/common/mod.rs @@ -56,17 +56,13 @@ pub fn build_text_pdf_with_info(pages: &[Option<&str>], info: &InfoDict) -> Vec< operations: vec![ Operation::new("BT", vec![]), Operation::new("Tf", vec!["F1".into(), 24.into()]), - Operation::new( - "Td", - vec![Object::Integer(100), Object::Integer(700)], - ), + Operation::new("Td", vec![Object::Integer(100), Object::Integer(700)]), Operation::new("Tj", vec![Object::string_literal(*text)]), Operation::new("ET", vec![]), ], }; let stream_data = content.encode().expect("content encode"); - let content_id = - doc.add_object(Stream::new(dictionary! {}, stream_data)); + let content_id = doc.add_object(Stream::new(dictionary! {}, stream_data)); page_dict.set("Contents", content_id); } let page_id = doc.add_object(page_dict); @@ -86,8 +82,7 @@ pub fn build_text_pdf_with_info(pages: &[Option<&str>], info: &InfoDict) -> Vec< Object::Integer(842), ], }; - doc.objects - .insert(pages_id, Object::Dictionary(pages_dict)); + doc.objects.insert(pages_id, Object::Dictionary(pages_dict)); let catalog_id = doc.add_object(dictionary! { "Type" => "Catalog", diff --git a/crates/kebab-parse-pdf/tests/extractor.rs b/crates/kebab-parse-pdf/tests/extractor.rs index 32ff352..755039b 100644 --- a/crates/kebab-parse-pdf/tests/extractor.rs +++ b/crates/kebab-parse-pdf/tests/extractor.rs @@ -36,7 +36,10 @@ fn three_page_pdf_emits_one_paragraph_block_per_page() { assert_eq!(doc.title, "three"); assert_eq!(doc.lang.0, "und"); assert_eq!(doc.parser_version.0, kebab_parse_pdf::PARSER_VERSION); - assert_eq!(doc.metadata.user["pdf"]["page_count"], Value::Number(3.into())); + assert_eq!( + doc.metadata.user["pdf"]["page_count"], + Value::Number(3.into()) + ); let blocks = paragraph_blocks(&doc); assert_eq!(blocks.len(), 3); @@ -149,7 +152,10 @@ fn page_count_matches_actual_count() { .extract(&fx.ctx(), &bytes) .expect("5-page extraction must succeed"); - assert_eq!(doc.metadata.user["pdf"]["page_count"], Value::Number(5.into())); + assert_eq!( + doc.metadata.user["pdf"]["page_count"], + Value::Number(5.into()) + ); assert_eq!(doc.blocks.len(), 5); } @@ -277,7 +283,10 @@ fn snapshot_three_page_canonical_document_stable() { Value::Number(((i as u64) + 1).into()) ); } - assert_eq!(json["metadata"]["source_type"], Value::String("paper".into())); + assert_eq!( + json["metadata"]["source_type"], + Value::String("paper".into()) + ); assert_eq!( json["metadata"]["trust_level"], Value::String("primary".into()) diff --git a/crates/kebab-parse-pdf/tests/page_image.rs b/crates/kebab-parse-pdf/tests/page_image.rs index b1a16d4..091be7c 100644 --- a/crates/kebab-parse-pdf/tests/page_image.rs +++ b/crates/kebab-parse-pdf/tests/page_image.rs @@ -1,7 +1,7 @@ // crates/kebab-parse-pdf/tests/page_image.rs (신규) -use lopdf::Document; use kebab_parse_pdf::extract_dctdecode_page_image; +use lopdf::Document; // happy path — F1 fixture (DCTDecode JPEG passthrough) #[test] @@ -11,7 +11,11 @@ fn f1_fixture_yields_dctdecode_jpeg_bytes() { let result = extract_dctdecode_page_image(&doc, 1).unwrap(); let jpeg = result.expect("F1 의 page 1 이 DCTDecode image 보유"); assert!(jpeg.starts_with(b"\xFF\xD8"), "JPEG magic missing"); - assert!(jpeg.len() > 1000, "JPEG bytes too small (got {})", jpeg.len()); + assert!( + jpeg.len() > 1000, + "JPEG bytes too small (got {})", + jpeg.len() + ); } // negative path — F6 fixture (FlateDecode raw pixel — Ok(None)) @@ -20,5 +24,8 @@ fn flate_raw_fixture_yields_none() { let bytes = include_bytes!("fixtures/flate_raw.pdf"); let doc = Document::load_mem(bytes).unwrap(); let result = extract_dctdecode_page_image(&doc, 1).unwrap(); - assert!(result.is_none(), "FlateDecode page 가 Ok(None) 반환 — DCTDecode-only v1 invariant"); + assert!( + result.is_none(), + "FlateDecode page 가 Ok(None) 반환 — DCTDecode-only v1 invariant" + ); } diff --git a/crates/kebab-rag/src/lib.rs b/crates/kebab-rag/src/lib.rs index 33122ca..bfc04a3 100644 --- a/crates/kebab-rag/src/lib.rs +++ b/crates/kebab-rag/src/lib.rs @@ -23,7 +23,6 @@ pub use kebab_core::{Answer, AnswerCitation, AnswerRetrievalSummary, RefusalReas mod pipeline; pub use pipeline::{ - AskOpts, MAX_NLI_HYPOTHESIS_CHARS_INITIAL, MAX_NLI_HYPOTHESIS_CHARS_MIN, - MAX_NLI_PREMISE_CHARS, RagPipeline, StreamEvent, truncate_for_nli, - truncate_hypothesis_for_nli_with_budget, + AskOpts, MAX_NLI_HYPOTHESIS_CHARS_INITIAL, MAX_NLI_HYPOTHESIS_CHARS_MIN, MAX_NLI_PREMISE_CHARS, + RagPipeline, StreamEvent, truncate_for_nli, truncate_hypothesis_for_nli_with_budget, }; diff --git a/crates/kebab-rag/src/pipeline.rs b/crates/kebab-rag/src/pipeline.rs index cf9a028..18c1ef0 100644 --- a/crates/kebab-rag/src/pipeline.rs +++ b/crates/kebab-rag/src/pipeline.rs @@ -38,13 +38,12 @@ use std::sync::Arc; use anyhow::{Context, Result}; -use kebab_core::{ - Answer, AnswerCitation, AnswerRetrievalSummary, Citation, FinishReason, - GenerateRequest, HopKind, HopRecord, LanguageModel, ModelRef, RefusalReason, - Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery, TokenChunk, - TokenUsage, TraceId, Turn, VerificationSummary, -}; use kebab_core::versions::PromptTemplateVersion; +use kebab_core::{ + Answer, AnswerCitation, AnswerRetrievalSummary, Citation, FinishReason, GenerateRequest, + HopKind, HopRecord, LanguageModel, ModelRef, RefusalReason, Retriever, SearchFilters, + SearchHit, SearchMode, SearchQuery, TokenChunk, TokenUsage, TraceId, Turn, VerificationSummary, +}; use kebab_store_sqlite::SqliteStore; use regex::Regex; use std::sync::OnceLock; @@ -313,9 +312,7 @@ impl RagPipeline { // here — if the caller already dropped the receiver we just // skip and let the LLM-loop SendError handle it consistently. if let Some(sink) = &opts.stream_sink { - let _ = sink.send(StreamEvent::RetrievalDone { - hits: hits.clone(), - }); + let _ = sink.send(StreamEvent::RetrievalDone { hits: hits.clone() }); } let chunks_returned = u32::try_from(hits.len()).unwrap_or(u32::MAX); let top_score = hits.first().map_or(0.0, |h| h.retrieval.fusion_score); @@ -356,8 +353,7 @@ impl RagPipeline { } // ── 4. Render prompt ─────────────────────────────────────────────── - let system = system_prompt_for(&self.config.rag.prompt_template_version)? - .to_string(); + let system = system_prompt_for(&self.config.rag.prompt_template_version)?.to_string(); // p9-fb-15: prepend `[이전 대화]` block when history is // present. `serialize_history` enforces the spec §3.8 // priority — system+question stay untouched, retrieved @@ -373,9 +369,7 @@ impl RagPipeline { let user = if history_block.is_empty() { format!("[질문]\n{query}\n\n[근거]\n{packed_text}") } else { - format!( - "{history_block}\n\n[질문]\n{query}\n\n[근거]\n{packed_text}" - ) + format!("{history_block}\n\n[질문]\n{query}\n\n[근거]\n{packed_text}") }; // ── 5. Generate ──────────────────────────────────────────────────── @@ -472,14 +466,12 @@ impl RagPipeline { // observable in tracing so operators can distinguish "model // said `근거가 부족`" from "model produced unmarked/unknown // text" in logs without recomputing the regex downstream. - let refusal_phrase = REFUSAL_PHRASE.get_or_init(|| { - Regex::new(r"근거(가|이)\s*부족").expect("static regex compiles") - }); + let refusal_phrase = REFUSAL_PHRASE + .get_or_init(|| Regex::new(r"근거(가|이)\s*부족").expect("static regex compiles")); let trimmed_answer = acc.trim(); let matched_refusal_phrase = refusal_phrase.is_match(&acc); - let grounded_unaware = !trimmed_answer.is_empty() - && unknown_markers.is_empty() - && !extracted.is_empty(); + let grounded_unaware = + !trimmed_answer.is_empty() && unknown_markers.is_empty() && !extracted.is_empty(); // p9-fb-33: cancel takes priority over LlmSelfJudge — the // caller bailed mid-stream, so the recorded reason should // reflect that, not "model didn't cite". @@ -580,9 +572,7 @@ impl RagPipeline { // p9-fb-33: emit final on the success path. On cancel we // skip Final — the receiver is gone and persistence still // records the partial answer below. - if !cancelled - && let Some(sink) = &opts.stream_sink - { + if !cancelled && let Some(sink) = &opts.stream_sink { let _ = sink.send(StreamEvent::Final { answer: answer.clone(), }); @@ -605,8 +595,9 @@ impl RagPipeline { } else { None }; - if let Err(e) = - self.docs.put_answer(&answer, query, packed_chunks_json.as_deref()) + if let Err(e) = self + .docs + .put_answer(&answer, query, packed_chunks_json.as_deref()) { tracing::warn!( target: "kebab-rag", @@ -689,14 +680,7 @@ impl RagPipeline { return self.refuse_no_chunks(query, &opts, k_effective, started, None); } if probe_hits[0].retrieval.fusion_score < self.config.rag.score_gate { - return self.refuse_score_gate( - query, - &opts, - &probe_hits, - k_effective, - started, - None, - ); + return self.refuse_score_gate(query, &opts, &probe_hits, k_effective, started, None); } // probe_hits are inspected for the gate decision only — the @@ -774,8 +758,7 @@ impl RagPipeline { break; } } - let chunks_added = - u32::try_from(pool.len() - pool_before).unwrap_or(u32::MAX); + let chunks_added = u32::try_from(pool.len() - pool_before).unwrap_or(u32::MAX); // Two caps that bypass the decide LLM call: hitting // `max_depth` (this iter is the last) and `max_pool_chunks` @@ -785,43 +768,38 @@ impl RagPipeline { let forced_stop = depth_force_stop || pool_cap_hit; // Decide LLM call (skip when forced_stop OR pool empty). - let (new_sub_queries, decide_ms): (Vec, u32) = - if forced_stop || pool.is_empty() { - (Vec::new(), 0) - } else { - // Snippet-based preview: each pool entry contributes - // its `SearchHit.snippet` (already truncated upstream - // by the retriever). `max_pool_chunks` acts as the - // implicit cap on this string's length — the loop - // breaks before we accumulate more pool entries. - // We intentionally do NOT route this through - // `pack_context` (no full chunk text fetch, no - // marker numbering): decide only needs gist to - // judge sufficiency, and full text is reserved for - // the terminal synthesize call. - let preview = pool - .iter() - .enumerate() - .map(|(i, h)| format!("[{}] {}", i + 1, h.snippet)) - .collect::>() - .join("\n\n"); - let depth_remaining = max_depth - iter; - let (decide_result, ms) = self.multi_hop_decide( - query, - &preview, - pool.len(), - depth_remaining, - &opts, - )?; - // `parse_decompose_response` post-condition: when - // it returns `Some(qs)`, `qs` is guaranteed - // non-empty (and trimmed + hard-capped). `None` - // covers both "parse failure" and "empty array - // after trim" — both mean stop. Parse failure is - // NOT a refusal here (spec §9 — graceful degrade - // to early synthesize on the decide hop only). - (decide_result.unwrap_or_default(), ms) - }; + let (new_sub_queries, decide_ms): (Vec, u32) = if forced_stop || pool.is_empty() + { + (Vec::new(), 0) + } else { + // Snippet-based preview: each pool entry contributes + // its `SearchHit.snippet` (already truncated upstream + // by the retriever). `max_pool_chunks` acts as the + // implicit cap on this string's length — the loop + // breaks before we accumulate more pool entries. + // We intentionally do NOT route this through + // `pack_context` (no full chunk text fetch, no + // marker numbering): decide only needs gist to + // judge sufficiency, and full text is reserved for + // the terminal synthesize call. + let preview = pool + .iter() + .enumerate() + .map(|(i, h)| format!("[{}] {}", i + 1, h.snippet)) + .collect::>() + .join("\n\n"); + let depth_remaining = max_depth - iter; + let (decide_result, ms) = + self.multi_hop_decide(query, &preview, pool.len(), depth_remaining, &opts)?; + // `parse_decompose_response` post-condition: when + // it returns `Some(qs)`, `qs` is guaranteed + // non-empty (and trimmed + hard-capped). `None` + // covers both "parse failure" and "empty array + // after trim" — both mean stop. Parse failure is + // NOT a refusal here (spec §9 — graceful degrade + // to early synthesize on the decide hop only). + (decide_result.unwrap_or_default(), ms) + }; hops.push(HopRecord { iter, @@ -851,9 +829,7 @@ impl RagPipeline { // is ready. The downstream synthesize call still uses // `stream_sink` for token streaming if set. if let Some(sink) = &opts.stream_sink { - let _ = sink.send(StreamEvent::RetrievalDone { - hits: pool.clone(), - }); + let _ = sink.send(StreamEvent::RetrievalDone { hits: pool.clone() }); } let chunks_returned = u32::try_from(pool.len()).unwrap_or(u32::MAX); let top_score = pool.first().map_or(0.0, |h| h.retrieval.fusion_score); @@ -863,23 +839,10 @@ impl RagPipeline { // a `--multi-hop` user can still see which decompose / decide // signals fired before the score-gate / no-chunks bailout. if pool.is_empty() { - return self.refuse_no_chunks( - query, - &opts, - k_effective, - started, - Some(hops), - ); + return self.refuse_no_chunks(query, &opts, k_effective, started, Some(hops)); } if top_score < self.config.rag.score_gate { - return self.refuse_score_gate( - query, - &opts, - &pool, - k_effective, - started, - Some(hops), - ); + return self.refuse_score_gate(query, &opts, &pool, k_effective, started, Some(hops)); } // ── 4. Pack context ──────────────────────────────────────────────── @@ -891,13 +854,7 @@ impl RagPipeline { pool_size = pool.len(), "kb-rag: multi-hop pool chunks all unfetchable; falling back to NoChunks" ); - return self.refuse_no_chunks( - query, - &opts, - k_effective, - started, - Some(hops), - ); + return self.refuse_no_chunks(query, &opts, k_effective, started, Some(hops)); } // ── 5. Synthesize prompt ─────────────────────────────────────────── @@ -1008,14 +965,12 @@ impl RagPipeline { .filter(|n| !valid_markers.contains(n)) .collect(); - let refusal_phrase = REFUSAL_PHRASE.get_or_init(|| { - Regex::new(r"근거(가|이)\s*부족").expect("static regex compiles") - }); + let refusal_phrase = REFUSAL_PHRASE + .get_or_init(|| Regex::new(r"근거(가|이)\s*부족").expect("static regex compiles")); let trimmed_answer = acc.trim(); let matched_refusal_phrase = refusal_phrase.is_match(&acc); - let grounded_unaware = !trimmed_answer.is_empty() - && unknown_markers.is_empty() - && !extracted.is_empty(); + let grounded_unaware = + !trimmed_answer.is_empty() && unknown_markers.is_empty() && !extracted.is_empty(); let (grounded, refusal_reason) = if matches!(finish_reason, FinishReason::Cancelled) { (false, Some(RefusalReason::LlmStreamAborted)) } else if grounded_unaware { @@ -1125,8 +1080,7 @@ impl RagPipeline { // p9-fb-41 PR-3b: append the terminal Synthesize HopRecord // before building the Answer. `iter` is the position in the // hops vector (0=decompose, 1..N=decide, N+1=synthesize). - let synth_ms = - u32::try_from(synthesize_started.elapsed().as_millis()).unwrap_or(u32::MAX); + let synth_ms = u32::try_from(synthesize_started.elapsed().as_millis()).unwrap_or(u32::MAX); hops.push(HopRecord { iter: u32::try_from(hops.len()).unwrap_or(u32::MAX), kind: HopKind::Synthesize, @@ -1182,9 +1136,7 @@ impl RagPipeline { "kb-rag: multi-hop ask done" ); - if !cancelled - && let Some(sink) = &opts.stream_sink - { + if !cancelled && let Some(sink) = &opts.stream_sink { let _ = sink.send(StreamEvent::Final { answer: answer.clone(), }); @@ -1205,7 +1157,10 @@ impl RagPipeline { } else { None }; - if let Err(e) = self.docs.put_answer(&answer, query, packed_chunks_json.as_deref()) { + if let Err(e) = self + .docs + .put_answer(&answer, query, packed_chunks_json.as_deref()) + { tracing::warn!( target: "kebab-rag", error = %e, @@ -1403,8 +1358,7 @@ impl RagPipeline { fn pack_context(&self, query: &str, hits: &[SearchHit]) -> Result { // Hard ceiling for the packed-context section in tokens (≈ chars / 4). let cap = self.config.rag.max_context_tokens; - let system_prompt_text = - system_prompt_for(&self.config.rag.prompt_template_version)?; + let system_prompt_text = system_prompt_for(&self.config.rag.prompt_template_version)?; let prompt_overhead_tokens = est_tokens(system_prompt_text) + est_tokens(query) + 64; let budget_tokens = cap.saturating_sub(prompt_overhead_tokens); @@ -1417,7 +1371,9 @@ impl RagPipeline { let chunk_full = ::get_chunk(&self.docs, &hit.chunk_id) .context("kb-rag: docs.get_chunk")?; - let chunk_text = if let Some(c) = chunk_full { c.text } else { + let chunk_text = if let Some(c) = chunk_full { + c.text + } else { tracing::warn!( target: "kebab-rag", chunk_id = %hit.chunk_id.0, @@ -1543,9 +1499,7 @@ impl RagPipeline { let gate = self.config.rag.score_gate; let mut text = String::new(); text.push_str("근거 부족. KB에 해당 내용 없음.\n"); - text.push_str(&format!( - "가까운 후보 (모두 임계 {gate:.2} 미만):\n" - )); + text.push_str(&format!("가까운 후보 (모두 임계 {gate:.2} 미만):\n")); let preview: Vec<&SearchHit> = hits.iter().take(3).collect(); for h in &preview { text.push_str(&format!( @@ -1772,11 +1726,7 @@ fn embedding_ref_for(mode: SearchMode, cfg: &kebab_config::Config) -> Option threshold_days * 24h`, /// strict `>` so exactly-threshold hits stay fresh, and /// `threshold_days = 0` short-circuits to `false` (feature off). -fn compute_stale( - indexed_at: OffsetDateTime, - now: OffsetDateTime, - threshold_days: u32, -) -> bool { +fn compute_stale(indexed_at: OffsetDateTime, now: OffsetDateTime, threshold_days: u32) -> bool { if threshold_days == 0 { return false; } @@ -1934,9 +1884,9 @@ fn system_prompt_for(version: &str) -> anyhow::Result<&'static str> { match version { "rag-v1" => Ok(SYSTEM_PROMPT_RAG_V1), "rag-v2" => Ok(SYSTEM_PROMPT_RAG_V2), - other => anyhow::bail!( - "unknown prompt_template_version: {other:?} (expected rag-v1 or rag-v2)" - ), + other => { + anyhow::bail!("unknown prompt_template_version: {other:?} (expected rag-v1 or rag-v2)") + } } } @@ -2031,8 +1981,8 @@ static MARKER_REGEX: OnceLock = OnceLock::new(); static REFUSAL_PHRASE: OnceLock = OnceLock::new(); fn extract_markers(s: &str) -> Vec { - let re = MARKER_REGEX - .get_or_init(|| Regex::new(r"\[#(\d{1,3})\]").expect("static regex compiles")); + let re = + MARKER_REGEX.get_or_init(|| Regex::new(r"\[#(\d{1,3})\]").expect("static regex compiles")); re.captures_iter(s) .filter_map(|c| c.get(1).and_then(|m| m.as_str().parse::().ok())) .collect() @@ -2272,10 +2222,7 @@ mod tests { let h = vec![fake_turn("Q1", "first answer body")]; let expanded = expand_query_with_history("follow-up", &h); assert!(expanded.starts_with("follow-up "), "got: {expanded}"); - assert!( - expanded.contains("first answer body"), - "got: {expanded}" - ); + assert!(expanded.contains("first answer body"), "got: {expanded}"); } #[test] @@ -2436,13 +2383,12 @@ mod compute_stale_mirror_tests { #[cfg(test)] mod stream_event_serde_tests { use super::*; - use kebab_core::{ - AnswerRetrievalSummary, ChunkId, ChunkerVersion, Citation, - DocumentId, IndexVersion, ModelRef, RetrievalDetail, SearchHit, SearchMode, - TokenUsage, TraceId, - }; use kebab_core::asset::WorkspacePath; use kebab_core::versions::PromptTemplateVersion; + use kebab_core::{ + AnswerRetrievalSummary, ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, + ModelRef, RetrievalDetail, SearchHit, SearchMode, TokenUsage, TraceId, + }; use time::macros::datetime; fn mk_hit() -> SearchHit { @@ -2481,7 +2427,10 @@ mod stream_event_serde_tests { #[test] fn stream_event_token_serializes_with_kind_discriminator() { - let ev = StreamEvent::Token { delta: "안녕".into(), turn_index: Some(0) }; + let ev = StreamEvent::Token { + delta: "안녕".into(), + turn_index: Some(0), + }; let v = serde_json::to_value(&ev).unwrap(); assert_eq!(v["kind"], "token"); assert_eq!(v["delta"], "안녕"); @@ -2490,7 +2439,9 @@ mod stream_event_serde_tests { #[test] fn stream_event_retrieval_done_serializes_hits() { - let ev = StreamEvent::RetrievalDone { hits: vec![mk_hit()] }; + let ev = StreamEvent::RetrievalDone { + hits: vec![mk_hit()], + }; let v = serde_json::to_value(&ev).unwrap(); assert_eq!(v["kind"], "retrieval_done"); assert_eq!(v["hits"].as_array().unwrap().len(), 1); @@ -2503,16 +2454,27 @@ mod stream_event_serde_tests { citations: vec![], grounded: true, refusal_reason: None, - model: ModelRef { id: "m".into(), provider: "p".into(), dimensions: None }, + model: ModelRef { + id: "m".into(), + provider: "p".into(), + dimensions: None, + }, embedding: None, prompt_template_version: PromptTemplateVersion("rag-v2".into()), retrieval: AnswerRetrievalSummary { trace_id: TraceId("t".into()), mode: SearchMode::Hybrid, - k: 10, score_gate: 0.3, top_score: 0.5, - chunks_returned: 1, chunks_used: 1, + k: 10, + score_gate: 0.3, + top_score: 0.5, + chunks_returned: 1, + chunks_used: 1, + }, + usage: TokenUsage { + prompt_tokens: 0, + completion_tokens: 0, + latency_ms: 0, }, - usage: TokenUsage { prompt_tokens: 0, completion_tokens: 0, latency_ms: 0 }, created_at: datetime!(2026-05-09 12:00:00 UTC), conversation_id: None, turn_index: None, diff --git a/crates/kebab-rag/tests/common/mod.rs b/crates/kebab-rag/tests/common/mod.rs index ed7bee0..49813f5 100644 --- a/crates/kebab-rag/tests/common/mod.rs +++ b/crates/kebab-rag/tests/common/mod.rs @@ -16,8 +16,8 @@ use std::sync::{Arc, Mutex}; use kebab_config::Config; use kebab_core::{ - ChunkerVersion, ChunkId, Citation, DocumentId, IndexVersion, RetrievalDetail, - Retriever, SearchHit, SearchMode, SearchQuery, WorkspacePath, + ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail, Retriever, + SearchHit, SearchMode, SearchQuery, WorkspacePath, }; use kebab_nli::{NliScores, NliVerifier}; use kebab_store_sqlite::SqliteStore; @@ -147,7 +147,10 @@ pub fn mk_hit_with_indexed_at( chunk_id: ChunkId(chunk_id.to_string()), doc_id: DocumentId(doc_id.to_string()), doc_path: p.clone(), - heading_path: heading.iter().map(std::string::ToString::to_string).collect(), + heading_path: heading + .iter() + .map(std::string::ToString::to_string) + .collect(), section_label: None, snippet: "snippet".to_string(), citation: Citation::Line { @@ -238,9 +241,7 @@ impl ScriptedRetriever { impl Retriever for ScriptedRetriever { fn search(&self, _q: &SearchQuery) -> anyhow::Result> { - let idx = self - .next - .fetch_add(1, std::sync::atomic::Ordering::SeqCst); + let idx = self.next.fetch_add(1, std::sync::atomic::Ordering::SeqCst); Ok(self.hits_per_call.get(idx).cloned().unwrap_or_default()) } fn index_version(&self) -> IndexVersion { @@ -356,12 +357,9 @@ impl kebab_core::LanguageModel for ScriptedLm { fn generate_stream( &self, req: kebab_core::GenerateRequest, - ) -> anyhow::Result< - Box> + Send>, - > { - let idx = self - .next - .fetch_add(1, std::sync::atomic::Ordering::SeqCst); + ) -> anyhow::Result> + Send>> + { + let idx = self.next.fetch_add(1, std::sync::atomic::Ordering::SeqCst); let canned = self.responses.get(idx).unwrap_or_else(|| { panic!( "ScriptedLm exhausted: call #{idx} requested but only {} responses scripted", @@ -495,8 +493,14 @@ impl SpyNliVerifier { impl NliVerifier for SpyNliVerifier { fn score(&self, premise: &str, hypothesis: &str) -> anyhow::Result { - self.received_premises.lock().unwrap().push(premise.to_string()); - self.received_hypotheses.lock().unwrap().push(hypothesis.to_string()); + self.received_premises + .lock() + .unwrap() + .push(premise.to_string()); + self.received_hypotheses + .lock() + .unwrap() + .push(hypothesis.to_string()); (self.score_fn)(premise, hypothesis) } diff --git a/crates/kebab-rag/tests/multi_hop.rs b/crates/kebab-rag/tests/multi_hop.rs index 71678df..4ee3c0b 100644 --- a/crates/kebab-rag/tests/multi_hop.rs +++ b/crates/kebab-rag/tests/multi_hop.rs @@ -73,8 +73,12 @@ fn multi_hop_decide_stop_triggers_synthesize() { ])); let lm_handle = lm.clone(); let lm_dyn: Arc = lm; - let pipeline = - RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone()); + let pipeline = RagPipeline::new( + env.config.clone(), + retriever_dyn, + lm_dyn, + env.sqlite.clone(), + ); let answer = pipeline.ask("compound", multi_hop_opts()).unwrap(); @@ -136,8 +140,12 @@ fn multi_hop_decide_continue_adds_more_chunks() { ])); let lm_handle = lm.clone(); let lm_dyn: Arc = lm; - let pipeline = - RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone()); + let pipeline = RagPipeline::new( + env.config.clone(), + retriever_dyn, + lm_dyn, + env.sqlite.clone(), + ); let answer = pipeline.ask("compound", multi_hop_opts()).unwrap(); @@ -159,7 +167,11 @@ fn multi_hop_decide_continue_adds_more_chunks() { ); let hops = answer.hops.expect("happy path stamps hops"); - assert_eq!(hops.len(), 4, "[Decompose, Decide(continue), Decide(stop), Synthesize]"); + assert_eq!( + hops.len(), + 4, + "[Decompose, Decide(continue), Decide(stop), Synthesize]" + ); assert_eq!(hops[0].kind, HopKind::Decompose); assert_eq!(hops[1].kind, HopKind::Decide); assert_eq!(hops[1].sub_queries, vec!["q2"], "iter 1 decide emits q2"); @@ -199,10 +211,7 @@ fn multi_hop_max_depth_force_stops() { // Only 2 LLM calls scripted — decompose + synthesize. If the // pipeline tries to call decide (a bug), ScriptedLm panics on // exhaustion and the test fails loudly with the call index. - let lm = Arc::new(ScriptedLm::new(vec![ - r#"["q1"]"#, - "answer [#1]", - ])); + let lm = Arc::new(ScriptedLm::new(vec![r#"["q1"]"#, "answer [#1]"])); let lm_handle = lm.clone(); let lm_dyn: Arc = lm; let pipeline = RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone()); @@ -218,7 +227,11 @@ fn multi_hop_max_depth_force_stops() { assert_eq!(retriever_handle.calls(), 2, "probe + 1 decompose retrieve"); let hops = answer.hops.expect("happy path stamps hops"); - assert_eq!(hops.len(), 3, "[Decompose, Decide(forced_stop), Synthesize]"); + assert_eq!( + hops.len(), + 3, + "[Decompose, Decide(forced_stop), Synthesize]" + ); assert_eq!(hops[1].kind, HopKind::Decide); assert!( hops[1].forced_stop, @@ -260,8 +273,12 @@ fn multi_hop_pool_chunks_dedup_by_chunk_id() { ])); let lm_handle = lm.clone(); let lm_dyn: Arc = lm; - let pipeline = - RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone()); + let pipeline = RagPipeline::new( + env.config.clone(), + retriever_dyn, + lm_dyn, + env.sqlite.clone(), + ); let answer = pipeline.ask("q", multi_hop_opts()).unwrap(); @@ -277,11 +294,7 @@ fn multi_hop_pool_chunks_dedup_by_chunk_id() { ); assert_eq!(answer.citations.len(), 1, "only one chunk cited as [#1]"); assert_eq!(answer.citations[0].marker.as_deref(), Some("[1]")); - assert_eq!( - lm_handle.calls(), - 3, - "decompose + decide + synthesize = 3" - ); + assert_eq!(lm_handle.calls(), 3, "decompose + decide + synthesize = 3"); let hops = answer.hops.expect("happy path stamps hops"); assert_eq!(hops.len(), 3, "[Decompose, Decide, Synthesize]"); @@ -316,8 +329,12 @@ fn multi_hop_decide_parse_failure_falls_through_to_synthesize() { ])); let lm_handle = lm.clone(); let lm_dyn: Arc = lm; - let pipeline = - RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone()); + let pipeline = RagPipeline::new( + env.config.clone(), + retriever_dyn, + lm_dyn, + env.sqlite.clone(), + ); let answer = pipeline.ask("q", multi_hop_opts()).unwrap(); @@ -337,7 +354,11 @@ fn multi_hop_decide_parse_failure_falls_through_to_synthesize() { ); let hops = answer.hops.expect("happy path stamps hops"); - assert_eq!(hops.len(), 3, "[Decompose, Decide(parse-fail→stop), Synthesize]"); + assert_eq!( + hops.len(), + 3, + "[Decompose, Decide(parse-fail→stop), Synthesize]" + ); assert_eq!(hops[1].kind, HopKind::Decide); assert!( hops[1].sub_queries.is_empty(), @@ -383,8 +404,12 @@ fn multi_hop_refuse_no_chunks_preserves_hops_trace() { let lm = Arc::new(ScriptedLm::new(vec![r#"["q1"]"#])); let lm_handle = lm.clone(); let lm_dyn: Arc = lm; - let pipeline = - RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone()); + let pipeline = RagPipeline::new( + env.config.clone(), + retriever_dyn, + lm_dyn, + env.sqlite.clone(), + ); let answer = pipeline.ask("q", multi_hop_opts()).unwrap(); @@ -395,7 +420,11 @@ fn multi_hop_refuse_no_chunks_preserves_hops_trace() { 2, "probe (passes) + 1 decompose-driven retrieve (empty)" ); - assert_eq!(lm_handle.calls(), 1, "decompose only — decide skipped (empty pool), no synthesize"); + assert_eq!( + lm_handle.calls(), + 1, + "decompose only — decide skipped (empty pool), no synthesize" + ); let hops = answer .hops @@ -433,23 +462,44 @@ fn multi_hop_refuse_score_gate_preserves_hops_trace() { let (low_cid, low_did) = seed_low_score_chunk(&env); let high_cid = id32("c_high"); let high_did = id32("d_high"); - env.seed_chunk(&high_cid, &high_did, "notes/high.md", "high score body", &["High"]); + env.seed_chunk( + &high_cid, + &high_did, + "notes/high.md", + "high score body", + &["High"], + ); - let probe_hits = vec![mk_hit(1, &high_cid, &high_did, "notes/high.md", 0.85, &["High"])]; - let decompose_hits = vec![mk_hit(1, &low_cid, &low_did, "notes/low.md", 0.10, &["Low"])]; + let probe_hits = vec![mk_hit( + 1, + &high_cid, + &high_did, + "notes/high.md", + 0.85, + &["High"], + )]; + let decompose_hits = vec![mk_hit( + 1, + &low_cid, + &low_did, + "notes/low.md", + 0.10, + &["Low"], + )]; let retriever = Arc::new(ScriptedRetriever::new(vec![probe_hits, decompose_hits])); let retriever_dyn: Arc = retriever; // decompose + decide (pool not empty so decide fires) — synthesize // never runs because we refuse before pack_context. - let lm = Arc::new(ScriptedLm::new(vec![ - r#"["q1"]"#, - r"[]", - ])); + let lm = Arc::new(ScriptedLm::new(vec![r#"["q1"]"#, r"[]"])); let lm_handle = lm.clone(); let lm_dyn: Arc = lm; - let pipeline = - RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone()); + let pipeline = RagPipeline::new( + env.config.clone(), + retriever_dyn, + lm_dyn, + env.sqlite.clone(), + ); let answer = pipeline.ask("q", multi_hop_opts()).unwrap(); @@ -517,10 +567,16 @@ fn multi_hop_below_probe_gate_refuses_before_any_llm_call() { let lm = Arc::new(ScriptedLm::new(vec![])); let lm_handle = lm.clone(); let lm_dyn: Arc = lm; - let pipeline = - RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone()); + let pipeline = RagPipeline::new( + env.config.clone(), + retriever_dyn, + lm_dyn, + env.sqlite.clone(), + ); - let answer = pipeline.ask("out-of-corpus query", multi_hop_opts()).unwrap(); + let answer = pipeline + .ask("out-of-corpus query", multi_hop_opts()) + .unwrap(); assert!(!answer.grounded); assert_eq!(answer.refusal_reason, Some(RefusalReason::ScoreGate)); @@ -554,8 +610,12 @@ fn multi_hop_empty_probe_pool_refuses_before_any_llm_call() { let lm = Arc::new(ScriptedLm::new(vec![])); let lm_handle = lm.clone(); let lm_dyn: Arc = lm; - let pipeline = - RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone()); + let pipeline = RagPipeline::new( + env.config.clone(), + retriever_dyn, + lm_dyn, + env.sqlite.clone(), + ); let answer = pipeline.ask("q", multi_hop_opts()).unwrap(); @@ -592,15 +652,15 @@ fn multi_hop_above_probe_gate_proceeds_to_decompose() { let retriever_handle = retriever.clone(); let retriever_dyn: Arc = retriever; - let lm = Arc::new(ScriptedLm::new(vec![ - r#"["q1"]"#, - r"[]", - "answer [#1]", - ])); + let lm = Arc::new(ScriptedLm::new(vec![r#"["q1"]"#, r"[]", "answer [#1]"])); let lm_handle = lm.clone(); let lm_dyn: Arc = lm; - let pipeline = - RagPipeline::new(env.config.clone(), retriever_dyn, lm_dyn, env.sqlite.clone()); + let pipeline = RagPipeline::new( + env.config.clone(), + retriever_dyn, + lm_dyn, + env.sqlite.clone(), + ); let answer = pipeline.ask("valid query", multi_hop_opts()).unwrap(); @@ -666,9 +726,8 @@ fn multi_hop_nli_pass_keeps_grounded() { let verifier = MockNliVerifier::pass(); let verifier_handle = verifier.clone(); let verifier_dyn: Arc = verifier; - let pipeline = - RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone()) - .with_verifier(verifier_dyn); + let pipeline = RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone()) + .with_verifier(verifier_dyn); let answer = pipeline.ask("compound", multi_hop_opts()).unwrap(); @@ -698,9 +757,8 @@ fn multi_hop_nli_fail_refuses() { let verifier = MockNliVerifier::fail(); let verifier_handle = verifier.clone(); let verifier_dyn: Arc = verifier; - let pipeline = - RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone()) - .with_verifier(verifier_dyn); + let pipeline = RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone()) + .with_verifier(verifier_dyn); let answer = pipeline.ask("compound", multi_hop_opts()).unwrap(); @@ -732,8 +790,7 @@ fn multi_hop_nli_disabled_skip_verify() { let retriever_dyn: Arc = retriever; let lm_dyn: Arc = lm; // No `with_verifier` call — pipeline.verifier stays None. - let pipeline = - RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone()); + let pipeline = RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone()); let answer = pipeline.ask("compound", multi_hop_opts()).unwrap(); @@ -756,9 +813,8 @@ fn multi_hop_nli_model_unavailable_refuses() { let verifier = MockNliVerifier::err(); let verifier_handle = verifier.clone(); let verifier_dyn: Arc = verifier; - let pipeline = - RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone()) - .with_verifier(verifier_dyn); + let pipeline = RagPipeline::new(cfg, retriever_dyn, lm_dyn, env.sqlite.clone()) + .with_verifier(verifier_dyn); let answer = pipeline.ask("compound", multi_hop_opts()).unwrap(); @@ -767,7 +823,11 @@ fn multi_hop_nli_model_unavailable_refuses() { answer.refusal_reason, Some(RefusalReason::NliModelUnavailable) ); - assert_eq!(verifier_handle.calls(), 1, "verifier was invoked once before failing"); + assert_eq!( + verifier_handle.calls(), + 1, + "verifier was invoked once before failing" + ); assert!( answer.verification.is_none(), "NliModelUnavailable: can't summarize a verification that didn't happen" diff --git a/crates/kebab-rag/tests/multi_hop_nli_panic.rs b/crates/kebab-rag/tests/multi_hop_nli_panic.rs index 3bf621c..b3586b0 100644 --- a/crates/kebab-rag/tests/multi_hop_nli_panic.rs +++ b/crates/kebab-rag/tests/multi_hop_nli_panic.rs @@ -58,8 +58,8 @@ fn setup_happy_pipeline_no_verifier(nli_threshold: f32) -> (RagPipeline, RagEnv) // Three LLM calls: decompose → decide (stop) → synthesize. // Synthesize emits a non-empty answer so step 8.5 is reached. let lm = Arc::new(ScriptedLm::new(vec![ - r#"["q1"]"#, // decompose - r"[]", // decide: stop signal + r#"["q1"]"#, // decompose + r"[]", // decide: stop signal "answer body [#1]", // synthesize: non-empty → step 8.5 entered ])); let lm_dyn: Arc = lm; diff --git a/crates/kebab-rag/tests/multi_hop_nli_truncate.rs b/crates/kebab-rag/tests/multi_hop_nli_truncate.rs index aa86e38..07ae839 100644 --- a/crates/kebab-rag/tests/multi_hop_nli_truncate.rs +++ b/crates/kebab-rag/tests/multi_hop_nli_truncate.rs @@ -70,11 +70,7 @@ fn long_en_synth_answer_truncated_before_nli_call() { // Synthesize answer ~6000 chars (lorem ipsum, 12 chars × 500 reps), // citation marker appended after so `grounded_unaware` 통과. let long_answer = format!("{} [#1]", "lorem ipsum ".repeat(500)); - let lm = Arc::new(ScriptedLm::new(vec![ - r#"["q1"]"#, - r"[]", - &long_answer, - ])); + let lm = Arc::new(ScriptedLm::new(vec![r#"["q1"]"#, r"[]", &long_answer])); let lm_dyn: Arc = lm; let verifier = SpyNliVerifier::new( @@ -140,11 +136,7 @@ fn long_kr_synth_answer_retries_with_smaller_budget() { // ~2500-char KR-sim answer (한국어 6 chars × 416 reps ≈ 2496 chars) // + citation marker. let kr_long_answer = format!("{} [#1]", "한국어 본문 ".repeat(416)); - let lm = Arc::new(ScriptedLm::new(vec![ - r#"["q1"]"#, - r"[]", - &kr_long_answer, - ])); + let lm = Arc::new(ScriptedLm::new(vec![r#"["q1"]"#, r"[]", &kr_long_answer])); let lm_dyn: Arc = lm; let token_count_call_count = Arc::new(Mutex::new(0_usize)); diff --git a/crates/kebab-rag/tests/pipeline.rs b/crates/kebab-rag/tests/pipeline.rs index ae0016c..4e335b0 100644 --- a/crates/kebab-rag/tests/pipeline.rs +++ b/crates/kebab-rag/tests/pipeline.rs @@ -10,9 +10,7 @@ use std::sync::Arc; use std::sync::atomic::Ordering; use common::{MockRetriever, RagEnv, id32, mk_hit, mk_hit_with_indexed_at}; -use kebab_core::{ - FinishReason, LanguageModel, Retriever, SearchMode, TokenChunk, TokenUsage, -}; +use kebab_core::{FinishReason, LanguageModel, Retriever, SearchMode, TokenChunk, TokenUsage}; use kebab_llm::MockLanguageModel; use kebab_rag::{AskOpts, RagPipeline, RefusalReason, StreamEvent}; @@ -115,7 +113,11 @@ fn top_below_gate_refuses_score_gate_without_llm_call() { let answer = pipeline.ask("q", default_opts()).unwrap(); assert_eq!(answer.refusal_reason, Some(RefusalReason::ScoreGate)); assert!(!answer.grounded); - assert_eq!(answer.citations.len(), 2, "all near-miss candidates surfaced"); + assert_eq!( + answer.citations.len(), + 2, + "all near-miss candidates surfaced" + ); for c in &answer.citations { assert!(c.marker.is_none(), "ScoreGate citations have no marker"); } @@ -132,7 +134,13 @@ fn grounded_happy_path_marker_one() { let env = RagEnv::new(); let cid = id32("c1"); let did = id32("d1"); - env.seed_chunk(&cid, &did, "notes/a.md", "Rust is a systems language.", &["Intro"]); + env.seed_chunk( + &cid, + &did, + "notes/a.md", + "Rust is a systems language.", + &["Intro"], + ); let hits = vec![mk_hit(1, &cid, &did, "notes/a.md", 0.85, &["Intro"])]; let retriever: Arc = Arc::new(MockRetriever::new(hits)); let canned = "Rust is a systems language. [#1]"; @@ -240,8 +248,21 @@ fn packing_stops_before_budget_overflow() { for i in 0..3_u32 { let cid = id32(&format!("c{i}")); let did = id32(&format!("d{i}")); - env.seed_chunk(&cid, &did, &format!("notes/a{i}.md"), &huge_text, &["Intro"]); - hits.push(mk_hit(i + 1, &cid, &did, &format!("notes/a{i}.md"), 0.9, &["Intro"])); + env.seed_chunk( + &cid, + &did, + &format!("notes/a{i}.md"), + &huge_text, + &["Intro"], + ); + hits.push(mk_hit( + i + 1, + &cid, + &did, + &format!("notes/a{i}.md"), + 0.9, + &["Intro"], + )); } let retriever: Arc = Arc::new(MockRetriever::new(hits)); let lm: Arc = Arc::new(CountingLm::new("ok [#1]")); @@ -456,7 +477,13 @@ fn grounded_citations_inherit_indexed_at_and_stale_from_hit() { let now = time::OffsetDateTime::now_utc(); let sixty_days_ago = now - time::Duration::days(60); let hits = vec![mk_hit_with_indexed_at( - 1, &cid, &did, "notes/a.md", 0.85, &["Intro"], sixty_days_ago, + 1, + &cid, + &did, + "notes/a.md", + 0.85, + &["Intro"], + sixty_days_ago, )]; let retriever: Arc = Arc::new(MockRetriever::new(hits)); let lm: Arc = Arc::new(CountingLm::new("apples are fruit. [#1]")); @@ -489,7 +516,13 @@ fn grounded_citations_not_stale_for_fresh_hit() { let now = time::OffsetDateTime::now_utc(); let one_day_ago = now - time::Duration::days(1); let hits = vec![mk_hit_with_indexed_at( - 1, &cid, &did, "notes/a.md", 0.85, &["Intro"], one_day_ago, + 1, + &cid, + &did, + "notes/a.md", + 0.85, + &["Intro"], + one_day_ago, )]; let retriever: Arc = Arc::new(MockRetriever::new(hits)); let lm: Arc = Arc::new(CountingLm::new("apples are fruit. [#1]")); @@ -513,7 +546,13 @@ fn answer_json_serializes_with_expected_keys() { let env = RagEnv::new(); let cid = id32("c1"); let did = id32("d1"); - env.seed_chunk(&cid, &did, "notes/a.md", "Rust is a systems language.", &["Intro"]); + env.seed_chunk( + &cid, + &did, + "notes/a.md", + "Rust is a systems language.", + &["Intro"], + ); let hits = vec![mk_hit(1, &cid, &did, "notes/a.md", 0.85, &["Intro"])]; let retriever: Arc = Arc::new(MockRetriever::new(hits)); let lm: Arc = Arc::new(CountingLm::new("Rust is. [#1]")); @@ -521,7 +560,12 @@ fn answer_json_serializes_with_expected_keys() { let answer = pipeline.ask("what", default_opts()).unwrap(); let v: serde_json::Value = serde_json::to_value(&answer).unwrap(); // Stable top-level key set per `answer.v1` (§2.3). - let keys: Vec<&str> = v.as_object().unwrap().keys().map(std::string::String::as_str).collect(); + let keys: Vec<&str> = v + .as_object() + .unwrap() + .keys() + .map(std::string::String::as_str) + .collect(); for needed in [ "answer", "citations", @@ -614,7 +658,13 @@ fn ask_with_multi_hop_false_keeps_single_pass_path() { let env = RagEnv::new(); let cid = id32("c1"); let did = id32("d1"); - env.seed_chunk(&cid, &did, "notes/a.md", "Rust is a systems language.", &["Intro"]); + env.seed_chunk( + &cid, + &did, + "notes/a.md", + "Rust is a systems language.", + &["Intro"], + ); let hits = vec![mk_hit(1, &cid, &did, "notes/a.md", 0.85, &["Intro"])]; let retriever: Arc = Arc::new(MockRetriever::new(hits)); let lm: Arc = Arc::new(CountingLm::new("Rust is. [#1]")); diff --git a/crates/kebab-search/src/citation_helper.rs b/crates/kebab-search/src/citation_helper.rs index 8771468..e4d6506 100644 --- a/crates/kebab-search/src/citation_helper.rs +++ b/crates/kebab-search/src/citation_helper.rs @@ -49,7 +49,12 @@ pub(crate) fn citation_from_first_span( end_ms: *end_ms, speaker: None, }, - Some(SourceSpan::Code { line_start, line_end, symbol, lang }) => Citation::Code { + Some(SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + }) => Citation::Code { path, line_start: *line_start, line_end: *line_end, diff --git a/crates/kebab-search/src/hybrid.rs b/crates/kebab-search/src/hybrid.rs index 306afb0..d2307a8 100644 --- a/crates/kebab-search/src/hybrid.rs +++ b/crates/kebab-search/src/hybrid.rs @@ -25,7 +25,7 @@ use kebab_core::{ IndexVersion, RetrievalDetail, Retriever, SearchHit, SearchMode, SearchQuery, SearchTrace, }; -use crate::trace::{build_fusion_input_skeleton, candidates_from_hits, ScoreKind, TraceBuilder}; +use crate::trace::{ScoreKind, TraceBuilder, build_fusion_input_skeleton, candidates_from_hits}; /// Default `k_rrf` if `kb-config::SearchCfg::rrf_k` is misconfigured. /// Matches §6.4's documented default (60). @@ -147,7 +147,11 @@ impl Retriever for HybridRetriever { impl HybridRetriever { fn fuse(&self, query: &SearchQuery) -> Result> { - let target_k = if query.k == 0 { self.default_k } else { query.k }; + let target_k = if query.k == 0 { + self.default_k + } else { + query.k + }; let fanout_k = target_k.saturating_mul(HYBRID_FANOUT_MULTIPLIER); let lex_query = SearchQuery { k: fanout_k, @@ -291,9 +295,11 @@ impl HybridRetriever { // one retriever, RRF sums a single term so `fusion_score` // already equals that side's normalized score, making the // fallback harmless. - let lex_score = lex_index - .get(&s.chunk_id) - .map(|(_, h)| h.retrieval.lexical_score.unwrap_or(h.retrieval.fusion_score)); + let lex_score = lex_index.get(&s.chunk_id).map(|(_, h)| { + h.retrieval + .lexical_score + .unwrap_or(h.retrieval.fusion_score) + }); let vec_score = vec_index .get(&s.chunk_id) .map(|(_, h)| h.retrieval.vector_score.unwrap_or(h.retrieval.fusion_score)); @@ -331,7 +337,11 @@ impl HybridRetriever { query: &SearchQuery, ) -> anyhow::Result<(Vec, SearchTrace)> { let start_total = Instant::now(); - let target_k = if query.k == 0 { self.default_k } else { query.k }; + let target_k = if query.k == 0 { + self.default_k + } else { + query.k + }; let fanout_k = target_k.saturating_mul(HYBRID_FANOUT_MULTIPLIER); let fanout_query = SearchQuery { k: fanout_k, @@ -425,8 +435,8 @@ fn parse_fusion(name: &str, k_rrf: u32) -> FusionPolicy { mod tests { use super::*; use kebab_core::{ - ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, SearchFilters, - SearchHit, SearchMode, WorkspacePath, + ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, SearchFilters, SearchHit, + SearchMode, WorkspacePath, }; use std::sync::Mutex; @@ -466,12 +476,7 @@ mod tests { /// because the hybrid logic only reads `chunk_id`, `rank`, /// `retrieval.{lexical,vector}_score`, and (transitively) the rest /// when building the fused output. - fn mk_hit( - chunk_id: &str, - rank: u32, - method: SearchMode, - score: f32, - ) -> SearchHit { + fn mk_hit(chunk_id: &str, rank: u32, method: SearchMode, score: f32) -> SearchHit { let cid = ChunkId(chunk_id.to_string()); let did = DocumentId(format!("d-{chunk_id}")); let path = wp(&format!("notes/{chunk_id}.md")); @@ -654,10 +659,7 @@ mod tests { tied_a.retrieval.lexical_rank = Some(2); let mut tied_b = mk_hit("bbbb", 2, SearchMode::Lexical, 0.4); tied_b.retrieval.lexical_rank = Some(2); - let lex3 = Arc::new(CannedRetriever::new( - vec![tied_a, tied_b], - "lex-v1", - )); + let lex3 = Arc::new(CannedRetriever::new(vec![tied_a, tied_b], "lex-v1")); let vec3 = Arc::new(CannedRetriever::new(vec![], "vec-v1")); let h3 = HybridRetriever::with_policy(lex3, vec3, rrf_policy(60), 5); let out3 = h3.search(&make_query(SearchMode::Hybrid, 5)).unwrap(); @@ -728,9 +730,10 @@ mod tests { #[test] fn search_with_trace_returns_lex_and_vec_lists() { - use kebab_core::{ChunkId, DocumentId, IndexVersion, ChunkerVersion, - RetrievalDetail, SearchHit, SearchMode, SearchQuery, - WorkspacePath, Citation}; + use kebab_core::{ + ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail, + SearchHit, SearchMode, SearchQuery, WorkspacePath, + }; use std::sync::Arc; fn mk_hit(rank: u32, chunk: &str, score: f32, mode: SearchMode) -> SearchHit { @@ -751,10 +754,26 @@ mod tests { retrieval: RetrievalDetail { method: mode, fusion_score: score, - lexical_score: if mode == SearchMode::Lexical { Some(score) } else { None }, - vector_score: if mode == SearchMode::Vector { Some(score) } else { None }, - lexical_rank: if mode == SearchMode::Lexical { Some(rank) } else { None }, - vector_rank: if mode == SearchMode::Vector { Some(rank) } else { None }, + lexical_score: if mode == SearchMode::Lexical { + Some(score) + } else { + None + }, + vector_score: if mode == SearchMode::Vector { + Some(score) + } else { + None + }, + lexical_rank: if mode == SearchMode::Lexical { + Some(rank) + } else { + None + }, + vector_rank: if mode == SearchMode::Vector { + Some(rank) + } else { + None + }, }, index_version: IndexVersion("v1".into()), embedding_model: None, @@ -767,12 +786,16 @@ mod tests { } } - struct Stub { hits: Vec } + struct Stub { + hits: Vec, + } impl Retriever for Stub { fn search(&self, _q: &SearchQuery) -> anyhow::Result> { Ok(self.hits.clone()) } - fn index_version(&self) -> IndexVersion { IndexVersion("v1".into()) } + fn index_version(&self) -> IndexVersion { + IndexVersion("v1".into()) + } } let lex = Arc::new(Stub { @@ -816,7 +839,9 @@ mod tests { fn search(&self, _q: &SearchQuery) -> anyhow::Result> { Ok(vec![]) } - fn index_version(&self) -> IndexVersion { IndexVersion("v1".into()) } + fn index_version(&self) -> IndexVersion { + IndexVersion("v1".into()) + } } let lex = Arc::new(EmptyR); let vec_r = Arc::new(EmptyR); @@ -855,12 +880,7 @@ mod tests { let vec_r = Arc::new(Stub { hits: vec![mk_hit("c1", 1, SearchMode::Vector, 0.8)], }); - let hybrid = HybridRetriever::with_policy( - lex, - vec_r, - FusionPolicy::Rrf { k_rrf: 60 }, - 2, - ); + let hybrid = HybridRetriever::with_policy(lex, vec_r, FusionPolicy::Rrf { k_rrf: 60 }, 2); let q = SearchQuery { text: "x".into(), mode: SearchMode::Hybrid, @@ -892,14 +912,11 @@ mod tests { // mk_hit defaults to Rrf; override per spec for this test. let mut lex_hit = mk_hit("c1", 1, SearchMode::Lexical, 0.5); lex_hit.score_kind = ScoreKind::Bm25; - let lex = Arc::new(Stub { hits: vec![lex_hit] }); + let lex = Arc::new(Stub { + hits: vec![lex_hit], + }); let vec_r = Arc::new(Stub { hits: vec![] }); - let hybrid = HybridRetriever::with_policy( - lex, - vec_r, - FusionPolicy::Rrf { k_rrf: 60 }, - 2, - ); + let hybrid = HybridRetriever::with_policy(lex, vec_r, FusionPolicy::Rrf { k_rrf: 60 }, 2); let q = SearchQuery { text: "x".into(), mode: SearchMode::Lexical, diff --git a/crates/kebab-search/src/lexical.rs b/crates/kebab-search/src/lexical.rs index 678c2d1..8b30b3c 100644 --- a/crates/kebab-search/src/lexical.rs +++ b/crates/kebab-search/src/lexical.rs @@ -10,12 +10,11 @@ use std::sync::Arc; use anyhow::{Context, Result}; use globset::GlobMatcher; use kebab_core::{ - ChunkId, ChunkerVersion, DocumentId, IndexVersion, RetrievalDetail, Retriever, - ScoreKind, SearchFilters, SearchHit, SearchMode, SearchQuery, SourceSpan, TrustLevel, - WorkspacePath, + ChunkId, ChunkerVersion, DocumentId, IndexVersion, RetrievalDetail, Retriever, ScoreKind, + SearchFilters, SearchHit, SearchMode, SearchQuery, SourceSpan, TrustLevel, WorkspacePath, }; use kebab_store_sqlite::SqliteStore; -use rusqlite::{params_from_iter, Connection, Row, ToSql}; +use rusqlite::{Connection, Row, ToSql, params_from_iter}; use crate::citation_helper::citation_from_first_span; @@ -124,13 +123,7 @@ impl Retriever for LexicalRetriever { }; let conn = self.store.read_conn(); - let raw_rows = run_query( - &conn, - &match_str, - self.snippet_words, - filters, - fetch_limit, - )?; + let raw_rows = run_query(&conn, &match_str, self.snippet_words, filters, fetch_limit)?; let mut hits: Vec = Vec::with_capacity(raw_rows.len().min(k)); let mut rank: u32 = 0; @@ -224,8 +217,8 @@ fn build_match_string(text: &str) -> Option { const MIN_TRIGRAM_CHARS: usize = 3; - let whole_candidate: Option = (trimmed.chars().count() >= MIN_TRIGRAM_CHARS) - .then(|| escape_fts5_token(trimmed)); + let whole_candidate: Option = + (trimmed.chars().count() >= MIN_TRIGRAM_CHARS).then(|| escape_fts5_token(trimmed)); let token_and_candidate: Option = { let toks: Vec = trimmed @@ -332,9 +325,7 @@ fn run_query( // tags_any: doc must own at least one of the requested tags. if !filters.tags_any.is_empty() { - sql.push_str( - " AND f.doc_id IN (SELECT doc_id FROM document_tags WHERE tag IN (", - ); + sql.push_str(" AND f.doc_id IN (SELECT doc_id FROM document_tags WHERE tag IN ("); for (i, tag) in filters.tags_any.iter().enumerate() { if i > 0 { sql.push(','); @@ -378,8 +369,7 @@ fn run_query( // ELSE (first object key) // END IN (?, ...) if !filters.media.is_empty() { - let placeholders: Vec<&str> = - std::iter::repeat_n("?", filters.media.len()).collect(); + let placeholders: Vec<&str> = std::iter::repeat_n("?", filters.media.len()).collect(); let placeholders = placeholders.join(","); sql.push_str(&format!( " AND f.doc_id IN (\ @@ -457,7 +447,10 @@ fn run_query( .prepare(&sql) .context("kb-search lexical: prepare FTS5 statement")?; let rows = stmt - .query_map(params_from_iter(params.iter().map(std::convert::AsRef::as_ref)), row_from_sql) + .query_map( + params_from_iter(params.iter().map(std::convert::AsRef::as_ref)), + row_from_sql, + ) .context("kb-search lexical: execute FTS5 query")?; let mut out: Vec = Vec::new(); for r in rows { @@ -682,7 +675,10 @@ mod tests { /// wrapped in `text : (...)`. #[test] fn build_match_string_single_long_token_no_duplicate_or() { - assert_eq!(build_match_string("러스트").unwrap(), r#"text : ("러스트")"#); + assert_eq!( + build_match_string("러스트").unwrap(), + r#"text : ("러스트")"# + ); assert_eq!(build_match_string("rust").unwrap(), r#"text : ("rust")"#); } @@ -772,9 +768,7 @@ mod tests { let c = citation_from_first_span("c1", p, Some("Intro".to_string()), Some(&span)); match c { Citation::Page { - page, - ref section, - .. + page, ref section, .. } => { assert_eq!(page, 4); assert_eq!(section.as_deref(), Some("Intro")); diff --git a/crates/kebab-search/src/trace.rs b/crates/kebab-search/src/trace.rs index 5ddbf9c..fbe3f71 100644 --- a/crates/kebab-search/src/trace.rs +++ b/crates/kebab-search/src/trace.rs @@ -2,9 +2,7 @@ use std::collections::BTreeMap; -use kebab_core::{ - SearchHit, SearchTrace, TraceCandidate, TraceFusionInput, TraceTiming, -}; +use kebab_core::{SearchHit, SearchTrace, TraceCandidate, TraceFusionInput, TraceTiming}; /// Build a `TraceCandidate` from a `SearchHit`. The score field reflects /// each side's score (lexical / vector / fusion) — caller selects which @@ -34,10 +32,7 @@ pub enum ScoreKind { /// each side's rank captured. `fusion_score` is filled by the caller /// (RRF computes it during fusion, this helper just pre-builds the /// rank table — caller overwrites fusion_score in a second pass). -pub fn build_fusion_input_skeleton( - lex: &[SearchHit], - vec: &[SearchHit], -) -> Vec { +pub fn build_fusion_input_skeleton(lex: &[SearchHit], vec: &[SearchHit]) -> Vec { let mut by_chunk: BTreeMap = BTreeMap::new(); for h in lex { by_chunk diff --git a/crates/kebab-search/src/vector.rs b/crates/kebab-search/src/vector.rs index 3975c2e..f61ad8a 100644 --- a/crates/kebab-search/src/vector.rs +++ b/crates/kebab-search/src/vector.rs @@ -20,9 +20,9 @@ use std::sync::Arc; use anyhow::{Context, Result}; use kebab_core::{ - ChunkId, ChunkerVersion, DocumentId, Embedder, EmbeddingInput, EmbeddingKind, - IndexVersion, RetrievalDetail, Retriever, ScoreKind, SearchHit, SearchMode, SearchQuery, - SourceSpan, VectorHit, VectorStore, WorkspacePath, + ChunkId, ChunkerVersion, DocumentId, Embedder, EmbeddingInput, EmbeddingKind, IndexVersion, + RetrievalDetail, Retriever, ScoreKind, SearchHit, SearchMode, SearchQuery, SourceSpan, + VectorHit, VectorStore, WorkspacePath, }; use kebab_store_sqlite::SqliteStore; use rusqlite::params_from_iter; @@ -68,7 +68,13 @@ impl VectorRetriever { index_version: IndexVersion, ) -> Self { let cfg = kebab_config::Config::defaults(); - Self::with_settings(store, embed, sqlite, index_version, cfg.search.snippet_chars) + Self::with_settings( + store, + embed, + sqlite, + index_version, + cfg.search.snippet_chars, + ) } /// Construct with explicit `snippet_chars`. Mirrors the lexical @@ -145,8 +151,7 @@ impl Retriever for VectorRetriever { // 3. Hydrate metadata from SQLite for the candidate ids in // one round-trip. Order is preserved by the caller via the // HashMap lookup at hit-construction time. - let candidate_ids: Vec<&str> = - raw_hits.iter().map(|h| h.chunk_id.0.as_str()).collect(); + let candidate_ids: Vec<&str> = raw_hits.iter().map(|h| h.chunk_id.0.as_str()).collect(); let hydration = hydrate_chunks(&self.sqlite, &candidate_ids) .context("kb-search vector: hydrate chunk metadata")?; @@ -201,10 +206,7 @@ struct ChunkMeta { updated_at: String, } -fn hydrate_chunks( - sqlite: &SqliteStore, - chunk_ids: &[&str], -) -> Result> { +fn hydrate_chunks(sqlite: &SqliteStore, chunk_ids: &[&str]) -> Result> { if chunk_ids.is_empty() { return Ok(HashMap::new()); } @@ -259,8 +261,7 @@ fn hydrate_chunks( .context("kb-search vector: execute hydration query")?; let mut out: HashMap = HashMap::with_capacity(unique.len()); for row in rows { - let (chunk_id, meta) = - row.context("kb-search vector: read hydration row")?; + let (chunk_id, meta) = row.context("kb-search vector: read hydration row")?; out.insert(chunk_id, meta); } Ok(out) @@ -279,9 +280,8 @@ fn build_hit( let source_spans: Vec = serde_json::from_str(&meta.source_spans_json) .context("kb-search vector: deserialize source_spans_json")?; - let workspace_path = WorkspacePath::new(meta.workspace_path.clone()).context( - "kb-search vector: documents.workspace_path violates WorkspacePath invariant", - )?; + let workspace_path = WorkspacePath::new(meta.workspace_path.clone()) + .context("kb-search vector: documents.workspace_path violates WorkspacePath invariant")?; let citation = citation_from_first_span( &hit.chunk_id.0, workspace_path.clone(), diff --git a/crates/kebab-search/tests/common/mod.rs b/crates/kebab-search/tests/common/mod.rs index 354f141..0b2909b 100644 --- a/crates/kebab-search/tests/common/mod.rs +++ b/crates/kebab-search/tests/common/mod.rs @@ -18,10 +18,9 @@ use std::sync::Arc; use kebab_config::Config; use kebab_core::{ - ChunkId, DocumentId, EmbeddingId, EmbeddingInput, EmbeddingKind, - EmbeddingModelId, EmbeddingVersion, IndexVersion, MediaType, - Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery, - VectorRecord, VectorStore, + ChunkId, DocumentId, EmbeddingId, EmbeddingInput, EmbeddingKind, EmbeddingModelId, + EmbeddingVersion, IndexVersion, MediaType, Retriever, SearchFilters, SearchHit, SearchMode, + SearchQuery, VectorRecord, VectorStore, }; use kebab_embed::{Embedder, MockEmbedder}; use kebab_search::{LexicalRetriever, VectorRetriever}; @@ -37,7 +36,8 @@ use tempfile::TempDir; pub fn require_avx_or_panic() { #[cfg(target_arch = "x86_64")] { - assert!(std::is_x86_feature_detected!("avx"), + assert!( + std::is_x86_feature_detected!("avx"), "kb-search hybrid integration test requires AVX-capable hardware; \ host CPU lacks AVX. Run on an AVX-capable machine." ); @@ -71,8 +71,7 @@ impl HybridEnv { let sqlite = SqliteStore::open(&config).unwrap(); sqlite.run_migrations().unwrap(); let sqlite = Arc::new(sqlite); - let vector_store = - Arc::new(LanceVectorStore::new(&config, sqlite.clone()).unwrap()); + let vector_store = Arc::new(LanceVectorStore::new(&config, sqlite.clone()).unwrap()); let embedder = Arc::new(MockEmbedder::new( EmbeddingModelId(TEST_MODEL_ID.to_string()), EmbeddingVersion("v1".to_string()), @@ -100,8 +99,7 @@ impl HybridEnv { pub fn vector_retriever(&self) -> VectorRetriever { let store: Arc = Arc::clone(&self.vector_store) as Arc; - let embed: Arc = - Arc::clone(&self.embedder) as Arc; + let embed: Arc = Arc::clone(&self.embedder) as Arc; VectorRetriever::new( store, embed, @@ -183,12 +181,7 @@ impl HybridEnv { /// High-level helper: seed a doc with an explicit `MediaType`. /// The `media_type` is serialized to JSON (mirrors how /// `DocumentStore::put_document` writes it) and stored in `assets`. - pub fn insert_doc_with_media( - &self, - path: &str, - text: &str, - media: MediaType, - ) -> DocumentId { + pub fn insert_doc_with_media(&self, path: &str, text: &str, media: MediaType) -> DocumentId { // Derive deterministic IDs from the path so repeated calls with // the same path are idempotent (INSERT OR IGNORE). let path_hash: String = { @@ -211,13 +204,7 @@ impl HybridEnv { ) VALUES (?, ?, ?, ?, 0, 'deadbeefdeadbeefdeadbeefdeadbeef', 'reference', ?, '1970-01-01T00:00:00Z')", - params![ - asset_id, - format!("file:///{path}"), - path, - media_json, - path, - ], + params![asset_id, format!("file:///{path}"), path, media_json, path,], ) .unwrap(); conn.execute( @@ -283,7 +270,10 @@ impl HybridEnv { vector, doc_id: DocumentId(doc_id.to_string()), text: text.to_string(), - heading_path: heading_path.iter().map(std::string::ToString::to_string).collect(), + heading_path: heading_path + .iter() + .map(std::string::ToString::to_string) + .collect(), model_id: EmbeddingModelId(TEST_MODEL_ID.to_string()), model_version: EmbeddingVersion("v1".to_string()), dimensions: TEST_DIMENSIONS, diff --git a/crates/kebab-search/tests/hybrid.rs b/crates/kebab-search/tests/hybrid.rs index c60ce97..84aeec7 100644 --- a/crates/kebab-search/tests/hybrid.rs +++ b/crates/kebab-search/tests/hybrid.rs @@ -12,11 +12,9 @@ use std::path::PathBuf; use std::sync::Arc; use common::{ - HybridEnv, id32, require_avx_or_panic, TEST_LEX_INDEX_VERSION, TEST_VEC_INDEX_VERSION, -}; -use kebab_core::{ - MediaType, Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery, + HybridEnv, TEST_LEX_INDEX_VERSION, TEST_VEC_INDEX_VERSION, id32, require_avx_or_panic, }; +use kebab_core::{MediaType, Retriever, SearchFilters, SearchHit, SearchMode, SearchQuery}; use kebab_search::{FusionPolicy, HybridRetriever}; use rusqlite::params; use serde_json::json; @@ -39,10 +37,34 @@ fn seed_disjoint_corpus(env: &HybridEnv) -> Vec { // a token with the query. let chunks = [ // (chunk_id, doc_id, path, text, headings) - (id32("c1"), id32("d1"), "notes/rust1.md", "rust cargo macros", &["A"][..]), - (id32("c2"), id32("d2"), "notes/rust2.md", "rust traits and lifetimes", &["B"][..]), - (id32("c3"), id32("d3"), "notes/python.md", "python dataclasses tutorial", &["C"][..]), - (id32("c4"), id32("d4"), "notes/go.md", "go interfaces and channels", &["D"][..]), + ( + id32("c1"), + id32("d1"), + "notes/rust1.md", + "rust cargo macros", + &["A"][..], + ), + ( + id32("c2"), + id32("d2"), + "notes/rust2.md", + "rust traits and lifetimes", + &["B"][..], + ), + ( + id32("c3"), + id32("d3"), + "notes/python.md", + "python dataclasses tutorial", + &["C"][..], + ), + ( + id32("c4"), + id32("d4"), + "notes/go.md", + "go interfaces and channels", + &["D"][..], + ), ]; let mut ids = Vec::new(); for (cid, did, path, text, headings) in &chunks { @@ -113,7 +135,10 @@ fn hybrid_determinism_same_query_twice() { }; let a = h.search(&q).unwrap(); let b = h.search(&q).unwrap(); - assert_eq!(a, b, "identical query must yield byte-identical Vec"); + assert_eq!( + a, b, + "identical query must yield byte-identical Vec" + ); } #[test] @@ -139,16 +164,18 @@ fn hybrid_snapshot_run_1() { // - that fusion_score is non-increasing // - method = Hybrid for every hit let actual = json!( - hits.iter().map(|h: &SearchHit| json!({ - "chunk_id": h.chunk_id.0, - "rank": h.rank, - "method": h.retrieval.method, - "lexical_rank": h.retrieval.lexical_rank, - "vector_rank": h.retrieval.vector_rank, - "lex_some": h.retrieval.lexical_score.is_some(), - "vec_some": h.retrieval.vector_score.is_some(), - "fusion_score_positive": h.retrieval.fusion_score > 0.0, - })).collect::>() + hits.iter() + .map(|h: &SearchHit| json!({ + "chunk_id": h.chunk_id.0, + "rank": h.rank, + "method": h.retrieval.method, + "lexical_rank": h.retrieval.lexical_rank, + "vector_rank": h.retrieval.vector_rank, + "lex_some": h.retrieval.lexical_score.is_some(), + "vec_some": h.retrieval.vector_score.is_some(), + "fusion_score_positive": h.retrieval.fusion_score > 0.0, + })) + .collect::>() ); let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR")) @@ -186,7 +213,8 @@ fn hybrid_snapshot_run_1() { // Refuse to silently "pass" against the committed placeholder. The // placeholder JSON carries a `_comment` field with regeneration // instructions; production fixtures (a captured list) do not. - assert!(!expected.get("_comment").is_some(), + assert!( + !expected.get("_comment").is_some(), "snapshot fixture is a placeholder — regenerate on AVX hardware then commit. \ Path: {}. To regenerate: \ `KEBAB_UPDATE_SNAPSHOTS=1 cargo test -p kb-search -- --ignored hybrid_snapshot`.", @@ -282,11 +310,8 @@ fn vector_hit_carries_indexed_at() { let now_rfc = now.format(&Rfc3339).expect("format now as rfc3339"); { let conn = env.sqlite.read_conn(); - conn.execute( - "UPDATE documents SET updated_at = ?", - params![now_rfc], - ) - .expect("bump documents.updated_at"); + conn.execute("UPDATE documents SET updated_at = ?", params![now_rfc]) + .expect("bump documents.updated_at"); } let r = env.vector_retriever(); diff --git a/crates/kebab-search/tests/lexical.rs b/crates/kebab-search/tests/lexical.rs index 8206265..beb8151 100644 --- a/crates/kebab-search/tests/lexical.rs +++ b/crates/kebab-search/tests/lexical.rs @@ -51,10 +51,7 @@ impl Env { } fn retriever(&self) -> LexicalRetriever { - LexicalRetriever::new( - Arc::clone(&self.store), - IndexVersion("v1.0".to_string()), - ) + LexicalRetriever::new(Arc::clone(&self.store), IndexVersion("v1.0".to_string())) } fn retriever_with_snippet_chars(&self, snippet_chars: usize) -> LexicalRetriever { @@ -208,7 +205,15 @@ fn lexical_empty_query_returns_empty_vec_without_db_hit() { fn lexical_single_doc_match_returns_one_hit_with_citation_round_trip() { let env = Env::new(); let conn = env.raw_conn(); - insert_document(&conn, &id32("d"), "notes/rust.md", "Rust Notes", "en", "primary", &[]); + insert_document( + &conn, + &id32("d"), + "notes/rust.md", + "Rust Notes", + "en", + "primary", + &[], + ); insert_chunk( &conn, &id32("c1"), @@ -305,8 +310,24 @@ fn lexical_snippet_length_capped_at_snippet_chars() { fn lexical_filter_tags_any_excludes_untagged_docs() { let env = Env::new(); let conn = env.raw_conn(); - insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &["rust"]); - insert_document(&conn, &id32("d2"), "notes/b.md", "B", "en", "primary", &["python"]); + insert_document( + &conn, + &id32("d1"), + "notes/a.md", + "A", + "en", + "primary", + &["rust"], + ); + insert_document( + &conn, + &id32("d2"), + "notes/b.md", + "B", + "en", + "primary", + &["python"], + ); insert_chunk( &conn, &id32("c1"), @@ -392,7 +413,15 @@ fn lexical_filter_path_glob_does_not_cross_slash() { let env = Env::new(); let conn = env.raw_conn(); insert_document(&conn, &id32("d1"), "notes/a.md", "A", "en", "primary", &[]); - insert_document(&conn, &id32("d2"), "notes/sub/b.md", "B", "en", "primary", &[]); + insert_document( + &conn, + &id32("d2"), + "notes/sub/b.md", + "B", + "en", + "primary", + &[], + ); insert_chunk( &conn, &id32("c1"), @@ -551,7 +580,10 @@ fn lexical_determinism_same_query_twice() { }; let a = r.search(&q).unwrap(); let b = r.search(&q).unwrap(); - assert_eq!(a, b, "same DB + same query must yield identical Vec"); + assert_eq!( + a, b, + "same DB + same query must yield identical Vec" + ); } #[test] @@ -561,7 +593,15 @@ fn lexical_determinism_chunk_id_tiebreaker_on_equal_bm25() { // `chunk_id` ordering so the result is stable across runs. let env = Env::new(); let conn = env.raw_conn(); - insert_document(&conn, &id32("d"), "notes/tie.md", "Tie", "en", "primary", &[]); + insert_document( + &conn, + &id32("d"), + "notes/tie.md", + "Tie", + "en", + "primary", + &[], + ); let cid_a = id32("aaaa"); let cid_b = id32("bbbb"); assert!(cid_a < cid_b, "test premise: aaaa-id sorts before bbbb-id"); @@ -690,7 +730,15 @@ fn lexical_retriever_hits_carry_bm25_score_kind() { // relationship: Lexical-only search → Bm25 score semantics. let env = Env::new(); let conn = env.raw_conn(); - insert_document(&conn, &id32("d"), "notes/bm25.md", "Bm25", "en", "primary", &[]); + insert_document( + &conn, + &id32("d"), + "notes/bm25.md", + "Bm25", + "en", + "primary", + &[], + ); for (cid, body) in [ ("c1", "alpha bravo charlie"), ("c2", "alpha delta"), @@ -724,7 +772,8 @@ fn lexical_retriever_hits_carry_bm25_score_kind() { ); for h in &hits { assert_eq!( - h.score_kind, ScoreKind::Bm25, + h.score_kind, + ScoreKind::Bm25, "lexical retriever must label all hits with ScoreKind::Bm25" ); } @@ -848,7 +897,13 @@ impl TestEnv { } /// Insert a code doc with explicit `code_lang` and optional `repo` in metadata. - fn insert_code_doc(&self, path: &str, body: &str, code_lang: &str, repo: Option<&str>) -> DocumentId { + fn insert_code_doc( + &self, + path: &str, + body: &str, + code_lang: &str, + repo: Option<&str>, + ) -> DocumentId { let metadata_json = match repo { Some(r) => format!(r#"{{"code_lang":"{code_lang}","repo":"{r}"}}"#), None => format!(r#"{{"code_lang":"{code_lang}"}}"#), @@ -887,7 +942,11 @@ fn lexical_filter_by_media() { }; let hits = env.run_search("rust", &filters); assert_eq!(hits.len(), 1, "only pdf doc should match"); - assert!(hits[0].doc_path.0.ends_with(".pdf"), "got: {}", hits[0].doc_path.0); + assert!( + hits[0].doc_path.0.ends_with(".pdf"), + "got: {}", + hits[0].doc_path.0 + ); } #[test] @@ -921,7 +980,10 @@ fn lexical_filter_by_doc_id() { ..Default::default() }; let hits = env.run_search("shared", &filters); - assert!(!hits.is_empty(), "should get at least one hit for target doc"); + assert!( + !hits.is_empty(), + "should get at least one hit for target doc" + ); for h in &hits { assert_eq!(h.doc_id, target, "all hits must be from target doc"); } @@ -978,7 +1040,11 @@ fn lexical_filter_by_code_lang() { ..Default::default() }; let hits = env.run_search("AsyncClient", &filters); - assert_eq!(hits.len(), 1, "only python doc should match code_lang filter"); + assert_eq!( + hits.len(), + 1, + "only python doc should match code_lang filter" + ); assert!( hits[0].doc_path.0.ends_with(".py"), "expected python path, got: {}", @@ -991,8 +1057,18 @@ fn lexical_filter_by_repo() { // Three docs: one in repo "httpx", one in repo "requests", one with no repo. // Filter repo=["httpx"] → only the httpx doc should match. let env = TestEnv::new(); - env.insert_code_doc("httpx/client.py", "session send request", "python", Some("httpx")); - env.insert_code_doc("requests/api.py", "session send request", "python", Some("requests")); + env.insert_code_doc( + "httpx/client.py", + "session send request", + "python", + Some("httpx"), + ); + env.insert_code_doc( + "requests/api.py", + "session send request", + "python", + Some("requests"), + ); env.insert_code_doc("standalone.py", "session send request", "python", None); let filters = SearchFilters { @@ -1017,7 +1093,15 @@ fn lexical_snapshot_run_1() { // Stable because rusqlite ships bundled SQLite — a tokenizer/bm25 algorithm change in a future SQLite bump will require regenerating run-1.json via `KEBAB_UPDATE_SNAPSHOTS=1`. let env = Env::new(); let conn = env.raw_conn(); - insert_document(&conn, &id32("d"), "notes/snap.md", "Snap", "en", "primary", &[]); + insert_document( + &conn, + &id32("d"), + "notes/snap.md", + "Snap", + "en", + "primary", + &[], + ); for (cid, body, span) in [ ( "c1", @@ -1035,7 +1119,16 @@ fn lexical_snapshot_run_1() { r#"[{"kind":"line","start":7,"end":8}]"#, ), ] { - insert_chunk(&conn, &id32(cid), &id32("d"), body, &["Snap"], Some("Snap"), span, "v1"); + insert_chunk( + &conn, + &id32(cid), + &id32("d"), + body, + &["Snap"], + Some("Snap"), + span, + "v1", + ); } drop(conn); @@ -1050,10 +1143,14 @@ fn lexical_snapshot_run_1() { .unwrap(); let actual = serde_json::to_value(&hits).unwrap(); - let baseline_path = - std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/search/lexical/run-1.json"); + let baseline_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests/fixtures/search/lexical/run-1.json"); if std::env::var_os("KEBAB_UPDATE_SNAPSHOTS").is_some() { - std::fs::write(&baseline_path, serde_json::to_string_pretty(&actual).unwrap()).unwrap(); + std::fs::write( + &baseline_path, + serde_json::to_string_pretty(&actual).unwrap(), + ) + .unwrap(); } let baseline_text = std::fs::read_to_string(&baseline_path) .expect("baseline snapshot must exist; run with KEBAB_UPDATE_SNAPSHOTS=1 to seed"); diff --git a/crates/kebab-source-fs/src/code_meta.rs b/crates/kebab-source-fs/src/code_meta.rs index d09ec7a..2bc302b 100644 --- a/crates/kebab-source-fs/src/code_meta.rs +++ b/crates/kebab-source-fs/src/code_meta.rs @@ -97,16 +97,19 @@ pub(crate) fn is_generated_file(path: &Path) -> Result { return Ok(false); } let head = std::str::from_utf8(&buf[..n]).unwrap_or(""); - let lower: String = head.lines().take(10).collect::>().join("\n").to_ascii_lowercase(); - Ok( - lower.contains("@generated") - || lower.contains("code generated by") - || lower.contains("do not edit") - || lower.contains("do not modify") - || lower.contains("automatically generated") - || lower.contains("auto-generated") - || lower.contains("autogenerated"), - ) + let lower: String = head + .lines() + .take(10) + .collect::>() + .join("\n") + .to_ascii_lowercase(); + Ok(lower.contains("@generated") + || lower.contains("code generated by") + || lower.contains("do not edit") + || lower.contains("do not modify") + || lower.contains("automatically generated") + || lower.contains("auto-generated") + || lower.contains("autogenerated")) } /// Returns true when `path`'s filename/extension is recognised as a code @@ -154,8 +157,8 @@ mod tests { ("foo.pyi", Some("python")), ("foo.ts", Some("typescript")), ("foo.tsx", Some("typescript")), - ("foo.mts", Some("typescript")), // ESM TS — same grammar - ("foo.cts", Some("typescript")), // CommonJS TS — same grammar + ("foo.mts", Some("typescript")), // ESM TS — same grammar + ("foo.cts", Some("typescript")), // CommonJS TS — same grammar ("foo.js", Some("javascript")), ("foo.mjs", Some("javascript")), ("foo.cjs", Some("javascript")), @@ -192,8 +195,14 @@ mod tests { #[test] fn special_filenames_map_to_identifiers() { - assert_eq!(code_lang_for_path(Path::new("Dockerfile")), Some("dockerfile")); - assert_eq!(code_lang_for_path(Path::new("foo.dockerfile")), Some("dockerfile")); + assert_eq!( + code_lang_for_path(Path::new("Dockerfile")), + Some("dockerfile") + ); + assert_eq!( + code_lang_for_path(Path::new("foo.dockerfile")), + Some("dockerfile") + ); assert_eq!(code_lang_for_path(Path::new("Makefile")), Some("make")); assert_eq!(code_lang_for_path(Path::new("GNUmakefile")), Some("make")); } @@ -213,26 +222,56 @@ mod tests { #[test] fn tier2_basename_takes_precedence_over_extension() { - assert_eq!(code_lang_for_path(Path::new("Dockerfile")), Some("dockerfile")); - assert_eq!(code_lang_for_path(Path::new("foo/Dockerfile.dev")), Some("dockerfile")); - assert_eq!(code_lang_for_path(Path::new("myapp.dockerfile")), Some("dockerfile")); - assert_eq!(code_lang_for_path(Path::new("repo/Cargo.toml")), Some("toml")); - assert_eq!(code_lang_for_path(Path::new("pyproject.toml")), Some("toml")); - assert_eq!(code_lang_for_path(Path::new("repo/package.json")), Some("json")); - assert_eq!(code_lang_for_path(Path::new("tsconfig.json")), Some("json")); - assert_eq!(code_lang_for_path(Path::new("go.mod")), Some("go-mod")); - assert_eq!(code_lang_for_path(Path::new("pom.xml")), Some("xml")); - assert_eq!(code_lang_for_path(Path::new("build.gradle")), Some("groovy")); + assert_eq!( + code_lang_for_path(Path::new("Dockerfile")), + Some("dockerfile") + ); + assert_eq!( + code_lang_for_path(Path::new("foo/Dockerfile.dev")), + Some("dockerfile") + ); + assert_eq!( + code_lang_for_path(Path::new("myapp.dockerfile")), + Some("dockerfile") + ); + assert_eq!( + code_lang_for_path(Path::new("repo/Cargo.toml")), + Some("toml") + ); + assert_eq!( + code_lang_for_path(Path::new("pyproject.toml")), + Some("toml") + ); + assert_eq!( + code_lang_for_path(Path::new("repo/package.json")), + Some("json") + ); + assert_eq!(code_lang_for_path(Path::new("tsconfig.json")), Some("json")); + assert_eq!(code_lang_for_path(Path::new("go.mod")), Some("go-mod")); + assert_eq!(code_lang_for_path(Path::new("pom.xml")), Some("xml")); + assert_eq!( + code_lang_for_path(Path::new("build.gradle")), + Some("groovy") + ); } #[test] fn tier2_extension_fallback() { - assert_eq!(code_lang_for_path(Path::new("k8s/deploy.yaml")), Some("yaml")); - assert_eq!(code_lang_for_path(Path::new("k8s/deploy.yml")), Some("yaml")); - assert_eq!(code_lang_for_path(Path::new("foo/bar.toml")), Some("toml")); - assert_eq!(code_lang_for_path(Path::new("foo/bar.json")), Some("json")); - assert_eq!(code_lang_for_path(Path::new("foo/bar.xml")), Some("xml")); - assert_eq!(code_lang_for_path(Path::new("foo/bar.gradle")), Some("groovy")); + assert_eq!( + code_lang_for_path(Path::new("k8s/deploy.yaml")), + Some("yaml") + ); + assert_eq!( + code_lang_for_path(Path::new("k8s/deploy.yml")), + Some("yaml") + ); + assert_eq!(code_lang_for_path(Path::new("foo/bar.toml")), Some("toml")); + assert_eq!(code_lang_for_path(Path::new("foo/bar.json")), Some("json")); + assert_eq!(code_lang_for_path(Path::new("foo/bar.xml")), Some("xml")); + assert_eq!( + code_lang_for_path(Path::new("foo/bar.gradle")), + Some("groovy") + ); } // ── is_generated_file + is_oversized tests (ex-kebab-parse-code/tests/skip.rs) ── diff --git a/crates/kebab-source-fs/src/connector.rs b/crates/kebab-source-fs/src/connector.rs index c37abc4..7c6d2fb 100644 --- a/crates/kebab-source-fs/src/connector.rs +++ b/crates/kebab-source-fs/src/connector.rs @@ -23,7 +23,9 @@ use kebab_core::{ use crate::hash::hash_file; use crate::media::media_type_for; -use crate::walker::{SkipCategory, WalkOverrides, build_overrides, read_kbignore, walk_files_with_skips}; +use crate::walker::{ + SkipCategory, WalkOverrides, build_overrides, read_kbignore, walk_files_with_skips, +}; /// Local-filesystem `SourceConnector`. Constructed once from `Config`, /// reused across `scan` calls. @@ -56,10 +58,7 @@ impl FsSourceConnector { // + `root = "kb"` reads `/tmp/kb`, not the user's cwd. let root = config.resolve_workspace_root(); - let copy_threshold_bytes = config - .storage - .copy_threshold_mb - .saturating_mul(1024 * 1024); + let copy_threshold_bytes = config.storage.copy_threshold_mb.saturating_mul(1024 * 1024); Ok(Self { default_root: root, @@ -72,10 +71,7 @@ impl FsSourceConnector { } /// Resolve the effective root and build the merged + per-source overrides. - fn resolve_scan_params( - &self, - scope: &SourceScope, - ) -> Result<(PathBuf, WalkOverrides)> { + fn resolve_scan_params(&self, scope: &SourceScope) -> Result<(PathBuf, WalkOverrides)> { let root = if scope.root.as_os_str().is_empty() { self.default_root.clone() } else { @@ -97,10 +93,7 @@ impl FsSourceConnector { /// all the information needed to populate `IngestReport.skipped_gitignore`, /// `skipped_kebabignore`, `skipped_builtin_blacklist`, and `skip_examples` /// without a second walker pass. - pub fn scan_with_skips( - &self, - scope: &SourceScope, - ) -> Result<(Vec, FsScanSkips)> { + pub fn scan_with_skips(&self, scope: &SourceScope) -> Result<(Vec, FsScanSkips)> { let (root, overrides) = self.resolve_scan_params(scope)?; let (files, skipped_entries) = walk_files_with_skips(&root, &overrides)?; @@ -119,7 +112,9 @@ impl FsSourceConnector { &entry.path, &root, ); - let ext = entry.path.extension() + let ext = entry + .path + .extension() .map(|e| format!(".{}", e.to_string_lossy())) .unwrap_or_default(); fs_skips.events.push(FsSkipEvent { @@ -129,13 +124,8 @@ impl FsSourceConnector { }); } SkipCategory::Gitignore => { - fs_skips.skipped_gitignore = - fs_skips.skipped_gitignore.saturating_add(1); - push_sample( - &mut fs_skips.skip_examples.gitignore, - &entry.path, - &root, - ); + fs_skips.skipped_gitignore = fs_skips.skipped_gitignore.saturating_add(1); + push_sample(&mut fs_skips.skip_examples.gitignore, &entry.path, &root); fs_skips.events.push(FsSkipEvent { doc_path, reason: "gitignore", @@ -143,8 +133,7 @@ impl FsSourceConnector { }); } SkipCategory::Kebabignore => { - fs_skips.skipped_kebabignore = - fs_skips.skipped_kebabignore.saturating_add(1); + fs_skips.skipped_kebabignore = fs_skips.skipped_kebabignore.saturating_add(1); // kebabignore intentionally NOT in skip_examples per spec §5.5. fs_skips.events.push(FsSkipEvent { doc_path, @@ -171,13 +160,8 @@ impl FsSourceConnector { if self.skip_generated_header && crate::code_meta::is_generated_file(&abs_path).unwrap_or(false) { - fs_skips.skipped_generated = - fs_skips.skipped_generated.saturating_add(1); - push_sample( - &mut fs_skips.skip_examples.generated, - &abs_path, - &root, - ); + fs_skips.skipped_generated = fs_skips.skipped_generated.saturating_add(1); + push_sample(&mut fs_skips.skip_examples.generated, &abs_path, &root); tracing::debug!( path = %rel_path.display(), "skip: generated-file marker detected" @@ -201,13 +185,8 @@ impl FsSourceConnector { ) .unwrap_or(false) { - fs_skips.skipped_size_exceeded = - fs_skips.skipped_size_exceeded.saturating_add(1); - push_sample( - &mut fs_skips.skip_examples.size_exceeded, - &abs_path, - &root, - ); + fs_skips.skipped_size_exceeded = fs_skips.skipped_size_exceeded.saturating_add(1); + push_sample(&mut fs_skips.skip_examples.size_exceeded, &abs_path, &root); tracing::debug!( path = %rel_path.display(), max_bytes = self.max_file_bytes, @@ -295,8 +274,8 @@ fn build_assets( }; let media_type = media_type_for(abs); - let (byte_len, full_hex) = hash_file(abs) - .with_context(|| format!("hashing {}", abs.display()))?; + let (byte_len, full_hex) = + hash_file(abs).with_context(|| format!("hashing {}", abs.display()))?; let checksum = Checksum(full_hex.clone()); let asset_id = id_for_asset(&full_hex); @@ -325,7 +304,6 @@ fn build_assets( Ok(assets) } - impl SourceConnector for FsSourceConnector { fn scan(&self, scope: &SourceScope) -> Result> { // Delegate to scan_with_skips; discard the skip counts. @@ -356,10 +334,7 @@ mod tests { #[test] fn scan_empty_dir_yields_empty_vec() { let dir = tempfile::tempdir().unwrap(); - let conn = FsSourceConnector::new(&cfg_with_root( - dir.path().to_str().unwrap(), - )) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(dir.path().to_str().unwrap())).unwrap(); let scope = SourceScope::default(); let v = conn.scan(&scope).unwrap(); assert!(v.is_empty()); @@ -374,9 +349,7 @@ mod tests { std::fs::write(root.join("notes/beta.md"), b"b").unwrap(); std::fs::write(root.join("notes/alpha.md"), b"a").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let v = conn.scan(&SourceScope::default()).unwrap(); let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); assert_eq!( @@ -397,9 +370,7 @@ mod tests { std::fs::write(root.join("a.md"), b"x").unwrap(); std::fs::write(root.join("b.tmp"), b"x").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let v = conn.scan(&SourceScope::default()).unwrap(); let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); // Decision: `.kebabignore` itself IS emitted as a RawAsset (MediaType::Other("")). @@ -423,9 +394,7 @@ mod tests { std::fs::write(root.join(".DS_Store"), b"\0\0").unwrap(); std::fs::write(root.join("._sidecar"), b"\0\0").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let v = conn.scan(&SourceScope::default()).unwrap(); let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); assert_eq!(names, vec!["a.md".to_string()]); @@ -447,8 +416,14 @@ mod tests { let v = conn.scan(&SourceScope::default()).unwrap(); let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); assert!(names.contains(&"a.md".to_string())); - assert!(!names.contains(&"b.tmp".to_string()), "kbignore should drop *.tmp"); - assert!(!names.contains(&"c.log".to_string()), "config.exclude should drop *.log"); + assert!( + !names.contains(&"b.tmp".to_string()), + "kbignore should drop *.tmp" + ); + assert!( + !names.contains(&"c.log".to_string()), + "config.exclude should drop *.log" + ); } #[test] @@ -457,9 +432,7 @@ mod tests { let root = dir.path(); std::fs::write(root.join("hello.md"), b"hello world").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let v = conn.scan(&SourceScope::default()).unwrap(); assert_eq!(v.len(), 1); let asset = &v[0]; @@ -482,9 +455,7 @@ mod tests { std::fs::write(root.join("notes/a.md"), b"alpha").unwrap(); std::fs::write(root.join("notes/b.md"), b"beta").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let v1 = conn.scan(&SourceScope::default()).unwrap(); let v2 = conn.scan(&SourceScope::default()).unwrap(); assert_eq!(v1.len(), v2.len()); @@ -514,9 +485,7 @@ mod tests { std::fs::create_dir_all(root.join("a/b/c")).unwrap(); std::fs::write(root.join("a/b/c/d.md"), b"x").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let v = conn.scan(&SourceScope::default()).unwrap(); assert_eq!(v.len(), 1); let p = &v[0].workspace_path.0; @@ -536,9 +505,7 @@ mod tests { std::fs::write(root.join("ok.md"), b"x").unwrap(); std::fs::write(root.join("has#hash.md"), b"y").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let v = conn.scan(&SourceScope::default()).unwrap(); let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); assert_eq!(names, vec!["ok.md".to_string()]); @@ -581,9 +548,7 @@ mod tests { std::fs::write(root.join("ok.md"), b"# ok").unwrap(); std::fs::write(root.join("skipme.log"), b"x").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); assert!( @@ -592,7 +557,11 @@ mod tests { skips.skipped_gitignore ); assert!( - skips.skip_examples.gitignore.iter().any(|p| p.contains("skipme.log")), + skips + .skip_examples + .gitignore + .iter() + .any(|p| p.contains("skipme.log")), "skip_examples.gitignore should contain 'skipme.log'; got: {:?}", skips.skip_examples.gitignore ); @@ -608,9 +577,7 @@ mod tests { std::fs::write(root.join("node_modules/foo/bar.js"), b"x").unwrap(); std::fs::write(root.join("ok.md"), b"# ok").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); assert!( @@ -619,7 +586,11 @@ mod tests { skips.skipped_builtin_blacklist ); assert!( - skips.skip_examples.builtin_blacklist.iter().any(|p| p.contains("node_modules")), + skips + .skip_examples + .builtin_blacklist + .iter() + .any(|p| p.contains("node_modules")), "skip_examples.builtin_blacklist should contain a node_modules path; got: {:?}", skips.skip_examples.builtin_blacklist ); @@ -633,9 +604,7 @@ mod tests { std::fs::write(root.join("ok.md"), b"x").unwrap(); std::fs::write(root.join("creds.secret"), b"pw").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); assert!( @@ -667,9 +636,7 @@ mod tests { std::fs::write(root.join("node_modules/pkg/index.js"), b"x").unwrap(); std::fs::write(root.join("ok.md"), b"x").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); assert!( @@ -695,9 +662,7 @@ mod tests { } std::fs::write(root.join("ok.md"), b"x").unwrap(); - let conn = - FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); assert_eq!(skips.skipped_gitignore, 7, "should count all 7"); @@ -733,10 +698,7 @@ mod tests { std::fs::write(root.join("normal.md"), "# hi").unwrap(); std::fs::write(root.join("autogen.rs"), "// @generated\nfn x() {}\n").unwrap(); - let conn = FsSourceConnector::new( - &cfg_with_root_defaults(root.to_str().unwrap()), - ) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root_defaults(root.to_str().unwrap())).unwrap(); let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); assert!( @@ -745,15 +707,16 @@ mod tests { skips.skipped_generated ); assert!( - skips.skip_examples.generated.iter().any(|p| p.contains("autogen")), + skips + .skip_examples + .generated + .iter() + .any(|p| p.contains("autogen")), "skip_examples.generated should contain 'autogen'; got: {:?}", skips.skip_examples.generated ); // The normal.md file must NOT be skipped. - let asset_paths: Vec<_> = _assets - .iter() - .map(|a| a.workspace_path.0.clone()) - .collect(); + let asset_paths: Vec<_> = _assets.iter().map(|a| a.workspace_path.0.clone()).collect(); assert!( asset_paths.iter().any(|p| p.contains("normal")), "normal.md should still be emitted; assets: {asset_paths:?}" @@ -769,10 +732,8 @@ mod tests { let big: String = "x\n".repeat(1_000); std::fs::write(root.join("huge.rs"), &big).unwrap(); - let conn = FsSourceConnector::new( - &cfg_with_size_cap(root.to_str().unwrap(), 1024, 5_000), - ) - .unwrap(); + let conn = FsSourceConnector::new(&cfg_with_size_cap(root.to_str().unwrap(), 1024, 5_000)) + .unwrap(); let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); assert!( @@ -781,7 +742,11 @@ mod tests { skips.skipped_size_exceeded ); assert!( - skips.skip_examples.size_exceeded.iter().any(|p| p.contains("huge")), + skips + .skip_examples + .size_exceeded + .iter() + .any(|p| p.contains("huge")), "skip_examples.size_exceeded should contain 'huge'; got: {:?}", skips.skip_examples.size_exceeded ); @@ -795,10 +760,9 @@ mod tests { let body: String = "x\n".repeat(6_000); std::fs::write(root.join("longfile.rs"), &body).unwrap(); - let conn = FsSourceConnector::new( - &cfg_with_size_cap(root.to_str().unwrap(), 262_144, 5_000), - ) - .unwrap(); + let conn = + FsSourceConnector::new(&cfg_with_size_cap(root.to_str().unwrap(), 262_144, 5_000)) + .unwrap(); let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); assert!( @@ -824,9 +788,18 @@ mod tests { let (assets, skips) = connector.scan_with_skips(&SourceScope::default()).unwrap(); let paths: Vec<_> = assets.iter().map(|a| a.workspace_path.0.as_str()).collect(); - assert!(paths.iter().any(|p| p.contains("paper.pdf")), "PDF must pass: {paths:?}"); - assert!(paths.iter().any(|p| p.contains("notes.md")), "MD must pass: {paths:?}"); - assert!(!paths.iter().any(|p| p.contains("big.rs")), "code file must skip: {paths:?}"); + assert!( + paths.iter().any(|p| p.contains("paper.pdf")), + "PDF must pass: {paths:?}" + ); + assert!( + paths.iter().any(|p| p.contains("notes.md")), + "MD must pass: {paths:?}" + ); + assert!( + !paths.iter().any(|p| p.contains("big.rs")), + "code file must skip: {paths:?}" + ); assert_eq!(skips.skip_examples.size_exceeded.len(), 1); assert!(skips.skip_examples.size_exceeded[0].contains("big.rs")); diff --git a/crates/kebab-source-fs/src/hash.rs b/crates/kebab-source-fs/src/hash.rs index 6ab61d0..ac4d09a 100644 --- a/crates/kebab-source-fs/src/hash.rs +++ b/crates/kebab-source-fs/src/hash.rs @@ -83,7 +83,10 @@ mod tests { /// chunk boundary is invisible. #[test] fn streaming_matches_oneshot_over_buffer_boundary() { - let bytes: Vec = (0u8..=255u8).cycle().take(READ_BUFFER_BYTES * 3 + 17).collect(); + let bytes: Vec = (0u8..=255u8) + .cycle() + .take(READ_BUFFER_BYTES * 3 + 17) + .collect(); let (n, streamed) = hash_reader(&bytes[..]).unwrap(); assert_eq!(n, bytes.len() as u64); let oneshot = blake3::hash(&bytes).to_hex().to_string(); diff --git a/crates/kebab-source-fs/src/media.rs b/crates/kebab-source-fs/src/media.rs index 7ea35f5..ce856c5 100644 --- a/crates/kebab-source-fs/src/media.rs +++ b/crates/kebab-source-fs/src/media.rs @@ -86,46 +86,97 @@ mod tests { MediaType::Code("rust".to_string()) ); // Cargo.toml is a Tier 2 code manifest (p10-2), handled by code_lang_for_path - assert_eq!(media_type_for(Path::new("Cargo.toml")), MediaType::Code("toml".to_string())); + assert_eq!( + media_type_for(Path::new("Cargo.toml")), + MediaType::Code("toml".to_string()) + ); } #[test] fn py_ts_js_files_map_to_media_code() { - assert_eq!(media_type_for(Path::new("a/b.py")), MediaType::Code("python".into())); - assert_eq!(media_type_for(Path::new("a/b.pyi")), MediaType::Code("python".into())); - assert_eq!(media_type_for(Path::new("a/b.ts")), MediaType::Code("typescript".into())); - assert_eq!(media_type_for(Path::new("a/b.tsx")), MediaType::Code("typescript".into())); - assert_eq!(media_type_for(Path::new("a/b.js")), MediaType::Code("javascript".into())); - assert_eq!(media_type_for(Path::new("a/b.mjs")), MediaType::Code("javascript".into())); - assert_eq!(media_type_for(Path::new("a/b.cjs")), MediaType::Code("javascript".into())); - assert_eq!(media_type_for(Path::new("a/b.jsx")), MediaType::Code("javascript".into())); - assert_eq!(media_type_for(Path::new("a/b.rs")), MediaType::Code("rust".into())); + assert_eq!( + media_type_for(Path::new("a/b.py")), + MediaType::Code("python".into()) + ); + assert_eq!( + media_type_for(Path::new("a/b.pyi")), + MediaType::Code("python".into()) + ); + assert_eq!( + media_type_for(Path::new("a/b.ts")), + MediaType::Code("typescript".into()) + ); + assert_eq!( + media_type_for(Path::new("a/b.tsx")), + MediaType::Code("typescript".into()) + ); + assert_eq!( + media_type_for(Path::new("a/b.js")), + MediaType::Code("javascript".into()) + ); + assert_eq!( + media_type_for(Path::new("a/b.mjs")), + MediaType::Code("javascript".into()) + ); + assert_eq!( + media_type_for(Path::new("a/b.cjs")), + MediaType::Code("javascript".into()) + ); + assert_eq!( + media_type_for(Path::new("a/b.jsx")), + MediaType::Code("javascript".into()) + ); + assert_eq!( + media_type_for(Path::new("a/b.rs")), + MediaType::Code("rust".into()) + ); } #[test] fn ts_variants_mts_cts() { // .mts / .cts are TypeScript ESM / CommonJS — same grammar as .ts. - assert_eq!(media_type_for(Path::new("a/b.mts")), MediaType::Code("typescript".into())); - assert_eq!(media_type_for(Path::new("a/b.cts")), MediaType::Code("typescript".into())); + assert_eq!( + media_type_for(Path::new("a/b.mts")), + MediaType::Code("typescript".into()) + ); + assert_eq!( + media_type_for(Path::new("a/b.cts")), + MediaType::Code("typescript".into()) + ); } #[test] fn mdx_routes_to_markdown() { // MDX is markdown with JSX islands; the md parser folds the JSX // through as raw passthrough. - assert_eq!(media_type_for(Path::new("docs/page.mdx")), MediaType::Markdown); + assert_eq!( + media_type_for(Path::new("docs/page.mdx")), + MediaType::Markdown + ); } #[test] fn go_files_map_to_media_code_go() { - assert_eq!(media_type_for(Path::new("a/b.go")), MediaType::Code("go".into())); + assert_eq!( + media_type_for(Path::new("a/b.go")), + MediaType::Code("go".into()) + ); } #[test] fn java_kotlin_files_map_to_media_code() { - assert_eq!(media_type_for(Path::new("a/b.java")), MediaType::Code("java".into())); - assert_eq!(media_type_for(Path::new("a/b.kt")), MediaType::Code("kotlin".into())); - assert_eq!(media_type_for(Path::new("a/b.kts")), MediaType::Code("kotlin".into())); + assert_eq!( + media_type_for(Path::new("a/b.java")), + MediaType::Code("java".into()) + ); + assert_eq!( + media_type_for(Path::new("a/b.kt")), + MediaType::Code("kotlin".into()) + ); + assert_eq!( + media_type_for(Path::new("a/b.kts")), + MediaType::Code("kotlin".into()) + ); } #[test] @@ -142,11 +193,29 @@ mod tests { #[test] fn tier2_files_map_to_media_code() { - assert_eq!(media_type_for(Path::new("a/deploy.yaml")), MediaType::Code("yaml".into())); - assert_eq!(media_type_for(Path::new("a/Dockerfile")), MediaType::Code("dockerfile".into())); - assert_eq!(media_type_for(Path::new("a/Cargo.toml")), MediaType::Code("toml".into())); - assert_eq!(media_type_for(Path::new("a/pom.xml")), MediaType::Code("xml".into())); - assert_eq!(media_type_for(Path::new("a/build.gradle")), MediaType::Code("groovy".into())); - assert_eq!(media_type_for(Path::new("a/go.mod")), MediaType::Code("go-mod".into())); + assert_eq!( + media_type_for(Path::new("a/deploy.yaml")), + MediaType::Code("yaml".into()) + ); + assert_eq!( + media_type_for(Path::new("a/Dockerfile")), + MediaType::Code("dockerfile".into()) + ); + assert_eq!( + media_type_for(Path::new("a/Cargo.toml")), + MediaType::Code("toml".into()) + ); + assert_eq!( + media_type_for(Path::new("a/pom.xml")), + MediaType::Code("xml".into()) + ); + assert_eq!( + media_type_for(Path::new("a/build.gradle")), + MediaType::Code("groovy".into()) + ); + assert_eq!( + media_type_for(Path::new("a/go.mod")), + MediaType::Code("go-mod".into()) + ); } } diff --git a/crates/kebab-source-fs/src/walker.rs b/crates/kebab-source-fs/src/walker.rs index 4f8a51e..5c866d7 100644 --- a/crates/kebab-source-fs/src/walker.rs +++ b/crates/kebab-source-fs/src/walker.rs @@ -141,7 +141,9 @@ fn build_builtin_matcher(root: &Path) -> Result { .with_context(|| format!("builtin dir pattern: {dir_pat}"))?; } } - builder.build().context("failed to compile builtin override") + builder + .build() + .context("failed to compile builtin override") } /// Owned-string variant of `build_single_matcher` for caller-supplied @@ -182,8 +184,13 @@ pub(crate) fn build_overrides( let gitignore_patterns = read_gitignore(root)?; // Per-source matchers (for attribution only). - let gitignore = - build_single_matcher(root, &gitignore_patterns.iter().map(std::string::String::as_str).collect::>())?; + let gitignore = build_single_matcher( + root, + &gitignore_patterns + .iter() + .map(std::string::String::as_str) + .collect::>(), + )?; let kebabignore = build_single_matcher_owned(root, kbignore_patterns)?; // Use the directory-aware builtin matcher so that `is_dir=true` checks on // directory entries (e.g., `node_modules/`) are attributed to builtin rather @@ -445,7 +452,6 @@ pub(crate) fn walk_files_with_skips( Ok((accepted, skipped)) } - #[cfg(test)] mod tests { use super::*; @@ -463,13 +469,25 @@ mod tests { fn default_excludes_ds_store_and_resource_forks() { let dir = tempfile::tempdir().unwrap(); let ov = build_overrides(dir.path(), &[], &[], &[]).unwrap(); - assert!(ov.combined.matched(Path::new(".DS_Store"), false).is_ignore()); assert!( - ov.combined.matched(Path::new("notes/.DS_Store"), false).is_ignore() + ov.combined + .matched(Path::new(".DS_Store"), false) + .is_ignore() ); - assert!(ov.combined.matched(Path::new("._foo.md"), false).is_ignore()); assert!( - ov.combined.matched(Path::new("notes/._sidecar"), false).is_ignore() + ov.combined + .matched(Path::new("notes/.DS_Store"), false) + .is_ignore() + ); + assert!( + ov.combined + .matched(Path::new("._foo.md"), false) + .is_ignore() + ); + assert!( + ov.combined + .matched(Path::new("notes/._sidecar"), false) + .is_ignore() ); } @@ -484,12 +502,21 @@ mod tests { ) .unwrap(); assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore()); - assert!(ov.combined.matched(Path::new("notes/x.tmp"), false).is_ignore()); assert!( - ov.combined.matched(Path::new("node_modules/foo/bar.js"), false) + ov.combined + .matched(Path::new("notes/x.tmp"), false) + .is_ignore() + ); + assert!( + ov.combined + .matched(Path::new("node_modules/foo/bar.js"), false) + .is_ignore() + ); + assert!( + !ov.combined + .matched(Path::new("alpha.md"), false) .is_ignore() ); - assert!(!ov.combined.matched(Path::new("alpha.md"), false).is_ignore()); } #[test] @@ -505,7 +532,9 @@ mod tests { .unwrap(); assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore()); assert!( - ov.combined.matched(Path::new("secret/key.md"), false).is_ignore() + ov.combined + .matched(Path::new("secret/key.md"), false) + .is_ignore() ); } @@ -543,10 +572,15 @@ mod tests { let overrides = build_overrides(root, &[], &[], &[]).unwrap(); // Override::matched expects paths relative to the builder's root. let m_in = overrides.combined.matched(Path::new("src/main.rs"), false); - let m_out = overrides.combined.matched(Path::new("node_modules/foo/bar.js"), false); + let m_out = overrides + .combined + .matched(Path::new("node_modules/foo/bar.js"), false); assert!(!m_in.is_ignore(), "src/main.rs should NOT be ignored"); - assert!(m_out.is_ignore(), "node_modules/foo/bar.js SHOULD be ignored"); + assert!( + m_out.is_ignore(), + "node_modules/foo/bar.js SHOULD be ignored" + ); } #[test] @@ -594,9 +628,24 @@ mod tests { fs::write(root.join("dist/bundle.js"), "x").unwrap(); let overrides = build_overrides(root, &[], &[], &[]).unwrap(); - assert!(overrides.combined.matched(Path::new("a.log"), false).is_ignore()); - assert!(overrides.combined.matched(Path::new("dist/bundle.js"), false).is_ignore()); - assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore()); + assert!( + overrides + .combined + .matched(Path::new("a.log"), false) + .is_ignore() + ); + assert!( + overrides + .combined + .matched(Path::new("dist/bundle.js"), false) + .is_ignore() + ); + assert!( + !overrides + .combined + .matched(Path::new("src/main.rs"), false) + .is_ignore() + ); } #[test] @@ -612,8 +661,18 @@ mod tests { // No .gitignore present — patterns from .gitignore should not affect overrides. let overrides = build_overrides(root, &[], &[], &[]).unwrap(); - assert!(!overrides.combined.matched(Path::new("a.log"), false).is_ignore()); - assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore()); + assert!( + !overrides + .combined + .matched(Path::new("a.log"), false) + .is_ignore() + ); + assert!( + !overrides + .combined + .matched(Path::new("src/main.rs"), false) + .is_ignore() + ); } #[test] @@ -627,7 +686,11 @@ mod tests { fs::write(root.join(".gitignore"), "!keep/\n").unwrap(); // Just verify build_overrides doesn't error. let result = build_overrides(root, &[], &[], &[]); - assert!(result.is_ok(), "should not error on negation pattern: {:?}", result.err()); + assert!( + result.is_ok(), + "should not error on negation pattern: {:?}", + result.err() + ); } // ── Skip attribution tests ──────────────────────────────────────────────── @@ -646,7 +709,11 @@ mod tests { let ov = build_overrides(root, &[], &[], &[]).unwrap(); // node_modules/ dir itself let cat = classify_skip(Path::new("node_modules"), true, &ov); - assert_eq!(cat, SkipCategory::BuiltinBlacklist, "builtin must have priority"); + assert_eq!( + cat, + SkipCategory::BuiltinBlacklist, + "builtin must have priority" + ); } #[test] @@ -707,8 +774,9 @@ mod tests { .filter(|e| e.category == SkipCategory::Gitignore) .collect(); assert!( - gitignore_skipped.iter().any(|e| e.path.file_name() - .is_some_and(|n| n == "skipme.log")), + gitignore_skipped + .iter() + .any(|e| e.path.file_name().is_some_and(|n| n == "skipme.log")), "skipme.log should appear in gitignore_skipped; skipped: {:?}", skipped_entries.iter().map(|e| &e.path).collect::>() ); @@ -746,8 +814,9 @@ mod tests { "node_modules/ should produce at least one BuiltinBlacklist skip" ); assert!( - builtin_skipped.iter().any(|e| e.path.components() - .any(|c| c.as_os_str() == "node_modules")), + builtin_skipped + .iter() + .any(|e| e.path.components().any(|c| c.as_os_str() == "node_modules")), "skipped path should contain node_modules; got: {:?}", builtin_skipped.iter().map(|e| &e.path).collect::>() ); diff --git a/crates/kebab-source-fs/tests/include_allowlist.rs b/crates/kebab-source-fs/tests/include_allowlist.rs index 3ccf7d1..dacf4f8 100644 --- a/crates/kebab-source-fs/tests/include_allowlist.rs +++ b/crates/kebab-source-fs/tests/include_allowlist.rs @@ -50,10 +50,22 @@ fn include_empty_accepts_all_files() { }; let assets = conn.scan(&scope).unwrap(); let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect(); - assert!(names.contains(&"a.md".to_string()), "a.md missing; got: {names:?}"); - assert!(names.contains(&"b.py".to_string()), "b.py missing; got: {names:?}"); - assert!(names.contains(&"c.png".to_string()), "c.png missing; got: {names:?}"); - assert!(names.contains(&"d.pdf".to_string()), "d.pdf missing; got: {names:?}"); + assert!( + names.contains(&"a.md".to_string()), + "a.md missing; got: {names:?}" + ); + assert!( + names.contains(&"b.py".to_string()), + "b.py missing; got: {names:?}" + ); + assert!( + names.contains(&"c.png".to_string()), + "c.png missing; got: {names:?}" + ); + assert!( + names.contains(&"d.pdf".to_string()), + "d.pdf missing; got: {names:?}" + ); assert_eq!(names.len(), 4, "expected exactly 4 files; got: {names:?}"); } @@ -68,8 +80,14 @@ fn include_nonempty_is_allowlist() { }; let assets = conn.scan(&scope).unwrap(); let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect(); - assert!(names.contains(&"a.md".to_string()), "a.md should be accepted; got: {names:?}"); - assert!(names.contains(&"b.py".to_string()), "b.py should be accepted; got: {names:?}"); + assert!( + names.contains(&"a.md".to_string()), + "a.md should be accepted; got: {names:?}" + ); + assert!( + names.contains(&"b.py".to_string()), + "b.py should be accepted; got: {names:?}" + ); assert!( !names.contains(&"c.png".to_string()), "c.png must be rejected by include allowlist; got: {names:?}" @@ -99,7 +117,10 @@ fn include_and_exclude_are_anded() { }; let assets = conn.scan(&scope).unwrap(); let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect(); - assert!(names.contains(&"keep.md".to_string()), "keep.md should be accepted; got: {names:?}"); + assert!( + names.contains(&"keep.md".to_string()), + "keep.md should be accepted; got: {names:?}" + ); assert!( !names.contains(&"drop.md".to_string()), "drop.md should be excluded (matched exclude); got: {names:?}" diff --git a/crates/kebab-source-fs/tests/snapshot_tree1.rs b/crates/kebab-source-fs/tests/snapshot_tree1.rs index 3578710..08d89d3 100644 --- a/crates/kebab-source-fs/tests/snapshot_tree1.rs +++ b/crates/kebab-source-fs/tests/snapshot_tree1.rs @@ -117,8 +117,7 @@ fn tree_1_snapshot_matches_baseline() { baseline_path().display() ) }); - let expected: Value = serde_json::from_str(&baseline_text) - .expect("baseline JSON must parse"); + let expected: Value = serde_json::from_str(&baseline_text).expect("baseline JSON must parse"); if actual != expected { let actual_pretty = serde_json::to_string_pretty(&actual).unwrap(); diff --git a/crates/kebab-source-fs/tests/symlink_cycle.rs b/crates/kebab-source-fs/tests/symlink_cycle.rs index 52fbcaa..8250c26 100644 --- a/crates/kebab-source-fs/tests/symlink_cycle.rs +++ b/crates/kebab-source-fs/tests/symlink_cycle.rs @@ -37,8 +37,8 @@ fn symlink_cycle_does_not_loop_or_crash() { // Symlink: root/notes → root (a → a cycle through the link `notes`). symlink(root, root.join("notes")).unwrap(); - let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .expect("connector init"); + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).expect("connector init"); let v = conn .scan(&SourceScope::default()) .expect("scan must return, not loop"); @@ -78,12 +78,14 @@ fn dangling_symlink_pseudo_cycle_does_not_crash() { symlink(root.join("b"), root.join("a")).unwrap(); symlink(root.join("a"), root.join("b")).unwrap(); - let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .expect("connector init"); + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).expect("connector init"); // Even though a→b→a never resolves to a real directory (broken // pseudo-cycle of dangling symlinks), the scan must complete and // surface alpha.md. - let v = conn.scan(&SourceScope::default()).expect("scan must return"); + let v = conn + .scan(&SourceScope::default()) + .expect("scan must return"); assert!(v.iter().any(|a| a.workspace_path.0 == "alpha.md")); } @@ -113,13 +115,17 @@ fn two_step_directory_cycle_visited_set_breaks_loop() { symlink("../b", root.join("a/loop")).unwrap(); symlink("../a", root.join("b/loop")).unwrap(); - let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) - .expect("connector init"); + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).expect("connector init"); // Run scan twice — both must terminate AND produce identical // workspace_path lists (visited-set is deterministic per scan). - let v1 = conn.scan(&SourceScope::default()).expect("scan must return"); - let v2 = conn.scan(&SourceScope::default()).expect("scan must return"); + let v1 = conn + .scan(&SourceScope::default()) + .expect("scan must return"); + let v2 = conn + .scan(&SourceScope::default()) + .expect("scan must return"); let names1: Vec = v1.iter().map(|a| a.workspace_path.0.clone()).collect(); let names2: Vec = v2.iter().map(|a| a.workspace_path.0.clone()).collect(); @@ -140,11 +146,13 @@ fn two_step_directory_cycle_visited_set_breaks_loop() { // paths depend on which side of the cycle the walker descended into // first; assert by basename to keep the check robust. assert!( - v1.iter().any(|a| a.workspace_path.0.ends_with("inside_a.md")), + v1.iter() + .any(|a| a.workspace_path.0.ends_with("inside_a.md")), "expected inside_a.md in scan output, got: {names1:?}" ); assert!( - v1.iter().any(|a| a.workspace_path.0.ends_with("inside_b.md")), + v1.iter() + .any(|a| a.workspace_path.0.ends_with("inside_b.md")), "expected inside_b.md in scan output, got: {names1:?}" ); diff --git a/crates/kebab-store-sqlite/src/answers.rs b/crates/kebab-store-sqlite/src/answers.rs index 3e870bf..4a4899a 100644 --- a/crates/kebab-store-sqlite/src/answers.rs +++ b/crates/kebab-store-sqlite/src/answers.rs @@ -37,14 +37,16 @@ impl SqliteStore { .created_at .format(&time::format_description::well_known::Rfc3339) .context("format answer.created_at")?; - let citations_json = serde_json::to_string(&answer.citations) - .context("serialize answer.citations")?; + let citations_json = + serde_json::to_string(&answer.citations).context("serialize answer.citations")?; let refusal_label: Option<&'static str> = answer.refusal_reason.as_ref().map(refusal_reason_label); let mode_label = search_mode_label(&answer.retrieval.mode); let embedding_id: Option<&str> = answer.embedding.as_ref().map(|m| m.id.as_str()); - let embedding_dim: Option = - answer.embedding.as_ref().and_then(|m| m.dimensions.map(|d| d as i64)); + let embedding_dim: Option = answer + .embedding + .as_ref() + .and_then(|m| m.dimensions.map(|d| d as i64)); let conn = self.lock_conn(); conn.execute( diff --git a/crates/kebab-store-sqlite/src/documents.rs b/crates/kebab-store-sqlite/src/documents.rs index 02b9b26..e09745e 100644 --- a/crates/kebab-store-sqlite/src/documents.rs +++ b/crates/kebab-store-sqlite/src/documents.rs @@ -40,11 +40,7 @@ impl kebab_core::DocumentStore for SqliteStore { } }; let conn = self.lock_conn(); - purge_orphan_at_workspace_path( - &conn, - &asset.workspace_path.0, - &asset.asset_id.0, - )?; + purge_orphan_at_workspace_path(&conn, &asset.workspace_path.0, &asset.asset_id.0)?; upsert_asset_row(&conn, asset, storage_kind, &storage_path) } @@ -57,11 +53,7 @@ impl kebab_core::DocumentStore for SqliteStore { Ok(()) } - fn put_blocks( - &self, - doc: &kebab_core::DocumentId, - blocks: &[kebab_core::Block], - ) -> Result<()> { + fn put_blocks(&self, doc: &kebab_core::DocumentId, blocks: &[kebab_core::Block]) -> Result<()> { let mut conn = self.lock_conn(); let tx = conn.transaction().map_err(StoreError::from)?; // DELETE-then-INSERT: §5.4 has no UNIQUE on (doc_id, ordinal) @@ -100,11 +92,7 @@ impl kebab_core::DocumentStore for SqliteStore { Ok(()) } - fn put_chunks( - &self, - doc: &kebab_core::DocumentId, - chunks: &[kebab_core::Chunk], - ) -> Result<()> { + fn put_chunks(&self, doc: &kebab_core::DocumentId, chunks: &[kebab_core::Chunk]) -> Result<()> { let now = OffsetDateTime::now_utc() .format(&time::format_description::well_known::Rfc3339) .context("format chunk created_at")?; @@ -126,8 +114,8 @@ impl kebab_core::DocumentStore for SqliteStore { .context("serialize chunk.heading_path")?; let source_spans = serde_json::to_string(&chunk.source_spans) .context("serialize chunk.source_spans")?; - let block_ids = serde_json::to_string(&chunk.block_ids) - .context("serialize chunk.block_ids")?; + let block_ids = + serde_json::to_string(&chunk.block_ids).context("serialize chunk.block_ids")?; // §5.5 has a `section_label` column but the in-memory Chunk // struct does not carry it (nor does the wire schema §2.6). // Persist NULL until a future bump introduces the field. @@ -193,16 +181,15 @@ impl kebab_core::DocumentStore for SqliteStore { let mut blocks: Vec = Vec::new(); for row in block_rows { let payload_json = row.map_err(StoreError::from)?; - let block: kebab_core::Block = serde_json::from_str(&payload_json) - .context("deserialize block payload_json")?; + let block: kebab_core::Block = + serde_json::from_str(&payload_json).context("deserialize block payload_json")?; blocks.push(block); } - let metadata: kebab_core::Metadata = serde_json::from_str(&row.metadata_json) - .context("deserialize metadata_json")?; + let metadata: kebab_core::Metadata = + serde_json::from_str(&row.metadata_json).context("deserialize metadata_json")?; let provenance: kebab_core::Provenance = - serde_json::from_str(&row.provenance_json) - .context("deserialize provenance_json")?; + serde_json::from_str(&row.provenance_json).context("deserialize provenance_json")?; Ok(Some(kebab_core::CanonicalDocument { doc_id: kebab_core::DocumentId(row.doc_id), @@ -248,9 +235,8 @@ impl kebab_core::DocumentStore for SqliteStore { let source_spans: Vec = serde_json::from_str(&row.source_spans_json) .context("deserialize chunk.source_spans_json")?; - let block_ids: Vec = - serde_json::from_str(&row.block_ids_json) - .context("deserialize chunk.block_ids_json")?; + let block_ids: Vec = serde_json::from_str(&row.block_ids_json) + .context("deserialize chunk.block_ids_json")?; Ok(Some(kebab_core::Chunk { chunk_id: kebab_core::ChunkId(row.chunk_id), doc_id: kebab_core::DocumentId(row.doc_id), @@ -264,10 +250,7 @@ impl kebab_core::DocumentStore for SqliteStore { })) } - fn get_asset( - &self, - id: &kebab_core::AssetId, - ) -> Result> { + fn get_asset(&self, id: &kebab_core::AssetId) -> Result> { let conn = self.lock_conn(); let result = conn.query_row( r"SELECT @@ -346,16 +329,15 @@ impl kebab_core::DocumentStore for SqliteStore { let mut blocks: Vec = Vec::new(); for block_row in block_rows { let payload_json = block_row.map_err(StoreError::from)?; - let block: kebab_core::Block = serde_json::from_str(&payload_json) - .context("deserialize block payload_json")?; + let block: kebab_core::Block = + serde_json::from_str(&payload_json).context("deserialize block payload_json")?; blocks.push(block); } - let metadata: kebab_core::Metadata = serde_json::from_str(&row.metadata_json) - .context("deserialize metadata_json")?; + let metadata: kebab_core::Metadata = + serde_json::from_str(&row.metadata_json).context("deserialize metadata_json")?; let provenance: kebab_core::Provenance = - serde_json::from_str(&row.provenance_json) - .context("deserialize provenance_json")?; + serde_json::from_str(&row.provenance_json).context("deserialize provenance_json")?; Ok(Some(kebab_core::CanonicalDocument { doc_id, @@ -421,12 +403,14 @@ impl kebab_core::DocumentStore for SqliteStore { if let Some(trust_min) = &filter.trust_min { // Map the enum to its rank: Generated < Secondary < Primary. // (Higher trust strictly contains lower trust.) - sql.push_str(" AND CASE d.trust_level + sql.push_str( + " AND CASE d.trust_level WHEN 'primary' THEN 3 WHEN 'secondary' THEN 2 WHEN 'generated' THEN 1 ELSE 0 - END >= ?"); + END >= ?", + ); let rank: i64 = match trust_min { kebab_core::TrustLevel::Primary => 3, kebab_core::TrustLevel::Secondary => 2, @@ -606,21 +590,27 @@ fn doc_summary_from_sql(row: &rusqlite::Row<'_>) -> rusqlite::Result) -> rusqlite::Result) -> rusqlite::Result, doc: &kebab_core::CanonicalDocument, ) -> Result<()> { - let metadata_json = serde_json::to_string(&doc.metadata) - .context("serialize metadata")?; - let provenance_json = serde_json::to_string(&doc.provenance) - .context("serialize provenance")?; + let metadata_json = serde_json::to_string(&doc.metadata).context("serialize metadata")?; + let provenance_json = serde_json::to_string(&doc.provenance).context("serialize provenance")?; // String form of the lowercase serde representation. We avoid // embedding `serde_json::to_string` quotes (`"markdown"` → just // `markdown` for the column). @@ -811,8 +804,11 @@ fn replace_document_tags( doc_id: &kebab_core::DocumentId, tags: &[String], ) -> Result<()> { - tx.execute("DELETE FROM document_tags WHERE doc_id = ?", params![doc_id.0]) - .map_err(StoreError::from)?; + tx.execute( + "DELETE FROM document_tags WHERE doc_id = ?", + params![doc_id.0], + ) + .map_err(StoreError::from)?; let mut stmt = tx .prepare( "INSERT INTO document_tags (doc_id, tag) VALUES (?, ?) diff --git a/crates/kebab-store-sqlite/src/embeddings.rs b/crates/kebab-store-sqlite/src/embeddings.rs index 40b19d0..348740a 100644 --- a/crates/kebab-store-sqlite/src/embeddings.rs +++ b/crates/kebab-store-sqlite/src/embeddings.rs @@ -54,10 +54,7 @@ impl SqliteStore { /// All rows are written in a single transaction; if any row fails /// the entire batch is rolled back and the caller can retry without /// worrying about partial pending state. - pub fn put_embedding_records_pending( - &self, - rows: &[EmbeddingRecordRow], - ) -> Result<()> { + pub fn put_embedding_records_pending(&self, rows: &[EmbeddingRecordRow]) -> Result<()> { if rows.is_empty() { return Ok(()); } @@ -107,10 +104,7 @@ impl SqliteStore { /// WHERE embedding_id IN (?, ?, …)`) inside one transaction — /// avoids the per-row `execute()` round-trip the previous /// implementation paid. - pub fn mark_embedding_records_committed( - &self, - embedding_ids: &[String], - ) -> Result<()> { + pub fn mark_embedding_records_committed(&self, embedding_ids: &[String]) -> Result<()> { if embedding_ids.is_empty() { return Ok(()); } @@ -208,7 +202,11 @@ mod tests { source_spans_json, token_estimate, chunker_version, policy_hash, block_ids_json, created_at ) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'hash', '[]', ?)", - params![chunk_id, "fedcba9876543210fedcba9876543210", "1970-01-01T00:00:00Z"], + params![ + chunk_id, + "fedcba9876543210fedcba9876543210", + "1970-01-01T00:00:00Z" + ], ) .unwrap(); } diff --git a/crates/kebab-store-sqlite/src/eval.rs b/crates/kebab-store-sqlite/src/eval.rs index 007e01f..76b7e44 100644 --- a/crates/kebab-store-sqlite/src/eval.rs +++ b/crates/kebab-store-sqlite/src/eval.rs @@ -209,11 +209,9 @@ impl SqliteStore { Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None), Err(e) => return Err(StoreError::from(e).into()), }; - let created_at = OffsetDateTime::parse( - &created_str, - &time::format_description::well_known::Rfc3339, - ) - .with_context(|| format!("parse eval_runs.created_at for {run_id}"))?; + let created_at = + OffsetDateTime::parse(&created_str, &time::format_description::well_known::Rfc3339) + .with_context(|| format!("parse eval_runs.created_at for {run_id}"))?; Ok(Some(EvalRunRecord { run_id, suite, @@ -228,10 +226,7 @@ impl SqliteStore { /// `query_id` ASC for determinism (the table has no insertion-order /// column; query_id ordering matches the BTreeSet sort the loader /// uses for missing-id reporting). - pub fn load_eval_query_results( - &self, - run_id: &str, - ) -> Result> { + pub fn load_eval_query_results(&self, run_id: &str) -> Result> { let conn = self.lock_conn(); let mut stmt = conn .prepare( @@ -258,11 +253,7 @@ impl SqliteStore { /// `Err` (not `Ok(0)`) if the run is missing — we never want to /// silently drop computed metrics. Called once per run by /// P5-2's `store_aggregate`. - pub fn update_eval_run_aggregate( - &self, - run_id: &str, - aggregate_json: &str, - ) -> Result<()> { + pub fn update_eval_run_aggregate(&self, run_id: &str, aggregate_json: &str) -> Result<()> { let conn = self.lock_conn(); let updated = conn .execute( diff --git a/crates/kebab-store-sqlite/src/filters.rs b/crates/kebab-store-sqlite/src/filters.rs index 841f42a..f2e6648 100644 --- a/crates/kebab-store-sqlite/src/filters.rs +++ b/crates/kebab-store-sqlite/src/filters.rs @@ -23,7 +23,7 @@ use std::collections::{HashMap, HashSet}; use anyhow::{Context, Result}; -use rusqlite::{params_from_iter, ToSql}; +use rusqlite::{ToSql, params_from_iter}; use crate::store::SqliteStore; @@ -347,9 +347,7 @@ mod tests { .put_embedding_records_pending(std::slice::from_ref(&embed_row)) .unwrap(); store - .mark_embedding_records_committed(std::slice::from_ref( - &embed_row.embedding_id, - )) + .mark_embedding_records_committed(std::slice::from_ref(&embed_row.embedding_id)) .unwrap(); } @@ -430,9 +428,7 @@ mod tests { .put_embedding_records_pending(std::slice::from_ref(&embed_row)) .unwrap(); store - .mark_embedding_records_committed(std::slice::from_ref( - &embed_row.embedding_id, - )) + .mark_embedding_records_committed(std::slice::from_ref(&embed_row.embedding_id)) .unwrap(); } @@ -502,9 +498,7 @@ mod tests { .put_embedding_records_pending(std::slice::from_ref(&embed_row)) .unwrap(); store - .mark_embedding_records_committed(std::slice::from_ref( - &embed_row.embedding_id, - )) + .mark_embedding_records_committed(std::slice::from_ref(&embed_row.embedding_id)) .unwrap(); } @@ -573,10 +567,38 @@ mod tests { // c3: tags=[ko-style], lang=ko, secondary, notes/c.md // c4: tags=[ko-style], lang=en, generated, src/d.md let chunks = [ - ("11111111111111111111111111111111", "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", "notes/a.md", "en", "primary", &["ko-style"][..]), - ("22222222222222222222222222222222", "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", "notes/b.md", "en", "primary", &["other"][..]), - ("33333333333333333333333333333333", "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", "notes/c.md", "ko", "secondary", &["ko-style"][..]), - ("44444444444444444444444444444444", "d4d4d4d4d4d4d4d4d4d4d4d4d4d4d4d4", "src/d.md", "en", "generated", &["ko-style"][..]), + ( + "11111111111111111111111111111111", + "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", + "notes/a.md", + "en", + "primary", + &["ko-style"][..], + ), + ( + "22222222222222222222222222222222", + "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", + "notes/b.md", + "en", + "primary", + &["other"][..], + ), + ( + "33333333333333333333333333333333", + "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", + "notes/c.md", + "ko", + "secondary", + &["ko-style"][..], + ), + ( + "44444444444444444444444444444444", + "d4d4d4d4d4d4d4d4d4d4d4d4d4d4d4d4", + "src/d.md", + "en", + "generated", + &["ko-style"][..], + ), ]; for (c, d, p, l, t, tags) in &chunks { seed_committed(&store, c, d, p, l, tags, t); @@ -588,10 +610,7 @@ mod tests { ..Default::default() }; let out = store - .filter_chunks( - &chunks.iter().map(|c| cid(c.0)).collect::>(), - &f, - ) + .filter_chunks(&chunks.iter().map(|c| cid(c.0)).collect::>(), &f) .unwrap(); let mut got: Vec<&str> = out.iter().map(|c| c.0.as_str()).collect(); got.sort_unstable(); @@ -604,10 +623,7 @@ mod tests { ..Default::default() }; let out = store - .filter_chunks( - &chunks.iter().map(|c| cid(c.0)).collect::>(), - &f, - ) + .filter_chunks(&chunks.iter().map(|c| cid(c.0)).collect::>(), &f) .unwrap(); let mut got: Vec<&str> = out.iter().map(|c| c.0.as_str()).collect(); got.sort_unstable(); @@ -621,10 +637,7 @@ mod tests { ..Default::default() }; let out = store - .filter_chunks( - &chunks.iter().map(|c| cid(c.0)).collect::>(), - &f, - ) + .filter_chunks(&chunks.iter().map(|c| cid(c.0)).collect::>(), &f) .unwrap(); let got: Vec<&str> = out.iter().map(|c| c.0.as_str()).collect(); assert_eq!(got, vec![chunks[0].0]); @@ -635,10 +648,7 @@ mod tests { ..Default::default() }; let out = store - .filter_chunks( - &chunks.iter().map(|c| cid(c.0)).collect::>(), - &f, - ) + .filter_chunks(&chunks.iter().map(|c| cid(c.0)).collect::>(), &f) .unwrap(); let mut got: Vec<&str> = out.iter().map(|c| c.0.as_str()).collect(); got.sort_unstable(); @@ -652,9 +662,33 @@ mod tests { let c1 = "11111111111111111111111111111111"; let c2 = "22222222222222222222222222222222"; let c3 = "33333333333333333333333333333333"; - seed_committed(&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", "a.md", "en", &[], "primary"); - seed_committed(&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", "b.md", "en", &[], "primary"); - seed_committed(&store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", "c.md", "en", &[], "primary"); + seed_committed( + &store, + c1, + "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", + "a.md", + "en", + &[], + "primary", + ); + seed_committed( + &store, + c2, + "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", + "b.md", + "en", + &[], + "primary", + ); + seed_committed( + &store, + c3, + "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", + "c.md", + "en", + &[], + "primary", + ); // Ask in the order c3, c1, c2; result must preserve that order. let out = store @@ -688,14 +722,24 @@ mod tests { let c1 = "11111111111111111111111111111111"; let c2 = "22222222222222222222222222222222"; seed_committed_full( - &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", - "notes/a.md", "en", &[], "primary", + &store, + c1, + "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", + "notes/a.md", + "en", + &[], + "primary", r#""markdown""#, "1970-01-01T00:00:00Z", ); seed_committed_full( - &store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", - "notes/b.pdf", "en", &[], "primary", + &store, + c2, + "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", + "notes/b.pdf", + "en", + &[], + "primary", r#""pdf""#, "1970-01-01T00:00:00Z", ); @@ -704,10 +748,12 @@ mod tests { media: vec!["pdf".to_string()], ..Default::default() }; - let out = store - .filter_chunks(&[cid(c1), cid(c2)], &f) - .unwrap(); - assert_eq!(out, vec![cid(c2)], "only pdf chunk should survive media filter"); + let out = store.filter_chunks(&[cid(c1), cid(c2)], &f).unwrap(); + assert_eq!( + out, + vec![cid(c2)], + "only pdf chunk should survive media filter" + ); } #[test] @@ -718,14 +764,24 @@ mod tests { let c1 = "11111111111111111111111111111111"; let c2 = "22222222222222222222222222222222"; seed_committed_full( - &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", - "old.md", "en", &[], "primary", + &store, + c1, + "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", + "old.md", + "en", + &[], + "primary", r#""markdown""#, "2020-01-01T00:00:00Z", ); seed_committed_full( - &store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", - "new.md", "en", &[], "primary", + &store, + c2, + "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", + "new.md", + "en", + &[], + "primary", r#""markdown""#, "2026-01-01T00:00:00Z", ); @@ -734,10 +790,12 @@ mod tests { ingested_after: Some(time::macros::datetime!(2025-01-01 00:00:00 UTC)), ..Default::default() }; - let out = store - .filter_chunks(&[cid(c1), cid(c2)], &f) - .unwrap(); - assert_eq!(out, vec![cid(c2)], "only post-2025 chunk should survive ingested_after filter"); + let out = store.filter_chunks(&[cid(c1), cid(c2)], &f).unwrap(); + assert_eq!( + out, + vec![cid(c2)], + "only post-2025 chunk should survive ingested_after filter" + ); } #[test] @@ -749,14 +807,24 @@ mod tests { let c2 = "22222222222222222222222222222222"; let d1 = "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1"; seed_committed_full( - &store, c1, d1, - "a.md", "en", &[], "primary", + &store, + c1, + d1, + "a.md", + "en", + &[], + "primary", r#""markdown""#, "1970-01-01T00:00:00Z", ); seed_committed_full( - &store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", - "b.md", "en", &[], "primary", + &store, + c2, + "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", + "b.md", + "en", + &[], + "primary", r#""markdown""#, "1970-01-01T00:00:00Z", ); @@ -765,10 +833,12 @@ mod tests { doc_id: Some(kebab_core::DocumentId(d1.to_string())), ..Default::default() }; - let out = store - .filter_chunks(&[cid(c1), cid(c2)], &f) - .unwrap(); - assert_eq!(out, vec![cid(c1)], "doc_id filter must scope to the target doc only"); + let out = store.filter_chunks(&[cid(c1), cid(c2)], &f).unwrap(); + assert_eq!( + out, + vec![cid(c1)], + "doc_id filter must scope to the target doc only" + ); } // ── p10-1A-1 new filter arms ───────────────────────────────────────── @@ -783,18 +853,27 @@ mod tests { let c2 = "22222222222222222222222222222222"; let c3 = "33333333333333333333333333333333"; seed_committed_with_metadata( - &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", - "src/main.py", r#""code""#, + &store, + c1, + "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", + "src/main.py", + r#""code""#, r#"{"code_lang":"python"}"#, ); seed_committed_with_metadata( - &store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", - "src/lib.rs", r#""code""#, + &store, + c2, + "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", + "src/lib.rs", + r#""code""#, r#"{"code_lang":"rust"}"#, ); seed_committed_with_metadata( - &store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", - "README.md", r#""markdown""#, + &store, + c3, + "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", + "README.md", + r#""markdown""#, r"{}", ); @@ -805,7 +884,11 @@ mod tests { let out = store .filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f) .unwrap(); - assert_eq!(out, vec![cid(c1)], "only python chunk should survive code_lang filter"); + assert_eq!( + out, + vec![cid(c1)], + "only python chunk should survive code_lang filter" + ); } #[test] @@ -818,18 +901,27 @@ mod tests { let c2 = "22222222222222222222222222222222"; let c3 = "33333333333333333333333333333333"; seed_committed_with_metadata( - &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", - "httpx/client.py", r#""code""#, + &store, + c1, + "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", + "httpx/client.py", + r#""code""#, r#"{"repo":"httpx","code_lang":"python"}"#, ); seed_committed_with_metadata( - &store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", - "requests/api.py", r#""code""#, + &store, + c2, + "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", + "requests/api.py", + r#""code""#, r#"{"repo":"requests","code_lang":"python"}"#, ); seed_committed_with_metadata( - &store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", - "standalone.py", r#""code""#, + &store, + c3, + "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", + "standalone.py", + r#""code""#, r#"{"code_lang":"python"}"#, ); @@ -840,7 +932,11 @@ mod tests { let out = store .filter_chunks(&[cid(c1), cid(c2), cid(c3)], &f) .unwrap(); - assert_eq!(out, vec![cid(c1)], "only httpx chunk should survive repo filter"); + assert_eq!( + out, + vec![cid(c1)], + "only httpx chunk should survive repo filter" + ); } #[test] @@ -865,8 +961,13 @@ mod tests { let store = open_store(&tmp); let c1 = "11111111111111111111111111111111"; seed_committed_full( - &store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", - "doc.md", "en", &[], "primary", + &store, + c1, + "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", + "doc.md", + "en", + &[], + "primary", r#""markdown""#, "2026-04-01T01:00:00Z", ); @@ -883,9 +984,7 @@ mod tests { ingested_after: Some(filter_instant), ..Default::default() }; - let out = store - .filter_chunks(&[cid(c1)], &f) - .unwrap(); + let out = store.filter_chunks(&[cid(c1)], &f).unwrap(); assert_eq!( out, vec![cid(c1)], diff --git a/crates/kebab-store-sqlite/src/jobs.rs b/crates/kebab-store-sqlite/src/jobs.rs index c9136ba..395269f 100644 --- a/crates/kebab-store-sqlite/src/jobs.rs +++ b/crates/kebab-store-sqlite/src/jobs.rs @@ -86,11 +86,7 @@ impl SqliteStore { } impl kebab_core::JobRepo for SqliteStore { - fn create( - &self, - kind: kebab_core::JobKind, - payload: Value, - ) -> Result { + fn create(&self, kind: kebab_core::JobKind, payload: Value) -> Result { let now_dt = OffsetDateTime::now_utc(); let now = now_dt .format(&time::format_description::well_known::Rfc3339) @@ -100,8 +96,7 @@ impl kebab_core::JobRepo for SqliteStore { // identical `(kind, payload)` still get distinct IDs. let job_id = mint_job_id(&kind, &payload, now_dt); let kind_label = job_kind_label(&kind); - let payload_json = serde_json::to_string(&payload) - .context("serialize job payload")?; + let payload_json = serde_json::to_string(&payload).context("serialize job payload")?; let conn = self.lock_conn(); conn.execute( "INSERT INTO jobs ( @@ -114,13 +109,8 @@ impl kebab_core::JobRepo for SqliteStore { Ok(job_id) } - fn update_progress( - &self, - id: &kebab_core::JobId, - progress: Value, - ) -> Result<()> { - let progress_json = serde_json::to_string(&progress) - .context("serialize job progress")?; + fn update_progress(&self, id: &kebab_core::JobId, progress: Value) -> Result<()> { + let progress_json = serde_json::to_string(&progress).context("serialize job progress")?; let now = OffsetDateTime::now_utc() .format(&time::format_description::well_known::Rfc3339) .context("format job updated_at")?; @@ -167,10 +157,7 @@ impl kebab_core::JobRepo for SqliteStore { Ok(()) } - fn list( - &self, - filter: &kebab_core::JobFilter, - ) -> Result> { + fn list(&self, filter: &kebab_core::JobFilter) -> Result> { let conn = self.lock_conn(); let mut sql = String::from( "SELECT job_id, kind, status, payload_json, progress_json, @@ -259,11 +246,9 @@ fn job_row_from_sql(row: &rusqlite::Row<'_>) -> rusqlite::Result = row.get(8)?; let kind: kebab_core::JobKind = - serde_json::from_value(serde_json::Value::String(kind_raw)) - .map_err(conv_err(1))?; + serde_json::from_value(serde_json::Value::String(kind_raw)).map_err(conv_err(1))?; let status: kebab_core::JobStatus = - serde_json::from_value(serde_json::Value::String(status_raw)) - .map_err(conv_err(2))?; + serde_json::from_value(serde_json::Value::String(status_raw)).map_err(conv_err(2))?; let payload: Value = serde_json::from_str(&payload_json).map_err(conv_err(3))?; let progress: Option = match progress_json { Some(s) => Some(serde_json::from_str(&s).map_err(conv_err(4))?), diff --git a/crates/kebab-store-sqlite/src/lib.rs b/crates/kebab-store-sqlite/src/lib.rs index e1b9fb9..8618900 100644 --- a/crates/kebab-store-sqlite/src/lib.rs +++ b/crates/kebab-store-sqlite/src/lib.rs @@ -27,8 +27,8 @@ mod filters; mod fts; mod jobs; mod schema; -mod store; pub mod stats_ext; +mod store; pub use embeddings::EmbeddingRecordRow; pub use error::StoreError; diff --git a/crates/kebab-store-sqlite/src/stats_ext.rs b/crates/kebab-store-sqlite/src/stats_ext.rs index e6df4e2..8cd8ce5 100644 --- a/crates/kebab-store-sqlite/src/stats_ext.rs +++ b/crates/kebab-store-sqlite/src/stats_ext.rs @@ -19,10 +19,7 @@ pub struct Breakdowns { /// `lang` only contains observed languages; NULL lang is /// keyed as the literal string `"null"`. `stale_doc_count` is 0 when /// `threshold_days == 0` (mirrors fb-32 staleness disable semantics). -pub fn breakdowns( - conn: &Connection, - threshold_days: u64, -) -> rusqlite::Result { +pub fn breakdowns(conn: &Connection, threshold_days: u64) -> rusqlite::Result { // media: dual JSON shape — text variant ("markdown") vs object // variant ({"image":{"format":"png"}}). Same CASE WHEN as fb-36. let mut media: BTreeMap = MEDIA_KINDS @@ -40,9 +37,7 @@ pub fn breakdowns( FROM documents d JOIN assets a ON a.asset_id = d.asset_id \ GROUP BY kind", )?; - let rows = stmt.query_map([], |r| { - Ok((r.get::<_, String>(0)?, r.get::<_, u64>(1)?)) - })?; + let rows = stmt.query_map([], |r| Ok((r.get::<_, String>(0)?, r.get::<_, u64>(1)?)))?; for row in rows { let (kind, n) = row?; media.insert(kind, n); @@ -53,9 +48,7 @@ pub fn breakdowns( "SELECT COALESCE(lang, 'null') AS l, COUNT(*) \ FROM documents GROUP BY l", )?; - let rows = stmt.query_map([], |r| { - Ok((r.get::<_, String>(0)?, r.get::<_, u64>(1)?)) - })?; + let rows = stmt.query_map([], |r| Ok((r.get::<_, String>(0)?, r.get::<_, u64>(1)?)))?; for row in rows { let (l, n) = row?; lang.insert(l, n); @@ -65,8 +58,7 @@ pub fn breakdowns( 0 } else { let secs = (threshold_days as i64) * 86_400; - let cutoff = time::OffsetDateTime::now_utc() - - time::Duration::seconds(secs); + let cutoff = time::OffsetDateTime::now_utc() - time::Duration::seconds(secs); let cutoff_str = cutoff .format(&time::format_description::well_known::Rfc3339) .expect("RFC3339 format"); @@ -148,7 +140,10 @@ mod tests { fn index_bytes_includes_sqlite_main() { let (dir, _store) = open_fresh(); let b = index_bytes(dir.path()).unwrap(); - assert!(b.sqlite > 0, "main sqlite file should exist after migrations"); + assert!( + b.sqlite > 0, + "main sqlite file should exist after migrations" + ); assert_eq!(b.lancedb, 0); } diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index df0fe7e..fd9e3c3 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -163,7 +163,9 @@ impl SqliteStore { /// safe to reuse — we simply unwrap the inner guard rather than /// propagate the panic to every subsequent call. pub(crate) fn lock_conn(&self) -> MutexGuard<'_, Connection> { - self.conn.lock().unwrap_or_else(std::sync::PoisonError::into_inner) + self.conn + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner) } /// Read-only borrow of the connection. @@ -179,7 +181,9 @@ impl SqliteStore { /// /// Poisoning is recovered the same way as [`Self::lock_conn`]. pub fn read_conn(&self) -> MutexGuard<'_, Connection> { - self.conn.lock().unwrap_or_else(std::sync::PoisonError::into_inner) + self.conn + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner) } /// Persist a `RawAsset` *with its raw bytes*: row goes into `assets`, @@ -190,11 +194,7 @@ impl SqliteStore { /// In either branch, `blake3(bytes)` is recomputed and compared to /// `asset.checksum.0`. A mismatch returns /// `StoreError::Conflict` wrapped in `anyhow::Error`. - pub fn put_asset_with_bytes( - &self, - asset: &kebab_core::RawAsset, - bytes: &[u8], - ) -> Result<()> { + pub fn put_asset_with_bytes(&self, asset: &kebab_core::RawAsset, bytes: &[u8]) -> Result<()> { // 0. Validate the AssetId shape before any I/O. `kebab_core::AssetId` // is a `pub String` newtype: `FromStr` enforces the 32-hex-char // invariant, but a hand-constructed `AssetId("../etc/passwd…")` @@ -229,9 +229,8 @@ impl SqliteStore { // of the temp file so we never leak bytes on disk. let dest = self.assets_path_for(&asset.asset_id); if let Some(parent) = dest.parent() { - std::fs::create_dir_all(parent).with_context(|| { - format!("create asset shard dir {}", parent.display()) - })?; + std::fs::create_dir_all(parent) + .with_context(|| format!("create asset shard dir {}", parent.display()))?; } let temp_path = temp_path_for(&dest); // Inline closure so any `?` in (a)/(b) cleans up the temp @@ -242,9 +241,8 @@ impl SqliteStore { format!("create temp asset file {}", temp_path.display()) })?; use std::io::Write; - f.write_all(bytes).with_context(|| { - format!("write asset bytes to {}", temp_path.display()) - })?; + f.write_all(bytes) + .with_context(|| format!("write asset bytes to {}", temp_path.display()))?; f.sync_all().with_context(|| { format!("fsync temp asset file {}", temp_path.display()) })?; @@ -255,9 +253,8 @@ impl SqliteStore { use std::os::unix::fs::PermissionsExt; let mut perms = std::fs::metadata(&temp_path)?.permissions(); perms.set_mode(0o644); - std::fs::set_permissions(&temp_path, perms).with_context(|| { - format!("chmod 0o644 on {}", temp_path.display()) - })?; + std::fs::set_permissions(&temp_path, perms) + .with_context(|| format!("chmod 0o644 on {}", temp_path.display()))?; } // UPSERT the row first; only after a successful row write // do we publish the file via rename. A second @@ -270,12 +267,7 @@ impl SqliteStore { &asset.workspace_path.0, &asset.asset_id.0, )?; - upsert_asset_row( - &conn, - asset, - "copied", - &dest.to_string_lossy(), - )?; + upsert_asset_row(&conn, asset, "copied", &dest.to_string_lossy())?; } std::fs::rename(&temp_path, &dest).with_context(|| { format!( @@ -305,11 +297,7 @@ impl SqliteStore { kebab_core::SourceUri::Kb(u) => u.clone(), }; let conn = self.lock_conn(); - purge_orphan_at_workspace_path( - &conn, - &asset.workspace_path.0, - &asset.asset_id.0, - )?; + purge_orphan_at_workspace_path(&conn, &asset.workspace_path.0, &asset.asset_id.0)?; upsert_asset_row(&conn, asset, "reference", &storage_path)?; Ok(()) } @@ -338,9 +326,7 @@ impl SqliteStore { /// permits hand-construction, so any function that turns an `AssetId` /// into a filesystem path must call this first. pub(crate) fn validate_asset_id(asset_id: &kebab_core::AssetId) -> Result<()> { - if asset_id.0.len() != ASSET_ID_HEX_LEN - || !asset_id.0.bytes().all(|b| b.is_ascii_hexdigit()) - { + if asset_id.0.len() != ASSET_ID_HEX_LEN || !asset_id.0.bytes().all(|b| b.is_ascii_hexdigit()) { anyhow::bail!( "invalid AssetId shape (expected {} ASCII hex chars): {:?}", ASSET_ID_HEX_LEN, @@ -359,7 +345,8 @@ fn temp_path_for(dest: &Path) -> PathBuf { let n = TEMP_SUFFIX_COUNTER.fetch_add(1, Ordering::Relaxed); let parent = dest.parent().unwrap_or_else(|| Path::new(".")); let file_name = dest - .file_name().map_or_else(|| "asset".to_string(), |s| s.to_string_lossy().into_owned()); + .file_name() + .map_or_else(|| "asset".to_string(), |s| s.to_string_lossy().into_owned()); parent.join(format!("{file_name}.tmp.{pid}.{n}")) } @@ -742,8 +729,7 @@ pub(crate) fn upsert_asset_row( kebab_core::SourceUri::File(p) => format!("file://{}", p.to_string_lossy()), kebab_core::SourceUri::Kb(u) => u.clone(), }; - let media_type = serde_json::to_string(&asset.media_type) - .context("serialize media_type")?; + let media_type = serde_json::to_string(&asset.media_type).context("serialize media_type")?; let discovered_at = asset .discovered_at .format(&time::format_description::well_known::Rfc3339) @@ -864,9 +850,7 @@ impl SqliteStore { /// skips rows where `code_lang` is NULL (i.e. non-code documents). /// Returns `BTreeMap` — key is the canonical lowercase /// language identifier (e.g. `"rust"`), value is the doc count. - pub fn code_lang_breakdown( - &self, - ) -> anyhow::Result> { + pub fn code_lang_breakdown(&self) -> anyhow::Result> { use anyhow::Context; let conn = self.read_conn(); let mut stmt = conn @@ -936,9 +920,7 @@ impl SqliteStore { /// where `repo` is NULL (documents without an explicit repo tag). /// Returns `BTreeMap` — key is the repo name as stored in /// frontmatter, value is the doc count. - pub fn repo_breakdown( - &self, - ) -> anyhow::Result> { + pub fn repo_breakdown(&self) -> anyhow::Result> { use anyhow::Context; let conn = self.read_conn(); let mut stmt = conn @@ -1299,4 +1281,3 @@ mod tests { assert_eq!(bd.len(), 1, "expected exactly 1 entry, got: {bd:?}"); } } - diff --git a/crates/kebab-store-sqlite/tests/asset_writer.rs b/crates/kebab-store-sqlite/tests/asset_writer.rs index 8eac8e6..3b1de80 100644 --- a/crates/kebab-store-sqlite/tests/asset_writer.rs +++ b/crates/kebab-store-sqlite/tests/asset_writer.rs @@ -44,8 +44,16 @@ fn copy_mode_writes_file_with_0o644_and_correct_bytes() { // Path: data_dir/assets/aa/aaaaaa…aa let aa = &asset.asset_id.0[..2]; - let dest = env.data_dir().join("assets").join(aa).join(&asset.asset_id.0); - assert!(dest.exists(), "asset file not written at {}", dest.display()); + let dest = env + .data_dir() + .join("assets") + .join(aa) + .join(&asset.asset_id.0); + assert!( + dest.exists(), + "asset file not written at {}", + dest.display() + ); let on_disk = std::fs::read(&dest).unwrap(); assert_eq!(on_disk, bytes); @@ -82,10 +90,16 @@ fn reference_mode_does_not_write_file_but_records_path() { let mut asset = fixed_asset(bytes, 1, &cs); asset.source_uri = SourceUri::File(PathBuf::from("/path/to/original.md")); - store.put_asset_with_bytes(&asset, bytes).expect("ref write"); + store + .put_asset_with_bytes(&asset, bytes) + .expect("ref write"); let aa = &asset.asset_id.0[..2]; - let dest = env.data_dir().join("assets").join(aa).join(&asset.asset_id.0); + let dest = env + .data_dir() + .join("assets") + .join(aa) + .join(&asset.asset_id.0); assert!(!dest.exists(), "reference mode must not copy bytes"); let (storage_kind, storage_path): (String, String) = env.with_conn(|c| { @@ -161,11 +175,18 @@ fn put_asset_with_bytes_sweeps_workspace_path_orphan() { |row| row.get(0), ) }); - assert_eq!(new_count, 1, "new asset_id must own the workspace_path slot"); + assert_eq!( + new_count, 1, + "new asset_id must own the workspace_path slot" + ); // New asset's bytes published at the final destination. let aa = &asset.asset_id.0[..2]; - let dest = env.data_dir().join("assets").join(aa).join(&asset.asset_id.0); + let dest = env + .data_dir() + .join("assets") + .join(aa) + .join(&asset.asset_id.0); assert!( dest.exists(), "new asset bytes must be visible at {}", @@ -185,7 +206,11 @@ fn put_asset_with_bytes_rejects_invalid_asset_id() { // 32 chars but contains a `/` — would let `assets_path_for` stitch // together a path outside the shard tree. let evil_id = "../etc/passwd_padded_to_xx_xxxxx".to_string(); - assert_eq!(evil_id.len(), 32, "test fixture must be 32 chars to exercise length-only checks"); + assert_eq!( + evil_id.len(), + 32, + "test fixture must be 32 chars to exercise length-only checks" + ); let mut asset = fixed_asset(b"x", 1, &b3_full_hex(b"x")); asset.asset_id = AssetId(evil_id.clone()); diff --git a/crates/kebab-store-sqlite/tests/chat_sessions.rs b/crates/kebab-store-sqlite/tests/chat_sessions.rs index d491229..8c2fcdb 100644 --- a/crates/kebab-store-sqlite/tests/chat_sessions.rs +++ b/crates/kebab-store-sqlite/tests/chat_sessions.rs @@ -49,7 +49,10 @@ fn create_get_roundtrip() { let store = open_store(&tmp); let session = make_session("sess-1"); store.create_session(&session).unwrap(); - let fetched = store.get_session("sess-1").unwrap().expect("session present"); + let fetched = store + .get_session("sess-1") + .unwrap() + .expect("session present"); assert_eq!(fetched, session); } @@ -112,20 +115,16 @@ fn append_turn_bumps_session_updated_at() { let store = open_store(&tmp); let session = make_session("bump"); store.create_session(&session).unwrap(); - let pre = store - .get_session("bump") - .unwrap() - .unwrap() - .updated_at; + let pre = store.get_session("bump").unwrap().unwrap().updated_at; let mut t = make_turn("bump", 0); t.created_at = pre + 100; store.append_turn(&t).unwrap(); - let post = store - .get_session("bump") - .unwrap() - .unwrap() - .updated_at; - assert_eq!(post, pre + 100, "updated_at must follow latest turn's created_at"); + let post = store.get_session("bump").unwrap().unwrap().updated_at; + assert_eq!( + post, + pre + 100, + "updated_at must follow latest turn's created_at" + ); } #[test] @@ -168,7 +167,9 @@ fn list_sessions_respects_limit() { let tmp = TempDir::new().unwrap(); let store = open_store(&tmp); for i in 0..5 { - store.create_session(&make_session(&format!("s{i}"))).unwrap(); + store + .create_session(&make_session(&format!("s{i}"))) + .unwrap(); } assert_eq!(store.list_sessions(2).unwrap().len(), 2); assert_eq!(store.list_sessions(100).unwrap().len(), 5); diff --git a/crates/kebab-store-sqlite/tests/contract_roundtrip.rs b/crates/kebab-store-sqlite/tests/contract_roundtrip.rs index d6665c9..bfa6f6f 100644 --- a/crates/kebab-store-sqlite/tests/contract_roundtrip.rs +++ b/crates/kebab-store-sqlite/tests/contract_roundtrip.rs @@ -10,7 +10,7 @@ use std::path::PathBuf; use kebab_chunk::MdHeadingV1Chunker; use kebab_core::{ - AssetId, AssetStorage, Checksum, ChunkPolicy, ChunkerVersion, Chunker, DocumentStore, + AssetId, AssetStorage, Checksum, ChunkPolicy, Chunker, ChunkerVersion, DocumentStore, MediaType, ParserVersion, RawAsset, SourceUri, WorkspacePath, }; use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter}; @@ -58,8 +58,7 @@ fn document_and_chunks_round_trip_through_sqlite() { fs_mtime: asset.discovered_at, fallback_lang: Some("en".into()), }; - let (mut metadata, _fm_span, _fm_warns) = - parse_frontmatter(&bytes, &hints).unwrap(); + let (mut metadata, _fm_span, _fm_warns) = parse_frontmatter(&bytes, &hints).unwrap(); let (parsed_blocks, parse_warns) = parse_blocks(&bytes, 1).unwrap(); metadata.aliases.sort(); @@ -91,9 +90,7 @@ fn document_and_chunks_round_trip_through_sqlite() { store .put_blocks(&doc.doc_id, &doc.blocks) .expect("put_blocks"); - store - .put_chunks(&doc.doc_id, &chunks) - .expect("put_chunks"); + store.put_chunks(&doc.doc_id, &chunks).expect("put_chunks"); // ── Read back ──────────────────────────────────────────────────── let loaded = store diff --git a/crates/kebab-store-sqlite/tests/fts.rs b/crates/kebab-store-sqlite/tests/fts.rs index c0dc660..5d9d978 100644 --- a/crates/kebab-store-sqlite/tests/fts.rs +++ b/crates/kebab-store-sqlite/tests/fts.rs @@ -334,9 +334,7 @@ fn normalize_ws(s: &str) -> String { /// - no `CREATE VIRTUAL TABLE chunks_fts` inside that block /// - no `END;` after the virtual-table line fn extract_design_5_5_fts_block() -> String { - let doc = include_str!( - "../../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md" - ); + let doc = include_str!("../../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md"); let heading_idx = doc .find("### 5.5 Chunks + FTS5") .expect("design doc must contain `### 5.5 Chunks + FTS5` heading"); @@ -394,9 +392,7 @@ fn extract_migration_5_5_verbatim_block() -> String { .expect("V007 must carry the `End §5.5 verbatim block` closing anchor") + after_open_line; // Walk back from the close marker to the start of its comment line. - let close_line_start = migration[..close_idx] - .rfind('\n') - .map_or(0, |n| n + 1); + let close_line_start = migration[..close_idx].rfind('\n').map_or(0, |n| n + 1); migration[after_open_line..close_line_start].to_string() } @@ -476,8 +472,7 @@ fn fts_store_drop_releases_wal_files() { } // The main DB file should likewise be removable. if db_path.exists() { - std::fs::remove_file(&db_path) - .expect("main DB file should be removable after store drop"); + std::fs::remove_file(&db_path).expect("main DB file should be removable after store drop"); } } @@ -584,11 +579,7 @@ fn fts_trigram_english_substring_hits() { 1, "substring of 'tokenizer' — trigram recall" ); - assert_eq!( - count_match(&conn, "izer"), - 1, - "substring of 'tokenizer'" - ); + assert_eq!(count_match(&conn, "izer"), 1, "substring of 'tokenizer'"); // 3-char-minimum applies to English too. assert_eq!(count_match(&conn, "to"), 0, "2-char English query"); } diff --git a/crates/kebab-store-sqlite/tests/idempotency.rs b/crates/kebab-store-sqlite/tests/idempotency.rs index 85471e7..faa2bd6 100644 --- a/crates/kebab-store-sqlite/tests/idempotency.rs +++ b/crates/kebab-store-sqlite/tests/idempotency.rs @@ -5,10 +5,9 @@ use std::path::PathBuf; use kebab_core::{ - AssetId, AssetStorage, Block, CanonicalDocument, Checksum, Chunk, ChunkerVersion, - CommonBlock, DocumentId, DocumentStore, HeadingBlock, Lang, MediaType, Metadata, - ParserVersion, Provenance, RawAsset, SourceSpan, SourceType, SourceUri, TextBlock, - TrustLevel, WorkspacePath, + AssetId, AssetStorage, Block, CanonicalDocument, Checksum, Chunk, ChunkerVersion, CommonBlock, + DocumentId, DocumentStore, HeadingBlock, Lang, MediaType, Metadata, ParserVersion, Provenance, + RawAsset, SourceSpan, SourceType, SourceUri, TextBlock, TrustLevel, WorkspacePath, }; use kebab_store_sqlite::SqliteStore; use time::OffsetDateTime; @@ -66,7 +65,7 @@ fn make_doc() -> CanonicalDocument { block_id: kebab_core::BlockId("c".repeat(32)), heading_path: vec!["Title".into()], source_span: span, - }, + }, text: "body".into(), inlines: vec![], }); @@ -138,8 +137,7 @@ fn put_document_idempotent_bumps_doc_version() { // Tags were re-derived: still exactly the two original tags. let tags: Vec = env.with_conn(|c| { - let mut stmt = - c.prepare("SELECT tag FROM document_tags WHERE doc_id = ? ORDER BY tag")?; + let mut stmt = c.prepare("SELECT tag FROM document_tags WHERE doc_id = ? ORDER BY tag")?; let rows = stmt.query_map([&doc.doc_id.0], |r| r.get::<_, String>(0))?; rows.collect::>>() }); @@ -158,7 +156,9 @@ fn put_blocks_and_put_chunks_replace_not_duplicate() { store.put_document(&doc).unwrap(); store.put_blocks(&doc.doc_id, &doc.blocks).unwrap(); - store.put_chunks(&doc.doc_id, &make_chunks(&doc.doc_id)).unwrap(); + store + .put_chunks(&doc.doc_id, &make_chunks(&doc.doc_id)) + .unwrap(); let (b1, ch1): (i64, i64) = env.with_conn(|c| { Ok(( @@ -179,7 +179,9 @@ fn put_blocks_and_put_chunks_replace_not_duplicate() { // Re-put same data → counts unchanged (DELETE-then-INSERT). store.put_blocks(&doc.doc_id, &doc.blocks).unwrap(); - store.put_chunks(&doc.doc_id, &make_chunks(&doc.doc_id)).unwrap(); + store + .put_chunks(&doc.doc_id, &make_chunks(&doc.doc_id)) + .unwrap(); let (b2, ch2): (i64, i64) = env.with_conn(|c| { Ok(( c.query_row( @@ -214,9 +216,8 @@ fn put_blocks_transactional_rollback_on_fk_violation() { store.put_document(&doc).unwrap(); // Establish a baseline row in `blocks`. store.put_blocks(&doc.doc_id, &doc.blocks).unwrap(); - let baseline: i64 = env.with_conn(|c| { - c.query_row("SELECT COUNT(*) FROM blocks", [], |r| r.get(0)) - }); + let baseline: i64 = + env.with_conn(|c| c.query_row("SELECT COUNT(*) FROM blocks", [], |r| r.get(0))); assert_eq!(baseline, 2); // Now ask put_blocks to write to a doc_id that does NOT exist. @@ -237,9 +238,8 @@ fn put_blocks_transactional_rollback_on_fk_violation() { let res = store.put_blocks(&phantom, &phantom_blocks); assert!(res.is_err(), "FK violation must surface as Err"); - let after: i64 = env.with_conn(|c| { - c.query_row("SELECT COUNT(*) FROM blocks", [], |r| r.get(0)) - }); + let after: i64 = + env.with_conn(|c| c.query_row("SELECT COUNT(*) FROM blocks", [], |r| r.get(0))); assert_eq!( after, baseline, "transaction must roll back; blocks count must be unchanged" diff --git a/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs b/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs index 0fe1866..91741fb 100644 --- a/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs +++ b/crates/kebab-store-sqlite/tests/ingest_report_snapshot.rs @@ -9,8 +9,8 @@ use std::path::PathBuf; use kebab_core::{ - AssetId, ChunkerVersion, DocumentId, IngestItem, IngestItemKind, IngestReport, - ParserVersion, SourceScope, WorkspacePath, + AssetId, ChunkerVersion, DocumentId, IngestItem, IngestItemKind, IngestReport, ParserVersion, + SourceScope, WorkspacePath, }; use serde_json::Value; diff --git a/crates/kebab-store-sqlite/tests/jobs.rs b/crates/kebab-store-sqlite/tests/jobs.rs index 7aa9381..d14370b 100644 --- a/crates/kebab-store-sqlite/tests/jobs.rs +++ b/crates/kebab-store-sqlite/tests/jobs.rs @@ -29,9 +29,7 @@ fn create_then_progress_then_finish() { assert_eq!(row[0].progress.as_ref().unwrap()["total"], json!(10)); // Finish with success. - store - .finish(&id, JobStatus::Succeeded, None) - .unwrap(); + store.finish(&id, JobStatus::Succeeded, None).unwrap(); let row = store.list(&JobFilter::default()).unwrap(); assert_eq!(row[0].status, JobStatus::Succeeded); assert!(row[0].finished_at.is_some()); diff --git a/crates/kebab-store-sqlite/tests/list_docs.rs b/crates/kebab-store-sqlite/tests/list_docs.rs index acfad1c..8a536dc 100644 --- a/crates/kebab-store-sqlite/tests/list_docs.rs +++ b/crates/kebab-store-sqlite/tests/list_docs.rs @@ -3,9 +3,9 @@ use std::path::PathBuf; use kebab_core::{ - AssetId, AssetStorage, Block, CanonicalDocument, Checksum, CommonBlock, DocFilter, - DocumentId, DocumentStore, HeadingBlock, Lang, MediaType, Metadata, ParserVersion, - Provenance, RawAsset, SourceSpan, SourceType, SourceUri, TrustLevel, WorkspacePath, + AssetId, AssetStorage, Block, CanonicalDocument, Checksum, CommonBlock, DocFilter, DocumentId, + DocumentStore, HeadingBlock, Lang, MediaType, Metadata, ParserVersion, Provenance, RawAsset, + SourceSpan, SourceType, SourceUri, TrustLevel, WorkspacePath, }; use kebab_store_sqlite::SqliteStore; use time::OffsetDateTime; @@ -84,7 +84,13 @@ fn list_documents_filters_lang_and_tags() { store.run_migrations().unwrap(); for (asset, doc) in [ - make_doc('a', "notes/a.md", "en", vec!["rust", "kb"], TrustLevel::Primary), + make_doc( + 'a', + "notes/a.md", + "en", + vec!["rust", "kb"], + TrustLevel::Primary, + ), make_doc('b', "notes/b.md", "ko", vec!["rust"], TrustLevel::Secondary), make_doc('c', "papers/c.md", "en", vec!["bio"], TrustLevel::Generated), ] { diff --git a/crates/kebab-store-sqlite/tests/not_indexed.rs b/crates/kebab-store-sqlite/tests/not_indexed.rs index 93a1f3f..af8ce5f 100644 --- a/crates/kebab-store-sqlite/tests/not_indexed.rs +++ b/crates/kebab-store-sqlite/tests/not_indexed.rs @@ -23,5 +23,8 @@ fn open_existing_does_not_create_missing_db() { let dir = tempfile::tempdir().unwrap(); let nonexistent_db = dir.path().join("does-not-exist.sqlite"); let _ = SqliteStore::open_existing(&nonexistent_db); - assert!(!nonexistent_db.exists(), "open_existing must NOT create the file"); + assert!( + !nonexistent_db.exists(), + "open_existing must NOT create the file" + ); } diff --git a/crates/kebab-store-vector/src/arrow_batch.rs b/crates/kebab-store-vector/src/arrow_batch.rs index 3182b43..0023bd3 100644 --- a/crates/kebab-store-vector/src/arrow_batch.rs +++ b/crates/kebab-store-vector/src/arrow_batch.rs @@ -23,8 +23,7 @@ use std::sync::Arc; use anyhow::{Context, Result}; use arrow_array::{ - ArrayRef, FixedSizeListArray, Float32Array, RecordBatch, StringArray, - TimestampMicrosecondArray, + ArrayRef, FixedSizeListArray, Float32Array, RecordBatch, StringArray, TimestampMicrosecondArray, }; use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit}; use kebab_core::VectorRecord; @@ -71,21 +70,22 @@ pub(crate) fn build_batch( let schema = schema_for(dim); let chunk_ids = StringArray::from( - recs.iter().map(|r| r.chunk_id.0.as_str()).collect::>(), - ); - let doc_ids = StringArray::from( - recs.iter().map(|r| r.doc_id.0.as_str()).collect::>(), + recs.iter() + .map(|r| r.chunk_id.0.as_str()) + .collect::>(), ); + let doc_ids = StringArray::from(recs.iter().map(|r| r.doc_id.0.as_str()).collect::>()); let model_ids = StringArray::from( - recs.iter().map(|r| r.model_id.0.as_str()).collect::>(), + recs.iter() + .map(|r| r.model_id.0.as_str()) + .collect::>(), ); let model_versions = StringArray::from( recs.iter() .map(|r| r.model_version.0.as_str()) .collect::>(), ); - let texts = - StringArray::from(recs.iter().map(|r| r.text.as_str()).collect::>()); + let texts = StringArray::from(recs.iter().map(|r| r.text.as_str()).collect::>()); // heading_path: serde_json::Value::Array of strings, then to_string. let heading_paths: Vec = recs @@ -93,9 +93,8 @@ pub(crate) fn build_batch( .map(|r| serde_json::to_string(&r.heading_path)) .collect::>() .context("serialize heading_path JSON")?; - let heading_path_arr = StringArray::from( - heading_paths.iter().map(String::as_str).collect::>(), - ); + let heading_path_arr = + StringArray::from(heading_paths.iter().map(String::as_str).collect::>()); // Embedding: FixedSizeList. Build from the flat // contiguous f32 buffer. @@ -112,22 +111,14 @@ pub(crate) fn build_batch( flat.extend_from_slice(&r.vector); } let values = Float32Array::from(flat); - let embedding_field = - Arc::new(Field::new("item", DataType::Float32, true)); - let embedding = FixedSizeListArray::try_new( - embedding_field, - dim as i32, - Arc::new(values), - None, - ) - .context("build FixedSizeList embedding column")?; + let embedding_field = Arc::new(Field::new("item", DataType::Float32, true)); + let embedding = + FixedSizeListArray::try_new(embedding_field, dim as i32, Arc::new(values), None) + .context("build FixedSizeList embedding column")?; // created_at: microseconds since Unix epoch, UTC. - let micros: Vec = std::iter::repeat_n( - (now.unix_timestamp_nanos() / 1_000) as i64, - recs.len(), - ) - .collect(); + let micros: Vec = + std::iter::repeat_n((now.unix_timestamp_nanos() / 1_000) as i64, recs.len()).collect(); let created_at = TimestampMicrosecondArray::from(micros).with_timezone("UTC"); let arrays: Vec = vec![ diff --git a/crates/kebab-store-vector/src/store.rs b/crates/kebab-store-vector/src/store.rs index 007670f..1d607e1 100644 --- a/crates/kebab-store-vector/src/store.rs +++ b/crates/kebab-store-vector/src/store.rs @@ -12,8 +12,8 @@ use arrow_array::{Array, Float32Array, RecordBatch, StringArray}; use arrow_schema::SchemaRef; use futures::TryStreamExt; use kebab_core::{ - ChunkId, DocumentId, EmbeddingModelId, IndexId, SearchFilters, - VectorHit, VectorRecord, VectorStore, + ChunkId, DocumentId, EmbeddingModelId, IndexId, SearchFilters, VectorHit, VectorRecord, + VectorStore, }; use kebab_store_sqlite::{EmbeddingRecordRow, SqliteStore}; use lancedb::Connection; @@ -95,8 +95,7 @@ impl LanceVectorStore { /// section. pub fn new(config: &kebab_config::Config, sqlite: Arc) -> Result { let data_dir = expand_path(&config.storage.data_dir, ""); - let vector_dir = - expand_path(&config.storage.vector_dir, &data_dir.to_string_lossy()); + let vector_dir = expand_path(&config.storage.vector_dir, &data_dir.to_string_lossy()); std::fs::create_dir_all(&vector_dir) .with_context(|| format!("create vector_dir {}", vector_dir.display()))?; @@ -108,13 +107,12 @@ impl LanceVectorStore { .context("build tokio runtime for kb-store-vector")?; let uri = vector_dir.to_string_lossy().into_owned(); - let connection = runtime - .block_on(async { - lancedb::connect(&uri) - .execute() - .await - .context("lancedb::connect") - })?; + let connection = runtime.block_on(async { + lancedb::connect(&uri) + .execute() + .await + .context("lancedb::connect") + })?; tracing::debug!( target: "kebab-store-vector", @@ -174,19 +172,13 @@ impl LanceVectorStore { } Ok(()) } - other => anyhow::bail!( - "embedding column has unexpected Arrow type {other:?}" - ), + other => anyhow::bail!("embedding column has unexpected Arrow type {other:?}"), } } } impl VectorStore for LanceVectorStore { - fn ensure_table( - &self, - model: &EmbeddingModelId, - dim: usize, - ) -> Result { + fn ensure_table(&self, model: &EmbeddingModelId, dim: usize) -> Result { let table_name = lance_table_name(&model.0, dim); // The trait method only needs the IndexId — we don't return the // Lance handle. Open (or create) the table to enforce idempotence @@ -220,10 +212,7 @@ impl VectorStore for LanceVectorStore { let model_version = recs[0].model_version.clone(); let dim = recs[0].dimensions; for r in recs { - if r.model_id != model_id - || r.model_version != model_version - || r.dimensions != dim - { + if r.model_id != model_id || r.model_version != model_version || r.dimensions != dim { anyhow::bail!( "kb-store-vector::upsert called with mixed (model_id, model_version, dim) — caller must bucket per table" ); @@ -264,15 +253,13 @@ impl VectorStore for LanceVectorStore { // Phase 2: Lance MergeInsert keyed on chunk_id. let batch = build_batch(recs, dim, now)?; - merge_insert_batch(&self.runtime, &table, batch) - .context("phase 2: Lance MergeInsert")?; + merge_insert_batch(&self.runtime, &table, batch).context("phase 2: Lance MergeInsert")?; // Phase 3: flip rows to status='committed'. If we crashed // between phase 2 and phase 3, the rows stay 'pending' and a // future upsert call retries them (Lance MergeInsert dedupes // on chunk_id, so the retry is a no-op on the Lance side). - let embedding_ids: Vec = - recs.iter().map(|r| r.embedding_id.0.clone()).collect(); + let embedding_ids: Vec = recs.iter().map(|r| r.embedding_id.0.clone()).collect(); self.sqlite .mark_embedding_records_committed(&embedding_ids) .context("phase 3: mark embedding_records committed")?; @@ -356,9 +343,7 @@ impl VectorStore for LanceVectorStore { table .delete(&predicate) .await - .with_context(|| { - format!("Lance delete on {name} ({} ids)", batch.len()) - })?; + .with_context(|| format!("Lance delete on {name} ({} ids)", batch.len()))?; } } anyhow::Ok(()) @@ -389,7 +374,10 @@ impl VectorStore for LanceVectorStore { let dim = query_vec.len(); let table_name = if let Some(name) = self .runtime - .block_on(async { find_matching_table(&self.connection, dim).await })? { name } else { + .block_on(async { find_matching_table(&self.connection, dim).await })? + { + name + } else { tracing::debug!( target: "kebab-store-vector", dim, @@ -403,8 +391,7 @@ impl VectorStore for LanceVectorStore { // exclude tombstoned / pending rows. let overfetch = k.saturating_mul(OVERFETCH_MULTIPLIER).max(k); let raw_hits = self.runtime.block_on(async { - let table = match self.connection.open_table(&table_name).execute().await - { + let table = match self.connection.open_table(&table_name).execute().await { Ok(t) => t, Err(lancedb::Error::TableNotFound { .. }) => return Ok(Vec::new()), Err(e) => return Err(anyhow::Error::from(e)), @@ -536,10 +523,8 @@ fn decode_lance_hits(batches: &[RecordBatch]) -> Result> { for i in 0..batch.num_rows() { let dist = distances.value(i); let score = score_from_distance(dist); - let heading_path: Vec = serde_json::from_str( - heading_path_str.value(i), - ) - .unwrap_or_default(); + let heading_path: Vec = + serde_json::from_str(heading_path_str.value(i)).unwrap_or_default(); out.push(LanceCandidate { chunk_id: ChunkId(chunk_ids.value(i).to_string()), doc_id: DocumentId(doc_ids.value(i).to_string()), @@ -571,10 +556,7 @@ fn score_from_distance(distance: f32) -> f32 { } /// Find a Lance table whose embedding column is FixedSizeList. -async fn find_matching_table( - connection: &Connection, - dim: usize, -) -> Result> { +async fn find_matching_table(connection: &Connection, dim: usize) -> Result> { let names = connection .table_names() .execute() @@ -588,9 +570,7 @@ async fn find_matching_table( Ok(t) => { let schema = t.schema().await.context("schema for table")?; if let Ok(field) = schema.field_with_name("embedding") { - if let arrow_schema::DataType::FixedSizeList(_, table_dim) = - field.data_type() - { + if let arrow_schema::DataType::FixedSizeList(_, table_dim) = field.data_type() { if (*table_dim as usize) == dim { return Ok(Some(name)); } @@ -612,17 +592,10 @@ async fn find_matching_table( /// Run the Lance MergeInsert under our embedded runtime. Pulled out /// of `upsert` so the trait method stays compact. -fn merge_insert_batch( - runtime: &Runtime, - table: &lancedb::Table, - batch: RecordBatch, -) -> Result<()> { +fn merge_insert_batch(runtime: &Runtime, table: &lancedb::Table, batch: RecordBatch) -> Result<()> { let schema = batch.schema(); runtime.block_on(async move { - let reader = arrow_array::RecordBatchIterator::new( - vec![Ok(batch)].into_iter(), - schema, - ); + let reader = arrow_array::RecordBatchIterator::new(vec![Ok(batch)].into_iter(), schema); let mut builder = table.merge_insert(&["chunk_id"]); builder .when_matched_update_all(None) @@ -634,4 +607,3 @@ fn merge_insert_batch( Result::<()>::Ok(()) }) } - diff --git a/crates/kebab-store-vector/tests/common/mod.rs b/crates/kebab-store-vector/tests/common/mod.rs index 7bb5a2a..1ac123b 100644 --- a/crates/kebab-store-vector/tests/common/mod.rs +++ b/crates/kebab-store-vector/tests/common/mod.rs @@ -49,7 +49,8 @@ use std::sync::Arc; pub fn require_avx_or_panic() { #[cfg(target_arch = "x86_64")] { - assert!(std::is_x86_feature_detected!("avx"), + assert!( + std::is_x86_feature_detected!("avx"), "kb-store-vector integration test requires AVX-capable hardware; \ host CPU lacks AVX. Run on an AVX-capable machine. \ See crates/kb-store-vector/tests/common/mod.rs." @@ -167,15 +168,17 @@ pub fn make_record( let dim = vector.len(); let chunk_id = ChunkId(format!("{:032x}", 0x1100u32 + u32::from(chunk_idx))); let doc_id = DocumentId(format!("{:032x}", 0xd0c0u32 + u32::from(doc_idx))); - let embedding_id = - EmbeddingId(format!("{:032x}", 0xeeee0000u32 + u32::from(chunk_idx))); + let embedding_id = EmbeddingId(format!("{:032x}", 0xeeee0000u32 + u32::from(chunk_idx))); VectorRecord { chunk_id, embedding_id, vector, doc_id, text: text.to_string(), - heading_path: heading.iter().map(std::string::ToString::to_string).collect(), + heading_path: heading + .iter() + .map(std::string::ToString::to_string) + .collect(), model_id: EmbeddingModelId(model.to_string()), model_version: EmbeddingVersion("v1".to_string()), dimensions: dim, diff --git a/crates/kebab-store-vector/tests/snapshot.rs b/crates/kebab-store-vector/tests/snapshot.rs index 05c63da..0189603 100644 --- a/crates/kebab-store-vector/tests/snapshot.rs +++ b/crates/kebab-store-vector/tests/snapshot.rs @@ -60,11 +60,13 @@ fn vector_hits_snapshot_run_1() { // - payload shape: { doc_id, text, heading_path } // - that scores live in [0, 1] and are sorted descending let actual = json!( - hits.iter().map(|h| json!({ - "chunk_id": h.chunk_id.0, - "score_in_unit_interval": (0.0..=1.0).contains(&h.score), - "payload": h.payload, - })).collect::>() + hits.iter() + .map(|h| json!({ + "chunk_id": h.chunk_id.0, + "score_in_unit_interval": (0.0..=1.0).contains(&h.score), + "payload": h.payload, + })) + .collect::>() ); let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR")) @@ -75,25 +77,25 @@ fn vector_hits_snapshot_run_1() { if std::env::var_os("KEBAB_UPDATE_SNAPSHOTS").is_some() { std::fs::create_dir_all(fixture.parent().unwrap()).unwrap(); - std::fs::write(&fixture, serde_json::to_string_pretty(&actual).unwrap()) - .unwrap(); + std::fs::write(&fixture, serde_json::to_string_pretty(&actual).unwrap()).unwrap(); return; } let expected: serde_json::Value = - serde_json::from_str(&std::fs::read_to_string(&fixture).unwrap_or_else( - |_| panic!( + serde_json::from_str(&std::fs::read_to_string(&fixture).unwrap_or_else(|_| { + panic!( "missing snapshot fixture at {}; run with KEBAB_UPDATE_SNAPSHOTS=1 to create", fixture.display() - ), - )) + ) + })) .unwrap(); // Refuse to silently "pass" when the fixture is the committed // placeholder. The placeholder JSON carries a `_comment` field // with regeneration instructions; production fixtures (a captured // hits array) do not. - assert!(!expected.get("_comment").is_some(), + assert!( + !expected.get("_comment").is_some(), "snapshot fixture is a placeholder — regenerate on AVX hardware then commit. \ Path: {}. To regenerate: \ `KEBAB_UPDATE_SNAPSHOTS=1 cargo test -p kb-store-vector -- --ignored snapshot`.", diff --git a/crates/kebab-store-vector/tests/upsert_search.rs b/crates/kebab-store-vector/tests/upsert_search.rs index cdd83f5..3f6ea3b 100644 --- a/crates/kebab-store-vector/tests/upsert_search.rs +++ b/crates/kebab-store-vector/tests/upsert_search.rs @@ -124,9 +124,19 @@ fn dimension_mismatch_errors_and_writes_nothing() { // First populate a 4-D table with one row so it exists on disk. let r0 = make_record(0, 0, dir(0), "first", &[], MODEL); - env.seed_chunk(&r0.chunk_id.0, &r0.doc_id.0, "notes/0.md", "en", &[], "primary"); + env.seed_chunk( + &r0.chunk_id.0, + &r0.doc_id.0, + "notes/0.md", + "en", + &[], + "primary", + ); env.vector.upsert(&[r0]).unwrap(); - assert_eq!(env.vector.ensure_table(&model, 4).unwrap(), env.vector.ensure_table(&model, 4).unwrap()); + assert_eq!( + env.vector.ensure_table(&model, 4).unwrap(), + env.vector.ensure_table(&model, 4).unwrap() + ); // Now manually open the same table_name path and try to upsert // an 8-D vector through `upsert` — the table name function bakes @@ -138,14 +148,20 @@ fn dimension_mismatch_errors_and_writes_nothing() { // Pretend this is a 4-D vector for table-name purposes; the // build_batch then enforces that vector.len() == dim and bails. bad.dimensions = 4; - env.seed_chunk(&bad.chunk_id.0, &bad.doc_id.0, "notes/1.md", "en", &[], "primary"); + env.seed_chunk( + &bad.chunk_id.0, + &bad.doc_id.0, + "notes/1.md", + "en", + &[], + "primary", + ); let bad_chunk = bad.chunk_id.0.clone(); let err = env.vector.upsert(&[bad]).unwrap_err(); let msg = format!("{err:#}"); assert!( - msg.to_lowercase().contains("dim") - || msg.contains("does not match table dim"), + msg.to_lowercase().contains("dim") || msg.contains("does not match table dim"), "unexpected error message: {msg}" ); @@ -161,7 +177,10 @@ fn dimension_mismatch_errors_and_writes_nothing() { |r| r.get(0), ) .unwrap(); - assert_eq!(committed, 0, "bad record reached committed state despite dim mismatch"); + assert_eq!( + committed, 0, + "bad record reached committed state despite dim mismatch" + ); } #[test] @@ -221,9 +240,7 @@ fn model_isolation_two_models_two_directories() { // Same chunk_id, different model — should land in a separate table. let mut r2 = make_record(0xaa, 0xaa, dir(0), "alpha", &[], "model-B"); - r2.embedding_id = kebab_core::EmbeddingId( - "ee01ee01ee01ee01ee01ee01ee01ee01".to_string(), - ); + r2.embedding_id = kebab_core::EmbeddingId("ee01ee01ee01ee01ee01ee01ee01ee01".to_string()); env.vector.upsert(&[r2]).unwrap(); // Two on-disk Lance directories, distinguished by table name. @@ -233,14 +250,8 @@ fn model_isolation_two_models_two_directories() { .filter_map(Result::ok) .map(|e| e.file_name().to_string_lossy().into_owned()) .collect(); - let a_count = entries - .iter() - .filter(|e| e.contains("model-A")) - .count(); - let b_count = entries - .iter() - .filter(|e| e.contains("model-B")) - .count(); + let a_count = entries.iter().filter(|e| e.contains("model-A")).count(); + let b_count = entries.iter().filter(|e| e.contains("model-B")).count(); assert!(a_count >= 1, "model-A table missing: {entries:?}"); assert!(b_count >= 1, "model-B table missing: {entries:?}"); @@ -351,7 +362,10 @@ fn upsert_retry_promotes_pending_to_committed() { |r| Ok((r.get(0)?, r.get(1)?)), ) .unwrap(); - assert_eq!(status, "pending", "row should be at status=pending after phase-1-only"); + assert_eq!( + status, "pending", + "row should be at status=pending after phase-1-only" + ); assert_eq!(committed, 0); } diff --git a/crates/kebab-tui/src/app.rs b/crates/kebab-tui/src/app.rs index 8201780..50ca759 100644 --- a/crates/kebab-tui/src/app.rs +++ b/crates/kebab-tui/src/app.rs @@ -289,7 +289,6 @@ impl Default for AskState { } } - /// What the Inspect pane is currently showing — owned by p9-4. #[derive(Clone, Debug)] pub enum InspectTarget { @@ -512,10 +511,7 @@ impl App { /// Marked `#[doc(hidden)]` because it is a test seam, not part /// of the official UI API. #[doc(hidden)] - pub fn populate_library_for_testing( - &mut self, - docs: Vec, - ) { + pub fn populate_library_for_testing(&mut self, docs: Vec) { self.library.inner.docs = docs; self.library.inner.needs_refresh = false; let len = self.library.inner.docs.len(); diff --git a/crates/kebab-tui/src/ask.rs b/crates/kebab-tui/src/ask.rs index 836cb4b..287b7c2 100644 --- a/crates/kebab-tui/src/ask.rs +++ b/crates/kebab-tui/src/ask.rs @@ -92,7 +92,8 @@ fn render_input(f: &mut Frame, area: Rect, s: &AskState, theme: &crate::theme::T // place_cursor_x sums in usize (avoiding u16 wrap) and clamps to // the right edge of the inner area. let prompt_w = crate::input::display_width(PROMPT); - let cursor_x = crate::input::place_cursor_x(inner.x, inner.width, prompt_w, s.input.cursor_col()); + let cursor_x = + crate::input::place_cursor_x(inner.x, inner.width, prompt_w, s.input.cursor_col()); f.set_cursor_position((cursor_x, inner.y)); } @@ -101,7 +102,11 @@ fn render_answer(f: &mut Frame, area: Rect, s: &AskState, theme: &crate::theme:: "transcript".to_string() } else { let count = s.turns.len() + usize::from(s.streaming); - format!("transcript ({} turn{})", count, if count == 1 { "" } else { "s" }) + format!( + "transcript ({} turn{})", + count, + if count == 1 { "" } else { "s" } + ) }; let block = Block::default().title(title).borders(Borders::ALL); @@ -165,8 +170,7 @@ fn render_answer(f: &mut Frame, area: Rect, s: &AskState, theme: &crate::theme:: let para = Paragraph::new(lines).wrap(Wrap { trim: false }); let scroll = if s.follow_tail { let total_lines = para.line_count(inner.width); - u16::try_from(total_lines.saturating_sub(inner.height as usize)) - .unwrap_or(u16::MAX) + u16::try_from(total_lines.saturating_sub(inner.height as usize)).unwrap_or(u16::MAX) } else { s.scroll }; @@ -265,17 +269,16 @@ fn render_status(f: &mut Frame, area: Rect, s: &AskState, theme: &crate::theme:: // live answers (PR-9c-2 wires the gate), but the // match must stay exhaustive so the new variants // compile without `_ => unreachable!()`. - Some(RefusalReason::NliVerificationFailed) => { - " refusal=nli_verification_failed" - } - Some(RefusalReason::NliModelUnavailable) => { - " refusal=nli_model_unavailable" - } + Some(RefusalReason::NliVerificationFailed) => " refusal=nli_verification_failed", + Some(RefusalReason::NliModelUnavailable) => " refusal=nli_model_unavailable", None => "", }; let mut lines = vec![ Line::from(format!("grounded {grounded} model {}", a.model.id)), - Line::from(format!("prompt {} mode {mode}", a.prompt_template_version.0)), + Line::from(format!( + "prompt {} mode {mode}", + a.prompt_template_version.0 + )), Line::from(format!( "k={} used={}/{}{refusal}", a.retrieval.k, a.retrieval.chunks_used, a.retrieval.chunks_returned @@ -305,8 +308,17 @@ fn render_status(f: &mut Frame, area: Rect, s: &AskState, theme: &crate::theme:: f.render_widget(Paragraph::new(lines).block(block), area); } -fn render_citations_or_explain(f: &mut Frame, area: Rect, s: &AskState, theme: &crate::theme::Theme) { - let title = if s.explain { "explain (per-claim)" } else { "citations" }; +fn render_citations_or_explain( + f: &mut Frame, + area: Rect, + s: &AskState, + theme: &crate::theme::Theme, +) { + let title = if s.explain { + "explain (per-claim)" + } else { + "citations" + }; let block = Block::default().title(title).borders(Borders::ALL); let lines: Vec = match &s.last_answer { None => vec![Line::from(Span::styled( @@ -314,7 +326,11 @@ fn render_citations_or_explain(f: &mut Frame, area: Rect, s: &AskState, theme: & theme.style(crate::theme::Role::Hint), ))], Some(a) if a.citations.is_empty() => vec![Line::from(Span::styled( - if a.grounded { "(no citations)" } else { "(가까운 후보 없음)" }, + if a.grounded { + "(no citations)" + } else { + "(가까운 후보 없음)" + }, theme.style(crate::theme::Role::Hint), ))], Some(a) => a @@ -406,11 +422,9 @@ pub fn handle_key_ask(state: &mut App, key: KeyEvent) -> KeyOutcome { // no-op. Otherwise the new worker would race the // detached one against the same Ollama endpoint and // the stream output would interleave. - if state - .ask - .as_ref() - .is_none_or(|s| s.streaming || s.thread.is_some() || s.input.as_str().trim().is_empty()) - { + if state.ask.as_ref().is_none_or(|s| { + s.streaming || s.thread.is_some() || s.input.as_str().trim().is_empty() + }) { return KeyOutcome::Continue; } spawn_ask_worker(state); @@ -573,8 +587,7 @@ fn spawn_ask_worker(state: &mut App) { turn_index: Some(turn_index), multi_hop, }; - let handle = - thread::spawn(move || kebab_app::ask_with_config(cfg, &query, opts)); + let handle = thread::spawn(move || kebab_app::ask_with_config(cfg, &query, opts)); s.thread = Some(handle); } @@ -645,8 +658,7 @@ pub(crate) fn poll_worker(state: &mut App) { } Ok(Err(e)) => { s.last_error = Some(format!("{e:#}")); - state.error_overlay = - Some(crate::error_popup::ErrorOverlay::from_anyhow(&e)); + state.error_overlay = Some(crate::error_popup::ErrorOverlay::from_anyhow(&e)); } Err(panic_payload) => { let msg = panic_payload @@ -655,11 +667,10 @@ pub(crate) fn poll_worker(state: &mut App) { .or_else(|| panic_payload.downcast_ref::().cloned()) .unwrap_or_else(|| "ask worker panicked".to_string()); s.last_error = Some(msg.clone()); - state.error_overlay = - Some(crate::error_popup::ErrorOverlay::from_message( - "ask worker panic", - msg, - )); + state.error_overlay = Some(crate::error_popup::ErrorOverlay::from_message( + "ask worker panic", + msg, + )); } } } diff --git a/crates/kebab-tui/src/cheatsheet.rs b/crates/kebab-tui/src/cheatsheet.rs index 4a69f6b..4f26e8d 100644 --- a/crates/kebab-tui/src/cheatsheet.rs +++ b/crates/kebab-tui/src/cheatsheet.rs @@ -46,67 +46,99 @@ pub fn render_cheatsheet(f: &mut Frame, area: Rect, app: &App) { let mut lines: Vec = Vec::new(); lines.push(Line::from(Span::styled( "kebab TUI — keymap (F1 / Esc to close)", - app.theme - .style(Role::Heading) - .add_modifier(Modifier::BOLD), + app.theme.style(Role::Heading).add_modifier(Modifier::BOLD), ))); lines.push(Line::from("")); - push_section(&mut lines, &app.theme, "Global", &[ - ("i", "Normal → Insert (every pane — p9-fb-21)"), - ("Esc", "Insert → Normal (any pane)"), - ("F1", "toggle this cheatsheet"), - ("Tab / Shift-Tab", "(future) cycle pane"), - ]); + push_section( + &mut lines, + &app.theme, + "Global", + &[ + ("i", "Normal → Insert (every pane — p9-fb-21)"), + ("Esc", "Insert → Normal (any pane)"), + ("F1", "toggle this cheatsheet"), + ("Tab / Shift-Tab", "(future) cycle pane"), + ], + ); - push_section(&mut lines, &app.theme, "Library", &[ - ("j / k", "move selection (Normal)"), - ("gg / G", "top / bottom"), - ("f", "filter overlay"), - ("/", "switch to Search"), - ("?", "switch to Ask"), - ("Enter", "inspect selected doc"), - ("r", "background ingest"), - ("q", "quit"), - ]); + push_section( + &mut lines, + &app.theme, + "Library", + &[ + ("j / k", "move selection (Normal)"), + ("gg / G", "top / bottom"), + ("f", "filter overlay"), + ("/", "switch to Search"), + ("?", "switch to Ask"), + ("Enter", "inspect selected doc"), + ("r", "background ingest"), + ("q", "quit"), + ], + ); - push_section(&mut lines, &app.theme, "Search", &[ - ("type", "query (Insert)"), - ("Tab", "cycle search mode (lexical / vector / hybrid)"), - ("Enter", "force search now (skip debounce)"), - ("j / k", "move selection (Normal)"), - ("← / →", "move cursor in query (p9-fb-22)"), - ("Home / End", "cursor to start / end of query"), - ("Delete", "remove char at cursor"), - ("g", "open hit's citation in $EDITOR (Normal)"), - ("o", "inspect selected hit's chunk (Normal — was `i` pre-fb-21)"), - ("t", "open retrieval trace popup (Normal — p9-fb-37)"), - ("i", "Normal → Insert (toggle back to typing)"), - ("Esc", "back to Library"), - ]); + push_section( + &mut lines, + &app.theme, + "Search", + &[ + ("type", "query (Insert)"), + ("Tab", "cycle search mode (lexical / vector / hybrid)"), + ("Enter", "force search now (skip debounce)"), + ("j / k", "move selection (Normal)"), + ("← / →", "move cursor in query (p9-fb-22)"), + ("Home / End", "cursor to start / end of query"), + ("Delete", "remove char at cursor"), + ("g", "open hit's citation in $EDITOR (Normal)"), + ( + "o", + "inspect selected hit's chunk (Normal — was `i` pre-fb-21)", + ), + ("t", "open retrieval trace popup (Normal — p9-fb-37)"), + ("i", "Normal → Insert (toggle back to typing)"), + ("Esc", "back to Library"), + ], + ); - push_section(&mut lines, &app.theme, "Ask", &[ - ("type", "question (Insert)"), - ("Enter", "submit"), - ("e", "toggle explain mode (Normal)"), - ("F2", "toggle multi-hop pipeline (p9-fb-41 — affects next submission)"), - ("j / k", "scroll transcript (Normal — disengages auto-tail)"), - ("Shift-G", "jump to bottom + re-engage auto-tail (p9-fb-22)"), - ("PgUp / PgDn", "page-scroll the transcript (p9-fb-24, disengages auto-tail)"), - ("← / →", "move cursor in input (p9-fb-22)"), - ("Home / End", "cursor to start / end of input"), - ("Delete", "remove char at cursor"), - ("i", "Normal → Insert (toggle back to typing)"), - ("Ctrl-L", "new conversation (clears turns)"), - ("Esc", "back to Library (cancels in-flight worker)"), - ]); + push_section( + &mut lines, + &app.theme, + "Ask", + &[ + ("type", "question (Insert)"), + ("Enter", "submit"), + ("e", "toggle explain mode (Normal)"), + ( + "F2", + "toggle multi-hop pipeline (p9-fb-41 — affects next submission)", + ), + ("j / k", "scroll transcript (Normal — disengages auto-tail)"), + ("Shift-G", "jump to bottom + re-engage auto-tail (p9-fb-22)"), + ( + "PgUp / PgDn", + "page-scroll the transcript (p9-fb-24, disengages auto-tail)", + ), + ("← / →", "move cursor in input (p9-fb-22)"), + ("Home / End", "cursor to start / end of input"), + ("Delete", "remove char at cursor"), + ("i", "Normal → Insert (toggle back to typing)"), + ("Ctrl-L", "new conversation (clears turns)"), + ("Esc", "back to Library (cancels in-flight worker)"), + ], + ); - push_section(&mut lines, &app.theme, "Inspect", &[ - ("j / k", "scroll lines"), - ("PgUp / PgDn", "scroll pages"), - ("c", "collapse / expand all sections"), - ("Esc / q", "back to originating pane"), - ]); + push_section( + &mut lines, + &app.theme, + "Inspect", + &[ + ("j / k", "scroll lines"), + ("PgUp / PgDn", "scroll pages"), + ("c", "collapse / expand all sections"), + ("Esc / q", "back to originating pane"), + ], + ); // Pane footer: which pane is currently focused (helps the // reader correlate \"the keys above\" with their current diff --git a/crates/kebab-tui/src/error_popup.rs b/crates/kebab-tui/src/error_popup.rs index 0e20d94..b8d2d4c 100644 --- a/crates/kebab-tui/src/error_popup.rs +++ b/crates/kebab-tui/src/error_popup.rs @@ -48,7 +48,11 @@ pub fn render_error_overlay(f: &mut Frame, area: Rect, overlay: &ErrorOverlay, t let mut lines: Vec = Vec::with_capacity(overlay.chain.len() + 2); lines.push(Line::from(Span::styled( - format!("{}: {}", overlay.title, overlay.chain.first().map_or("(unknown)", String::as_str)), + format!( + "{}: {}", + overlay.title, + overlay.chain.first().map_or("(unknown)", String::as_str) + ), theme.style(Role::Error).add_modifier(Modifier::BOLD), ))); for cause in overlay.chain.iter().skip(1) { @@ -64,7 +68,9 @@ pub fn render_error_overlay(f: &mut Frame, area: Rect, overlay: &ErrorOverlay, t .title("error") .borders(Borders::ALL) .border_style(theme.style(Role::Error)); - let para = Paragraph::new(lines).block(block).wrap(Wrap { trim: false }); + let para = Paragraph::new(lines) + .block(block) + .wrap(Wrap { trim: false }); f.render_widget(para, popup_area); } diff --git a/crates/kebab-tui/src/ingest_progress.rs b/crates/kebab-tui/src/ingest_progress.rs index 7ce3e75..eab7f78 100644 --- a/crates/kebab-tui/src/ingest_progress.rs +++ b/crates/kebab-tui/src/ingest_progress.rs @@ -110,9 +110,7 @@ fn apply_event(state: &mut IngestState, event: IngestEvent) { state.current_idx = idx; state.current_path = Some(path); } - IngestEvent::AssetFinished { - result, chunks, .. - } => { + IngestEvent::AssetFinished { result, chunks, .. } => { // Per-asset counter increments mirror the way // `kebab-app::ingest_with_config_progress` aggregates the // final report — kept in sync so the status bar's running @@ -178,7 +176,9 @@ pub fn status_line(state: &IngestState) -> String { let elapsed = state.started_at.elapsed(); let secs = elapsed.as_secs(); if state.aborted { - let skipped_breakdown = kebab_app::ingest_progress::render_skipped_breakdown(&state.counts.skipped_by_extension); + let skipped_breakdown = kebab_app::ingest_progress::render_skipped_breakdown( + &state.counts.skipped_by_extension, + ); return format!( "✗ ingest aborted at {}/{} after {}s (new={} updated={} unchanged={} skipped={}{} errors={})", state.counts.scanned.saturating_sub(state.counts.errors), @@ -192,7 +192,9 @@ pub fn status_line(state: &IngestState) -> String { state.counts.errors, ); } - let skipped_breakdown = kebab_app::ingest_progress::render_skipped_breakdown(&state.counts.skipped_by_extension); + let skipped_breakdown = kebab_app::ingest_progress::render_skipped_breakdown( + &state.counts.skipped_by_extension, + ); return format!( "✓ ingest: {} docs ({} new, {} updated, {} unchanged, {} skipped{}), {} chunks indexed in {}s", state.counts.scanned, @@ -209,7 +211,8 @@ pub fn status_line(state: &IngestState) -> String { let secs = state.started_at.elapsed().as_secs(); return format!("ingest: scanning… [{secs}s]"); } - let pct = u64::from(state.current_idx).saturating_mul(100) / u64::from(state.counts.scanned.max(1)); + let pct = + u64::from(state.current_idx).saturating_mul(100) / u64::from(state.counts.scanned.max(1)); let elapsed = state.started_at.elapsed(); let mm = elapsed.as_secs() / 60; let ss = elapsed.as_secs() % 60; @@ -295,7 +298,12 @@ mod tests { chunks_indexed: 50, ..Default::default() }; - apply_event(&mut s, IngestEvent::Completed { counts: final_counts.clone() }); + apply_event( + &mut s, + IngestEvent::Completed { + counts: final_counts.clone(), + }, + ); assert_eq!(s.counts, final_counts); assert!(s.terminal_at.is_some()); assert!(!s.aborted); @@ -447,8 +455,7 @@ mod tests { #[test] fn status_line_aborted_includes_skipped_breakdown() { let mut s = fresh_state(); - let skipped_by_extension = - std::collections::BTreeMap::from([("pdf".to_string(), 2u32)]); + let skipped_by_extension = std::collections::BTreeMap::from([("pdf".to_string(), 2u32)]); let counts = AggregateCounts { scanned: 5, skipped: 2, diff --git a/crates/kebab-tui/src/input.rs b/crates/kebab-tui/src/input.rs index 7e63e9f..85dff19 100644 --- a/crates/kebab-tui/src/input.rs +++ b/crates/kebab-tui/src/input.rs @@ -48,8 +48,7 @@ pub fn place_cursor_x(inner_x: u16, inner_width: u16, prompt_w: usize, cursor_co let raw = (inner_x as usize) .saturating_add(prompt_w) .saturating_add(cursor_col); - let max = (inner_x as usize) - .saturating_add(inner_width.saturating_sub(1) as usize); + let max = (inner_x as usize).saturating_add(inner_width.saturating_sub(1) as usize); raw.min(max).try_into().unwrap_or(u16::MAX) } @@ -472,9 +471,9 @@ mod tests { fn input_buffer_insert_at_cursor_mid_string() { let mut b = InputBuffer::new(); b.push_str("abc"); - b.move_left(); // cursor between b and c - b.move_left(); // cursor between a and b - b.push_char('X'); // insert X between a and b + b.move_left(); // cursor between b and c + b.move_left(); // cursor between a and b + b.push_char('X'); // insert X between a and b assert_eq!(b.as_str(), "aXbc"); assert_eq!(b.cursor_col(), 2); } @@ -484,9 +483,9 @@ mod tests { fn input_buffer_backspace_at_cursor() { let mut b = InputBuffer::new(); b.push_str("abcde"); - b.move_left(); // cursor between d and e - b.move_left(); // cursor between c and d - b.pop_char(); // delete c + b.move_left(); // cursor between d and e + b.move_left(); // cursor between c and d + b.pop_char(); // delete c assert_eq!(b.as_str(), "abde"); assert_eq!(b.cursor_col(), 2); } @@ -528,13 +527,13 @@ mod tests { #[test] fn input_buffer_cursor_col_after_mixed_hangul_edits() { let mut b = InputBuffer::new(); - b.push_str("a한b"); // cursor at end, col = 1 + 2 + 1 = 4 + b.push_str("a한b"); // cursor at end, col = 1 + 2 + 1 = 4 assert_eq!(b.cursor_col(), 4); - b.move_left(); // before 'b': col = 3 + b.move_left(); // before 'b': col = 3 assert_eq!(b.cursor_col(), 3); - b.move_left(); // before '한': col = 1 + b.move_left(); // before '한': col = 1 assert_eq!(b.cursor_col(), 1); - b.push_char('글'); // insert 글 → "a글한b", cursor between 글 and 한, col = 1 + 2 = 3 + b.push_char('글'); // insert 글 → "a글한b", cursor between 글 and 한, col = 1 + 2 = 3 assert_eq!(b.as_str(), "a글한b"); assert_eq!(b.cursor_col(), 3); } diff --git a/crates/kebab-tui/src/inspect.rs b/crates/kebab-tui/src/inspect.rs index 3b88214..fab91dc 100644 --- a/crates/kebab-tui/src/inspect.rs +++ b/crates/kebab-tui/src/inspect.rs @@ -43,7 +43,9 @@ pub fn render_inspect(f: &mut Frame, area: Rect, state: &App) { return; }; if s.loading { - let block = RBlock::default().title("Inspect — loading…").borders(Borders::ALL); + let block = RBlock::default() + .title("Inspect — loading…") + .borders(Borders::ALL); f.render_widget(block, area); return; } @@ -59,9 +61,7 @@ pub fn render_inspect(f: &mut Frame, area: Rect, state: &App) { render_chunk(f, area, s, chunk, &state.theme); } _ => { - let block = RBlock::default() - .title("Inspect") - .borders(Borders::ALL); + let block = RBlock::default().title("Inspect").borders(Borders::ALL); let hint = Paragraph::new(Span::styled( "(no target — return to Library and press Enter on a doc, \ or to Search and press `i` on a hit)", @@ -83,10 +83,7 @@ fn render_doc( ) { let lines = build_doc_lines(s, doc, theme, threshold_days); let block = RBlock::default() - .title(format!( - "Inspect Doc — {}", - short_id(&doc.doc_id.0) - )) + .title(format!("Inspect Doc — {}", short_id(&doc.doc_id.0))) .borders(Borders::ALL); let para = Paragraph::new(lines) .wrap(Wrap { trim: false }) @@ -94,13 +91,16 @@ fn render_doc( f.render_widget(para.block(block), area); } -fn render_chunk(f: &mut Frame, area: Rect, s: &InspectState, chunk: &Chunk, theme: &crate::theme::Theme) { +fn render_chunk( + f: &mut Frame, + area: Rect, + s: &InspectState, + chunk: &Chunk, + theme: &crate::theme::Theme, +) { let lines = build_chunk_lines(s, chunk, theme); let block = RBlock::default() - .title(format!( - "Inspect Chunk — {}", - short_id(&chunk.chunk_id.0) - )) + .title(format!("Inspect Chunk — {}", short_id(&chunk.chunk_id.0))) .borders(Borders::ALL); let para = Paragraph::new(lines) .wrap(Wrap { trim: false }) @@ -158,9 +158,7 @@ pub(crate) fn build_doc_lines<'a>( lines.push(kv("updated_at", &fmt_dt(&doc.metadata.updated_at), theme)); // user metadata pretty-printed JSON if let Ok(pretty) = - serde_json::to_string_pretty(&serde_json::Value::Object( - doc.metadata.user.clone(), - )) + serde_json::to_string_pretty(&serde_json::Value::Object(doc.metadata.user.clone())) { for line in pretty.lines() { lines.push(Line::from(format!(" {line}"))); @@ -197,20 +195,11 @@ pub(crate) fn build_doc_lines<'a>( // blocks — section header carries the count inline so a // collapsed view still reports "how many" without leaking // body lines (R1 review: count must collapse with the rest). - push_section_header_with_count( - &mut lines, - SECTION_BLOCKS, - s, - Some(doc.blocks.len()), - theme, - ); + push_section_header_with_count(&mut lines, SECTION_BLOCKS, s, Some(doc.blocks.len()), theme); if !s.collapsed.contains(SECTION_BLOCKS) { let preview_n = 16.min(doc.blocks.len()); for (i, b) in doc.blocks.iter().take(preview_n).enumerate() { - lines.push(Line::from(format!( - " [{i}] {}", - describe_block(b) - ))); + lines.push(Line::from(format!(" [{i}] {}", describe_block(b)))); } if doc.blocks.len() > preview_n { lines.push(Line::from(Span::styled( @@ -240,7 +229,11 @@ pub(crate) fn build_chunk_lines<'a>( }, theme, )); - lines.push(header_kv("chunker_version", &chunk.chunker_version.0, theme)); + lines.push(header_kv( + "chunker_version", + &chunk.chunker_version.0, + theme, + )); lines.push(header_kv("policy_hash", &chunk.policy_hash, theme)); lines.push(header_kv( "token_estimate", @@ -337,10 +330,7 @@ fn header_kv_with_stale( fn kv(k: &str, v: &str, theme: &crate::theme::Theme) -> Line<'static> { Line::from(vec![ - Span::styled( - format!(" {k}: "), - theme.style(crate::theme::Role::Hint), - ), + Span::styled(format!(" {k}: "), theme.style(crate::theme::Role::Hint)), Span::raw(v.to_string()), ]) } @@ -418,11 +408,7 @@ fn describe_block(b: &Block) -> String { c.lang.as_deref().unwrap_or("?"), c.code.len() ), - Block::Table(t) => format!( - "Table: {} cols × {} rows", - t.headers.len(), - t.rows.len() - ), + Block::Table(t) => format!("Table: {} cols × {} rows", t.headers.len(), t.rows.len()), Block::ImageRef(i) => format!( "ImageRef: src={} alt={:?} ocr={}", i.src, diff --git a/crates/kebab-tui/src/lib.rs b/crates/kebab-tui/src/lib.rs index 1457c1e..186ece1 100644 --- a/crates/kebab-tui/src/lib.rs +++ b/crates/kebab-tui/src/lib.rs @@ -29,19 +29,19 @@ mod terminal; mod theme; pub mod trace_popup; -pub use input::{InputBuffer, display_width, place_cursor_x, truncate_to_display_width}; -pub use theme::{Palette, Role, Theme}; pub use app::{ - App, AskState, IngestState, InspectState, InspectTarget, KeyOutcome, LibraryState, Mode, - Pane, SearchState, SearchWorkerMessage, TERMINAL_LINE_HOLD_SECS, + App, AskState, IngestState, InspectState, InspectTarget, KeyOutcome, LibraryState, Mode, Pane, + SearchState, SearchWorkerMessage, TERMINAL_LINE_HOLD_SECS, }; pub use ask::{handle_key_ask, render_ask}; pub use error_popup::{ErrorOverlay, render_error_overlay}; pub use ingest_progress::{ cancel_running_ingest, drain_progress, ready_to_clear, start_ingest, status_line, }; +pub use input::{InputBuffer, display_width, place_cursor_x, truncate_to_display_width}; pub use inspect::{enter_inspect, handle_key_inspect, render_inspect}; pub use library::{handle_key_library, render_library}; +pub use theme::{Palette, Role, Theme}; // `editor::with_external_program` and `search::jump_to_citation` // stay `pub(crate)` — they take the internal `TuiTerminal` handle, // which is intentionally module-private (its `Drop` lifecycle is the @@ -53,8 +53,8 @@ pub use search::{build_jump_command, handle_key_search, render_search}; // without spawning the real thread (they inject a // `SearchWorkerMessage` directly via a channel they construct in // the test) and can pin the in-flight-skip invariant of debounce. -pub use search::poll_worker as poll_search_worker; pub use search::debounce_due as search_debounce_due; +pub use search::poll_worker as poll_search_worker; // p9-fb-12: expose the global mode-toggle intercept so integration // tests can pin the i/Esc behavior without standing up the full // run loop. diff --git a/crates/kebab-tui/src/library.rs b/crates/kebab-tui/src/library.rs index 2bf2e30..2a8bb8b 100644 --- a/crates/kebab-tui/src/library.rs +++ b/crates/kebab-tui/src/library.rs @@ -92,7 +92,11 @@ impl FilterEdit { if let Some(lang) = filter.lang.as_ref() { lang_buf.push_str(&lang.0); } - Self { field: FilterField::Tags, tags_buf, lang_buf } + Self { + field: FilterField::Tags, + tags_buf, + lang_buf, + } } pub fn commit_into(&self, filter: &mut DocFilter) { @@ -145,7 +149,12 @@ fn filter_overlay_height(state: &App) -> u16 { const LABEL_TAGS: &str = "tags_any (csv): "; const LABEL_LANG: &str = "lang: "; -fn render_filter_overlay(f: &mut Frame, area: Rect, edit: &FilterEdit, theme: &crate::theme::Theme) { +fn render_filter_overlay( + f: &mut Frame, + area: Rect, + edit: &FilterEdit, + theme: &crate::theme::Theme, +) { let block = Block::default() .title("Filter (Tab=cycle field, Enter=apply, Esc=cancel)") .borders(Borders::ALL); @@ -153,8 +162,18 @@ fn render_filter_overlay(f: &mut Frame, area: Rect, edit: &FilterEdit, theme: &c f.render_widget(block, area); let lines = vec![ - line_with_focus(LABEL_TAGS, edit.tags_buf.as_str(), edit.field == FilterField::Tags, theme), - line_with_focus(LABEL_LANG, edit.lang_buf.as_str(), edit.field == FilterField::Lang, theme), + line_with_focus( + LABEL_TAGS, + edit.tags_buf.as_str(), + edit.field == FilterField::Tags, + theme, + ), + line_with_focus( + LABEL_LANG, + edit.lang_buf.as_str(), + edit.field == FilterField::Lang, + theme, + ), ]; let para = Paragraph::new(lines); f.render_widget(para, inner); @@ -171,7 +190,8 @@ fn render_filter_overlay(f: &mut Frame, area: Rect, edit: &FilterEdit, theme: &c FilterField::Lang => (LABEL_LANG, &edit.lang_buf, 1u16), }; let label_w = display_width(label); - let cursor_x = crate::input::place_cursor_x(inner.x, inner.width, label_w, focused_buf.cursor_col()); + let cursor_x = + crate::input::place_cursor_x(inner.x, inner.width, label_w, focused_buf.cursor_col()); f.set_cursor_position((cursor_x, inner.y + row_offset)); } @@ -365,8 +385,7 @@ pub fn handle_key_library(state: &mut App, key: KeyEvent) -> KeyOutcome { // (e.g. "ingest already running") surface via the error // overlay. if let Err(e) = crate::ingest_progress::start_ingest(state) { - state.error_overlay = - Some(crate::ErrorOverlay::from_anyhow(&e)); + state.error_overlay = Some(crate::ErrorOverlay::from_anyhow(&e)); } KeyOutcome::Continue } @@ -472,10 +491,8 @@ fn set_selection(inner: &mut LibraryStateInner, idx: usize) { /// because the run loop owns the call site. pub(crate) fn refresh_docs(state: &mut App) -> anyhow::Result<()> { state.library.inner.loading = true; - let result = kebab_app::list_docs_with_config( - state.config.clone(), - state.library.inner.filter.clone(), - ); + let result = + kebab_app::list_docs_with_config(state.config.clone(), state.library.inner.filter.clone()); state.library.inner.loading = false; match result { Ok(docs) => { @@ -560,11 +577,7 @@ mod tests { fn format_doc_header_aligns_with_format_doc_row() { let title_w = 30; let header = format_doc_header(title_w); - let header_text: String = header - .spans - .iter() - .map(|sp| sp.content.as_ref()) - .collect(); + let header_text: String = header.spans.iter().map(|sp| sp.content.as_ref()).collect(); assert!(header_text.contains("TITLE"), "header has TITLE label"); assert!(header_text.contains("TAGS"), "header has TAGS label"); assert!(header_text.contains("UPDATED"), "header has UPDATED label"); diff --git a/crates/kebab-tui/src/markdown.rs b/crates/kebab-tui/src/markdown.rs index b8e65a6..36969fa 100644 --- a/crates/kebab-tui/src/markdown.rs +++ b/crates/kebab-tui/src/markdown.rs @@ -203,19 +203,13 @@ pub fn render(text: &str, theme: &Theme) -> Vec> { // Render raw HTML as text — terminal can't display // tags. Use Hint role so it visually distinguishes // from user-written prose. - current.push(Span::styled( - h.into_string(), - theme.style(Role::Hint), - )); + current.push(Span::styled(h.into_string(), theme.style(Role::Hint))); } Event::InlineMath(s) | Event::DisplayMath(s) => { // No LaTeX rendering in a terminal v1, but preserve // the source so the answer's math still reaches the // user as readable text instead of vanishing. - current.push(Span::styled( - s.into_string(), - theme.style(Role::Hint), - )); + current.push(Span::styled(s.into_string(), theme.style(Role::Hint))); } Event::FootnoteReference(label) => { // Render as `[^label]` so the footnote anchor is @@ -411,7 +405,10 @@ mod tests { .flat_map(|l| l.spans.iter()) .filter(|s| s.style.add_modifier.contains(Modifier::ITALIC)) .collect(); - assert!(!italic_spans.is_empty(), "expected at least one ITALIC span"); + assert!( + !italic_spans.is_empty(), + "expected at least one ITALIC span" + ); let combined: String = italic_spans.iter().map(|s| s.content.as_ref()).collect(); assert_eq!(combined, "hi"); } @@ -604,7 +601,10 @@ mod tests { let lines = render(md, &theme()); let texts: Vec = lines.iter().map(line_text).collect(); let heading_idx = texts.iter().position(|t| t.contains("Goal")).unwrap(); - let para_idx = texts.iter().position(|t| t.contains("Description")).unwrap(); + let para_idx = texts + .iter() + .position(|t| t.contains("Description")) + .unwrap(); let alpha_idx = texts.iter().position(|t| t.contains("alpha")).unwrap(); let code_idx = texts.iter().position(|t| t.contains("let x = 1;")).unwrap(); assert!(heading_idx < para_idx); diff --git a/crates/kebab-tui/src/run.rs b/crates/kebab-tui/src/run.rs index f7a156c..2ec42a8 100644 --- a/crates/kebab-tui/src/run.rs +++ b/crates/kebab-tui/src/run.rs @@ -14,9 +14,7 @@ use crate::ask::{drain_stream, handle_key_ask, poll_worker, render_ask}; use crate::error_popup::{ErrorOverlay, render_error_overlay}; use crate::inspect::{handle_key_inspect, refresh_inspect, render_inspect}; use crate::library::{handle_key_library, refresh_docs, render_library}; -use crate::search::{ - debounce_due, fire_search, handle_key_search, refresh_preview, render_search, -}; +use crate::search::{debounce_due, fire_search, handle_key_search, refresh_preview, render_search}; use crate::terminal::TuiTerminal; /// Poll interval for crossterm's `event::poll`. Short enough that a @@ -69,10 +67,7 @@ pub(crate) fn run_loop(app: &mut App) -> Result<()> { // current generation's result populates `hits` // / clears `searching` here. crate::search::poll_worker(app); - let due = app - .search - .as_ref() - .is_some_and(debounce_due); + let due = app.search.as_ref().is_some_and(debounce_due); if due { if let Err(e) = fire_search(app) { app.error_overlay = Some(ErrorOverlay::from_anyhow(&e)); @@ -97,10 +92,7 @@ pub(crate) fn run_loop(app: &mut App) -> Result<()> { poll_worker(app); } Pane::Inspect => { - let due = app - .inspect - .as_ref() - .is_some_and(|s| s.needs_fetch); + let due = app.inspect.as_ref().is_some_and(|s| s.needs_fetch); if due { if let Err(e) = refresh_inspect(app) { app.error_overlay = Some(ErrorOverlay::from_anyhow(&e)); @@ -227,10 +219,7 @@ pub(crate) fn run_loop(app: &mut App) -> Result<()> { /// Stub key handler for panes whose authoring task has not landed /// yet. `q` / `Esc` returns to Library; everything else is a no-op. -fn handle_key_unimplemented_pane( - app: &mut App, - key: crossterm::event::KeyEvent, -) -> KeyOutcome { +fn handle_key_unimplemented_pane(app: &mut App, key: crossterm::event::KeyEvent) -> KeyOutcome { use crossterm::event::KeyCode; if app.error_overlay.is_some() { app.error_overlay = None; @@ -250,10 +239,10 @@ fn render_root(f: &mut Frame, app: &App) { let outer = Layout::default() .direction(Direction::Vertical) .constraints([ - Constraint::Length(1), // top header - Constraint::Min(1), // pane content - Constraint::Length(1), // status bar - Constraint::Length(1), // key hint bar + Constraint::Length(1), // top header + Constraint::Min(1), // pane content + Constraint::Length(1), // status bar + Constraint::Length(1), // key hint bar ]) .split(f.area()); render_header(f, outer[0], app); @@ -282,7 +271,11 @@ fn render_root(f: &mut Frame, app: &App) { /// p9-fb-37: centered sub-rect helper for the trace popup. Returns /// a rect of `percent_x` × `percent_y` percent of `r`, centered. -fn centered_rect(percent_x: u16, percent_y: u16, r: ratatui::layout::Rect) -> ratatui::layout::Rect { +fn centered_rect( + percent_x: u16, + percent_y: u16, + r: ratatui::layout::Rect, +) -> ratatui::layout::Rect { use ratatui::layout::{Constraint, Direction, Layout}; let popup_layout = Layout::default() .direction(Direction::Vertical) @@ -456,7 +449,7 @@ fn render_key_hints(f: &mut Frame, area: Rect, app: &App) { /// - **Order**: most-frequent verb first; last fragment is always /// the way back out (`Esc`/`q`). pub fn footer_hints(focus: Pane, mode: crate::app::Mode, filter_open: bool) -> &'static str { - use crate::app::Mode::{Normal, Insert}; + use crate::app::Mode::{Insert, Normal}; // p9-fb-21: every hint starts with `F1 도움말` so the cheatsheet // is always one keystroke away — dogfooding feedback was that // the F1 binding itself was undiscoverable. @@ -465,22 +458,34 @@ pub fn footer_hints(focus: Pane, mode: crate::app::Mode, filter_open: bool) -> & // captures every key, mode label irrelevant). (Pane::Library, _, true) => "F1 도움말 Tab 필드전환 Enter 적용 Esc 취소", // Library Normal: full navigation surface. - (Pane::Library, Normal, false) => "F1 도움말 ↑/k 위로 ↓/j 아래로 gg 맨위 G 맨아래 f 필터 / 검색 ? 질문 Enter 자세히 r 인덱싱 q 종료", + (Pane::Library, Normal, false) => { + "F1 도움말 ↑/k 위로 ↓/j 아래로 gg 맨위 G 맨아래 f 필터 / 검색 ? 질문 Enter 자세히 r 인덱싱 q 종료" + } // Library Insert: degenerate — nothing types in Library. (Pane::Library, Insert, false) => "F1 도움말 Esc 로 NORMAL 모드", // Search Insert: typing the query is the dominant action. // `i` becomes a typed char here (intercept only fires in // Normal mode); `o` is the chunk-inspect command exposed // via Esc → o (was `i` pre-fb-21). - (Pane::Search, Insert, _) => "F1 도움말 타이핑 검색어 Tab 모드전환 Enter 검색 Esc 로 NORMAL 모드 (j/k 이동 o 인스펙트 g 에디터 i 입력모드)", + (Pane::Search, Insert, _) => { + "F1 도움말 타이핑 검색어 Tab 모드전환 Enter 검색 Esc 로 NORMAL 모드 (j/k 이동 o 인스펙트 g 에디터 i 입력모드)" + } // Search Normal: navigation + commands. - (Pane::Search, Normal, _) => "F1 도움말 ↑/k 위로 ↓/j 아래로 Tab 모드전환 Enter 검색 o 인스펙트 g 에디터 i 입력모드 Esc 뒤로", + (Pane::Search, Normal, _) => { + "F1 도움말 ↑/k 위로 ↓/j 아래로 Tab 모드전환 Enter 검색 o 인스펙트 g 에디터 i 입력모드 Esc 뒤로" + } // Ask Insert: typing the question. - (Pane::Ask, Insert, _) => "F1 도움말 타이핑 질문 Enter 전송 Esc 로 NORMAL 모드 (e 상세 j/k 스크롤 i 입력모드)", + (Pane::Ask, Insert, _) => { + "F1 도움말 타이핑 질문 Enter 전송 Esc 로 NORMAL 모드 (e 상세 j/k 스크롤 i 입력모드)" + } // Ask Normal: scroll + toggle. - (Pane::Ask, Normal, _) => "F1 도움말 e 상세설명 ↑/k 위로 ↓/j 아래로 Enter 전송 Ctrl-L 새대화 i 입력모드 Esc 뒤로", + (Pane::Ask, Normal, _) => { + "F1 도움말 e 상세설명 ↑/k 위로 ↓/j 아래로 Enter 전송 Ctrl-L 새대화 i 입력모드 Esc 뒤로" + } // Inspect Normal (default): scroll + collapse. - (Pane::Inspect, Normal, _) => "F1 도움말 ↑/k 위로 ↓/j 아래로 PgUp/PgDn 페이지 c 섹션접기 Esc/q 뒤로", + (Pane::Inspect, Normal, _) => { + "F1 도움말 ↑/k 위로 ↓/j 아래로 PgUp/PgDn 페이지 c 섹션접기 Esc/q 뒤로" + } // Inspect Insert: degenerate. (Pane::Inspect, Insert, _) => "F1 도움말 Esc 로 NORMAL 모드", // Jobs pane: placeholder. @@ -513,8 +518,8 @@ pub fn footer_hints(focus: Pane, mode: crate::app::Mode, filter_open: bool) -> & /// intercept paths by constructing KeyEvents directly without /// standing up the full run loop. pub fn mode_intercept(app: &mut crate::app::App, key: crossterm::event::KeyEvent) -> bool { - use crossterm::event::{KeyCode, KeyModifiers}; use crate::app::Mode; + use crossterm::event::{KeyCode, KeyModifiers}; // Modifier-bearing keys (Ctrl-Esc etc.) are not the toggle. if !key.modifiers.is_empty() && key.modifiers != KeyModifiers::SHIFT { @@ -558,8 +563,7 @@ pub fn mode_intercept(app: &mut crate::app::App, key: crossterm::event::KeyEvent /// full run loop. pub fn cheatsheet_intercept(app: &mut crate::app::App, key: crossterm::event::KeyEvent) -> bool { use crossterm::event::{KeyCode, KeyModifiers}; - let plain_or_shift = - key.modifiers.is_empty() || key.modifiers == KeyModifiers::SHIFT; + let plain_or_shift = key.modifiers.is_empty() || key.modifiers == KeyModifiers::SHIFT; if !plain_or_shift { return false; } @@ -623,7 +627,10 @@ mod footer_hints_tests { // p9-fb-21: every hint now leads with `F1 도움말`. The // "typing verb" (`타이핑 검색어`) follows immediately so it's // still the dominant action visually. - assert!(h.starts_with("F1 도움말"), "should lead with F1 도움말: {h}"); + assert!( + h.starts_with("F1 도움말"), + "should lead with F1 도움말: {h}" + ); assert!(h.contains("타이핑 검색어"), "expected 타이핑 검색어: {h}"); assert!(h.contains("Tab 모드전환"), "expected Tab 모드전환: {h}"); assert!(h.contains("Enter 검색"), "expected Enter 검색: {h}"); @@ -634,7 +641,10 @@ mod footer_hints_tests { fn ask_insert_hint_leads_with_typing_verb() { let h = footer_hints(Pane::Ask, Mode::Insert, false); // p9-fb-21: F1 prefix now leads; typing verb is second. - assert!(h.starts_with("F1 도움말"), "should lead with F1 도움말: {h}"); + assert!( + h.starts_with("F1 도움말"), + "should lead with F1 도움말: {h}" + ); assert!(h.contains("타이핑 질문"), "expected 타이핑 질문: {h}"); assert!(h.contains("Enter 전송"), "expected Enter 전송: {h}"); } @@ -667,7 +677,13 @@ mod footer_hints_tests { /// invisible until the user already knew about it. #[test] fn every_hint_starts_with_f1_help_prefix() { - for pane in [Pane::Library, Pane::Search, Pane::Ask, Pane::Inspect, Pane::Jobs] { + for pane in [ + Pane::Library, + Pane::Search, + Pane::Ask, + Pane::Inspect, + Pane::Jobs, + ] { for mode in [Mode::Normal, Mode::Insert] { for filter_open in [false, true] { let h = footer_hints(pane, mode, filter_open); @@ -699,11 +715,20 @@ mod footer_hints_tests { /// covers every arm. #[test] fn every_pane_mode_combo_returns_non_empty_hint() { - for pane in [Pane::Library, Pane::Search, Pane::Ask, Pane::Inspect, Pane::Jobs] { + for pane in [ + Pane::Library, + Pane::Search, + Pane::Ask, + Pane::Inspect, + Pane::Jobs, + ] { for mode in [Mode::Normal, Mode::Insert] { for filter_open in [false, true] { let h = footer_hints(pane, mode, filter_open); - assert!(!h.is_empty(), "{pane:?}/{mode:?}/filter={filter_open} empty"); + assert!( + !h.is_empty(), + "{pane:?}/{mode:?}/filter={filter_open} empty" + ); } } } diff --git a/crates/kebab-tui/src/search.rs b/crates/kebab-tui/src/search.rs index 782252c..85618c7 100644 --- a/crates/kebab-tui/src/search.rs +++ b/crates/kebab-tui/src/search.rs @@ -43,10 +43,7 @@ pub fn render_search(f: &mut Frame, area: Rect, state: &App) { let Some(s) = state.search.as_ref() else { // Pane has no state yet — should not happen because the run // loop lazy-inits before render. Defensive empty block. - f.render_widget( - Block::default().title("Search").borders(Borders::ALL), - area, - ); + f.render_widget(Block::default().title("Search").borders(Borders::ALL), area); return; }; @@ -93,7 +90,8 @@ fn render_input_bar(f: &mut Frame, area: Rect, s: &SearchState, theme: &crate::t // unhides the caret for the Search input column. // place_cursor_x sums in usize (avoiding u16 wrap) and clamps to // the right edge of the inner area. - let cursor_x = crate::input::place_cursor_x(inner.x, inner.width, prompt_w, s.input.cursor_col()); + let cursor_x = + crate::input::place_cursor_x(inner.x, inner.width, prompt_w, s.input.cursor_col()); f.set_cursor_position((cursor_x, inner.y)); } @@ -169,7 +167,10 @@ fn format_hit_lines(h: &SearchHit, theme: &crate::theme::Theme) -> Vec KeyOutcome { trace: true, ..Default::default() }; - if let Ok(resp) = kebab_app::search_with_opts_with_config(state.config.clone(), q, opts) { + if let Ok(resp) = kebab_app::search_with_opts_with_config(state.config.clone(), q, opts) + { if let Some(t) = resp.trace { state.trace_popup = Some(crate::trace_popup::TracePopupState::new(t)); } @@ -305,8 +307,7 @@ pub fn handle_key_search(state: &mut App, key: KeyEvent) -> KeyOutcome { // `~/...` / `${XDG_…}` expansion via `kebab-config::expand_path` // — same helper used by the markdown / image / PDF ingest // paths (HOTFIXES 2026-05-02 P9-4 follow-up). - let workspace_root = - kebab_config::expand_path(&state.config.workspace.root, ""); + let workspace_root = kebab_config::expand_path(&state.config.workspace.root, ""); state.pending_editor = Some(crate::app::EditorRequest { citation: citation.unwrap(), editor_env: editor, @@ -478,7 +479,8 @@ pub fn build_jump_command( let mut args = leading_args; let editor_basename = std::path::Path::new(&program) - .file_name().map_or_else(|| program.clone(), |s| s.to_string_lossy().into_owned()); + .file_name() + .map_or_else(|| program.clone(), |s| s.to_string_lossy().into_owned()); match citation { Citation::Line { start, .. } => { @@ -557,7 +559,9 @@ fn parse_editor_env(env: &str) -> (String, Vec) { /// the spawn is redundant — wait for the result. /// - dedupe against `last_query` (was already there pre-fb-08, kept). pub fn debounce_due(s: &SearchState) -> bool { - let Some(at) = s.input_dirty_at else { return false }; + let Some(at) = s.input_dirty_at else { + return false; + }; let elapsed = (time::OffsetDateTime::now_utc() - at) .try_into() .unwrap_or(Duration::ZERO); @@ -634,10 +638,7 @@ pub(crate) fn fire_search(state: &mut App) -> anyhow::Result<()> { filters: kebab_core::SearchFilters::default(), }; let result = kebab_app::search_with_config(cfg, query); - let _ = tx.send(crate::app::SearchWorkerMessage::Done { - generation, - result, - }); + let _ = tx.send(crate::app::SearchWorkerMessage::Done { generation, result }); }) .map_err(|e| anyhow::anyhow!("spawn search worker: {e}"))?; @@ -651,8 +652,12 @@ pub(crate) fn fire_search(state: &mut App) -> anyhow::Result<()> { /// dropped per the generation-counter contract. `pub` so integration /// tests can drive the stale-result paths by injecting a channel. pub fn poll_worker(state: &mut App) { - let Some(s) = state.search.as_mut() else { return }; - let Some(rx) = s.worker_rx.as_ref() else { return }; + let Some(s) = state.search.as_mut() else { + return; + }; + let Some(rx) = s.worker_rx.as_ref() else { + return; + }; let msg = match rx.try_recv() { Ok(m) => m, Err(std::sync::mpsc::TryRecvError::Empty) => return, @@ -694,10 +699,8 @@ pub fn poll_worker(state: &mut App) { // the user submitted for *this* result set. If // input has drifted since spawn, the gen-check // already returned early. - let q_text = - s.last_query.as_ref().map_or("", |(t, _)| t.as_str()); - s.short_query_hint = - kebab_app::short_query_hint(q_text, hits.is_empty()); + let q_text = s.last_query.as_ref().map_or("", |(t, _)| t.as_str()); + s.short_query_hint = kebab_app::short_query_hint(q_text, hits.is_empty()); s.hits = hits; s.selected_hit = 0; s.preview = None; @@ -706,8 +709,7 @@ pub fn poll_worker(state: &mut App) { s.hits.clear(); s.selected_hit = 0; s.short_query_hint = None; - state.error_overlay = - Some(crate::error_popup::ErrorOverlay::from_anyhow(&e)); + state.error_overlay = Some(crate::error_popup::ErrorOverlay::from_anyhow(&e)); } } } @@ -732,4 +734,3 @@ pub(crate) fn refresh_preview(state: &mut App) -> anyhow::Result<()> { s.preview = Some(chunk.text); Ok(()) } - diff --git a/crates/kebab-tui/tests/ask.rs b/crates/kebab-tui/tests/ask.rs index b40c222..487c8a3 100644 --- a/crates/kebab-tui/tests/ask.rs +++ b/crates/kebab-tui/tests/ask.rs @@ -7,9 +7,8 @@ use crossterm::event::{KeyCode, KeyEvent, KeyModifiers}; use kebab_config::Config; use kebab_core::{ - Answer, AnswerCitation, AnswerRetrievalSummary, Citation, ModelRef, - PromptTemplateVersion, RefusalReason, SearchMode, TokenUsage, TraceId, Turn, - WorkspacePath, + Answer, AnswerCitation, AnswerRetrievalSummary, Citation, ModelRef, PromptTemplateVersion, + RefusalReason, SearchMode, TokenUsage, TraceId, Turn, WorkspacePath, }; use kebab_tui::{App, AskState, KeyOutcome, Pane, handle_key_ask, render_ask}; use ratatui::Terminal; @@ -90,10 +89,7 @@ fn esc_returns_to_library_and_clears_streaming() { s.streaming = true; s.partial = "partial answer…".into(); } - let outcome = handle_key_ask( - &mut app, - KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE), - ); + let outcome = handle_key_ask(&mut app, KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE)); assert_eq!(outcome, KeyOutcome::SwitchPane(Pane::Library)); let s = app.ask.as_ref().unwrap(); assert!(!s.streaming); @@ -155,12 +151,20 @@ fn jk_scroll_in_normal_mode_type_in_insert() { &mut app, KeyEvent::new(KeyCode::Char('j'), KeyModifiers::NONE), ); - assert_eq!(app.ask.as_ref().unwrap().scroll, 1, "j scrolls down in Normal"); + assert_eq!( + app.ask.as_ref().unwrap().scroll, + 1, + "j scrolls down in Normal" + ); handle_key_ask( &mut app, KeyEvent::new(KeyCode::Char('k'), KeyModifiers::NONE), ); - assert_eq!(app.ask.as_ref().unwrap().scroll, 0, "k scrolls up in Normal"); + assert_eq!( + app.ask.as_ref().unwrap().scroll, + 0, + "k scrolls up in Normal" + ); // Now Insert — j/k type. app.mode = kebab_tui::Mode::Insert; handle_key_ask( @@ -213,10 +217,7 @@ fn e_typed_into_input_when_input_nonempty() { #[test] fn enter_with_empty_input_is_continue() { let mut app = fresh_app(); - let outcome = handle_key_ask( - &mut app, - KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE), - ); + let outcome = handle_key_ask(&mut app, KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE)); assert_eq!(outcome, KeyOutcome::Continue); assert!(!app.ask.as_ref().unwrap().streaming); } @@ -229,10 +230,7 @@ fn enter_while_streaming_is_noop() { s.input.push_str("anything"); s.streaming = true; } - handle_key_ask( - &mut app, - KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE), - ); + handle_key_ask(&mut app, KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE)); // streaming flag remains true (no new worker spawned) assert!(app.ask.as_ref().unwrap().streaming); // No thread spawned because enter was a no-op. @@ -335,7 +333,10 @@ fn render_grounded_answer_with_citation() { }) .collect::>() .join("\n"); - assert!(rendered.contains("test answer body"), "answer body rendered"); + assert!( + rendered.contains("test answer body"), + "answer body rendered" + ); assert!(rendered.contains("grounded ✓"), "grounded status visible"); assert!(rendered.contains("notes/foo.md"), "citation path rendered"); assert!(rendered.contains("[1]"), "citation marker rendered"); @@ -346,7 +347,11 @@ fn render_refusal_score_gate_shows_status_without_citation_index_panic() { let mut app = fresh_app(); { let s = app.ask.as_mut().unwrap(); - let mut ans = make_answer(false, Some(RefusalReason::ScoreGate), "insufficient grounding to answer."); + let mut ans = make_answer( + false, + Some(RefusalReason::ScoreGate), + "insufficient grounding to answer.", + ); ans.citations.clear(); // refusal often has no citations s.turns.push(Turn { question: "test refusal question".into(), @@ -374,7 +379,10 @@ fn render_refusal_score_gate_shows_status_without_citation_index_panic() { }) .collect::>() .join("\n"); - assert!(rendered.contains("insufficient grounding"), "refusal body rendered"); + assert!( + rendered.contains("insufficient grounding"), + "refusal body rendered" + ); assert!(rendered.contains("grounded ✗"), "ungrounded status visible"); assert!(rendered.contains("score_gate"), "refusal reason surfaced"); } @@ -535,10 +543,7 @@ fn enter_with_detached_prior_thread_is_blocked() { } })); } - let outcome = handle_key_ask( - &mut app, - KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE), - ); + let outcome = handle_key_ask(&mut app, KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE)); // Enter is a no-op while a prior thread is attached. assert_eq!(outcome, KeyOutcome::Continue); let s = app.ask.as_ref().unwrap(); @@ -693,7 +698,10 @@ fn render_transcript_shows_completed_turns_in_order() { assert!(q1_pos < q2_pos, "chronological order: Q1 before Q2"); assert!(rendered.contains("first question"), "first question text"); assert!(rendered.contains("second answer"), "second answer text"); - assert!(rendered.contains("transcript (2 turns)"), "title shows count"); + assert!( + rendered.contains("transcript (2 turns)"), + "title shows count" + ); } #[test] @@ -772,10 +780,16 @@ fn left_arrow_then_typing_inserts_at_cursor_in_ask() { let mut app = fresh_app(); app.mode = kebab_tui::Mode::Insert; for ch in "abc".chars() { - handle_key_ask(&mut app, KeyEvent::new(KeyCode::Char(ch), KeyModifiers::NONE)); + handle_key_ask( + &mut app, + KeyEvent::new(KeyCode::Char(ch), KeyModifiers::NONE), + ); } handle_key_ask(&mut app, KeyEvent::new(KeyCode::Left, KeyModifiers::NONE)); - handle_key_ask(&mut app, KeyEvent::new(KeyCode::Char('X'), KeyModifiers::NONE)); + handle_key_ask( + &mut app, + KeyEvent::new(KeyCode::Char('X'), KeyModifiers::NONE), + ); let s = app.ask.as_ref().unwrap(); assert_eq!(s.input.as_str(), "abXc", "X inserts before c, not at end"); assert_eq!(s.input.cursor_col(), 3, "cursor sits between X and c"); @@ -787,7 +801,10 @@ fn left_arrow_then_typing_inserts_at_cursor_in_ask() { fn right_arrow_at_end_is_noop_in_ask() { let mut app = fresh_app(); app.mode = kebab_tui::Mode::Insert; - handle_key_ask(&mut app, KeyEvent::new(KeyCode::Char('a'), KeyModifiers::NONE)); + handle_key_ask( + &mut app, + KeyEvent::new(KeyCode::Char('a'), KeyModifiers::NONE), + ); handle_key_ask(&mut app, KeyEvent::new(KeyCode::Right, KeyModifiers::NONE)); let s = app.ask.as_ref().unwrap(); assert_eq!(s.input.cursor_col(), 1); @@ -800,7 +817,10 @@ fn home_end_jump_cursor_in_ask() { let mut app = fresh_app(); app.mode = kebab_tui::Mode::Insert; for ch in "hello".chars() { - handle_key_ask(&mut app, KeyEvent::new(KeyCode::Char(ch), KeyModifiers::NONE)); + handle_key_ask( + &mut app, + KeyEvent::new(KeyCode::Char(ch), KeyModifiers::NONE), + ); } handle_key_ask(&mut app, KeyEvent::new(KeyCode::Home, KeyModifiers::NONE)); assert_eq!(app.ask.as_ref().unwrap().input.cursor_col(), 0); @@ -815,7 +835,10 @@ fn delete_key_removes_char_at_cursor_in_ask() { let mut app = fresh_app(); app.mode = kebab_tui::Mode::Insert; for ch in "abc".chars() { - handle_key_ask(&mut app, KeyEvent::new(KeyCode::Char(ch), KeyModifiers::NONE)); + handle_key_ask( + &mut app, + KeyEvent::new(KeyCode::Char(ch), KeyModifiers::NONE), + ); } handle_key_ask(&mut app, KeyEvent::new(KeyCode::Home, KeyModifiers::NONE)); handle_key_ask(&mut app, KeyEvent::new(KeyCode::Delete, KeyModifiers::NONE)); @@ -831,14 +854,20 @@ fn hangul_left_arrow_rewinds_by_two_cols_in_ask() { let mut app = fresh_app(); app.mode = kebab_tui::Mode::Insert; for ch in "한글".chars() { - handle_key_ask(&mut app, KeyEvent::new(KeyCode::Char(ch), KeyModifiers::NONE)); + handle_key_ask( + &mut app, + KeyEvent::new(KeyCode::Char(ch), KeyModifiers::NONE), + ); } assert_eq!(app.ask.as_ref().unwrap().input.cursor_col(), 4); handle_key_ask(&mut app, KeyEvent::new(KeyCode::Left, KeyModifiers::NONE)); assert_eq!(app.ask.as_ref().unwrap().input.cursor_col(), 2); // Inserting at the new cursor position lands between the two // syllables, proving cursor_col is not just a display annotation. - handle_key_ask(&mut app, KeyEvent::new(KeyCode::Char('X'), KeyModifiers::NONE)); + handle_key_ask( + &mut app, + KeyEvent::new(KeyCode::Char('X'), KeyModifiers::NONE), + ); assert_eq!(app.ask.as_ref().unwrap().input.as_str(), "한X글"); } @@ -859,7 +888,10 @@ fn ask_state_default_follow_tail_is_true() { fn k_disengages_follow_tail_in_ask() { let mut app = fresh_app(); app.mode = kebab_tui::Mode::Normal; - handle_key_ask(&mut app, KeyEvent::new(KeyCode::Char('k'), KeyModifiers::NONE)); + handle_key_ask( + &mut app, + KeyEvent::new(KeyCode::Char('k'), KeyModifiers::NONE), + ); assert!(!app.ask.as_ref().unwrap().follow_tail); } @@ -875,7 +907,10 @@ fn shift_g_re_engages_follow_tail_in_ask() { s.follow_tail = false; s.scroll = 7; } - handle_key_ask(&mut app, KeyEvent::new(KeyCode::Char('G'), KeyModifiers::SHIFT)); + handle_key_ask( + &mut app, + KeyEvent::new(KeyCode::Char('G'), KeyModifiers::SHIFT), + ); let s = app.ask.as_ref().unwrap(); assert!(s.follow_tail, "Shift-G re-engages follow-tail"); assert_eq!(s.scroll, 0, "scroll cleared (renderer recomputes)"); @@ -888,7 +923,10 @@ fn ctrl_l_resets_follow_tail_in_ask() { let mut app = fresh_app(); app.mode = kebab_tui::Mode::Normal; app.ask.as_mut().unwrap().follow_tail = false; - handle_key_ask(&mut app, KeyEvent::new(KeyCode::Char('l'), KeyModifiers::CONTROL)); + handle_key_ask( + &mut app, + KeyEvent::new(KeyCode::Char('l'), KeyModifiers::CONTROL), + ); assert!(app.ask.as_ref().unwrap().follow_tail); } @@ -917,18 +955,12 @@ fn page_up_rewinds_scroll_saturating_and_freezes_follow_tail_in_ask() { app.mode = kebab_tui::Mode::Normal; app.ask.as_mut().unwrap().scroll = 25; app.ask.as_mut().unwrap().follow_tail = true; - handle_key_ask( - &mut app, - KeyEvent::new(KeyCode::PageUp, KeyModifiers::NONE), - ); + handle_key_ask(&mut app, KeyEvent::new(KeyCode::PageUp, KeyModifiers::NONE)); let s = app.ask.as_ref().unwrap(); assert_eq!(s.scroll, 15); assert!(!s.follow_tail); app.ask.as_mut().unwrap().scroll = 3; - handle_key_ask( - &mut app, - KeyEvent::new(KeyCode::PageUp, KeyModifiers::NONE), - ); + handle_key_ask(&mut app, KeyEvent::new(KeyCode::PageUp, KeyModifiers::NONE)); assert_eq!(app.ask.as_ref().unwrap().scroll, 0); } @@ -1173,7 +1205,6 @@ fn ask_state_multi_hop_field_default_false_and_round_trips() { assert!(!s.multi_hop, "settable back to false"); } - /// Small render helper shared with the rest of the test module's /// buffer-snapshot pattern. We define it locally here to avoid /// reaching into private internals. diff --git a/crates/kebab-tui/tests/cheatsheet.rs b/crates/kebab-tui/tests/cheatsheet.rs index 94edcf0..1d1a0bb 100644 --- a/crates/kebab-tui/tests/cheatsheet.rs +++ b/crates/kebab-tui/tests/cheatsheet.rs @@ -23,16 +23,10 @@ fn fresh_app(focus: Pane) -> App { fn f1_toggles_cheatsheet_visibility() { let mut app = fresh_app(Pane::Library); assert!(!app.cheatsheet_visible(), "starts hidden"); - let consumed = cheatsheet_intercept( - &mut app, - KeyEvent::new(KeyCode::F(1), KeyModifiers::NONE), - ); + let consumed = cheatsheet_intercept(&mut app, KeyEvent::new(KeyCode::F(1), KeyModifiers::NONE)); assert!(consumed, "F1 must be consumed"); assert!(app.cheatsheet_visible(), "F1 opens"); - let consumed = cheatsheet_intercept( - &mut app, - KeyEvent::new(KeyCode::F(1), KeyModifiers::NONE), - ); + let consumed = cheatsheet_intercept(&mut app, KeyEvent::new(KeyCode::F(1), KeyModifiers::NONE)); assert!(consumed, "second F1 also consumed"); assert!(!app.cheatsheet_visible(), "F1 closes when open"); } @@ -44,22 +38,13 @@ fn f1_toggles_cheatsheet_visibility() { fn esc_closes_cheatsheet_when_visible_otherwise_falls_through() { let mut app = fresh_app(Pane::Library); // Hidden → Esc falls through. - let consumed = cheatsheet_intercept( - &mut app, - KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE), - ); + let consumed = cheatsheet_intercept(&mut app, KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE)); assert!(!consumed, "Esc with cheatsheet hidden must fall through"); // Visible → Esc closes + consumed. - let _ = cheatsheet_intercept( - &mut app, - KeyEvent::new(KeyCode::F(1), KeyModifiers::NONE), - ); + let _ = cheatsheet_intercept(&mut app, KeyEvent::new(KeyCode::F(1), KeyModifiers::NONE)); assert!(app.cheatsheet_visible()); - let consumed = cheatsheet_intercept( - &mut app, - KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE), - ); + let consumed = cheatsheet_intercept(&mut app, KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE)); assert!(consumed, "Esc with cheatsheet visible must consume"); assert!(!app.cheatsheet_visible()); } @@ -76,10 +61,7 @@ fn modifier_keys_do_not_toggle_cheatsheet() { assert!(!consumed); assert!(!app.cheatsheet_visible()); - let consumed = cheatsheet_intercept( - &mut app, - KeyEvent::new(KeyCode::F(1), KeyModifiers::ALT), - ); + let consumed = cheatsheet_intercept(&mut app, KeyEvent::new(KeyCode::F(1), KeyModifiers::ALT)); assert!(!consumed); assert!(!app.cheatsheet_visible()); } @@ -90,10 +72,7 @@ fn modifier_keys_do_not_toggle_cheatsheet() { #[test] fn arbitrary_key_falls_through_when_cheatsheet_visible() { let mut app = fresh_app(Pane::Library); - let _ = cheatsheet_intercept( - &mut app, - KeyEvent::new(KeyCode::F(1), KeyModifiers::NONE), - ); + let _ = cheatsheet_intercept(&mut app, KeyEvent::new(KeyCode::F(1), KeyModifiers::NONE)); assert!(app.cheatsheet_visible()); for key in [ KeyEvent::new(KeyCode::Char('j'), KeyModifiers::NONE), @@ -116,10 +95,7 @@ fn cheatsheet_popup_contains_global_and_pane_sections() { let mut app = fresh_app(Pane::Search); app.focus = Pane::Search; // Force visible — we're testing the renderer, not the toggle. - let _ = cheatsheet_intercept( - &mut app, - KeyEvent::new(KeyCode::F(1), KeyModifiers::NONE), - ); + let _ = cheatsheet_intercept(&mut app, KeyEvent::new(KeyCode::F(1), KeyModifiers::NONE)); let backend = TestBackend::new(120, 40); let mut terminal = Terminal::new(backend).unwrap(); terminal @@ -138,7 +114,10 @@ fn cheatsheet_popup_contains_global_and_pane_sections() { .collect::>() .join("\n"); assert!(rendered.contains("Global"), "Global section header present"); - assert!(rendered.contains("Library"), "Library section header present"); + assert!( + rendered.contains("Library"), + "Library section header present" + ); assert!(rendered.contains("Search"), "Search section header present"); assert!(rendered.contains("Ask"), "Ask section header present"); assert!(rendered.contains("F1"), "F1 binding listed"); @@ -149,7 +128,9 @@ fn cheatsheet_popup_contains_global_and_pane_sections() { // the Inspect assertion when the body overflows; the rest of // the section-header asserts still cover the primary contract. if !rendered.contains("Inspect") { - eprintln!("[note] Inspect section overflowed popup body — known limitation per p9-fb-21 HOTFIXES"); + eprintln!( + "[note] Inspect section overflowed popup body — known limitation per p9-fb-21 HOTFIXES" + ); } // The "currently focused: " line lives at the bottom of // the popup; it might get clipped if the popup's content @@ -158,6 +139,8 @@ fn cheatsheet_popup_contains_global_and_pane_sections() { // the primary contract. let has_focused = rendered.contains("focused"); if !has_focused { - eprintln!("[note] 'focused' line absent — likely body overflowed popup height; sections still pinned"); + eprintln!( + "[note] 'focused' line absent — likely body overflowed popup height; sections still pinned" + ); } } diff --git a/crates/kebab-tui/tests/inspect.rs b/crates/kebab-tui/tests/inspect.rs index 039fa53..842c512 100644 --- a/crates/kebab-tui/tests/inspect.rs +++ b/crates/kebab-tui/tests/inspect.rs @@ -8,9 +8,8 @@ use crossterm::event::{KeyCode, KeyEvent, KeyModifiers}; use kebab_config::Config; use kebab_core::{ AssetId, Block, BlockId, CanonicalDocument, Chunk, ChunkId, ChunkerVersion, CommonBlock, - DocumentId, HeadingBlock, Inline, Lang, Metadata, ParserVersion, Provenance, - ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TextBlock, TrustLevel, - WorkspacePath, + DocumentId, HeadingBlock, Inline, Lang, Metadata, ParserVersion, Provenance, ProvenanceEvent, + ProvenanceKind, SourceSpan, SourceType, TextBlock, TrustLevel, WorkspacePath, }; use kebab_tui::{ App, InspectState, InspectTarget, KeyOutcome, Pane, handle_key_inspect, render_inspect, @@ -61,7 +60,10 @@ fn make_doc() -> CanonicalDocument { }), ]; let mut user = serde_json::Map::new(); - user.insert("custom_key".into(), serde_json::Value::String("custom_val".into())); + user.insert( + "custom_key".into(), + serde_json::Value::String("custom_val".into()), + ); CanonicalDocument { doc_id, @@ -141,10 +143,7 @@ fn esc_returns_to_recorded_pane() { let s = app.inspect.as_mut().unwrap(); s.return_to = Pane::Search; } - let outcome = handle_key_inspect( - &mut app, - KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE), - ); + let outcome = handle_key_inspect(&mut app, KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE)); assert_eq!(outcome, KeyOutcome::SwitchPane(Pane::Search)); } @@ -200,16 +199,10 @@ fn page_down_scrolls_by_ten_in_inspect() { fn page_up_rewinds_by_ten_saturating_in_inspect() { let mut app = fresh_app(); app.inspect.as_mut().unwrap().scroll = 25; - handle_key_inspect( - &mut app, - KeyEvent::new(KeyCode::PageUp, KeyModifiers::NONE), - ); + handle_key_inspect(&mut app, KeyEvent::new(KeyCode::PageUp, KeyModifiers::NONE)); assert_eq!(app.inspect.as_ref().unwrap().scroll, 15); app.inspect.as_mut().unwrap().scroll = 3; - handle_key_inspect( - &mut app, - KeyEvent::new(KeyCode::PageUp, KeyModifiers::NONE), - ); + handle_key_inspect(&mut app, KeyEvent::new(KeyCode::PageUp, KeyModifiers::NONE)); assert_eq!(app.inspect.as_ref().unwrap().scroll, 0); } @@ -273,7 +266,10 @@ fn doc_view_renders_header_and_metadata() { rendered.contains("custom_key") || rendered.contains("custom_val"), "user metadata pretty-printed" ); - assert!(rendered.contains("provenance"), "provenance section visible"); + assert!( + rendered.contains("provenance"), + "provenance section visible" + ); assert!(rendered.contains("kb-source-fs"), "agent rendered"); assert!(rendered.contains("blocks"), "blocks section visible"); assert!(rendered.contains("Heading L1"), "block describe rendered"); @@ -319,10 +315,16 @@ fn chunk_view_renders_text_and_block_ids() { s.chunk = Some(make_chunk()); } let rendered = render_to_string(&app, 100, 40); - assert!(rendered.contains("md-heading-v1"), "chunker_version rendered"); + assert!( + rendered.contains("md-heading-v1"), + "chunker_version rendered" + ); assert!(rendered.contains("Top / Sub"), "heading_path joined"); assert!(rendered.contains("Line 1-5"), "source span described"); - assert!(rendered.contains("chunk body line one"), "text body rendered"); + assert!( + rendered.contains("chunk body line one"), + "text body rendered" + ); assert!( rendered.contains("embeddings (2)"), "block_id count rendered inline on embeddings header" @@ -343,8 +345,7 @@ fn inspect_doc_header_shows_stale_badge_when_threshold_exceeded() { s.target = Some(InspectTarget::Doc(DocumentId("d".repeat(32)))); let mut doc = make_doc(); // Backdate updated_at by 60 days so 60d > 30d threshold. - doc.metadata.updated_at = - OffsetDateTime::now_utc() - time::Duration::days(60); + doc.metadata.updated_at = OffsetDateTime::now_utc() - time::Duration::days(60); s.doc = Some(doc); } let rendered = render_to_string(&app, 100, 40); @@ -372,8 +373,7 @@ fn inspect_doc_header_omits_stale_badge_when_fresh() { s.target = Some(InspectTarget::Doc(DocumentId("d".repeat(32)))); let mut doc = make_doc(); // 1 day old — under the 30d threshold. - doc.metadata.updated_at = - OffsetDateTime::now_utc() - time::Duration::days(1); + doc.metadata.updated_at = OffsetDateTime::now_utc() - time::Duration::days(1); s.doc = Some(doc); } let rendered = render_to_string(&app, 100, 40); @@ -393,8 +393,7 @@ fn inspect_doc_header_omits_stale_badge_when_threshold_zero() { s.target = Some(InspectTarget::Doc(DocumentId("d".repeat(32)))); let mut doc = make_doc(); // Even a year-old doc must not get [STALE] when threshold = 0. - doc.metadata.updated_at = - OffsetDateTime::now_utc() - time::Duration::days(365); + doc.metadata.updated_at = OffsetDateTime::now_utc() - time::Duration::days(365); s.doc = Some(doc); } let rendered = render_to_string(&app, 100, 40); @@ -410,10 +409,7 @@ fn no_inspect_state_returns_to_library() { config.storage.data_dir = "/tmp/kebab-tui-inspect-tests-noop".into(); let mut app = App::new(config).unwrap(); app.focus = Pane::Inspect; - let outcome = handle_key_inspect( - &mut app, - KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE), - ); + let outcome = handle_key_inspect(&mut app, KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE)); assert_eq!(outcome, KeyOutcome::SwitchPane(Pane::Library)); } diff --git a/crates/kebab-tui/tests/library.rs b/crates/kebab-tui/tests/library.rs index 44f0dba..c94c09d 100644 --- a/crates/kebab-tui/tests/library.rs +++ b/crates/kebab-tui/tests/library.rs @@ -18,7 +18,12 @@ use time::OffsetDateTime; fn make_doc(path: &str, title: &str, tags: Vec<&str>) -> DocSummary { DocSummary { - doc_id: DocumentId(format!("{:0<32}", path.chars().filter(|c| c.is_alphanumeric()).collect::())), + doc_id: DocumentId(format!( + "{:0<32}", + path.chars() + .filter(|c| c.is_alphanumeric()) + .collect::() + )), doc_path: WorkspacePath::new(path.into()).unwrap(), title: title.into(), lang: Lang("en".into()), @@ -88,10 +93,8 @@ fn handle_key_library_q_quits() { #[test] fn handle_key_library_esc_quits_when_no_overlay() { let mut app = app_with_docs(vec![]); - let outcome = kebab_tui::handle_key_library( - &mut app, - KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE), - ); + let outcome = + kebab_tui::handle_key_library(&mut app, KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE)); assert_eq!(outcome, KeyOutcome::Quit); } @@ -118,10 +121,8 @@ fn handle_key_library_question_switches_to_ask() { #[test] fn handle_key_library_enter_does_not_switch_when_empty() { let mut app = app_with_docs(vec![]); - let outcome = kebab_tui::handle_key_library( - &mut app, - KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE), - ); + let outcome = + kebab_tui::handle_key_library(&mut app, KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE)); assert_eq!(outcome, KeyOutcome::Continue); } @@ -185,10 +186,8 @@ fn handle_key_library_arrow_down_moves_selection() { #[test] fn handle_key_library_enter_inspects_when_docs_present() { let mut app = app_with_docs(vec![make_doc("a.md", "A", vec![])]); - let outcome = kebab_tui::handle_key_library( - &mut app, - KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE), - ); + let outcome = + kebab_tui::handle_key_library(&mut app, KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE)); assert_eq!(outcome, KeyOutcome::SwitchPane(Pane::Inspect)); } @@ -209,10 +208,8 @@ fn handle_key_library_f_opens_filter_overlay_then_enter_refreshes() { ); } // Enter commits + refreshes. - let o2 = kebab_tui::handle_key_library( - &mut app, - KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE), - ); + let o2 = + kebab_tui::handle_key_library(&mut app, KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE)); assert_eq!(o2, KeyOutcome::Refresh); } @@ -235,10 +232,8 @@ fn filter_overlay_accepts_hangul_tags() { ); } // Enter commits. - let o2 = kebab_tui::handle_key_library( - &mut app, - KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE), - ); + let o2 = + kebab_tui::handle_key_library(&mut app, KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE)); assert_eq!(o2, KeyOutcome::Refresh); // The library filter should now contain "한글" as a tag. let filter = app.library_filter_for_testing(); @@ -272,9 +267,9 @@ fn filter_overlay_render_places_cursor_on_focused_field() { // After draw, ratatui calls backend.set_cursor_position when the // frame's cursor_position is Some. The terminal's // get_cursor_position proxies to the backend. - let pos = terminal.get_cursor_position().expect( - "filter overlay must call set_cursor_position, so cursor pos must be readable", - ); + let pos = terminal + .get_cursor_position() + .expect("filter overlay must call set_cursor_position, so cursor pos must be readable"); // The Tags label ("tags_any (csv): ") has display_width 16; inner.x // is 1 (inside border). With empty input cursor_col=0, expected x=17. // We assert x>0 to avoid hardcoding the exact layout geometry while @@ -325,7 +320,10 @@ fn library_renders_column_header_row() { .lines() .position(|line| line.contains("TITLE")) .expect("TITLE header should be present"); - let lines_after = rendered.lines().skip(title_line_idx + 1).collect::>(); + let lines_after = rendered + .lines() + .skip(title_line_idx + 1) + .collect::>(); assert!( lines_after.iter().any(|line| line.contains("doc-")), "no data rows after header:\n{rendered}" @@ -338,7 +336,11 @@ fn library_renders_column_header_row() { #[test] fn library_renders_korean_titles_without_overflow() { let docs = vec![ - make_doc("ko/한글-노트.md", "러스트로 만드는 지식 베이스", vec!["rust", "한글"]), + make_doc( + "ko/한글-노트.md", + "러스트로 만드는 지식 베이스", + vec!["rust", "한글"], + ), make_doc("jp/漢字メモ.md", "日本語のテストドキュメント", vec!["jp"]), make_doc("mix/hello-세계.md", "Hello, 세계 mixed title", vec!["mix"]), ]; diff --git a/crates/kebab-tui/tests/mode.rs b/crates/kebab-tui/tests/mode.rs index 90fd1e8..13bbe3e 100644 --- a/crates/kebab-tui/tests/mode.rs +++ b/crates/kebab-tui/tests/mode.rs @@ -24,12 +24,13 @@ fn esc_in_insert_flips_to_normal_and_consumes() { for &pane in &[Pane::Library, Pane::Search, Pane::Ask, Pane::Inspect] { let mut app = fresh_app(pane); app.mode = Mode::Insert; - let consumed = mode_intercept( - &mut app, - KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE), - ); + let consumed = mode_intercept(&mut app, KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE)); assert!(consumed, "Esc in Insert must be consumed (pane: {pane:?})"); - assert_eq!(app.mode, Mode::Normal, "mode flipped to Normal (pane: {pane:?})"); + assert_eq!( + app.mode, + Mode::Normal, + "mode flipped to Normal (pane: {pane:?})" + ); } } @@ -40,10 +41,7 @@ fn esc_in_insert_flips_to_normal_and_consumes() { fn esc_in_normal_mode_falls_through() { let mut app = fresh_app(Pane::Library); assert_eq!(app.mode, Mode::Normal); - let consumed = mode_intercept( - &mut app, - KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE), - ); + let consumed = mode_intercept(&mut app, KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE)); assert!(!consumed, "Esc in Normal must fall through to pane"); assert_eq!(app.mode, Mode::Normal, "mode unchanged"); } @@ -55,13 +53,21 @@ fn esc_in_normal_mode_falls_through() { fn i_in_normal_on_library_inspect_jobs_flips_to_insert() { for &pane in &[Pane::Library, Pane::Inspect, Pane::Jobs] { let mut app = fresh_app(pane); - assert_eq!(app.mode, Mode::Normal, "auto_for({pane:?}) should be Normal"); + assert_eq!( + app.mode, + Mode::Normal, + "auto_for({pane:?}) should be Normal" + ); let consumed = mode_intercept( &mut app, KeyEvent::new(KeyCode::Char('i'), KeyModifiers::NONE), ); assert!(consumed, "i in Normal on {pane:?} must be consumed"); - assert_eq!(app.mode, Mode::Insert, "mode flipped to Insert (pane: {pane:?})"); + assert_eq!( + app.mode, + Mode::Insert, + "mode flipped to Insert (pane: {pane:?})" + ); } } @@ -72,7 +78,11 @@ fn i_in_normal_on_library_inspect_jobs_flips_to_insert() { fn i_on_search_or_ask_in_insert_falls_through_to_pane() { for &pane in &[Pane::Search, Pane::Ask] { let mut app = fresh_app(pane); - assert_eq!(app.mode, Mode::Insert, "auto_for({pane:?}) should be Insert"); + assert_eq!( + app.mode, + Mode::Insert, + "auto_for({pane:?}) should be Insert" + ); let consumed = mode_intercept( &mut app, KeyEvent::new(KeyCode::Char('i'), KeyModifiers::NONE), @@ -96,7 +106,11 @@ fn i_on_search_or_ask_in_normal_flips_to_insert() { KeyEvent::new(KeyCode::Char('i'), KeyModifiers::NONE), ); assert!(consumed, "i on {pane:?}/Normal must intercept (p9-fb-21)"); - assert_eq!(app.mode, Mode::Insert, "mode flipped to Insert (pane: {pane:?})"); + assert_eq!( + app.mode, + Mode::Insert, + "mode flipped to Insert (pane: {pane:?})" + ); } } @@ -107,10 +121,7 @@ fn i_on_search_or_ask_in_normal_flips_to_insert() { fn modifier_keys_do_not_trigger_intercept() { let mut app = fresh_app(Pane::Library); app.mode = Mode::Insert; - let consumed = mode_intercept( - &mut app, - KeyEvent::new(KeyCode::Esc, KeyModifiers::CONTROL), - ); + let consumed = mode_intercept(&mut app, KeyEvent::new(KeyCode::Esc, KeyModifiers::CONTROL)); assert!(!consumed, "Ctrl+Esc must fall through"); assert_eq!(app.mode, Mode::Insert, "mode unchanged"); @@ -136,16 +147,19 @@ fn shift_modifier_passes_modifier_filter() { // 'i', so it falls through. Both are intentional.) let mut app = fresh_app(Pane::Library); app.mode = Mode::Insert; - let consumed = mode_intercept( - &mut app, - KeyEvent::new(KeyCode::Esc, KeyModifiers::SHIFT), + let consumed = mode_intercept(&mut app, KeyEvent::new(KeyCode::Esc, KeyModifiers::SHIFT)); + assert!( + consumed, + "Shift+Esc still toggles (modifier filter allows SHIFT)" ); - assert!(consumed, "Shift+Esc still toggles (modifier filter allows SHIFT)"); let mut app = fresh_app(Pane::Library); let consumed = mode_intercept( &mut app, KeyEvent::new(KeyCode::Char('I'), KeyModifiers::SHIFT), ); - assert!(!consumed, "Shift+I (capital) falls through — only lowercase 'i' toggles"); + assert!( + !consumed, + "Shift+I (capital) falls through — only lowercase 'i' toggles" + ); } diff --git a/crates/kebab-tui/tests/search.rs b/crates/kebab-tui/tests/search.rs index 4e7fc29..e3e8b9a 100644 --- a/crates/kebab-tui/tests/search.rs +++ b/crates/kebab-tui/tests/search.rs @@ -3,8 +3,8 @@ use crossterm::event::{KeyCode, KeyEvent, KeyModifiers}; use kebab_config::Config; use kebab_core::{ - Citation, ChunkId, ChunkerVersion, DocumentId, EmbeddingModelId, IndexVersion, - RetrievalDetail, SearchHit, SearchMode, WorkspacePath, + ChunkId, ChunkerVersion, Citation, DocumentId, EmbeddingModelId, IndexVersion, RetrievalDetail, + SearchHit, SearchMode, WorkspacePath, }; use kebab_tui::{ App, KeyOutcome, Mode, Pane, SearchState, SearchWorkerMessage, build_jump_command, @@ -73,10 +73,7 @@ fn line_citation(path: &str, line: u32) -> Citation { #[test] fn esc_returns_to_library() { let mut app = fresh_app(); - let outcome = handle_key_search( - &mut app, - KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE), - ); + let outcome = handle_key_search(&mut app, KeyEvent::new(KeyCode::Esc, KeyModifiers::NONE)); assert_eq!(outcome, KeyOutcome::SwitchPane(Pane::Library)); } @@ -134,20 +131,14 @@ fn enter_with_query_emits_refresh() { let s = app.search.as_mut().unwrap(); s.input.push_str("rust"); } - let outcome = handle_key_search( - &mut app, - KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE), - ); + let outcome = handle_key_search(&mut app, KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE)); assert_eq!(outcome, KeyOutcome::Refresh); } #[test] fn enter_with_empty_query_is_continue() { let mut app = fresh_app(); - let outcome = handle_key_search( - &mut app, - KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE), - ); + let outcome = handle_key_search(&mut app, KeyEvent::new(KeyCode::Enter, KeyModifiers::NONE)); assert_eq!(outcome, KeyOutcome::Continue); } @@ -193,19 +184,23 @@ fn j_k_move_selection_within_bounds() { #[test] fn build_jump_command_line_uses_plus_n_for_vim() { let citation = line_citation("notes/foo.md", 42); - let (program, args) = - build_jump_command(&citation, "vim", Path::new("/tmp/workspace")); + let (program, args) = build_jump_command(&citation, "vim", Path::new("/tmp/workspace")); assert_eq!(program, "vim"); - assert_eq!(args, vec!["+42".to_string(), "/tmp/workspace/notes/foo.md".into()]); + assert_eq!( + args, + vec!["+42".to_string(), "/tmp/workspace/notes/foo.md".into()] + ); } #[test] fn build_jump_command_line_uses_g_flag_for_code() { let citation = line_citation("notes/foo.md", 42); - let (program, args) = - build_jump_command(&citation, "code", Path::new("/tmp/workspace")); + let (program, args) = build_jump_command(&citation, "code", Path::new("/tmp/workspace")); assert_eq!(program, "code"); - assert_eq!(args, vec!["-g".to_string(), "/tmp/workspace/notes/foo.md:42".into()]); + assert_eq!( + args, + vec!["-g".to_string(), "/tmp/workspace/notes/foo.md:42".into()] + ); } #[test] @@ -227,8 +222,18 @@ fn render_search_with_hits_shows_input_and_path() { s.input.push_str("rust traits"); s.mode = SearchMode::Hybrid; s.hits = vec![ - make_hit(1, "notes/rust.md", "trait dispatch\nis dynamic", line_citation("notes/rust.md", 12)), - make_hit(2, "notes/dyn.md", "dynamic dispatch\nvtable", line_citation("notes/dyn.md", 3)), + make_hit( + 1, + "notes/rust.md", + "trait dispatch\nis dynamic", + line_citation("notes/rust.md", 12), + ), + make_hit( + 2, + "notes/dyn.md", + "dynamic dispatch\nvtable", + line_citation("notes/dyn.md", 3), + ), ]; s.selected_hit = 0; } @@ -249,10 +254,19 @@ fn render_search_with_hits_shows_input_and_path() { }) .collect::>() .join("\n"); - assert!(rendered.contains("hybrid"), "mode badge rendered: {rendered}"); + assert!( + rendered.contains("hybrid"), + "mode badge rendered: {rendered}" + ); assert!(rendered.contains("rust traits"), "input text rendered"); - assert!(rendered.contains("notes/rust.md"), "first hit path rendered"); - assert!(rendered.contains("notes/dyn.md"), "second hit path rendered"); + assert!( + rendered.contains("notes/rust.md"), + "first hit path rendered" + ); + assert!( + rendered.contains("notes/dyn.md"), + "second hit path rendered" + ); } /// p9-fb-32: Search pane prefixes the rank/score header line with a @@ -454,7 +468,12 @@ fn g_key_enqueues_pending_editor_request() { app.mode = kebab_tui::Mode::Normal; { let s = app.search.as_mut().unwrap(); - s.hits = vec![make_hit(1, "notes/x.md", "snippet", line_citation("notes/x.md", 42))]; + s.hits = vec![make_hit( + 1, + "notes/x.md", + "snippet", + line_citation("notes/x.md", 42), + )]; s.selected_hit = 0; } assert!(app.pending_editor().is_none(), "queue starts empty"); @@ -570,15 +589,18 @@ fn poll_worker_noop_when_no_rx() { /// Helper for the debounce_due tests — build a state with the four /// fields the test cares about set, others default. #[allow(clippy::field_reassign_with_default)] -fn search_state_with(input: &str, mode: SearchMode, searching: bool, last_query: Option<(String, SearchMode)>) -> SearchState { +fn search_state_with( + input: &str, + mode: SearchMode, + searching: bool, + last_query: Option<(String, SearchMode)>, +) -> SearchState { let mut s = SearchState::default(); s.input.push_str(input); s.mode = mode; s.searching = searching; s.last_query = last_query; - s.input_dirty_at = Some( - time::OffsetDateTime::now_utc() - time::Duration::seconds(1), - ); + s.input_dirty_at = Some(time::OffsetDateTime::now_utc() - time::Duration::seconds(1)); s } @@ -683,12 +705,7 @@ fn o_in_normal_with_hits_enters_inspect() { app.focus = Pane::Search; app.mode = Mode::Normal; let s = app.search.as_mut().unwrap(); - s.hits = vec![make_hit( - 1, - "a.md", - "snippet", - line_citation("a.md", 1), - )]; + s.hits = vec![make_hit(1, "a.md", "snippet", line_citation("a.md", 1))]; s.selected_hit = 0; let outcome = kebab_tui::handle_key_search( &mut app, diff --git a/docs/superpowers/plans/2026-05-28-v0.20-ingest-log-plan.md b/docs/superpowers/plans/2026-05-28-v0.20-ingest-log-plan.md new file mode 100644 index 0000000..3809d19 --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-v0.20-ingest-log-plan.md @@ -0,0 +1,616 @@ +--- +title: "v0.20.x ingest log feature — plan" +date: 2026-05-28 +status: "DRAFT (round 0)" +phase: B4 (plan drafter) +target_spec: ../specs/2026-05-28-v0.20-ingest-log-spec.md +parent_task: ../../../tasks/p10/p10-1A-5-ingest-failure-log.md +plan_for_version: 0.20.x +target_branch: feat/pdf-scanned-ocr +step_count: 6 +commit_count: 5 +estimated_loc_delta: "+650 / -25" +--- + +# v0.20.x ingest log feature — plan + +## §0 Overview + +본 plan 은 spec ACCEPT (`docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md`, 491 line) 의 6 step / 5 commit decomposition. spec §5 AC-1 ~ AC-10 의 acceptance criteria 를 step boundary 마다 verifier로 매핑. + +**핵심 deliverable**: + +1. `kebab-config` 의 `[logging]` section (2 field: `ingest_log_enabled` / `ingest_log_dir`). +2. `kebab-app/src/ingest_log.rs` 신규 module (`IngestLogWriter` + `LogEvent` enum, 5 kind). +3. `PdfOcrProgress::Finished` + `IngestEvent::PdfOcrFinished` 의 **4 additive field** (`image_byte_size` / `image_width` / `image_height` / `failure_reason`) → wire schema `ingest_progress.v1` additive minor cascade. +4. `kebab-app` 의 5 emit hook integration (init / flush / OCR / parse_error+skip / fatal error). +5. integration test `ingest_log_smoke.rs` (AC-9). +6. workspace test + clippy + dogfood smoke (AC-8). + +**작업 분량 estimate**: +650 LOC (300 신규 module, 250 hook+config, 100 test) / -25 LOC (PdfOcrProgress callsite refactor). branch 변경 없음, doc-only commit 0. + +**Spec-driven invariant**: + +- **wire schema** = additive minor (4 optional field 추가, `required` 변경 없음, 기존 consumer regression 0). +- **backward compat** = `#[serde(default)]` 로 pre-v0.20 config 자동 init (AC-10). +- **subagent skip** = direct in-session execution (worker protocol). + +--- + +## §1 Step table + +| # | Step | Files (primary) | Commit (after step) | AC covered | +|---|------|-----------------|---------------------|------------| +| 1 | LoggingCfg + Config integration | `crates/kebab-config/src/lib.rs`, `crates/kebab-config/tests/*.rs` | `feat(config): add [logging] section (ingest_log_enabled + ingest_log_dir)` | AC-1, AC-10 | +| 2 | IngestLogWriter module + LogEvent enum | `crates/kebab-app/src/ingest_log.rs` (new), `crates/kebab-app/src/lib.rs` (mod 선언) | `feat(app): IngestLogWriter + LogEvent enum (per-ingest-run ndjson log)` | AC-3 (struct) | +| 3 | PdfOcrProgress::Finished extend + wire cascade | `crates/kebab-app/src/pdf_ocr_apply.rs`, `crates/kebab-app/src/ingest_progress.rs`, `docs/wire-schema/v1/ingest_progress.schema.json`, `integrations/claude-code/kebab/SKILL.md` | `feat(wire): PdfOcrProgress.Finished + ingest_progress.v1 additive 4 fields (image_byte_size/width/height + failure_reason)` | AC-3 (ocr fields), AC-5 (failure_reason carry) | +| 4 | 5 emit hook integration | `crates/kebab-app/src/lib.rs` (Hook 1/3/5), `crates/kebab-app/src/pdf_ocr_apply.rs` (Hook 2 metric capture), `crates/kebab-source-fs/src/connector.rs` (Hook 4 skip emit) | `feat(app): wire IngestLogWriter into 5 ingest emit hooks (Arc sync)` | AC-2, AC-4, AC-5, AC-6, AC-7 | +| 5 | Integration test (ingest_log_smoke) | `crates/kebab-app/tests/ingest_log_smoke.rs` (new) | `test(app): ingest_log_smoke integration test (AC-9)` | AC-9 | +| 6 | Final sanity (workspace test + clippy + optional dogfood) | n/a (verifier only) | no commit | AC-8 | + +5 commit 단위, 6 step 단위. Step 6 는 verifier-only (no commit), 누적 regression 확인용. + +--- + +## §2 Per-step detail + +### §2.1 Step 1 — LoggingCfg + Config integration + +**Goal**: spec §3.1 + §4.4 — `LoggingCfg` struct + Config field + backward compat. + +#### §2.1.1 Files affected + +| Path | Action | Approx LOC | Notes | +|------|--------|------------|-------| +| `crates/kebab-config/src/lib.rs` | edit | +55 / -0 | Config struct (line 37+), 새 LoggingCfg struct + Default + default fns | +| `crates/kebab-config/tests/integration.rs` (또는 신규 `tests/logging_roundtrip.rs`) | edit / new | +35 | TOML roundtrip 1 test (default load + override load + pre-v0.20 backward compat) | + +기존 file `crates/kebab-config/src/lib.rs` 의 line range: + +- line 37–81: `Config` struct (현재 `pdf` field 가 line 62–66 위치). 신규 `logging` field 는 `pdf` 다음 line 67 부근 삽입. +- 신규 `LoggingCfg` struct + `default_ingest_log_*` fn 는 line 416 부근 (`PdfCfg::defaults` 다음) 또는 file 끝부근의 cfg-grouping spot 에 추가. 위치 선택은 executor 재량 — 기존 cfg struct (NliCfg/PdfOcrCfg/PdfCfg) 와 같은 visual layout 유지. + +#### §2.1.2 Action diff outline + +```rust +// crates/kebab-config/src/lib.rs (line 37+, Config struct) +pub struct Config { + // ... existing fields ... + #[serde(default = "PdfCfg::defaults")] + pub pdf: PdfCfg, + /// v0.20.x sub-item: ingest log surface. `#[serde(default)]` 라 + /// pre-v0.20 config (`[logging]` section 부재) 가 default 로 init. + #[serde(default)] + pub logging: LoggingCfg, + #[serde(skip)] + pub(crate) source_dir: Option, +} + +// 신규 struct (cfg grouping spot) +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct LoggingCfg { + /// ingest 시 structured ndjson log auto-write. default = true. + /// false 시 log file 생성 0 (AC-6). + #[serde(default = "default_ingest_log_enabled")] + pub ingest_log_enabled: bool, + + /// per-ingest-run log file directory. default = `{state_dir}/logs`. + /// `{state_dir}` placeholder = XDG state dir (e.g. `~/.local/state/kebab`). + /// Log file 누적 disk usage 는 user-managed (rotation policy 미제공 — spec §6 R-1). + #[serde(default = "default_ingest_log_dir")] + pub ingest_log_dir: PathBuf, +} + +fn default_ingest_log_enabled() -> bool { true } +fn default_ingest_log_dir() -> PathBuf { + PathBuf::from("{state_dir}/logs") +} + +impl Default for LoggingCfg { + fn default() -> Self { + Self { + ingest_log_enabled: default_ingest_log_enabled(), + ingest_log_dir: default_ingest_log_dir(), + } + } +} +``` + +테스트 추가 (`crates/kebab-config/tests/logging_roundtrip.rs`): + +```rust +// 1. default Config 의 logging section round-trip (TOML → Config → TOML). +// 2. `[logging]\nenabled = false\ningest_log_dir = "/tmp/x"` override 가 +// deserialize 시 정확히 반영되는지 verify. +// 3. pre-v0.20 fixture (entire config 에 [logging] 없는) 가 default LoggingCfg 로 init (AC-10). +``` + +#### §2.1.3 Acceptance + +- `cargo test -p kebab-config -j 4` 전수 pass + 새 test 1 pass. +- `cargo build -p kebab-config -j 4` clean. +- `cargo clippy -p kebab-config -- -D warnings` 0 warning. + +#### §2.1.4 Commit + +``` +feat(config): add [logging] section (ingest_log_enabled + ingest_log_dir) + +v0.20.x ingest log surface 의 config side. `LoggingCfg` struct 신설: + * ingest_log_enabled (bool, default true) + * ingest_log_dir (PathBuf, default "{state_dir}/logs") + +#[serde(default)] tag 로 pre-v0.20 config 가 [logging] section 부재 +시 LoggingCfg::default() 자동 init (AC-10 backward compat). + +`{state_dir}` placeholder 의 실제 expand 는 step 2 (IngestLogWriter) +의 expand_log_dir helper 가 담당 (kebab-config 의 expand_path_with_base +는 `{state_dir}` 미지원, spec §6 R-3). +``` + +--- + +### §2.2 Step 2 — IngestLogWriter module + LogEvent enum + +**Goal**: spec §4.1 — 새 module `crates/kebab-app/src/ingest_log.rs` 의 `IngestLogWriter` + `LogEvent` enum + `IngestSummary` struct + run_id generation + path expansion. *callsite wiring 은 step 4*. 본 step 은 self-contained 한 writer + unit test 만. + +#### §2.2.1 Files affected + +| Path | Action | Approx LOC | Notes | +|------|--------|------------|-------| +| `crates/kebab-app/src/ingest_log.rs` | new | +280 | writer struct + LogEvent enum + IngestSummary + Drop impl + 4 unit test | +| `crates/kebab-app/src/lib.rs` | edit | +3 | `mod ingest_log;` 선언 (line 63 부근, `pub mod ingest_progress;` 다음). `pub use ingest_log::{IngestLogWriter, LogEvent, IngestSummary};` | + +#### §2.2.2 Action diff outline + +`crates/kebab-app/src/ingest_log.rs` (신규 module body): + +- module doc (5 line): writer 역할 + run_id 형식 + emit ordering 명문. +- imports: `std::fs::File`, `std::io::{BufWriter,Write}`, `std::path::{Path,PathBuf}`, `std::time::SystemTime`, `serde::{Serialize,Deserialize}`, `time::OffsetDateTime`, `time::format_description::well_known::Rfc3339`, `uuid::Uuid`. +- **`pub struct IngestLogWriter`** (`file: BufWriter`, `path: PathBuf`, `run_id: String`, `started_at: SystemTime`). + - `pub fn open(cfg: &kebab_config::LoggingCfg) -> anyhow::Result>` — `cfg.ingest_log_enabled == false` 시 `Ok(None)`, true 시 log_dir 생성 + file create + run_id 발급. open 실패는 `Err` 반환 (caller 가 swallow + tracing::warn). + - `pub fn write_event(&mut self, event: &LogEvent<'_>) -> anyhow::Result<()>` — serde_json::to_writer + writeln. + - `pub fn write_summary(&mut self, summary: &IngestSummary) -> anyhow::Result<()>` — 동일 pattern. + - `pub fn flush(&mut self) -> anyhow::Result<()>`. + - getters: `run_id() / path() / started_at()`. +- **`impl Drop`** — best-effort `self.file.flush()` (spec §6 R-4 panic unwind path). +- **`fn generate_run_id() -> String`** — `OffsetDateTime::now_utc().format(time::macros::format_description!("[year][month][day]T[hour][minute][second]Z"))` 의 ISO 8601 compact prefix + `Uuid::now_v7().simple().to_string()` 의 마지막 8 hex char. `rand` 추가 0 (spec §6 R-5). +- **`fn expand_log_dir(path: &Path) -> PathBuf`** — string-replace `{state_dir}` → `kebab_config::Config::xdg_state_dir()`. tilde/env 는 `kebab_config::expand_path` 위임. +- **`pub(crate) fn now_ts() -> String`** — Rfc3339 formatted UTC. step 4 의 hook 들이 호출. +- **`pub enum LogEvent<'a>`** — `#[serde(tag="kind", rename_all="snake_case")]`. 4 variant: + - `Ocr { ts, doc_path, page, image_byte_size: Option, image_width: Option, image_height: Option, ms, chars, success, reason: Option<&'a str>, ocr_engine }`. + - `ParseError { ts, doc_path, reason, message }`. + - `Skip { ts, doc_path, reason, detail: Option<&'a str> }`. + - `Error { ts, code, message }`. +- **`pub struct IngestSummary`** — owned fields (`ts: String`, `run_id: String`, `scanned/new/errors/ocr_pages/ocr_failures: u32`, `ocr_p50_ms/p90_ms/max_ms: Option`, `duration_ms: u64`). `#[serde(tag = "kind", rename = "summary")]` 또는 별도 `kind: &'static str = "summary"` literal field 로 wire-shape 의 `kind: "summary"` 강제. **권장**: 별도 `IngestSummary` enum variant 대신 `tagged-struct` (serde 의 `#[serde(rename = "summary")]` + explicit kind field) — wire output 의 line 단위 JSON 이 항상 `{"kind":"summary",…}` 시작. + +**unit test (5 fn, ingest_log.rs 의 `#[cfg(test)] mod tests`)**: + +1. `generate_run_id_has_iso_prefix_and_8_hex_suffix` — `^\d{8}T\d{6}Z-[0-9a-f]{8}$` regex match. +2. `expand_log_dir_substitutes_state_dir_placeholder` — `"{state_dir}/logs"` → xdg_state_dir + "/logs". +3. `writer_disabled_returns_none` — `LoggingCfg { enabled: false, .. } → IngestLogWriter::open() == Ok(None)`. +4. `writer_writes_one_event_per_line_with_kind_discriminator` — temp file 에 write_event ×3 → 3 line, 각 line 의 첫 char `{`, `"kind":` substring present. +5. `drop_flushes_pending_buffer` — write_event 후 explicit flush 없이 drop, 그 후 read_to_string 으로 line count ≥ 1 verify. + +**OQ-2 (p50 / p90 계산)**: workspace 에 `quantiles` crate 없음, simple sorted Vec 으로 처리. 본 step 의 IngestSummary 는 numeric field 만 제공 — 실제 p50/p90 계산은 step 4 의 emit hook 이 ms accumulator (Vec) 유지 후 final stage 에서 sort + percentile index. + +**OQ-3 (log cleanup policy 명문 위치)**: LoggingCfg::ingest_log_dir 의 doc-comment 에 한 줄 (`Log file 누적 disk usage 는 user-managed`) 으로 충분 — README/SMOKE 변경 없음. step 1 의 commit body 에 한 줄 명문. + +#### §2.2.3 Acceptance + +- `cargo test -p kebab-app --lib ingest_log -j 4` 4 passed. +- `cargo build -p kebab-app -j 4` clean. +- `cargo clippy -p kebab-app -- -D warnings` 0 warning. + +#### §2.2.4 Commit + +``` +feat(app): IngestLogWriter + LogEvent enum (per-ingest-run ndjson log) + +v0.20.x ingest log surface 의 module side. crates/kebab-app/src/ +ingest_log.rs 신규: + * IngestLogWriter — open/write_event/write_summary/flush + Drop flush + * LogEvent enum 4 variant (ocr / parse_error / skip / error) + * IngestSummary struct (kind="summary" literal + 11 stat field) + * generate_run_id (ISO 8601 prefix + uuid v7 마지막 8 hex) + * expand_log_dir ({state_dir} placeholder 의 hand-roll expand) + +uuid v7 = workspace dep (Cargo.toml line 132), rand 신규 의존 회피 +(spec §6 R-5). + +본 step 은 self-contained writer + 5 unit test. ingest pipeline 의 +emit hook 5개 wiring 은 step 4. +``` + +--- + +### §2.3 Step 3 — PdfOcrProgress::Finished extend + wire cascade + +**Goal**: spec §4.2 HIGH-1 + §3.3 ocr fields — `PdfOcrProgress::Finished` 와 `IngestEvent::PdfOcrFinished` 에 4 additive field 추가, wire schema additive minor cascade. + +#### §2.3.1 Files affected + +| Path | Action | Approx LOC | Notes | +|------|--------|------------|-------| +| `crates/kebab-app/src/pdf_ocr_apply.rs` | edit | +25 / -5 | `PdfOcrProgress::Finished` variant 의 4 field 추가, 3 emit_progress callsite (line 145, 173, 247) 의 measurement + emit 갱신 | +| `crates/kebab-app/src/ingest_progress.rs` | edit | +15 / -5 | `IngestEvent::PdfOcrFinished` 의 4 field 추가, 기존 test `ingest_event_serializes_with_discriminator` 류 보존 | +| `crates/kebab-app/src/lib.rs` | edit | +10 / -3 | line 1865–1882 의 `PdfOcrProgress::Finished { … } => IngestEvent::PdfOcrFinished { … }` mapping 의 4 field carry | +| `docs/wire-schema/v1/ingest_progress.schema.json` | edit | +12 | `image_byte_size` / `image_width` / `image_height` / `failure_reason` property 추가 (모두 optional, `required` 변경 없음) | +| `integrations/claude-code/kebab/SKILL.md` | edit | +5 | wire schema description 동기 (추가 optional field 명시) | + +#### §2.3.2 Action diff outline + +`PdfOcrProgress::Finished` (pdf_ocr_apply.rs line 283): + +```rust +Finished { + page: u32, + ms: u64, + chars: u32, + skipped: bool, + // NEW (4 field, optional): + image_byte_size: Option, + image_width: Option, + image_height: Option, + failure_reason: Option, // "timeout" | "ocr_error" | "network_error" | None +}, +``` + +3 emit_progress callsite 갱신 (pdf_ocr_apply.rs line 145 / 173 / 247): + +- **line 145** (success path, OCR 정상 완료): `image_byte_size: Some()`, `image_width: Some()`, `image_height: Some()`, `failure_reason: None`. `` 는 raster image 의 measurement (Bug #11 follow-up 의 측정 spot 재사용 또는 인접 변수). +- **line 173** (engine 실패 → skip): `failure_reason: Some("ocr_error".into())` (or "timeout" if 분류 가능). image metric 은 available 시 emit, unavailable 시 None. +- **line 247** (validation/threshold skip, OCR 미수행): `failure_reason: None`, image metric 가능 시 emit. + +`IngestEvent::PdfOcrFinished` (ingest_progress.rs line 96–102): + +```rust +PdfOcrFinished { + page: u32, + ms: u64, + chars: u32, + ocr_engine: String, + skipped: bool, + // NEW (4 field, optional): + image_byte_size: Option, + image_width: Option, + image_height: Option, + failure_reason: Option, +}, +``` + +`crates/kebab-app/src/lib.rs` line 1865–1882 의 mapping: + +```rust +crate::pdf_ocr_apply::PdfOcrProgress::Finished { + page, ms, chars, skipped, + image_byte_size, image_width, image_height, failure_reason, +} => { + if let Some(sender) = progress { + let _ = sender.send( + crate::ingest_progress::IngestEvent::PdfOcrFinished { + page, ms, chars, + ocr_engine: engine.engine_name().to_string(), + skipped, + image_byte_size, image_width, image_height, + failure_reason: failure_reason.clone(), + }, + ); + } + // step 4 의 Hook 2 가 이 위치에서 추가로 log writer 에 write. +} +``` + +`docs/wire-schema/v1/ingest_progress.schema.json` 의 `properties` (line 8+) 에 추가: + +```jsonc +"image_byte_size": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished (optional): raster image byte size." }, +"image_width": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished (optional): raster image width px." }, +"image_height": { "type": "integer", "minimum": 0, "description": "pdf_ocr_finished (optional): raster image height px." }, +"failure_reason": { "type": "string", "enum": ["timeout", "ocr_error", "network_error", "other"], "description": "pdf_ocr_finished (optional): present iff OCR failed." } +``` + +**중요**: `required` array 는 변경 없음 (현재 `["schema_version", "kind", "ts"]`). 4 field 모두 optional → **additive minor** = backward compat. + +`integrations/claude-code/kebab/SKILL.md` 갱신 — wire schema 의 `pdf_ocr_finished` 설명에 4 추가 field 한 줄 명문 (existing 1 paragraph 다음). + +#### §2.3.3 Acceptance + +- `cargo test -p kebab-app pdf_ocr_apply -j 4` 전수 pass. +- `cargo test -p kebab-app ingest_progress -j 4` 전수 pass. +- `cargo test -p kebab-cli wire_search wire_ask -j 4` regression check (기존 PdfOcrFinished consumer 가 4 추가 field 의 `Option::None` 으로도 deserialize 성공). +- `cargo build -p kebab-app -p kebab-cli -j 4` clean. +- wire schema validate: `jq '.properties | keys' docs/wire-schema/v1/ingest_progress.schema.json` 가 4 신규 key 포함, `.required` 변경 없음. + +#### §2.3.4 Commit + +``` +feat(wire): PdfOcrProgress.Finished + ingest_progress.v1 additive 4 fields + +v0.20.x ingest log feature 의 wire side. additive minor cascade: + + * PdfOcrProgress::Finished + IngestEvent::PdfOcrFinished 의 4 field: + - image_byte_size: Option + - image_width: Option + - image_height: Option + - failure_reason: Option + * docs/wire-schema/v1/ingest_progress.schema.json — 4 추가 property + (모두 optional, required 변경 없음 = additive minor) + * integrations/claude-code/kebab/SKILL.md — wire schema description 동기 + +기존 ingest_progress.v1 consumer (CLI wire dump, integration test +fixture, kebab-cli wire_search/wire_ask) 는 4 추가 field 의 +Option::None 으로 backward-compat. version bump 0 (additive minor = +binary-version cascade trigger 아님 per CLAUDE.md §Versioning cascade). +``` + +--- + +### §2.4 Step 4 — 5 emit hook integration (Arc>) + +**Goal**: spec §4.2 — 5 hook 위치에서 IngestLogWriter 호출. ownership = `Option>>` (binding 은 `ingest_with_config_opts` 에서, 5 hook 이 clone+lock+write). + +#### §2.4.1 Files affected + +| Path | Action | Approx LOC | Hook | +|------|--------|------------|------| +| `crates/kebab-app/src/lib.rs` | edit | +110 / -10 | Hook 1 (init + flush), Hook 3 (parse_error path), Hook 5 (fatal error), summary stage 의 percentile 계산 + write_summary | +| `crates/kebab-app/src/pdf_ocr_apply.rs` | edit | +35 / -3 | Hook 2: image metric capture (existing raster decode spot) + emit_progress 의 4 field carry. *signature 변경 없음* — `Finished` field 추가만으로 caller 가 image metric carry 가능 | +| `crates/kebab-source-fs/src/connector.rs` | edit | +25 | Hook 4: scan_with_skips 의 skip event 마다 callback (또는 `FsScanSkips` 의 `events: Vec` accumulator) — kebab-app 이 scan 후 enumerate + write | + +#### §2.4.2 Hook detail + +**Hook 1 — `ingest_with_config_opts` (lib.rs line 281)**: + +function entry 직후 `let log_writer: Option>> = IngestLogWriter::open(&config.logging) | Ok(Some(w)) → Some(Arc::new(Mutex::new(w))) | Ok(None) → None | Err(e) → tracing::warn + None`. function exit (Completed / Aborted 경로 직전) 에서 `summary 계산 + write_summary + flush`. summary 의 `ocr_p50_ms / p90_ms / max_ms` 는 success-only OCR duration accumulator `Vec` 를 `sort_unstable()` 후 `len*50/100` / `len*90/100` index 로 추출, `samples.last()` 로 max. + +**Hook 2 — `apply_ocr_to_pdf_pages` (pdf_ocr_apply.rs) → caller closure in lib.rs line 1855**: + +step 3 에서 `PdfOcrProgress::Finished` 의 4 field 추가됐으므로 본 step 은 closure 의 `Finished arm` 에 한 줄 추가: `log_writer.clone()` 캡처 + lock + `write_event(&LogEvent::Ocr { ts: now_ts(), doc_path, page, image_*, ms, chars, success: !skipped && failure_reason.is_none(), reason: failure_reason.as_deref(), ocr_engine: engine.engine_name() })`. success path 시 `ocr_ms_samples.lock().push(ms)`. + +**ownership note (MEDIUM-1)**: emit_progress 는 `F: FnMut(PdfOcrProgress)` (pdf_ocr_apply.rs line 88) → closure 가 `Arc>` clone 캡처 가능. single-threaded per-asset loop 이므로 deadlock 위험 없음. + +**Hook 3 — `parse_error` (lib.rs `ingest_one_pdf_asset` line 1770 + `ingest_one_code_asset` line 2002 의 parse Err arm)**: + +`kebab_parse_pdf::extract(...)` (또는 code parser) 의 `Err(e)` arm 마다 한 줄: `log_writer.lock().write_event(&LogEvent::ParseError { ts: now_ts(), doc_path: asset.path_str(), reason: classify_parse_error(&e), message: &format!("{e}") })`. `classify_parse_error` 는 `kebab_core::Error::PdfFormat → "lopdf_error"`, `Error::ImageFormat → "image_format"`, fallback `"other"` 분류 — pdf_ocr_apply.rs 또는 ingest_log.rs 의 helper. + +**Hook 4 — skip event (kebab-source-fs/src/connector.rs)**: + +current `FsSourceConnector::scan_with_skips` (line 100) 은 skip 마다 `tracing::debug` + counter increment 만 함. 두 option — A (`FsScanSkips` 에 `events: Vec` field 추가) vs B (connector 에 `Arc>` 주입). **A 채택** (B 는 kebab-source-fs → kebab-app cycle). + +`FsScanSkips` (line 207 부근) 에 `pub events: Vec` 추가, 새 struct `FsSkipEvent { doc_path: String, reason: &'static str, detail: Option }` 정의. 5 skip arm (line 113 builtin_blacklist / 122 gitignore / 131 kebabignore / 154 generated / 179 size_exceeded) 마다 `fs_skips.events.push(FsSkipEvent { ... })` 추가. kebab-app/lib.rs 가 scan 직후 (asset loop 진입 전) `for ev in &fs_skips.events { log_writer.lock().write_event(&LogEvent::Skip { ts: now_ts(), doc_path: &ev.doc_path, reason: ev.reason, detail: ev.detail.as_deref() }) }` enumerate. + +**Hook 5 — fatal error (lib.rs `ingest_with_config_opts` 의 error return path)**: + +`?` operator bubbling 패턴이므로 explicit catch spot 부재. **권장 위치**: `ingest_with_config_opts` body 전체를 inner closure `(|| -> anyhow::Result { ... })()` 로 wrap 후 outer 에서 `match result { Err(e) => { log_writer.lock().write_event(&LogEvent::Error { ts: now_ts(), code: "ingest_fatal", message: &format!("{e:#}") }); flush; Err(e) }, Ok(r) => { write_summary + flush; Ok(r) } }`. 본 패턴은 기존 ingest_progress 의 Completed / Aborted emit 과 mutually exclusive — Aborted 는 cancel 의 정상 종료 (not Err), Err arm 만 LogEvent::Error 발동. spec §4.2 의 "error_wire::classify 자체 변경 0" 와 정합 — classify 는 kebab-cli wire.rs 에서 호출, 본 hook 는 facade 안 generic 처리. + +#### §2.4.3 ownership wiring 요약 + +``` +ingest_with_config_opts: + log_writer: Option>> + ├─ Hook 1: init at entry + write_summary at exit + ├─ apply_ocr_to_pdf_pages closure: + │ log_writer.clone() 캡처 → Hook 2 write_event(LogEvent::Ocr) + │ ocr_ms_samples.clone() 캡처 → success-only ms push + ├─ ingest_one_pdf_asset / _code_asset 의 parse Err arm: Hook 3 write_event(LogEvent::ParseError) + ├─ scan 직후 fs_skips.events enumerate: Hook 4 write_event(LogEvent::Skip) + └─ error_wire::classify 호출 spot: Hook 5 write_event(LogEvent::Error) +``` + +#### §2.4.4 Acceptance + +- `cargo test --workspace -j 1 --no-fail-fast` 전수 pass (기존 1358 test + 어떤 새 test 도 regression 0). +- `cargo build --workspace -j 4` clean. +- `cargo clippy --workspace --all-targets -j 4 -- -D warnings` 0 warning. +- 본 step 내 새 module-level test 없음 — integration test 는 step 5. + +#### §2.4.5 Commit + +``` +feat(app): wire IngestLogWriter into 5 ingest emit hooks (Arc sync) + +v0.20.x ingest log feature 의 ingest pipeline wiring. 5 emit hook: + + Hook 1: ingest_with_config_opts entry/exit (writer init + summary write + flush) + Hook 2: apply_ocr_to_pdf_pages closure (PdfOcrProgress::Finished → LogEvent::Ocr) + Hook 3: ingest_one_*_asset parse Err arm (LogEvent::ParseError) + Hook 4: scan 직후 fs_skips.events enumerate (LogEvent::Skip) + Hook 5: error_wire::classify 호출 spot (LogEvent::Error) + +Hook 4 의 skip event carry 위해 kebab-source-fs 의 FsScanSkips 에 +events: Vec field 추가 (kebab-source-fs 가 kebab-app +재호출 안 함 — cycle 회피). + +Ownership: Option>> binding 1 곳, 5 hook 이 +clone+lock+write. ocr_ms_samples (Vec success-only) 는 Arc +로 share, summary stage 가 sort+p50/p90/max 계산. single-threaded +per-asset loop 라 deadlock/contention 위험 없음. + +Writer 실패는 ingest 자체 fail 시키지 않음 (tracing::warn + 진행). +``` + +--- + +### §2.5 Step 5 — Integration test `ingest_log_smoke` + +**Goal**: spec §5 AC-9 — 5-step body integration test. + +#### §2.5.1 Files affected + +| Path | Action | Approx LOC | Notes | +|------|--------|------------|-------| +| `crates/kebab-app/tests/ingest_log_smoke.rs` | new | +160 | 1 fn `ingest_log_smoke` + 1 supporting fn (minimal corpus generator) | +| `crates/kebab-app/Cargo.toml` | edit (optional) | +0 / +0 | `tempfile` 가 이미 dev-dep 이면 변경 0. (`crates/kebab-app/tests/` 의 기존 test 가 사용 중인지 verify — 거의 확실) | + +#### §2.5.2 Action diff outline + +`crates/kebab-app/tests/ingest_log_smoke.rs` 신규, 2 `#[test]`: + +**`fn ingest_log_smoke` (AC-9, 6-step body)**: + +1. `TempDir::new()` + workspace `tmp/kb` + log_dir `tmp/logs` 생성. +2. minimal corpus — `kb/hello.md` (plain text) + `kb/scanned.pdf` (fixture `tests/fixtures/scanned-1page.pdf` copy; fallback fixture 결정은 §5 OQ-A). +3. `Config::test_default(&workspace)` 의 `cfg.logging = LoggingCfg { ingest_log_enabled: true, ingest_log_dir: log_dir }`. +4. `ingest_with_config_opts(cfg, SourceScope::Workspace, false, IngestOpts::default())` → `.expect("ingest")`. +5. `read_dir(&log_dir)` 안 `ingest-*.ndjson` 정확히 1 file assert. `read_to_string` 으로 body. +6. `body.lines()` 각 line → `serde_json::from_str` 으로 parse, `kind` field ∈ {"ocr","parse_error","skip","error","summary"} assert (matches! macro). 마지막 line `kind == "summary"`, `scanned > 0`, `ocr_pages > 0` assert. + +**`fn ingest_log_disabled_emits_no_file` (AC-6, 4-step body)**: + +1. TempDir + workspace + `hello.md` 만. +2. `cfg.logging = LoggingCfg { ingest_log_enabled: false, .. }`. +3. ingest_with_config_opts 실행. +4. `log_dir` 안 `ingest-*.ndjson` 파일 0개 assert (log_dir 자체 생성됐을 수 있으나 file 0). + +**imports**: `tempfile::TempDir`, `kebab_app::{ingest_with_config_opts, IngestOpts, SourceScope}`, `kebab_config::{Config, LoggingCfg}`, `serde_json::Value`. + +**Fixture fallback**: `tests/fixtures/scanned-1page.pdf` 가 미존재 시 (likely — 본 PR scope 가 아니어서) 기존 PDF fixture (e.g. `tests/fixtures/*.pdf`) 중 1 page 의 raster-only 가 있으면 그것을, 없으면 plain text PDF + skip ocr 사례로 test scope 축소 (`ocr_pages > 0` 대신 `summary kind 만 verify`). + +→ executor 가 fixture 위치 확인 후 결정. 본 plan 은 `scanned-1page.pdf` 를 가정. + +#### §2.5.3 Acceptance + +- `cargo test -p kebab-app --test ingest_log_smoke -j 4 2>&1 | tail -3` → `1 passed; 0 failed`. +- `cargo test -p kebab-app --test ingest_log_smoke ingest_log_disabled_emits_no_file -j 4` → `1 passed; 0 failed`. + +#### §2.5.4 Commit + +``` +test(app): ingest_log_smoke integration test (AC-9) + +crates/kebab-app/tests/ingest_log_smoke.rs 신규: + + * ingest_log_smoke (AC-9): tempdir + 1 md + 1 scanned PDF → + ingest → assert log file exists + 각 line valid JSON + + 각 kind ∈ {ocr,parse_error,skip,error,summary} + last + line kind=summary + scanned>0 + ocr_pages>0. + + * ingest_log_disabled_emits_no_file (AC-6): enabled=false 일 + 때 log_dir 안 ingest-*.ndjson 파일 0개 verify. + +fixture: tests/fixtures/scanned-1page.pdf (executor 가 기존 +fb-* PR 시 추가했던 scanned PDF fixture 재사용; 미존재 시 +fallback path — fixture 추가 commit 별도 prepend). +``` + +--- + +### §2.6 Step 6 — Final sanity (no commit) + +**Goal**: 누적 workspace test + clippy + (optional) dogfood. + +#### §2.6.1 Verifier + +- **workspace test 전수**: `CARGO_TARGET_DIR=/build/out/cargo-target/target cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -20` → `test result: ok`. +- **clippy**: `cargo clippy --workspace --all-targets -j 4 -- -D warnings 2>&1 | tail -10` → exit 0. +- **format**: `cargo fmt --all --check` → exit 0. +- **(optional) dogfood smoke**: `target/release/kebab ingest --config /tmp/kebab-smoke/config.toml --json 2>/dev/null | tail -3` → success + `ls /tmp/kebab-smoke/logs/ingest-*.ndjson | wc -l` ≥ 1. + +#### §2.6.2 Commit + +본 step 은 commit 0. regression detected 시 step 1–5 중 해당 step 으로 돌아가 fix → `git commit --amend` 또는 `git commit --fixup` (CLAUDE.md §Git Hygiene: "create NEW commits rather than amending" — fixup 권장). + +--- + +## §3 Verifier checklist (cumulative) + +spec §5 AC 마다 step 매핑 + verifier command. 본 plan 의 executor 가 step 종료 시마다 누적 verifier 실행: + +| AC | Spec text 요약 | Verifier | Step | +|----|--------------|----------|------| +| AC-1 | `[logging]` default emit | TOML serialize 시 `[logging]` block 자동 추가 (`Config::default() | toml::to_string`) | Step 1 | +| AC-2 | `ingest-{run_id}.ndjson` 파일 생성 | `ls {log_dir}/ingest-*.ndjson` ≥ 1 (smoke test) | Step 5 (smoke 안 검증) | +| AC-3 | 각 line valid JSON + kind enum | `jq -c 'select(.kind \| IN("ocr","parse_error","skip","error","summary"))' < log.ndjson \| wc -l` = line count | Step 5 | +| AC-4 | OCR per-page + summary record | `grep -c '"kind":"ocr"' < log.ndjson` ≥ 1 + last line kind=summary | Step 5 | +| AC-5 | 모든 failure type record (size_exceeded / parse_error / ocr timeout) | smoke test 의 fixture 가 1개 size_exceeded 또는 ocr-fail 를 trigger 시 grep | Step 5 (optional fixture 확장) | +| AC-6 | `ingest_log_enabled = false` → 파일 0 | `ingest_log_disabled_emits_no_file` test | Step 5 | +| AC-7 | `ingest_log_dir` override → custom path emit | smoke test 의 tempdir 가 그 검증 (default 가 아닌 path 에 file 생성) | Step 5 | +| AC-8 | workspace test + clippy | `cargo test --workspace -j 1` + `cargo clippy --workspace --all-targets -- -D warnings` | Step 6 | +| AC-9 | integration test | `cargo test -p kebab-app --test ingest_log_smoke -j 4` | Step 5 | +| AC-10 | pre-v0.20 config (no [logging]) load with defaults | Step 1 의 새 test 가 fixture toml 의 [logging] 부재 → Config::load 후 logging == LoggingCfg::default() | Step 1 | + +**누적 invariant**: + +- step 1 종료 후: AC-1, AC-10. +- step 2 종료 후: AC-1, AC-10 (writer struct unit test 만). +- step 3 종료 후: 동일 + wire schema additive verified (consumer regression 0). +- step 4 종료 후: 동일 + workspace test regression 0. +- step 5 종료 후: AC-1, AC-2, AC-3, AC-4, AC-6, AC-7, AC-9, AC-10. AC-5 는 fixture coverage 에 따라. +- step 6 종료 후: AC-8 + 전체 cumulative. + +--- + +## §4 Risks resolution + +spec §6 R-1 ~ R-5 + OQ-1 ~ OQ-3 의 plan resolution: + +- **R-1 (log rotation cleanup)**: step 1 의 `LoggingCfg::ingest_log_dir` doc-comment 에 `Log file 누적 disk usage 는 user-managed` 한 줄. README/SMOKE/ARCHITECTURE 변경 0 (user-facing surface 가 config field 자체이고 일반 user 가 default 로 만족). +- **R-2 (concurrent ingest run_id collision)**: step 2 의 `generate_run_id` = ISO 8601 second-precision prefix + uuid v7 마지막 8 hex. uuid v7 은 ms precision + 74-bit random, 8 hex (32 bit) 도 동일 ms 안 collision 확률 1e-9 미만. concurrent ingest 가 의도된 use case 아님 (single-user local-first KB) 이라 mitigate 충분. +- **R-3 (`{state_dir}` placeholder expand)**: step 2 의 `expand_log_dir` 가 hand-roll string-replace. existing `kebab_config::expand_path` 는 tilde/env 만 처리, `{state_dir}` 미지원. follow-up: `expand_path_with_base` 에 `{state_dir}` 도 추가하는 일반화는 본 PR scope 아님 (LOW-2 deferred). +- **R-4 (panic/abort 시 flush 미실행)**: step 2 의 `Drop for IngestLogWriter` 가 `let _ = self.file.flush()` — panic unwind 도 BufWriter::drop 이 flush 시도 (kernel write call). abort (libc::abort, SIGKILL) 는 drop 미실행 — 본 case 는 mitigate 불가 (OS-level limitation). +- **R-5 (`rand` 신규 의존 회피)**: step 2 의 generate_run_id 가 `uuid::Uuid::now_v7().simple().to_string()` 의 마지막 8 hex 사용. uuid v7 는 workspace dep, `rand` 추가 0. + +OQ: + +- **OQ-1 (image_byte_size + dimensions 출처)**: spec ACCEPT 이 `PdfOcrProgress::Finished` carry (Option A) 채택. step 3 가 이 patch 의 wire cascade. +- **OQ-2 (p50 / p90 계산)**: step 4 의 summary stage 가 success-only `Vec` sort + index `len*50/100` (truncating). `quantiles` crate 추가 0. +- **OQ-3 (log cleanup doc 위치)**: step 1 의 `LoggingCfg::ingest_log_dir` doc-comment 만 — README/SMOKE 변경 0. 만약 user-facing 명문이 필요해지면 follow-up commit 으로 README 의 `Configuration` section 에 1 줄. + +추가 OQ (closure r2 LOW-3 의 spec line 22 vs 414 inconsistency): + +- **OQ-4**: spec line 22 의 "wire schema 변경 0" 와 line 414 의 "additive minor (backward compat)" 가 의미상 동등 (additive minor = wire-schema major bump 미발생 = "변경 0" 의 의도). step 3 의 commit body 가 이 명문화 — `additive minor = binary-version cascade trigger 아님` (CLAUDE.md §Versioning cascade 의 "wire 의 additive minor 변경 (...) 은 backward-compat 이라 본 트리거에 해당 안 됨" 와 일치). spec body 자체의 1-line 수정은 별도 prepend commit 또는 본 step 3 commit body 의 명문화로 충분 (executor 재량). + +--- + +## §5 Open questions for executor + +executor (Phase C round 0) 가 결정해야 할 in-step open question: + +- **OQ-A (fixture availability)**: `crates/kebab-app/tests/fixtures/scanned-1page.pdf` 존재 여부 확인. 미존재 시 (a) 기존 fixture 재사용 (e.g. fb-04 의 PDF) — fixture path 만 수정, (b) plain text PDF 로 test scope 축소, (c) 신규 fixture 추가 commit prepend. **권장**: (a) 또는 (b). 추가 fixture commit 은 5-commit 분량 초과. +- **OQ-B (Hook 5 위치 정밀화)**: spec §4.2 Hook 5 가 "`ingest_with_config_opts` 의 error return path (per-asset catch + final Err arm)" 라고 명시. 실제 `lib.rs` 에는 명시적 `match err { ... }` 패턴 부재 — `?` operator chain 으로 bubble. executor 가 `error_wire::classify` 호출 자체 찾아서 그 직전 spot 에 한 줄 추가. classify 호출 위치는 현재 `crates/kebab-app/src/error_wire.rs` 혹은 그 caller (kebab-cli `wire.rs`). 본 plan 은 kebab-app facade 안에서 classify 호출이 발생한다고 가정 — 만약 classify 가 kebab-cli 에서만 호출되면 Hook 5 가 spec 의 "writer 생명주기" 와 mismatch (writer 는 kebab-app 안). 이때는 kebab-app facade 의 final Err arm 에서 `let _ = log_writer.lock().map(|mut w| w.write_event(LogEvent::Error { code: "ingest_fatal", message: &format!("{e}") }))` 식 generic 처리. executor 가 grep 후 결정. +- **OQ-C (OCR ms accumulator share pattern)**: closure 가 FnMut 라면 `RefCell>` 충분, FnOnce/Fn 라면 `Arc>>`. emit_progress 가 FnMut 로 보임 (line 88 `F: FnMut(PdfOcrProgress)`) → RefCell 도 가능하나 본 plan 은 lock writer 와 같은 pattern (`Arc>`) 으로 일관성. +- **OQ-D (skip event 누락 case)**: `FsScanSkips.events` 가 5 skip arm 중 어느 한 곳이라도 누락되면 AC-5 가 fail. executor 가 connector.rs 의 5 skip spot (builtin_blacklist / gitignore / kebabignore / generated / size_exceeded) 모두 push 추가 verify. + +--- + +## §6 References + +- **Spec**: `docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md` (491 line, ACCEPT 7/7 + 1 LOW) +- **Closure critic r2**: `.omc/reviews/2026-05-28-v0.20-ingest-log-spec-closure-r2-result.md` +- **Brief**: `.omc/reviews/2026-05-28-v0.20-ingest-log-plan-drafter-brief.md` +- **Parent task**: `tasks/p10/p10-1A-5-ingest-failure-log.md` +- **Parent design**: `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` §8 (wire schema), §9 (versioning cascade) +- **Bug #11 follow-up**: OCR raster image metric capture (pdf_ocr_apply.rs line 145 vicinity) +- **Existing wire schema**: `docs/wire-schema/v1/ingest_progress.schema.json` (57 line) +- **Existing IngestEvent**: `crates/kebab-app/src/ingest_progress.rs` line 61–103 +- **Existing PdfOcrProgress**: `crates/kebab-app/src/pdf_ocr_apply.rs` line 276–294 +- **Existing fs skip detection**: `crates/kebab-source-fs/src/connector.rs::scan_with_skips` (line 100 부근, 5 skip arm) +- **xdg_state_dir**: `crates/kebab-config/src/lib.rs` line 1112 +- **uuid v7 workspace dep**: `Cargo.toml` line 132 (`uuid = { version = "1", features = ["v7", "serde"] }`) +- **time crate workspace dep**: `Cargo.toml` line 131 (`time = { version = "0.3", features = ["serde", "macros", "formatting", "parsing"] }`) + +--- + +## §7 Constraints (worker protocol + spec) + +1. **branch 변경 0** — 모든 commit 은 `feat/pdf-scanned-ocr` HEAD (`6a9551e`) 의 직계 descendant. PR 은 main 으로. +2. **subagent skip** — executor 가 nested subagent spawn 안 함, in-session direct edit. +3. **spec ACCEPT frozen 변경 0** — spec body 의 1-line LOW-3 fix (line 22 ↔ 414 정합화) 는 별도 spec-edit commit (필요 시 본 PR 이외). +4. **wire schema = additive minor** — `ingest_progress.v1` 의 4 추가 field 가 모두 optional, `required` array 변경 0. 기존 consumer (`kebab-cli wire_search` / `wire_ask` / Claude Code skill) regression 0. +5. **regression 0** — 기존 1358 workspace test + 새 +6 test (config roundtrip 1, ingest_log unit 5, integration 2). cumulative `cargo test --workspace -j 1` 전수 pass. +6. **commit 단위 = 5** — spec acceptance scope 의 commit boundary. step 6 는 verifier-only, no commit. +7. **plan line 500–700** — 본 file 약 670 line target. +8. **dogfood 영향 0** — 본 plan 의 commit 들이 mass-merged 후 dogfood smoke 가 fail 시 사용자 보고 + revert. dogfood = `docs/SMOKE.md` 의 isolated TempDir KB pipeline. +9. **binary version bump 0** — wire schema additive minor + design contract 변경 0 → CLAUDE.md §Versioning cascade 의 bump trigger 미발동 (현재 `0.19.x` 가정, executor 가 workspace `Cargo.toml` version 확인). +10. **HANDOFF/ARCHITECTURE 변경 0** — 사용자 surface (CLI flag, TUI key, config field 사용자 노출) 변경이 config 1개 (logging) 뿐 → README 의 `Configuration` section 에 한 줄 (feedback_readme_sync_rule 의 "사용자 visible surface 변경 시" 가 강하게 trigger). step 1 또는 step 4 commit 에서 README 한 줄 추가 (option) — line 700 분량 절감 위해 본 plan 은 명문 0, executor 재량. + +--- + +## §8 Plan-level estimate + +- **drafter (current task)**: 30 min — read brief + spec + 8 source spot grep + write plan. +- **executor (next phase)**: 4-6 h — step 1 (30 min) + step 2 (90 min) + step 3 (60 min) + step 4 (120 min) + step 5 (90 min) + step 6 (30 min) + commit drafting + dogfood smoke. +- **review (final phase)**: 30-60 min — 5 commit diff scan + AC verifier reproduce + dogfood log file 1 spot check. + +총 5-7 h end-to-end, 본 PR 만으로 dogfood 사용자 (Phase B4 → B4-execute → review) 완료 가능. diff --git a/docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md b/docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md new file mode 100644 index 0000000..0b103a9 --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-v0.20-ingest-log-spec.md @@ -0,0 +1,491 @@ +--- +title: "v0.20.x ingest log feature — spec" +date: 2026-05-28 +status: "DRAFT (round r1c)" +target_version: 0.20.x +phase: A4 (spec drafter rewrite) +parent_spec: ../../../tasks/p10/p10-1A-5-ingest-failure-log.md +contract_sections: [] +references: [2026-04-27-kebab-final-form-design.md, 프리-v0.20 dogfood Bug #15 (ocr timeout + log analysis), 2026-05-28 closure critic result] +--- + +# v0.20.x ingest log feature — spec + +## 동기 + +dogfood 4-round 후 사용자가 명시 요구: + +> ocr 실패를 포함한 전체적인 실패 발생 시 그 로그를 자세하기 볼 수 있도록 하고 싶은데. 그 통계들을 알 수 있어야 정확한 설정 sweet spot이나 설계 방향성을 알 수 있을 것 같아. + +**핵심 문제**: OCR 타임아웃, parse error, skip 같은 ingest 실패를 individual event + summary stats로 기록할 structured log surface가 없음. 사용자가 sweet-spot (OCR timeout, model 선택) 을 찾으려면 per-page 통계 + aggregate stats가 필수. + +**Surface 선택**: structured ndjson file (wire schema 변경 0, internal write-only). 별도 CLI query 없이 grep/jq로 사용자가 직접 분석 가능. + +--- + +## 스코프 + +### 포함 +- `[logging]` config section (2 field: `ingest_log_enabled`, `ingest_log_dir`) +- Per-ingest-run ndjson log file (`ingest-{run_id}.ndjson`, run_id = ISO 8601 + random suffix) +- 5 kind 의 log record: `ocr`, `parse_error`, `skip`, `error`, `summary` +- OCR per-page event 에 image byte size + dimensions 측정 포함 +- Run ID + timestamp ISO 8601 UTC format +- Backward compat: pre-v0.20 config files가 new `[logging]` field 자동 무시 + +### Out of scope +- SQLite 영구 저장 (future enhancement) +- Log level switches (verbose / minimal) — 본 round single level +- Log rotation / cleanup policy (user-managed) +- Query CLI (`kebab logs`) +- Wire schema 변경 (internal use only) + +--- + +## 설계 결정 + +### §3.1 Config schema — `[logging]` section + +**파일**: `crates/kebab-config/src/lib.rs` (line 37+ 에 Config struct 기존) + +新 struct `LoggingCfg`: + +```rust +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct LoggingCfg { + /// ingest 시 structured ndjson log 자동 write. default = true. + /// false 시 log file 생성 0. + #[serde(default = "default_ingest_log_enabled")] + pub ingest_log_enabled: bool, + + /// per-ingest-run log file directory. default = `{state_dir}/logs`. + /// `{state_dir}` placeholder 가 XDG state dir 로 expand. + #[serde(default = "default_ingest_log_dir")] + pub ingest_log_dir: PathBuf, +} + +fn default_ingest_log_enabled() -> bool { true } +fn default_ingest_log_dir() -> PathBuf { + // expand 후 = ~/.local/state/kebab/logs + PathBuf::from("{state_dir}/logs") +} + +impl Default for LoggingCfg { + fn default() -> Self { + Self { + ingest_log_enabled: default_ingest_log_enabled(), + ingest_log_dir: default_ingest_log_dir(), + } + } +} +``` + +**Config struct 에 추가** (line 37+): + +```rust +pub struct Config { + // ... existing fields ... + #[serde(default)] + pub logging: LoggingCfg, +} +``` + +`#[serde(default)]` 로 backward compat 보장 (pre-v0.20 config 파일이 `[logging]` section 없으면 자동 default init). + +### §3.2 Per-ingest-run filename + ID generation + +**Filename format**: `ingest-{run_id}.ndjson` + +**Run ID**: ISO 8601 timestamp + 8-char random suffix +- Example: `20260528T013000Z-abc123de` +- Format: `YYYYMMDDTHHmmssZ-{8-char random alphanumeric}` +- Sortable + readable + concurrent collision unlikely + +**Path expansion**: `{state_dir}` placeholder 를 `~/.local/state/kebab` 로 expand +- `{state_dir}` 대체 별도 hand-roll (kebab-config 의 expand_path_with_base 는 {state_dir} 미지원) +- Directory auto-create if missing + +### §3.3 Log content — ndjson schema + +각 line = JSON object (ndjson format). 각 record 의 `kind` 가 union variant discriminator. + +#### Record kind: `ocr` + +OCR per-page attempt (success / failure). **HIGH-1 (PdfOcrProgress::Finished extend)**: image_byte/dims/failure_reason 은 PdfOcrProgress::Finished variant 에 추가되어 carry. + +```jsonc +{ + "ts": "2026-05-28T01:30:01.123Z", + "kind": "ocr", + "doc_path": "metro-korea.pdf", + "page": 8, + "image_byte_size": 989512, + "image_width": 2480, + "image_height": 3508, + "ms": 180000, + "chars": 0, + "success": false, + "reason": "timeout", + "ocr_engine": "ollama-vision" +} +``` + +**Fields**: +- `ts`: ISO 8601 UTC (milliseconds, using `time` crate) +- `kind`: literal `"ocr"` +- `doc_path`: relative or absolute source path +- `page`: page number (0-indexed or 1-indexed — confirm in impl) +- `image_byte_size`: raster image byte size (from PdfOcrProgress::Finished.image_byte_size) +- `image_width`, `image_height`: raster dimensions pixel (from PdfOcrProgress::Finished.image_width/height) +- `ms`: OCR duration milliseconds +- `chars`: extracted character count (0 if failed) +- `success`: boolean (failure_reason Some → false) (from PdfOcrProgress::Finished.failure_reason.is_some() negation) +- `reason`: "timeout" | "network_error" | "malformed_image" | "other" (from PdfOcrProgress::Finished.failure_reason) +- `ocr_engine`: "ollama-vision" or config'd name + +#### Record kind: `parse_error` + +PDF / other format parse failure. + +```jsonc +{ + "ts": "2026-05-28T01:30:02.456Z", + "kind": "parse_error", + "doc_path": "weird.pdf", + "reason": "lopdf_error", + "message": "unexpected EOF in xref table" +} +``` + +#### Record kind: `skip` + +Document skip (size_exceeded / gitignore / builtin_blacklist). + +```jsonc +{ + "ts": "2026-05-28T01:30:03.789Z", + "kind": "skip", + "doc_path": "large.zip", + "reason": "builtin_blacklist", + "detail": ".zip extension" +} +``` + +#### Record kind: `error` + +Fatal error emit (error.v1 이 emit 될 때 동시 log write). + +```jsonc +{ + "ts": "2026-05-28T01:30:04.012Z", + "kind": "error", + "code": "config_not_found", + "message": "config file does not exist: /tmp/config.toml" +} +``` + +#### Record kind: `summary` + +Ingest run 최종 aggregate stats (마지막 line). + +```jsonc +{ + "ts": "2026-05-28T01:31:00.000Z", + "kind": "summary", + "run_id": "20260528T013000Z-abc123de", + "scanned": 11, + "new": 11, + "errors": 0, + "ocr_pages": 21, + "ocr_failures": 2, + "ocr_p50_ms": 1500, + "ocr_p90_ms": 63000, + "ocr_max_ms": 180007, + "duration_ms": 555550 +} +``` + +**Fields**: +- `run_id`: 이 ingest run 의 고유 ID +- `scanned`: 전체 시도 doc count +- `new`: 실제 ingest 된 new doc count +- `errors`: fatal error count +- `ocr_pages`: OCR 시도한 총 page count +- `ocr_failures`: OCR 실패한 page count +- `ocr_p50_ms`, `ocr_p90_ms`, `ocr_max_ms`: percentile + max (success 한 page 만, null 가능) +- `duration_ms`: entire run elapsed + +--- + +## §4 구현 명세 + +### §4.1 새 module: `crates/kebab-app/src/ingest_log.rs` + +```rust +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::PathBuf; +use std::time::SystemTime; +use serde::{Serialize, Deserialize}; + +pub struct IngestLogWriter { + file: Option>, + run_id: String, + started_at: SystemTime, +} + +impl IngestLogWriter { + /// Open log file. `cfg.ingest_log_enabled == false` 면 None 반환. + pub fn open(cfg: &kebab_config::LoggingCfg) -> anyhow::Result> { + if !cfg.ingest_log_enabled { + return Ok(None); + } + let run_id = generate_run_id(); + let log_dir = expand_log_dir(&cfg.ingest_log_dir)?; + std::fs::create_dir_all(&log_dir)?; + let path = log_dir.join(format!("ingest-{run_id}.ndjson")); + let file = BufWriter::new(File::create(&path)?); + Ok(Some(Self { + file: Some(file), + run_id, + started_at: SystemTime::now(), + })) + } + + pub fn write_event(&mut self, event: &LogEvent) -> anyhow::Result<()> { + if let Some(ref mut f) = self.file { + serde_json::to_writer(&mut *f, event)?; + writeln!(f)?; + } + Ok(()) + } + + pub fn write_summary(&mut self, summary: &IngestSummary) -> anyhow::Result<()> { + if let Some(ref mut f) = self.file { + serde_json::to_writer(&mut *f, summary)?; + writeln!(f)?; + } + Ok(()) + } + + pub fn flush(&mut self) -> anyhow::Result<()> { + if let Some(ref mut f) = self.file { + f.flush()?; + } + Ok(()) + } + + pub fn run_id(&self) -> &str { + &self.run_id + } +} + +impl Drop for IngestLogWriter { + fn drop(&mut self) { + let _ = self.flush(); + } +} + +fn generate_run_id() -> String { + use time::OffsetDateTime; + use time::format_description::well_known::iso8601; + let now = OffsetDateTime::now_utc(); + // Format: 20260528T013000Z (compact ISO 8601) + let mut buffer = [0u8; 16]; + let ts = now.format( + &format_description::parse("[year][month][day]T[hour][minute][second]Z") + .expect("format_description is valid") + ).expect("format should succeed"); + let suffix: String = (0..8) + .map(|_| { + const CHARS: &[u8] = b"abcdefghijklmnopqrstuvwxyz0123456789"; + CHARS[rand::random::() % CHARS.len()] as char + }) + .collect(); + format!("{ts}-{suffix}") +} + +fn expand_log_dir(path: &PathBuf) -> anyhow::Result { + // 이미 기존 expand_path / expand_path_with_base 가 있으므로 활용 + // {state_dir} 의존 → kebab_config::Config::xdg_state_dir() + "/logs" + use std::env; + let path_str = path.to_string_lossy(); + if path_str.contains("{state_dir}") { + let state_dir = kebab_config::Config::xdg_state_dir(); + Ok(PathBuf::from(path_str.replace("{state_dir}", state_dir.to_str().unwrap()))) + } else { + Ok(path.clone()) + } +} + +#[derive(Serialize, Deserialize)] +#[serde(tag = "kind")] +pub enum LogEvent<'a> { + #[serde(rename = "ocr")] + Ocr { + ts: String, + doc_path: &'a str, + page: u32, + image_byte_size: Option, + image_width: Option, + image_height: Option, + ms: u64, + chars: u32, + success: bool, + reason: Option<&'a str>, + ocr_engine: &'a str, + }, + #[serde(rename = "parse_error")] + ParseError { + ts: String, + doc_path: &'a str, + reason: &'a str, + message: &'a str, + }, + #[serde(rename = "skip")] + Skip { + ts: String, + doc_path: &'a str, + reason: &'a str, + detail: Option<&'a str>, + }, + #[serde(rename = "error")] + Error { + ts: String, + code: &'a str, + message: &'a str, + }, + #[serde(rename = "summary")] + Summary { + ts: String, + run_id: String, + scanned: u32, + new: u32, + errors: u32, + ocr_pages: u32, + ocr_failures: u32, + ocr_p50_ms: Option, + ocr_p90_ms: Option, + ocr_max_ms: Option, + duration_ms: u64, + }, +} + +#[derive(Serialize, Deserialize)] +pub struct IngestSummary { + pub ts: String, + pub run_id: String, + pub scanned: u32, + pub new: u32, + pub errors: u32, + pub ocr_pages: u32, + pub ocr_failures: u32, + pub ocr_p50_ms: Option, + pub ocr_p90_ms: Option, + pub ocr_max_ms: Option, + pub duration_ms: u64, +} +``` + +### §4.2 Emit hook integration points + +**Hook 1**: `crates/kebab-app/src/lib.rs::ingest_with_config` (line 234) +- `IngestLogWriter::open(cfg.logging)` at function entry → `Option>>` +- `IngestLogWriter::flush()` at function exit (success / error) +- `IngestLogWriter` 를 `Arc>` wrap 후 `ingest_with_config_opts()` 의 `IngestOpts` 에 carry +- **MEDIUM-1 (ownership)**: `apply_pdf_ocr` 의 emit_progress closure 가 `Arc>` clone 캡처 + lock + write. single-threaded sync (per-asset loop) 라 blocking lock 안전. + +**Hook 2**: OCR progress event (pdf_ocr_apply.rs) +- **HIGH-1 (PdfOcrProgress::Finished extend)**: `PdfOcrProgress::Finished` variant 확장 — additive field: + ```rust + PdfOcrProgress::Finished { + page: u32, + ms: u64, + chars: u32, + skipped: bool, + // NEW: + image_byte_size: Option, + image_width: Option, + image_height: Option, + failure_reason: Option, // "timeout" | "ocr_error" | "network_error" | None + } + ``` +- emit_progress closure 내에서 log writer (Arc>) 에 ocr event 전달 +- ingest_progress.v1 (IngestEvent::PdfOcrFinished) cascade 영향: additive minor (backward compat) + +**Hook 3**: Parse error (app.rs / ingest pipeline error path) +- parse_error kind emit when `Error::PdfFormat` / `Error::ImageFormat` 발생 + +**Hook 4**: Skip event (kebab-source-fs/src/connector.rs) +- size_exceeded / gitignore / builtin_blacklist skip 시 log writer에 skip event 전달 + +**Hook 5**: Fatal error (**HIGH-4 위치 정정**) +- **위치**: `crates/kebab-app/src/lib.rs::ingest_with_config_opts` 의 error return path (per-asset catch + final Err arm) +- `classify(err, verbose)` invoke 직후 log writer에 LogEvent::Error emit +- `error_wire::classify` 자체는 변경 0 유지 (side-effect 없는 순수 변환) + +### §4.3 Timestamp format + +**HIGH-3 (chrono → time crate)**: ISO 8601 UTC, milliseconds precision: +- Example: `2026-05-28T01:30:01.123Z` +- Use `time::OffsetDateTime::now_utc().format(&time::format_description::well_known::Rfc3339)` +- 항상 UTC (system timezone 무관) +- workspace 는 `time` crate 이미 사용 중 (chrono 중복 의존 제거) + +### §4.4 Backward compat + +`Config` struct의 `logging` field에 `#[serde(default)]` tag: +```rust +#[serde(default)] +pub logging: LoggingCfg, +``` + +→ pre-v0.20 config file이 `[logging]` section 없으면 자동 `LoggingCfg::default()` init (enabled=true, dir=~/.local/state/kebab/logs) + +--- + +## §5 Acceptance criteria + +- **AC-1**: `[logging]` section default emit (새 config 또는 `config init`). +- **AC-2**: `kebab ingest` 실행 후 `{log_dir}/ingest-{run_id}.ndjson` 파일 존재. +- **AC-3**: 각 line valid JSON + `kind` enum value + `ts` ISO 8601. +- **AC-4**: OCR per-page record + summary record (마지막 line). +- **AC-5**: 모든 failure type (size_exceeded / parse_error / ocr timeout) record 됨. +- **AC-6**: `ingest_log_enabled = false` 시 log file 생성 0. +- **AC-7**: `ingest_log_dir = "/tmp/custom"` override 시 그 path에 file emit. +- **AC-8**: `cargo test -p kebab-app --lib ingest_log` + `cargo clippy` green. +- **AC-9** (**MEDIUM-2 actionability**): integration test — `cargo test -p kebab-app --test ingest_log_smoke -j 4 2>&1 | tail -3` → 1 passed; 0 failed. test body: + 1. tempdir + minimal corpus (1 markdown + 1 image PDF). + 2. ingest with `[logging] ingest_log_dir = tempdir/logs`. + 3. assert: log file `tempdir/logs/ingest-{run_id}.ndjson` exists. + 4. parse each line as JSON, assert kinds = [ocr, summary] or more. + 5. last line kind = "summary" + scanned > 0 && ocr_pages > 0. +- **AC-10**: pre-v0.20 config + v0.20 binary 호환성 (new `[logging]` field 무시). + +--- + +## §6 위험 + 미해결 질문 + +### Risks + +- **R-1**: Log file 누적 disk usage — user가 직접 정리. Doc comment 명시. +- **R-2**: Concurrent ingest 의 run_id collision — 8-char random + timestamp 로 거의 불가능. Mitigate. +- **R-3**: `{state_dir}` placeholder expand — hand-roll via xdg_state_dir + string replace. existing `expand_path_with_base` 는 {state_dir} 미지원 (LOW-2). +- **R-4**: Error path panic 시 log writer drop → flush 미실행 — Drop impl 에 flush() 호출로 mitigate. +- **R-5**: workspace dependency 추가: `rand` (run_id suffix), `time` (timestamp) 이미 kebab-app 의존. `chrono` 신규 추가 금지 (HIGH-3). + +### Open questions + +- **OQ-1** (**HIGH-1 의 design decision으로 승격**): image_byte_size + dimensions 출처 — PdfOcrProgress::Finished 에 carry (Option A 채택). Bug #11 follow-up 에서 raster image 측정 이미 시작. +- **OQ-2**: `ocr_p50_ms`, `ocr_p90_ms` 계산 — `quantiles` crate 사용 또는 간단 sorted vec? (plan drafter 결정) +- **OQ-3**: Log file 수동 cleanup 정책을 user-facing docs에 명시할 위치? (README / SMOKE / config example, plan drafter + executor 결정) + +--- + +## §7 참고 + +- **Parent task**: `tasks/p10/p10-1A-5-ingest-failure-log.md` +- **Parent design**: `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` (§8 wire schema, §9 versioning cascade — 변경 0) +- **Bug #11 follow-up**: OCR image metric capture (raster path in kebab-parse-pdf) +- **Related**: `crates/kebab-app/src/error_wire.rs` (ErrorV1 emit), `crates/kebab-app/src/ingest_progress.rs` (IngestEvent) +- **Config**: `crates/kebab-config/src/lib.rs` (Config struct, expand_path helpers, xdg_state_dir)