diff --git a/crates/kebab-app/src/app.rs b/crates/kebab-app/src/app.rs index 6a3043f..5f7a42f 100644 --- a/crates/kebab-app/src/app.rs +++ b/crates/kebab-app/src/app.rs @@ -40,11 +40,18 @@ use anyhow::{Context, Result, anyhow}; use lru::LruCache; use kebab_core::{ - Answer, DocumentStore, Embedder, IndexVersion, LanguageModel, Retriever, SearchHit, - SearchMode, SearchOpts, SearchQuery, VectorStore, + Answer, DocumentStore, Embedder, ExtractContext, Extractor, IndexVersion, LanguageModel, + MediaType, Retriever, SearchHit, SearchMode, SearchOpts, SearchQuery, VectorStore, }; use kebab_embed_local::FastembedEmbedder; use kebab_llm_local::OllamaLanguageModel; +use kebab_parse_code::{ + CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, + JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, + TypescriptAstExtractor, +}; +use kebab_parse_image::ImageExtractor; +use kebab_parse_pdf::PdfTextExtractor; use kebab_rag::{AskOpts, RagPipeline}; use kebab_search::{HybridRetriever, LexicalRetriever, VectorRetriever}; use kebab_store_sqlite::SqliteStore; @@ -115,6 +122,12 @@ pub fn short_query_hint(query_text: &str, hits_empty: bool) -> Option { pub struct App { pub(crate) config: kebab_config::Config, pub(crate) sqlite: Arc, + /// post-v0.18.0 extractor-dispatch-unification: polymorphic Extractor + /// registry. App init 시 1회 등록되어 `extract_for(...)` 가 lookup + /// 한다. 현재 11 entry (ImageExtractor + PdfTextExtractor + 9 AST). + /// MarkdownExtractor 는 별 PR 에서 추가 — markdown ingest path 는 + /// 본 PR 에서 free-function 그대로 유지. + pub(crate) extractors: Vec>, /// Memoized embedder — built lazily on first `embedder()` call when /// embeddings are enabled. `OnceLock` keeps the struct `Sync` and /// the build path cold-only-once. @@ -204,6 +217,25 @@ impl App { // `None` (cache disabled — every search hits the retrievers). let search_cache = NonZeroUsize::new(config.search.cache_capacity) .map(|cap| Mutex::new(LruCache::new(cap))); + // post-v0.18.0 extractor-dispatch-unification: build the 11-entry + // Extractor registry. All entries are state-less unit structs with + // zero-cost `new()`, so init cost is effectively 0 and side effects + // are 0 — `pipeline_verifier` fallible `?` below may bail but the + // already-constructed `extractors` Vec drops without cost. Markdown + // is NOT registered (see field doc). + let extractors: Vec> = vec![ + Box::new(ImageExtractor::new()), + Box::new(PdfTextExtractor::new()), + Box::new(RustAstExtractor::new()), + Box::new(PythonAstExtractor::new()), + Box::new(TypescriptAstExtractor::new()), + Box::new(JavascriptAstExtractor::new()), + Box::new(GoAstExtractor::new()), + Box::new(JavaAstExtractor::new()), + Box::new(KotlinAstExtractor::new()), + Box::new(CAstExtractor::new()), + Box::new(CppAstExtractor::new()), + ]; // p9-fb-41 PR-9c-2: build the NLI verifier when the gate is // enabled. App carries it on `RagPipeline` via // `with_verifier` so the rag crate doesn't have to know about @@ -222,6 +254,7 @@ impl App { Ok(Self { config, sqlite: Arc::new(sqlite), + extractors, embedder: OnceLock::new(), vector: OnceLock::new(), llm: OnceLock::new(), @@ -230,6 +263,30 @@ impl App { }) } + /// Polymorphic dispatcher for the [`Extractor`] trait. Looks up the + /// first Extractor whose `supports(media)` returns true and invokes + /// `extract(ctx, bytes)` on it. + /// + /// Errors with `anyhow!("no Extractor for media_type {media:?}")` + /// when no matching Extractor is registered. Callers in + /// `ingest_one_*_asset` reach this only after the outer 4-arm + /// dispatch (`MediaType::Markdown` / `Image` / `Pdf` / `Code(lang)`) + /// has matched, so a miss is a programming error — NOT a user- + /// facing skip. + pub(crate) fn extract_for( + &self, + media: &MediaType, + ctx: &ExtractContext<'_>, + bytes: &[u8], + ) -> Result { + let extractor = self + .extractors + .iter() + .find(|e| e.supports(media)) + .ok_or_else(|| anyhow!("no Extractor for media_type {media:?}"))?; + extractor.extract(ctx, bytes) + } + /// Run a [`SearchQuery`] through the configured retriever stack and /// return the top-k hits. p9-fb-19: result is served from the /// in-process LRU cache when the same `(query_norm, mode, k, @@ -1157,3 +1214,128 @@ mod tests_trace { assert!(resp.trace.is_some(), "trace populated when opts.trace=true"); } } + +/// post-v0.18.0 extractor-dispatch-unification: in-crate unit tests for +/// the `App.extractors` registry + `App::extract_for` polymorphic +/// dispatch. In-crate (not `tests/`) because `extractors` + `extract_for` +/// are `pub(crate)` — integration tests cannot reach them. +/// +/// Spec §5.1 + plan §2 Step 10 — 3 test class: +/// 1. registry length = 11 (image + pdf + 9 AST). +/// 2. mutually-exclusive `supports()` grid over 16 sample MediaTypes. +/// 3. `extract_for` returns `Err("no Extractor ...")` for registry-NOT-cover +/// MediaType (Audio). +#[cfg(test)] +mod tests_extractor_dispatch { + use super::*; + use kebab_core::{AudioType, ExtractConfig, ImageType}; + + /// helper: tempdir-isolated App for tests (mirrors `tests_trace`'s + /// `open_app_with_temp_dir` pattern). + fn open_app_with_temp_dir() -> (tempfile::TempDir, App) { + let dir = tempfile::tempdir().unwrap(); + let mut cfg = kebab_config::Config::defaults(); + cfg.storage.data_dir = dir.path().to_string_lossy().into_owned(); + // Bring up migrations. + let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + drop(store); + let app = App::open_with_config(cfg).unwrap(); + (dir, app) + } + + /// Registry length invariant: 11 Extractor (image + pdf + 9 AST). + /// Markdown is NOT registered (free-function path — defer to a + /// separate PR per spec §3.4). + #[test] + fn registry_has_eleven_extractors() { + let (_dir, app) = open_app_with_temp_dir(); + assert_eq!( + app.extractors.len(), + 11, + "registry must hold 11 Extractors (image + pdf + 9 AST). \ + markdown 은 별 PR." + ); + } + + /// 11 Extractor 의 `supports()` 가 16 sample MediaType 에 대해 + /// mutually exclusive — 어떤 두 Extractor 도 동일 MediaType 에 + /// 대해 true 반환 안 됨. + #[test] + fn supports_grid_is_mutually_exclusive() { + let (_dir, app) = open_app_with_temp_dir(); + let samples = vec![ + MediaType::Markdown, + MediaType::Pdf, + MediaType::Image(ImageType::Png), + MediaType::Image(ImageType::Jpeg), + MediaType::Code("rust".into()), + MediaType::Code("python".into()), + MediaType::Code("typescript".into()), + MediaType::Code("javascript".into()), + MediaType::Code("go".into()), + MediaType::Code("java".into()), + MediaType::Code("kotlin".into()), + MediaType::Code("c".into()), + MediaType::Code("cpp".into()), + MediaType::Code("yaml".into()), // registry NOT cover + MediaType::Code("shell".into()), // registry NOT cover + MediaType::Audio(AudioType::Wav), // registry NOT cover + ]; + for sample in &samples { + let hits: Vec<_> = app + .extractors + .iter() + .filter(|e| e.supports(sample)) + .collect(); + assert!( + hits.len() <= 1, + "mutually exclusive violated for {sample:?}: {} hits", + hits.len() + ); + } + } + + /// `extract_for` 가 registry NOT cover MediaType (Audio) 에 대해 + /// `Err("no Extractor for media_type ...")` 반환. Audio MediaType + /// 사용으로 RawAsset 의 actual content 의존 회피 — registry NOT + /// cover → 즉시 Err. + #[test] + fn extract_for_unsupported_media_errors() { + let (_dir, app) = open_app_with_temp_dir(); + + // Minimal RawAsset. Actual content never read — Audio MediaType + // 는 registry NOT cover → `extract_for` 가 dispatch loop 안에서 + // 바로 Err 반환. RawAsset field set 은 `crates/kebab-core/src/ + // asset.rs:62-73` 와 정합 (8 field). + let asset = kebab_core::RawAsset { + asset_id: kebab_core::AssetId("00".repeat(16)), + source_uri: kebab_core::SourceUri::File("/tmp/dummy.wav".into()), + workspace_path: kebab_core::WorkspacePath("dummy.wav".to_string()), + media_type: MediaType::Audio(AudioType::Wav), + byte_len: 0, + checksum: kebab_core::Checksum("00".repeat(32)), + discovered_at: time::OffsetDateTime::now_utc(), + // AssetStorage::Inline 미존재 — actual variant `Copied { path }` + // 사용 (kebab-core/src/asset.rs:55-60). + stored: kebab_core::AssetStorage::Copied { + path: std::path::PathBuf::from("/tmp/dummy.wav"), + }, + }; + + let workspace_root: std::path::PathBuf = std::path::PathBuf::from("/tmp"); + let cfg = ExtractConfig::default(); + let ctx = ExtractContext { + asset: &asset, + workspace_root: &workspace_root, + config: &cfg, + }; + let result = app.extract_for(&MediaType::Audio(AudioType::Wav), &ctx, &[]); + assert!(result.is_err(), "Audio 는 registry 미포함 → Err 기대"); + let err_msg = format!("{:#}", result.unwrap_err()); + assert!( + err_msg.contains("no Extractor"), + "unexpected err: {err_msg}" + ); + } +} diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 2378e2e..96951e5 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -43,14 +43,12 @@ use kebab_chunk::{CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, Co use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, - EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType, + EmbeddingKind, ExtractContext, IngestReport, Lang, LanguageModel, MediaType, ParserVersion, RawAsset, SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore, }; use kebab_llm_local::OllamaLanguageModel; -use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr}; -use kebab_parse_code::{CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor}; -use kebab_parse_pdf::PdfTextExtractor; +use kebab_parse_image::{OllamaVisionOcr, apply_caption, apply_ocr}; use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -353,9 +351,7 @@ pub fn ingest_with_config_opts( } else { None }; - let image_extractor = ImageExtractor::new(); let image_pipeline = ImagePipeline { - extractor: &image_extractor, ocr_engine: ocr_engine.as_ref(), caption_llm: caption_llm.as_deref(), }; @@ -758,7 +754,6 @@ type SqliteStoreAlias = kebab_store_sqlite::SqliteStore; /// once per ingest invocation. Threaded through `ingest_one_asset` so /// the dispatch does not need ten separate parameters. struct ImagePipeline<'a> { - extractor: &'a ImageExtractor, ocr_engine: Option<&'a OllamaVisionOcr>, caption_llm: Option<&'a dyn LanguageModel>, } @@ -1232,7 +1227,6 @@ fn ingest_one_image_asset( image_pipeline: &ImagePipeline<'_>, force_reingest: bool, ) -> anyhow::Result { - let image_extractor = image_pipeline.extractor; let ocr_engine = image_pipeline.ocr_engine; let caption_llm = image_pipeline.caption_llm; let path = match &asset.source_uri { @@ -1291,9 +1285,9 @@ fn ingest_one_image_asset( workspace_root: &workspace_root, config: &extract_config, }; - let mut canonical = image_extractor - .extract(&ctx, &bytes) - .context("kb-parse-image::ImageExtractor::extract")?; + let mut canonical = app + .extract_for(&asset.media_type, &ctx, &bytes) + .context("kb-app::extract_for (image)")?; // 2 + 3. Apply OCR / caption when their adapters exist. Both are // Lenient — failure is captured into Provenance Warning, @@ -1780,9 +1774,9 @@ fn ingest_one_pdf_asset( workspace_root: &workspace_root, config: &extract_config, }; - let mut canonical = PdfTextExtractor::new() - .extract(&ctx, &bytes) - .context("kb-parse-pdf::PdfTextExtractor::extract")?; + let mut canonical = app + .extract_for(&asset.media_type, &ctx, &bytes) + .context("kb-app::extract_for (pdf)")?; // Per-medium chunker selection: PDF docs always use pdf-page-v1 // regardless of `config.chunking.chunker_version`. The chunker @@ -2007,43 +2001,24 @@ fn ingest_one_code_asset( config: &extract_config, }; - // p10-1b Task D/G/J/L: extractor per-lang. + // post-v0.18.0 extractor-dispatch-unification: + // 9 AST lang 의 dispatch 가 polymorphic — App.extractors registry 의 + // `*AstExtractor` entry 가 lang string 으로 disjoint `supports()` 비교 + // 후 단일 hit. Tier 2 (manifest) + Tier 3 (shell) 은 free-function + // `synthesize_tier2_document` 유지 (Extractor impl 아님 — 별 PR). // p10-3: capture Result so Tier 1 extractor errors can fall back to Tier 3. let canonical_result: anyhow::Result = match code_lang { - "rust" => RustAstExtractor::new() - .extract(&ctx, &bytes) - .context("kb-parse-code::RustAstExtractor::extract (code:rust)"), - "python" => PythonAstExtractor::new() - .extract(&ctx, &bytes) - .context("kb-parse-code::PythonAstExtractor::extract (code:python)"), - "typescript" => TypescriptAstExtractor::new() - .extract(&ctx, &bytes) - .context("kb-parse-code::TypescriptAstExtractor::extract (code:typescript)"), - "javascript" => JavascriptAstExtractor::new() - .extract(&ctx, &bytes) - .context("kb-parse-code::JavascriptAstExtractor::extract (code:javascript)"), - "go" => GoAstExtractor::new() - .extract(&ctx, &bytes) - .context("kb-parse-code::GoAstExtractor::extract (code:go)"), - "java" => JavaAstExtractor::new() - .extract(&ctx, &bytes) - .context("kb-parse-code::JavaAstExtractor::extract (code:java)"), - "kotlin" => KotlinAstExtractor::new() - .extract(&ctx, &bytes) - .context("kb-parse-code::KotlinAstExtractor::extract (code:kotlin)"), + // 9 AST lang: rust / python / typescript / javascript / go / java / kotlin / c / cpp + "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" | "c" + | "cpp" => app + .extract_for(&asset.media_type, &ctx, &bytes) + .with_context(|| format!("kb-app::extract_for (code:{code_lang})")), // p10-2 Tier 2: no extractor — synthesize Document directly from raw bytes. "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => { synthesize_tier2_document(asset, &bytes, code_lang, &parser_version) } // p10-3: shell reuses the same synthesizer. "shell" => synthesize_tier2_document(asset, &bytes, "shell", &parser_version), - // p10-1D: C + C++ AST extractors. - "c" => CAstExtractor::new() - .extract(&ctx, &bytes) - .context("kebab-parse-code::CAstExtractor::extract (code:c)"), - "cpp" => CppAstExtractor::new() - .extract(&ctx, &bytes) - .context("kebab-parse-code::CppAstExtractor::extract (code:cpp)"), other => anyhow::bail!("unreachable (extract): {other}"), };