Merge pull request 'refactor(app): extract dispatch polymorphism — App.extract_for(...) + 11 Extractor registry' (#187) from refactor/extractor-dispatch-unification into main

Reviewed-on: #187
This commit was merged in pull request #187.
This commit is contained in:
2026-05-26 21:07:20 +00:00
2 changed files with 202 additions and 45 deletions

View File

@@ -40,11 +40,18 @@ use anyhow::{Context, Result, anyhow};
use lru::LruCache;
use kebab_core::{
Answer, DocumentStore, Embedder, IndexVersion, LanguageModel, Retriever, SearchHit,
SearchMode, SearchOpts, SearchQuery, VectorStore,
Answer, DocumentStore, Embedder, ExtractContext, Extractor, IndexVersion, LanguageModel,
MediaType, Retriever, SearchHit, SearchMode, SearchOpts, SearchQuery, VectorStore,
};
use kebab_embed_local::FastembedEmbedder;
use kebab_llm_local::OllamaLanguageModel;
use kebab_parse_code::{
CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor,
JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor,
TypescriptAstExtractor,
};
use kebab_parse_image::ImageExtractor;
use kebab_parse_pdf::PdfTextExtractor;
use kebab_rag::{AskOpts, RagPipeline};
use kebab_search::{HybridRetriever, LexicalRetriever, VectorRetriever};
use kebab_store_sqlite::SqliteStore;
@@ -115,6 +122,12 @@ pub fn short_query_hint(query_text: &str, hits_empty: bool) -> Option<String> {
pub struct App {
pub(crate) config: kebab_config::Config,
pub(crate) sqlite: Arc<SqliteStore>,
/// post-v0.18.0 extractor-dispatch-unification: polymorphic Extractor
/// registry. App init 시 1회 등록되어 `extract_for(...)` 가 lookup
/// 한다. 현재 11 entry (ImageExtractor + PdfTextExtractor + 9 AST).
/// MarkdownExtractor 는 별 PR 에서 추가 — markdown ingest path 는
/// 본 PR 에서 free-function 그대로 유지.
pub(crate) extractors: Vec<Box<dyn Extractor + Send + Sync>>,
/// Memoized embedder — built lazily on first `embedder()` call when
/// embeddings are enabled. `OnceLock` keeps the struct `Sync` and
/// the build path cold-only-once.
@@ -204,6 +217,25 @@ impl App {
// `None` (cache disabled — every search hits the retrievers).
let search_cache = NonZeroUsize::new(config.search.cache_capacity)
.map(|cap| Mutex::new(LruCache::new(cap)));
// post-v0.18.0 extractor-dispatch-unification: build the 11-entry
// Extractor registry. All entries are state-less unit structs with
// zero-cost `new()`, so init cost is effectively 0 and side effects
// are 0 — `pipeline_verifier` fallible `?` below may bail but the
// already-constructed `extractors` Vec drops without cost. Markdown
// is NOT registered (see field doc).
let extractors: Vec<Box<dyn Extractor + Send + Sync>> = vec![
Box::new(ImageExtractor::new()),
Box::new(PdfTextExtractor::new()),
Box::new(RustAstExtractor::new()),
Box::new(PythonAstExtractor::new()),
Box::new(TypescriptAstExtractor::new()),
Box::new(JavascriptAstExtractor::new()),
Box::new(GoAstExtractor::new()),
Box::new(JavaAstExtractor::new()),
Box::new(KotlinAstExtractor::new()),
Box::new(CAstExtractor::new()),
Box::new(CppAstExtractor::new()),
];
// p9-fb-41 PR-9c-2: build the NLI verifier when the gate is
// enabled. App carries it on `RagPipeline` via
// `with_verifier` so the rag crate doesn't have to know about
@@ -222,6 +254,7 @@ impl App {
Ok(Self {
config,
sqlite: Arc::new(sqlite),
extractors,
embedder: OnceLock::new(),
vector: OnceLock::new(),
llm: OnceLock::new(),
@@ -230,6 +263,30 @@ impl App {
})
}
/// Polymorphic dispatcher for the [`Extractor`] trait. Looks up the
/// first Extractor whose `supports(media)` returns true and invokes
/// `extract(ctx, bytes)` on it.
///
/// Errors with `anyhow!("no Extractor for media_type {media:?}")`
/// when no matching Extractor is registered. Callers in
/// `ingest_one_*_asset` reach this only after the outer 4-arm
/// dispatch (`MediaType::Markdown` / `Image` / `Pdf` / `Code(lang)`)
/// has matched, so a miss is a programming error — NOT a user-
/// facing skip.
pub(crate) fn extract_for(
&self,
media: &MediaType,
ctx: &ExtractContext<'_>,
bytes: &[u8],
) -> Result<kebab_core::CanonicalDocument> {
let extractor = self
.extractors
.iter()
.find(|e| e.supports(media))
.ok_or_else(|| anyhow!("no Extractor for media_type {media:?}"))?;
extractor.extract(ctx, bytes)
}
/// Run a [`SearchQuery`] through the configured retriever stack and
/// return the top-k hits. p9-fb-19: result is served from the
/// in-process LRU cache when the same `(query_norm, mode, k,
@@ -1157,3 +1214,128 @@ mod tests_trace {
assert!(resp.trace.is_some(), "trace populated when opts.trace=true");
}
}
/// post-v0.18.0 extractor-dispatch-unification: in-crate unit tests for
/// the `App.extractors` registry + `App::extract_for` polymorphic
/// dispatch. In-crate (not `tests/`) because `extractors` + `extract_for`
/// are `pub(crate)` — integration tests cannot reach them.
///
/// Spec §5.1 + plan §2 Step 10 — 3 test class:
/// 1. registry length = 11 (image + pdf + 9 AST).
/// 2. mutually-exclusive `supports()` grid over 16 sample MediaTypes.
/// 3. `extract_for` returns `Err("no Extractor ...")` for registry-NOT-cover
/// MediaType (Audio).
#[cfg(test)]
mod tests_extractor_dispatch {
use super::*;
use kebab_core::{AudioType, ExtractConfig, ImageType};
/// helper: tempdir-isolated App for tests (mirrors `tests_trace`'s
/// `open_app_with_temp_dir` pattern).
fn open_app_with_temp_dir() -> (tempfile::TempDir, App) {
let dir = tempfile::tempdir().unwrap();
let mut cfg = kebab_config::Config::defaults();
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
// Bring up migrations.
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
drop(store);
let app = App::open_with_config(cfg).unwrap();
(dir, app)
}
/// Registry length invariant: 11 Extractor (image + pdf + 9 AST).
/// Markdown is NOT registered (free-function path — defer to a
/// separate PR per spec §3.4).
#[test]
fn registry_has_eleven_extractors() {
let (_dir, app) = open_app_with_temp_dir();
assert_eq!(
app.extractors.len(),
11,
"registry must hold 11 Extractors (image + pdf + 9 AST). \
markdown 은 별 PR."
);
}
/// 11 Extractor 의 `supports()` 가 16 sample MediaType 에 대해
/// mutually exclusive — 어떤 두 Extractor 도 동일 MediaType 에
/// 대해 true 반환 안 됨.
#[test]
fn supports_grid_is_mutually_exclusive() {
let (_dir, app) = open_app_with_temp_dir();
let samples = vec![
MediaType::Markdown,
MediaType::Pdf,
MediaType::Image(ImageType::Png),
MediaType::Image(ImageType::Jpeg),
MediaType::Code("rust".into()),
MediaType::Code("python".into()),
MediaType::Code("typescript".into()),
MediaType::Code("javascript".into()),
MediaType::Code("go".into()),
MediaType::Code("java".into()),
MediaType::Code("kotlin".into()),
MediaType::Code("c".into()),
MediaType::Code("cpp".into()),
MediaType::Code("yaml".into()), // registry NOT cover
MediaType::Code("shell".into()), // registry NOT cover
MediaType::Audio(AudioType::Wav), // registry NOT cover
];
for sample in &samples {
let hits: Vec<_> = app
.extractors
.iter()
.filter(|e| e.supports(sample))
.collect();
assert!(
hits.len() <= 1,
"mutually exclusive violated for {sample:?}: {} hits",
hits.len()
);
}
}
/// `extract_for` 가 registry NOT cover MediaType (Audio) 에 대해
/// `Err("no Extractor for media_type ...")` 반환. Audio MediaType
/// 사용으로 RawAsset 의 actual content 의존 회피 — registry NOT
/// cover → 즉시 Err.
#[test]
fn extract_for_unsupported_media_errors() {
let (_dir, app) = open_app_with_temp_dir();
// Minimal RawAsset. Actual content never read — Audio MediaType
// 는 registry NOT cover → `extract_for` 가 dispatch loop 안에서
// 바로 Err 반환. RawAsset field set 은 `crates/kebab-core/src/
// asset.rs:62-73` 와 정합 (8 field).
let asset = kebab_core::RawAsset {
asset_id: kebab_core::AssetId("00".repeat(16)),
source_uri: kebab_core::SourceUri::File("/tmp/dummy.wav".into()),
workspace_path: kebab_core::WorkspacePath("dummy.wav".to_string()),
media_type: MediaType::Audio(AudioType::Wav),
byte_len: 0,
checksum: kebab_core::Checksum("00".repeat(32)),
discovered_at: time::OffsetDateTime::now_utc(),
// AssetStorage::Inline 미존재 — actual variant `Copied { path }`
// 사용 (kebab-core/src/asset.rs:55-60).
stored: kebab_core::AssetStorage::Copied {
path: std::path::PathBuf::from("/tmp/dummy.wav"),
},
};
let workspace_root: std::path::PathBuf = std::path::PathBuf::from("/tmp");
let cfg = ExtractConfig::default();
let ctx = ExtractContext {
asset: &asset,
workspace_root: &workspace_root,
config: &cfg,
};
let result = app.extract_for(&MediaType::Audio(AudioType::Wav), &ctx, &[]);
assert!(result.is_err(), "Audio 는 registry 미포함 → Err 기대");
let err_msg = format!("{:#}", result.unwrap_err());
assert!(
err_msg.contains("no Extractor"),
"unexpected err: {err_msg}"
);
}
}

View File

@@ -43,14 +43,12 @@ use kebab_chunk::{CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, Co
use kebab_core::{
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType,
EmbeddingKind, ExtractContext, IngestReport, Lang, LanguageModel, MediaType,
ParserVersion, RawAsset, SearchHit, SearchQuery, SourceScope,
SourceUri, VectorRecord, VectorStore,
};
use kebab_llm_local::OllamaLanguageModel;
use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr};
use kebab_parse_code::{CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor};
use kebab_parse_pdf::PdfTextExtractor;
use kebab_parse_image::{OllamaVisionOcr, apply_caption, apply_ocr};
use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
use kebab_source_fs::FsSourceConnector;
@@ -353,9 +351,7 @@ pub fn ingest_with_config_opts(
} else {
None
};
let image_extractor = ImageExtractor::new();
let image_pipeline = ImagePipeline {
extractor: &image_extractor,
ocr_engine: ocr_engine.as_ref(),
caption_llm: caption_llm.as_deref(),
};
@@ -758,7 +754,6 @@ type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
/// once per ingest invocation. Threaded through `ingest_one_asset` so
/// the dispatch does not need ten separate parameters.
struct ImagePipeline<'a> {
extractor: &'a ImageExtractor,
ocr_engine: Option<&'a OllamaVisionOcr>,
caption_llm: Option<&'a dyn LanguageModel>,
}
@@ -1232,7 +1227,6 @@ fn ingest_one_image_asset(
image_pipeline: &ImagePipeline<'_>,
force_reingest: bool,
) -> anyhow::Result<kebab_core::IngestItem> {
let image_extractor = image_pipeline.extractor;
let ocr_engine = image_pipeline.ocr_engine;
let caption_llm = image_pipeline.caption_llm;
let path = match &asset.source_uri {
@@ -1291,9 +1285,9 @@ fn ingest_one_image_asset(
workspace_root: &workspace_root,
config: &extract_config,
};
let mut canonical = image_extractor
.extract(&ctx, &bytes)
.context("kb-parse-image::ImageExtractor::extract")?;
let mut canonical = app
.extract_for(&asset.media_type, &ctx, &bytes)
.context("kb-app::extract_for (image)")?;
// 2 + 3. Apply OCR / caption when their adapters exist. Both are
// Lenient — failure is captured into Provenance Warning,
@@ -1780,9 +1774,9 @@ fn ingest_one_pdf_asset(
workspace_root: &workspace_root,
config: &extract_config,
};
let mut canonical = PdfTextExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-pdf::PdfTextExtractor::extract")?;
let mut canonical = app
.extract_for(&asset.media_type, &ctx, &bytes)
.context("kb-app::extract_for (pdf)")?;
// Per-medium chunker selection: PDF docs always use pdf-page-v1
// regardless of `config.chunking.chunker_version`. The chunker
@@ -2007,43 +2001,24 @@ fn ingest_one_code_asset(
config: &extract_config,
};
// p10-1b Task D/G/J/L: extractor per-lang.
// post-v0.18.0 extractor-dispatch-unification:
// 9 AST lang 의 dispatch 가 polymorphic — App.extractors registry 의
// `*AstExtractor` entry 가 lang string 으로 disjoint `supports()` 비교
// 후 단일 hit. Tier 2 (manifest) + Tier 3 (shell) 은 free-function
// `synthesize_tier2_document` 유지 (Extractor impl 아님 — 별 PR).
// p10-3: capture Result so Tier 1 extractor errors can fall back to Tier 3.
let canonical_result: anyhow::Result<kebab_core::CanonicalDocument> = match code_lang {
"rust" => RustAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::RustAstExtractor::extract (code:rust)"),
"python" => PythonAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::PythonAstExtractor::extract (code:python)"),
"typescript" => TypescriptAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::TypescriptAstExtractor::extract (code:typescript)"),
"javascript" => JavascriptAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::JavascriptAstExtractor::extract (code:javascript)"),
"go" => GoAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::GoAstExtractor::extract (code:go)"),
"java" => JavaAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::JavaAstExtractor::extract (code:java)"),
"kotlin" => KotlinAstExtractor::new()
.extract(&ctx, &bytes)
.context("kb-parse-code::KotlinAstExtractor::extract (code:kotlin)"),
// 9 AST lang: rust / python / typescript / javascript / go / java / kotlin / c / cpp
"rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" | "c"
| "cpp" => app
.extract_for(&asset.media_type, &ctx, &bytes)
.with_context(|| format!("kb-app::extract_for (code:{code_lang})")),
// p10-2 Tier 2: no extractor — synthesize Document directly from raw bytes.
"yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => {
synthesize_tier2_document(asset, &bytes, code_lang, &parser_version)
}
// p10-3: shell reuses the same synthesizer.
"shell" => synthesize_tier2_document(asset, &bytes, "shell", &parser_version),
// p10-1D: C + C++ AST extractors.
"c" => CAstExtractor::new()
.extract(&ctx, &bytes)
.context("kebab-parse-code::CAstExtractor::extract (code:c)"),
"cpp" => CppAstExtractor::new()
.extract(&ctx, &bytes)
.context("kebab-parse-code::CppAstExtractor::extract (code:cpp)"),
other => anyhow::bail!("unreachable (extract): {other}"),
};