refactor(config): v3 경로 call-site sweep (kebab-app/kebab-eval/kebab-parse-image)
부모 경로에 .ingest 삽입(leaf 구조체 불변). src + 테스트 call-site 전부. kebab-cli 테스트의 v2 TOML fixture 는 from_file 자동변환(T6) 경로 검증용으로 유지. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -924,7 +924,7 @@ impl App {
|
||||
k: u32::try_from(query.k).unwrap_or(u32::MAX),
|
||||
snippet_chars: u32::try_from(self.config.search.snippet_chars).unwrap_or(u32::MAX),
|
||||
embedding_version,
|
||||
chunker_version: self.config.chunking.chunker_version.clone(),
|
||||
chunker_version: self.config.ingest.chunking.chunker_version.clone(),
|
||||
corpus_revision: self.sqlite.corpus_revision(),
|
||||
})
|
||||
}
|
||||
@@ -1025,7 +1025,7 @@ impl App {
|
||||
fn lexical_index_version(config: &kebab_config::Config) -> IndexVersion {
|
||||
IndexVersion(format!(
|
||||
"lex:{}:fts5-v009-korean-morphological",
|
||||
config.chunking.chunker_version
|
||||
config.ingest.chunking.chunker_version
|
||||
))
|
||||
}
|
||||
|
||||
|
||||
@@ -360,12 +360,12 @@ pub fn ingest_with_config_opts(
|
||||
// loop is correct and cheap. Construction failure (e.g. invalid
|
||||
// endpoint) aborts ingest fail-fast — better than silently disabling
|
||||
// OCR/caption mid-run.
|
||||
let ocr_engine: Option<Box<dyn OcrEngine>> = if app.config.image.ocr.enabled {
|
||||
let ocr_engine: Option<Box<dyn OcrEngine>> = if app.config.ingest.image.ocr.enabled {
|
||||
Some(build_image_ocr_engine(&app.config).context("kb-app::ingest: build image OCR engine")?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let caption_llm: Option<Box<dyn LanguageModel>> = if app.config.image.caption.enabled {
|
||||
let caption_llm: Option<Box<dyn LanguageModel>> = if app.config.ingest.image.caption.enabled {
|
||||
Some(Box::new(OllamaLanguageModel::new(&app.config).context(
|
||||
"kb-app::ingest: build OllamaLanguageModel for caption",
|
||||
)?))
|
||||
@@ -380,7 +380,7 @@ pub fn ingest_with_config_opts(
|
||||
// p10 / v0.20 sub-item 1: PDF OCR engine eager init (H-5 resolution).
|
||||
// image OCR pattern mirror — per-ingest 1회 build, fallible → fail-fast.
|
||||
let pdf_ocr_engine: Option<Box<dyn OcrEngine>> =
|
||||
if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on {
|
||||
if app.config.ingest.pdf.ocr.enabled || app.config.ingest.pdf.ocr.always_on {
|
||||
Some(
|
||||
build_pdf_ocr_engine(&app.config)
|
||||
.context("kb-app::ingest: build pdf OCR engine")?,
|
||||
@@ -825,7 +825,7 @@ fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String {
|
||||
type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
|
||||
|
||||
/// v0.27.0 (T8): build the image OCR engine selected by
|
||||
/// `config.image.ocr.engine`. Returns a boxed trait object so the ingest
|
||||
/// `config.ingest.image.ocr.engine`. Returns a boxed trait object so the ingest
|
||||
/// pipeline is engine-agnostic. Construction is fail-fast (model load /
|
||||
/// hash / endpoint validation) — mirrors the prior concrete-type behaviour.
|
||||
///
|
||||
@@ -835,7 +835,7 @@ type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
|
||||
fn build_image_ocr_engine(
|
||||
config: &kebab_config::Config,
|
||||
) -> anyhow::Result<Box<dyn OcrEngine>> {
|
||||
match config.image.ocr.engine.as_str() {
|
||||
match config.ingest.image.ocr.engine.as_str() {
|
||||
OLLAMA_VISION_ENGINE => Ok(Box::new(
|
||||
OllamaVisionOcr::new(config).context("build OllamaVisionOcr")?,
|
||||
)),
|
||||
@@ -850,7 +850,7 @@ fn build_image_ocr_engine(
|
||||
}
|
||||
|
||||
/// v0.27.0 (T8): build the PDF OCR engine selected by
|
||||
/// `config.pdf.ocr.engine`. The ollama-vision arm uses the PDF-specific
|
||||
/// `config.ingest.pdf.ocr.engine`. The ollama-vision arm uses the PDF-specific
|
||||
/// `model` / `languages` / `max_pixels` / `request_timeout_secs` knobs (and
|
||||
/// endpoint fallback to `models.llm.endpoint`). The paddle-onnx arm shares
|
||||
/// the same bundled ONNX models as image OCR (resolved from `image.ocr`
|
||||
@@ -869,9 +869,9 @@ fn build_image_ocr_engine(
|
||||
fn build_pdf_ocr_engine(
|
||||
config: &kebab_config::Config,
|
||||
) -> anyhow::Result<Box<dyn OcrEngine>> {
|
||||
match config.pdf.ocr.engine.as_str() {
|
||||
match config.ingest.pdf.ocr.engine.as_str() {
|
||||
OLLAMA_VISION_ENGINE => {
|
||||
let cfg = &config.pdf.ocr;
|
||||
let cfg = &config.ingest.pdf.ocr;
|
||||
let endpoint = match cfg.endpoint.as_deref() {
|
||||
Some(s) if !s.is_empty() => s.to_string(),
|
||||
_ => config.models.llm.endpoint.clone(),
|
||||
@@ -2144,7 +2144,7 @@ fn sweep_deleted_files(
|
||||
/// asset rollback on embed-fail is a P+ task).
|
||||
///
|
||||
/// `chunker_version` is hard-coded to `pdf-page-v1` (HOTFIXES entry —
|
||||
/// `config.chunking.chunker_version` is single-valued today and serves
|
||||
/// `config.ingest.chunking.chunker_version` is single-valued today and serves
|
||||
/// the markdown path; per-medium config split is a P+ chunker registry
|
||||
/// task).
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
@@ -2229,15 +2229,15 @@ fn ingest_one_pdf_asset(
|
||||
// v0.20 sub-item 1: post-extract OCR enrichment (PR #187 registry
|
||||
// dispatch invariant 보존 — extract_for 가 normal entry).
|
||||
let (pdf_ocr_pages, pdf_ocr_ms_total): (Option<u32>, Option<u64>) =
|
||||
if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on {
|
||||
if app.config.ingest.pdf.ocr.enabled || app.config.ingest.pdf.ocr.always_on {
|
||||
match pdf_ocr_engine {
|
||||
Some(engine) => {
|
||||
let ocr_opts = crate::pdf_ocr_apply::PdfOcrOpts {
|
||||
enabled: app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on,
|
||||
always_on: app.config.pdf.ocr.always_on,
|
||||
valid_ratio_threshold: app.config.pdf.ocr.valid_ratio_threshold,
|
||||
min_char_count: app.config.pdf.ocr.min_char_count,
|
||||
lang_hint: app.config.pdf.ocr.lang_hint.clone().map(kebab_core::Lang),
|
||||
enabled: app.config.ingest.pdf.ocr.enabled || app.config.ingest.pdf.ocr.always_on,
|
||||
always_on: app.config.ingest.pdf.ocr.always_on,
|
||||
valid_ratio_threshold: app.config.ingest.pdf.ocr.valid_ratio_threshold,
|
||||
min_char_count: app.config.ingest.pdf.ocr.min_char_count,
|
||||
lang_hint: app.config.ingest.pdf.ocr.lang_hint.clone().map(kebab_core::Lang),
|
||||
cancel: cancel.cloned(),
|
||||
};
|
||||
// v0.20.x Hook 2: pre-clone Arcs for capture by OCR closure.
|
||||
@@ -2356,7 +2356,7 @@ fn ingest_one_pdf_asset(
|
||||
};
|
||||
|
||||
// Per-medium chunker selection: PDF docs always use pdf-page-v1
|
||||
// regardless of `config.chunking.chunker_version`. The chunker
|
||||
// regardless of `config.ingest.chunking.chunker_version`. The chunker
|
||||
// validates every block carries `SourceSpan::Page`; failure here
|
||||
// means the parser drifted from its contract.
|
||||
let chunker = PdfPageV1Chunker;
|
||||
@@ -3056,10 +3056,10 @@ fn build_body_hints(asset: &RawAsset) -> BodyHints {
|
||||
/// Build a `ChunkPolicy` from the active config.
|
||||
fn chunk_policy_from_config(config: &kebab_config::Config) -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: config.chunking.target_tokens,
|
||||
overlap_tokens: config.chunking.overlap_tokens,
|
||||
respect_markdown_headings: config.chunking.respect_markdown_headings,
|
||||
chunker_version: ChunkerVersion(config.chunking.chunker_version.clone()),
|
||||
target_tokens: config.ingest.chunking.target_tokens,
|
||||
overlap_tokens: config.ingest.chunking.overlap_tokens,
|
||||
respect_markdown_headings: config.ingest.chunking.respect_markdown_headings,
|
||||
chunker_version: ChunkerVersion(config.ingest.chunking.chunker_version.clone()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3099,7 +3099,7 @@ fn ocr_engine_version_for_sig(config: &kebab_config::Config, engine: &str, model
|
||||
// stable per-model revision, so engine/model is the identity.
|
||||
return format!("ollama/{model}");
|
||||
}
|
||||
let ocr = &config.image.ocr;
|
||||
let ocr = &config.ingest.image.ocr;
|
||||
let key = format!(
|
||||
"{}|{}|{}",
|
||||
ocr.det_model.as_deref().unwrap_or("<bundled>"),
|
||||
@@ -3130,7 +3130,7 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
|
||||
// Common (every media type): chunking parameters that move chunk
|
||||
// boundaries. `target_tokens` / `overlap_tokens` change re-chunking for
|
||||
// markdown / image / pdf / code alike, so a change re-indexes all types.
|
||||
let c = &config.chunking;
|
||||
let c = &config.ingest.chunking;
|
||||
let mut sig = format!(
|
||||
"chunk:{}:{}:{}:{}",
|
||||
c.target_tokens, c.overlap_tokens, c.respect_markdown_headings, c.chunker_version
|
||||
@@ -3140,7 +3140,7 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
|
||||
// OCR / caption only affect output when their `enabled` flag is
|
||||
// on; the model / prompt version matters only then. Off ↔ off is
|
||||
// a stable empty token so re-running the same config skips.
|
||||
let ocr = &config.image.ocr;
|
||||
let ocr = &config.ingest.image.ocr;
|
||||
if ocr.enabled {
|
||||
// v0.27.0 (T9): engine + engine_version so switching engine
|
||||
// (ollama-vision ↔ paddle-onnx) OR changing the model/assets
|
||||
@@ -3153,7 +3153,7 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
|
||||
} else {
|
||||
sig.push_str("|ocr:0");
|
||||
}
|
||||
let cap = &config.image.caption;
|
||||
let cap = &config.ingest.image.caption;
|
||||
if cap.enabled {
|
||||
sig.push_str(&format!("|cap:1:{}", cap.prompt_template_version));
|
||||
} else {
|
||||
@@ -3163,7 +3163,7 @@ fn ingest_config_signature(config: &kebab_config::Config, media: &MediaType) ->
|
||||
MediaType::Pdf => {
|
||||
// PDF OCR is active when EITHER `enabled` or `always_on` is set
|
||||
// (mirrors the ingest gate). `model` only matters when active.
|
||||
let ocr = &config.pdf.ocr;
|
||||
let ocr = &config.ingest.pdf.ocr;
|
||||
if ocr.enabled || ocr.always_on {
|
||||
// v0.27.0 (T9): engine + engine_version (same cascade rule as
|
||||
// image OCR above) alongside the enabled/always_on gate.
|
||||
|
||||
@@ -205,7 +205,7 @@ fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Mode
|
||||
// maintain their own versions; surface those when SchemaV1.models
|
||||
// becomes a multi-medium map (P+).
|
||||
parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
|
||||
chunker_version: cfg.chunking.chunker_version.clone(),
|
||||
chunker_version: cfg.ingest.chunking.chunker_version.clone(),
|
||||
active_parsers,
|
||||
active_chunkers,
|
||||
// EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
|
||||
|
||||
@@ -62,8 +62,8 @@ impl TestEnv {
|
||||
// Drop in a small chunk policy so the fixture's small files
|
||||
// emit at least a couple of chunks even with overlap_tokens
|
||||
// honored.
|
||||
config.chunking.target_tokens = 80;
|
||||
config.chunking.overlap_tokens = 20;
|
||||
config.ingest.chunking.target_tokens = 80;
|
||||
config.ingest.chunking.overlap_tokens = 20;
|
||||
|
||||
Self {
|
||||
temp,
|
||||
|
||||
@@ -63,7 +63,7 @@ fn chunking_change_reindexes_all_types() {
|
||||
let scanned = first.scanned;
|
||||
|
||||
// Bump target_tokens — folds into every type's signature.
|
||||
env.config.chunking.target_tokens += 100;
|
||||
env.config.ingest.chunking.target_tokens += 100;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
|
||||
@@ -34,11 +34,11 @@ fn cfg_with_image_pipeline(env: &TestEnv, mock_endpoint: &str) -> Config {
|
||||
let mut cfg = env.config.clone();
|
||||
// p9-fb-25: workspace.include removed; extension routing is now
|
||||
// handled by extractor matching alone (no config knob).
|
||||
cfg.image.ocr.enabled = true;
|
||||
cfg.image.ocr.endpoint = Some(mock_endpoint.to_string());
|
||||
cfg.image.ocr.model = "vision-mock:1b".to_string();
|
||||
cfg.image.ocr.max_pixels = 512;
|
||||
cfg.image.caption.enabled = false; // tested separately below
|
||||
cfg.ingest.image.ocr.enabled = true;
|
||||
cfg.ingest.image.ocr.endpoint = Some(mock_endpoint.to_string());
|
||||
cfg.ingest.image.ocr.model = "vision-mock:1b".to_string();
|
||||
cfg.ingest.image.ocr.max_pixels = 512;
|
||||
cfg.ingest.image.caption.enabled = false; // tested separately below
|
||||
cfg.models.llm.endpoint = mock_endpoint.to_string();
|
||||
cfg.models.llm.model = "vision-mock:1b".to_string();
|
||||
cfg
|
||||
@@ -161,8 +161,8 @@ async fn ingest_image_with_ocr_and_caption_populates_both_fields() {
|
||||
let env = TestEnv::lexical_only();
|
||||
write_red_png(&env.workspace_root, "diagram.png");
|
||||
let mut cfg = cfg_with_image_pipeline(&env, &server.uri());
|
||||
cfg.image.caption.enabled = true;
|
||||
cfg.image.caption.max_pixels = 384;
|
||||
cfg.ingest.image.caption.enabled = true;
|
||||
cfg.ingest.image.caption.max_pixels = 384;
|
||||
|
||||
let cfg_clone = cfg.clone();
|
||||
let scope = env.scope();
|
||||
@@ -270,8 +270,8 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
|
||||
let mut cfg = env.config.clone();
|
||||
// p9-fb-25: workspace.include removed; extension routing is now
|
||||
// handled by extractor matching alone (no config knob).
|
||||
cfg.image.ocr.enabled = false;
|
||||
cfg.image.caption.enabled = false;
|
||||
cfg.ingest.image.ocr.enabled = false;
|
||||
cfg.ingest.image.caption.enabled = false;
|
||||
|
||||
let cfg_clone = cfg.clone();
|
||||
let scope = env.scope();
|
||||
@@ -334,8 +334,8 @@ async fn garbage_png_increments_errors_counter_exactly_once() {
|
||||
let mut cfg = env.config.clone();
|
||||
// p9-fb-25: workspace.include removed; extension routing is now
|
||||
// handled by extractor matching alone (no config knob).
|
||||
cfg.image.ocr.enabled = false;
|
||||
cfg.image.caption.enabled = false;
|
||||
cfg.ingest.image.ocr.enabled = false;
|
||||
cfg.ingest.image.caption.enabled = false;
|
||||
|
||||
let cfg_clone = cfg.clone();
|
||||
let scope = env.scope();
|
||||
|
||||
@@ -23,8 +23,8 @@ fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Con
|
||||
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.chunking.target_tokens = 80;
|
||||
cfg.chunking.overlap_tokens = 20;
|
||||
cfg.ingest.chunking.target_tokens = 80;
|
||||
cfg.ingest.chunking.overlap_tokens = 20;
|
||||
cfg.logging = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: log_dir.to_path_buf(),
|
||||
|
||||
@@ -22,8 +22,8 @@ fn ollama_endpoint() -> String {
|
||||
|
||||
fn make_ocr_env_real() -> TestEnv {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.pdf.ocr.enabled = true;
|
||||
env.config.pdf.ocr.endpoint = Some(ollama_endpoint());
|
||||
env.config.ingest.pdf.ocr.enabled = true;
|
||||
env.config.ingest.pdf.ocr.endpoint = Some(ollama_endpoint());
|
||||
env.config.models.embedding.provider = "none".to_string();
|
||||
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
@@ -92,8 +92,8 @@ fn ocr_text_indexed_and_searchable() {
|
||||
#[test]
|
||||
fn ingest_with_cancel_aborts_mid_pdf() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.pdf.ocr.enabled = true;
|
||||
env.config.pdf.ocr.endpoint = Some("http://127.0.0.1:1".to_string());
|
||||
env.config.ingest.pdf.ocr.enabled = true;
|
||||
env.config.ingest.pdf.ocr.endpoint = Some("http://127.0.0.1:1".to_string());
|
||||
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
|
||||
@@ -196,9 +196,9 @@ fn pdf_ocr_progress_emits_started_finished_events() {
|
||||
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
config.models.embedding.provider = "none".to_string();
|
||||
config.models.embedding.dimensions = 0;
|
||||
config.pdf.ocr.enabled = true;
|
||||
config.ingest.pdf.ocr.enabled = true;
|
||||
if let Ok(endpoint) = std::env::var("KEBAB_PDF_OCR_ENDPOINT") {
|
||||
config.pdf.ocr.endpoint = Some(endpoint);
|
||||
config.ingest.pdf.ocr.endpoint = Some(endpoint);
|
||||
}
|
||||
|
||||
let scope = kebab_core::SourceScope {
|
||||
|
||||
@@ -49,9 +49,9 @@ async fn ingest_dual_write_doc_id_matches_ndjson() {
|
||||
let result = spawn_blocking(move || {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
// Enable PDF OCR + set up mock endpoint
|
||||
env.config.pdf.ocr.enabled = true;
|
||||
env.config.pdf.ocr.endpoint = Some(mock_url.clone());
|
||||
env.config.pdf.ocr.model = "qwen2.5vl:3b".to_string();
|
||||
env.config.ingest.pdf.ocr.enabled = true;
|
||||
env.config.ingest.pdf.ocr.endpoint = Some(mock_url.clone());
|
||||
env.config.ingest.pdf.ocr.model = "qwen2.5vl:3b".to_string();
|
||||
// Enable ingest log
|
||||
let log_dir = env.temp.path().join("logs");
|
||||
std::fs::create_dir_all(&log_dir).unwrap();
|
||||
|
||||
@@ -121,8 +121,8 @@ fn cfg_with_pdf(env: &TestEnv) -> Config {
|
||||
// PDF ingest does not need OCR / caption / LM — leave defaults
|
||||
// (ocr.enabled=false, caption.enabled=false). The image pipeline
|
||||
// construction step skips both adapters.
|
||||
cfg.image.ocr.enabled = false;
|
||||
cfg.image.caption.enabled = false;
|
||||
cfg.ingest.image.ocr.enabled = false;
|
||||
cfg.ingest.image.caption.enabled = false;
|
||||
cfg
|
||||
}
|
||||
|
||||
|
||||
@@ -12,8 +12,8 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.chunking.target_tokens = 80;
|
||||
cfg.chunking.overlap_tokens = 20;
|
||||
cfg.ingest.chunking.target_tokens = 80;
|
||||
cfg.ingest.chunking.overlap_tokens = 20;
|
||||
cfg
|
||||
}
|
||||
|
||||
|
||||
@@ -14,8 +14,8 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
|
||||
config.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
config.models.embedding.provider = "none".to_string();
|
||||
config.models.embedding.dimensions = 0;
|
||||
config.chunking.target_tokens = 80;
|
||||
config.chunking.overlap_tokens = 20;
|
||||
config.ingest.chunking.target_tokens = 80;
|
||||
config.ingest.chunking.overlap_tokens = 20;
|
||||
config
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user