feat(kebab-app): P6-4 image ingest wiring — kebab ingest 가 PNG/JPEG 처리 #36

Merged
altair823 merged 3 commits from feat/p6-4-image-ingest-wiring into main 2026-05-02 08:22:27 +00:00
7 changed files with 917 additions and 33 deletions

4
Cargo.lock generated
View File

@@ -3397,6 +3397,7 @@ dependencies = [
"anyhow",
"blake3",
"dirs 5.0.1",
"image",
"kebab-chunk",
"kebab-config",
"kebab-core",
@@ -3405,6 +3406,7 @@ dependencies = [
"kebab-llm",
"kebab-llm-local",
"kebab-normalize",
"kebab-parse-image",
"kebab-parse-md",
"kebab-parse-types",
"kebab-rag",
@@ -3417,10 +3419,12 @@ dependencies = [
"serde_json",
"tempfile",
"time",
"tokio",
"toml",
"tracing",
"tracing-appender",
"tracing-subscriber",
"wiremock",
]
[[package]]

View File

@@ -23,6 +23,11 @@ kebab-embed-local = { path = "../kebab-embed-local" }
kebab-llm = { path = "../kebab-llm" }
kebab-llm-local = { path = "../kebab-llm-local" }
kebab-rag = { path = "../kebab-rag" }
# P6-4: image extractor + OCR + caption adapters live here. App
# threads them into the per-asset dispatch (see `ingest_one_asset`
# image branch). Trait-only consumption — no `kebab-parse-image`
# internals leak into kb-app code.
kebab-parse-image = { path = "../kebab-parse-image" }
anyhow = { workspace = true }
blake3 = { workspace = true }
serde = { workspace = true }
@@ -37,3 +42,9 @@ dirs = "5"
[dev-dependencies]
rusqlite = { workspace = true }
tempfile = { workspace = true }
# Image-pipeline integration tests use wiremock to stub Ollama for OCR
# / caption HTTP calls. Async runtime to host the mock server only;
# the kb-app code under test stays sync.
wiremock = { workspace = true }
tokio = { workspace = true, features = ["rt-multi-thread"] }
image = { version = "0.25", default-features = false, features = ["png"] }

View File

@@ -41,12 +41,15 @@ use serde::{Deserialize, Serialize};
use kebab_chunk::MdHeadingV1Chunker;
use kebab_core::{
Answer, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
EmbeddingKind, IngestReport, ParserVersion, RawAsset, SearchHit, SearchQuery,
SourceConnector, SourceScope, SourceUri, VectorRecord, VectorStore,
EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType,
ParserVersion, RawAsset, SearchHit, SearchQuery, SourceConnector, SourceScope,
SourceUri, VectorRecord, VectorStore,
};
use kebab_llm_local::OllamaLanguageModel;
use kebab_normalize::build_canonical_document;
use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr};
use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
use kebab_source_fs::FsSourceConnector;
@@ -190,6 +193,35 @@ pub fn ingest_with_config(
let parser_version = ParserVersion(KEBAB_PARSE_MD_VERSION.to_string());
let chunk_policy = chunk_policy_from_config(&app.config);
// P6-4: build OCR / caption adapters once per ingest invocation,
// gated on their respective `enabled` flags. `reqwest::blocking::Client`
// is internally Arc-shared so reusing one instance across the asset
// loop is correct and cheap. Construction failure (e.g. invalid
// endpoint) aborts ingest fail-fast — better than silently disabling
// OCR/caption mid-run.
let ocr_engine: Option<OllamaVisionOcr> = if app.config.image.ocr.enabled {
Some(
OllamaVisionOcr::new(&app.config)
.context("kb-app::ingest: build OllamaVisionOcr")?,
)
} else {
None
};
let caption_llm: Option<Box<dyn LanguageModel>> = if app.config.image.caption.enabled {
Some(Box::new(
OllamaLanguageModel::new(&app.config)
.context("kb-app::ingest: build OllamaLanguageModel for caption")?,
))
} else {
None
};
let image_extractor = ImageExtractor::new();
let image_pipeline = ImagePipeline {
extractor: &image_extractor,
ocr_engine: ocr_engine.as_ref(),
caption_llm: caption_llm.as_deref(),
};
// Pre-load every existing doc_id so we can label `IngestItem.kind`
// as `New` vs `Updated` correctly. `list_documents` returns one
// row per `(workspace_path, asset_id)` — index by the deterministic
@@ -230,6 +262,7 @@ pub fn ingest_with_config(
embedder.as_ref(),
vector_store.as_ref(),
&existing_doc_ids,
&image_pipeline,
);
let item = match item {
@@ -241,7 +274,12 @@ pub fn ingest_with_config(
error = %e,
"kb-app::ingest: per-file fatal"
);
error_count = error_count.saturating_add(1);
// Note: `error_count += 1` happens below in the

(칭찬) 사용자가 요청한 "다중 이미지 + 손상 PNG" 수동 검증 중 발견된 IngestReport.errors double-count 결함을 같은 PR 안에서 fix + 회귀 테스트화 했습니다. P6-4 본 task scope 외였지만 image dispatch 가 Err 분기를 처음으로 자주 trigger 시키면서 표면화한 기존 결함이라 같은 PR 에서 처리하는 게 정직합니다. 회귀 테스트의 scanned == new + updated + skipped + errors invariant 검증은 향후 카운터 컨벤션이 다시 깨지는 걸 방지합니다.

(칭찬) 사용자가 요청한 "다중 이미지 + 손상 PNG" 수동 검증 중 발견된 `IngestReport.errors` double-count 결함을 같은 PR 안에서 fix + 회귀 테스트화 했습니다. P6-4 본 task scope 외였지만 image dispatch 가 `Err` 분기를 처음으로 자주 trigger 시키면서 표면화한 기존 결함이라 같은 PR 에서 처리하는 게 정직합니다. 회귀 테스트의 `scanned == new + updated + skipped + errors` invariant 검증은 향후 카운터 컨벤션이 다시 깨지는 걸 방지합니다.
// `match item.kind { Error => ... }` arm — incrementing
// here too would double-count (a regression first
// surfaced by P6-4 image dispatch where Err returns
// are common; markdown rarely propagated Err so the
// bug went unnoticed).
kebab_core::IngestItem {
kind: kebab_core::IngestItemKind::Error,
doc_id: None,
@@ -434,10 +472,20 @@ fn mint_ingest_run_id(scope_json: &str, at: time::OffsetDateTime) -> String {
/// `<… as JobRepo>` to be explicit.
type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;
/// P6-4: borrowed bundle of the three image-pipeline components built
/// once per ingest invocation. Threaded through `ingest_one_asset` so
/// the dispatch does not need ten separate parameters.

doc-comment 가 잘못 합쳐졌습니다. line 470-473 의 4 줄 ("Process a single asset: read bytes, parse, normalize, chunk, persist, embed...") 는 원래 fn ingest_one_asset 의 doc 인데, 그 사이에 새 ImagePipeline struct 를 끼워 넣으면서 빈 줄 없이 line 474-476 의 struct doc 과 이어졌습니다. 결과적으로 rustdoc 은 두 doc-comment 를 합쳐서 ImagePipeline 의 문서로 인식하고, ingest_one_asset 자체는 doc 이 없는 상태가 됩니다.

수정:

type SqliteStoreAlias = kebab_store_sqlite::SqliteStore;

/// P6-4: borrowed bundle of the three image-pipeline components built
/// once per ingest invocation. Threaded through `ingest_one_asset` so
/// the dispatch does not need ten separate parameters.
struct ImagePipeline<'a> { ... }

/// Process a single asset: read bytes, parse, normalize, chunk,
/// persist, embed. Per-asset failures bubble up to the caller for
/// labelling as `IngestItemKind::Error` — they do NOT abort the
/// whole run.
#[allow(clippy::too_many_arguments)]
fn ingest_one_asset(...)

원래 doc 을 fn 위치로 되돌리고 struct 의 P6-4 doc 만 struct 위에 두는 형태.

doc-comment 가 잘못 합쳐졌습니다. line 470-473 의 4 줄 ("Process a single asset: read bytes, parse, normalize, chunk, persist, embed...") 는 원래 `fn ingest_one_asset` 의 doc 인데, 그 사이에 새 `ImagePipeline` struct 를 끼워 넣으면서 빈 줄 없이 line 474-476 의 struct doc 과 이어졌습니다. 결과적으로 rustdoc 은 두 doc-comment 를 합쳐서 `ImagePipeline` 의 문서로 인식하고, `ingest_one_asset` 자체는 doc 이 없는 상태가 됩니다. 수정: ```rust type SqliteStoreAlias = kebab_store_sqlite::SqliteStore; /// P6-4: borrowed bundle of the three image-pipeline components built /// once per ingest invocation. Threaded through `ingest_one_asset` so /// the dispatch does not need ten separate parameters. struct ImagePipeline<'a> { ... } /// Process a single asset: read bytes, parse, normalize, chunk, /// persist, embed. Per-asset failures bubble up to the caller for /// labelling as `IngestItemKind::Error` — they do NOT abort the /// whole run. #[allow(clippy::too_many_arguments)] fn ingest_one_asset(...) ``` 원래 doc 을 fn 위치로 되돌리고 struct 의 P6-4 doc 만 struct 위에 두는 형태.
struct ImagePipeline<'a> {
extractor: &'a ImageExtractor,
ocr_engine: Option<&'a OllamaVisionOcr>,
caption_llm: Option<&'a dyn LanguageModel>,
}
/// Process a single asset: read bytes, parse, normalize, chunk,
/// persist, embed. Per-asset failures bubble up to the caller for
/// labelling as `IngestItemKind::Error` — they do NOT abort the
/// whole run.
#[allow(clippy::too_many_arguments)]
fn ingest_one_asset(
app: &App,
asset: &RawAsset,
@@ -446,27 +494,47 @@ fn ingest_one_asset(
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
image_pipeline: &ImagePipeline<'_>,
) -> anyhow::Result<kebab_core::IngestItem> {
tracing::debug!(
target: "kebab-app::ingest",
path = %asset.workspace_path.0,
media_type = ?asset.media_type,
"processing asset"
);
// Only handle Markdown for now; other media types are P6+ work.
if asset.media_type != kebab_core::MediaType::Markdown {
return Ok(kebab_core::IngestItem {
kind: kebab_core::IngestItemKind::Skipped,
doc_id: None,
doc_path: asset.workspace_path.clone(),
asset_id: Some(asset.asset_id.clone()),
byte_len: Some(asset.byte_len),
block_count: None,
chunk_count: None,
parser_version: None,
chunker_version: None,
warnings: Vec::new(),
error: None,
});
// P6-4: dispatch on media_type. Markdown takes the existing
// parse-md / normalize path; image takes the new
// ImageExtractor + (optional) OCR + (optional) caption path.
// Anything else (PDF, audio, unknown) is skipped — the
// respective phases (P7 / P8) wire them in later.
match &asset.media_type {
MediaType::Markdown => { /* fall through to markdown path */ }
MediaType::Image(_) => {
return ingest_one_image_asset(
app,
asset,
chunk_policy,
embedder,
vector_store,
existing_doc_ids,
image_pipeline,
);
}
_ => {
return Ok(kebab_core::IngestItem {
kind: kebab_core::IngestItemKind::Skipped,
doc_id: None,
doc_path: asset.workspace_path.clone(),
asset_id: Some(asset.asset_id.clone()),
byte_len: Some(asset.byte_len),
block_count: None,
chunk_count: None,
parser_version: None,
chunker_version: None,
warnings: Vec::new(),
error: None,
});
}
}
let path = match &asset.source_uri {
@@ -612,6 +680,277 @@ fn ingest_one_asset(
})
}
/// P6-4: process one `MediaType::Image(_)` asset end-to-end.
///
/// Pipeline: read bytes → `ImageExtractor::extract` → optional
/// `apply_ocr` → optional `apply_caption` → existing chunker / embedder
/// / store path (the same one markdown uses, which already handles
/// `Block::ImageRef` per P1-5).
///
/// Failure semantics (per P6-4 spec):
/// - `ImageExtractor::extract` Err → propagate (caller increments
/// `errors`).
/// - OCR / caption Err → log + `Provenance::Warning` event, continue.
/// `block.ocr` / `block.caption` stay `None`. `errors` NOT incremented.
#[allow(clippy::too_many_arguments)]
fn ingest_one_image_asset(
app: &App,
asset: &RawAsset,
chunk_policy: &ChunkPolicy,
embedder: Option<&Arc<dyn Embedder + Send + Sync>>,
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
image_pipeline: &ImagePipeline<'_>,
) -> anyhow::Result<kebab_core::IngestItem> {
let image_extractor = image_pipeline.extractor;
let ocr_engine = image_pipeline.ocr_engine;
let caption_llm = image_pipeline.caption_llm;
let path = match &asset.source_uri {
SourceUri::File(p) => p.clone(),
SourceUri::Kb(_) => {
return Ok(kebab_core::IngestItem {
kind: kebab_core::IngestItemKind::Skipped,
doc_id: None,
doc_path: asset.workspace_path.clone(),
asset_id: Some(asset.asset_id.clone()),
byte_len: Some(asset.byte_len),
block_count: None,
chunk_count: None,
parser_version: None,
chunker_version: None,
warnings: vec![
"kb:// source URIs are not supported by the fs ingester".into(),
],
error: None,
});
}
};
let bytes = std::fs::read(&path)
.with_context(|| format!("read image asset bytes from {}", path.display()))?;
// 1. Decode + EXIF + dimensions. ExtractContext.config carries
// nothing the image extractor reads today; we pass a default
// instance per the trait shape.
let extract_config = kebab_core::ExtractConfig::default();
let workspace_root = std::path::PathBuf::from(&app.config.workspace.root);
let ctx = ExtractContext {
asset,
workspace_root: &workspace_root,
config: &extract_config,
};
let mut canonical = image_extractor
.extract(&ctx, &bytes)
.context("kb-parse-image::ImageExtractor::extract")?;
// 2 + 3. Apply OCR / caption when their adapters exist. Both are
// Lenient — failure is captured into Provenance Warning,
// `block.ocr` / `block.caption` stay `None`. P6-4 spec
// explicitly: such partial failures do NOT increment the
// `errors` counter.
//

if let Some(Block::ImageRef(block)) = canonical.blocks.first_mut() 가 None 이거나 다른 Block variant 면 silent skip — Provenance event 도 warning 도 안 남기고 OCR/caption 단계가 통째로 무력화됩니다.

P6-1 contract 가 "image document 는 항상 단일 ImageRef block" 이라 현재 production 에선 unreachable 이지만, 미래 task 가 multi-block image document 를 도입하면 이 guard 가 silent failure 로 바뀝니다. 최소한 else 분기에서 tracing::warn! 또는 panic-with-context 를 권장합니다:

match canonical.blocks.first_mut() {
    Some(Block::ImageRef(block)) => { /* OCR + caption */ }
    other => {
        tracing::warn!(
            target: "kebab-app",
            path = %asset.workspace_path.0,
            blocks = canonical.blocks.len(),
            first_kind = ?other.map(|b| std::mem::discriminant(b)),
            "image document missing ImageRef block — OCR/caption skipped"
        );
    }
}

Provenance Warning 까지 추가하면 운영에서 잡기 더 쉽습니다.

`if let Some(Block::ImageRef(block)) = canonical.blocks.first_mut()` 가 None 이거나 다른 `Block` variant 면 silent skip — Provenance event 도 warning 도 안 남기고 OCR/caption 단계가 통째로 무력화됩니다. P6-1 contract 가 "image document 는 항상 단일 ImageRef block" 이라 현재 production 에선 unreachable 이지만, 미래 task 가 multi-block image document 를 도입하면 이 guard 가 silent failure 로 바뀝니다. 최소한 `else` 분기에서 `tracing::warn!` 또는 panic-with-context 를 권장합니다: ```rust match canonical.blocks.first_mut() { Some(Block::ImageRef(block)) => { /* OCR + caption */ } other => { tracing::warn!( target: "kebab-app", path = %asset.workspace_path.0, blocks = canonical.blocks.len(), first_kind = ?other.map(|b| std::mem::discriminant(b)), "image document missing ImageRef block — OCR/caption skipped" ); } } ``` Provenance Warning 까지 추가하면 운영에서 잡기 더 쉽습니다.
// Determinism stress (per spec Risks): the per-document
// Provenance timestamps for any analysis-stage Warning
// events share a single `now_utc()` reading taken once
// here, mirroring `kb-normalize::build_canonical_document`.
let lang_hint = lang_hint_from_doc(&canonical);
let now = time::OffsetDateTime::now_utc();
let mut warning_notes: Vec<String> = Vec::new();
match canonical.blocks.first_mut() {
Some(Block::ImageRef(block)) => {
if let Some(engine) = ocr_engine
&& let Err(e) = apply_ocr(
engine,
&bytes,
block,
lang_hint.as_ref(),

OCR 실패 분기와 caption 실패 분기 (line 754-773 + 776-799) 가 거의 동일 boilerplate 입니다 — variable 이름 (ocr_failed: vs caption_failed:) 만 다르고, 나머지 (tracing::warn / push ProvenanceEvent / push warning_notes) 는 동일.

중복 ~25줄을 헬퍼로 뽑으면 두 호출이 한 줄로 줄고, 미래에 "Warning event 에 다른 필드 추가" 같은 변경이 한 곳만 손보면 끝납니다:

fn record_image_analysis_failure(
    asset: &RawAsset,
    canonical: &mut CanonicalDocument,
    warning_notes: &mut Vec<String>,
    stage: &str,                        // "ocr_failed" / "caption_failed"
    err: anyhow::Error,
) {
    let note = format!("{stage}: {err:#}");
    tracing::warn!(
        target: "kebab-app",
        path = %asset.workspace_path.0,
        "{}",
        note
    );
    canonical.provenance.events.push(ProvenanceEvent {
        at: OffsetDateTime::now_utc(),
        agent: "kb-app".to_string(),
        kind: ProvenanceKind::Warning,
        note: Some(note.clone()),
    });
    warning_notes.push(note);
}

호출부:

if let Err(e) = apply_ocr(...) {
    record_image_analysis_failure(asset, &mut canonical, &mut warning_notes, "ocr_failed", e);
}
OCR 실패 분기와 caption 실패 분기 (line 754-773 + 776-799) 가 거의 동일 boilerplate 입니다 — variable 이름 (`ocr_failed:` vs `caption_failed:`) 만 다르고, 나머지 (tracing::warn / push ProvenanceEvent / push warning_notes) 는 동일. 중복 ~25줄을 헬퍼로 뽑으면 두 호출이 한 줄로 줄고, 미래에 "Warning event 에 다른 필드 추가" 같은 변경이 한 곳만 손보면 끝납니다: ```rust fn record_image_analysis_failure( asset: &RawAsset, canonical: &mut CanonicalDocument, warning_notes: &mut Vec<String>, stage: &str, // "ocr_failed" / "caption_failed" err: anyhow::Error, ) { let note = format!("{stage}: {err:#}"); tracing::warn!( target: "kebab-app", path = %asset.workspace_path.0, "{}", note ); canonical.provenance.events.push(ProvenanceEvent { at: OffsetDateTime::now_utc(), agent: "kb-app".to_string(), kind: ProvenanceKind::Warning, note: Some(note.clone()), }); warning_notes.push(note); } ``` 호출부: ```rust if let Err(e) = apply_ocr(...) { record_image_analysis_failure(asset, &mut canonical, &mut warning_notes, "ocr_failed", e); } ```
&mut canonical.provenance.events,
)

OCR 실패 분기와 caption 실패 분기가 각각 자기 time::OffsetDateTime::now_utc() 를 호출합니다 (line 767, 792). spec p6-4 의 Risks/notes 섹션 "Determinism stress" 항목이 명시한 대로, P6-1 의 ImageExtractor::extract 가 이미 Discovered + Parsed 두 이벤트에 단일 now() 를 공유하므로, OCR/caption Warning 이벤트도 같은 처리를 해야 "한 자산 안의 Provenance timestamp 들이 자연스럽게 같이 묶임" 이 됩니다.

fn 진입 시 한 번 let now = OffsetDateTime::now_utc(); 캐시 후 두 분기에서 재사용 권장 (위에서 제안한 헬퍼 함수와 결합하면 자연스럽게 한 곳).

spec 본문이 명시적으로 "this task's wiring must not introduce a second now() call between extract and apply_ocr/caption" 라고 했는데 두 번 호출되고 있어 spec 약속과 어긋납니다.

OCR 실패 분기와 caption 실패 분기가 각각 자기 `time::OffsetDateTime::now_utc()` 를 호출합니다 (line 767, 792). spec p6-4 의 Risks/notes 섹션 "Determinism stress" 항목이 명시한 대로, P6-1 의 `ImageExtractor::extract` 가 이미 `Discovered` + `Parsed` 두 이벤트에 단일 now() 를 공유하므로, OCR/caption Warning 이벤트도 같은 처리를 해야 "한 자산 안의 Provenance timestamp 들이 자연스럽게 같이 묶임" 이 됩니다. fn 진입 시 한 번 `let now = OffsetDateTime::now_utc();` 캐시 후 두 분기에서 재사용 권장 (위에서 제안한 헬퍼 함수와 결합하면 자연스럽게 한 곳). spec 본문이 명시적으로 "this task's wiring must not introduce a second `now()` call between extract and apply_ocr/caption" 라고 했는데 두 번 호출되고 있어 spec 약속과 어긋납니다.
{
record_image_analysis_failure(
asset,
&mut canonical.provenance.events,
&mut warning_notes,

이미지 경로의 IngestItem.warnings 형식 ("ocr_failed: <err>", "caption_failed: <err>") 이 markdown 경로의 형식 (format!("{:?}: {}", w.kind, w.note) — 예: "MalformedFrontmatter: missing closing fence") 과 다릅니다. 같은 wire 필드를 두 갈래의 다른 컨벤션이 채우면 downstream consumer (예: kebab inspect doc --json 의 reader) 가 "warnings 안의 prefix 가 무엇을 뜻하는지" 조건문을 두 번 작성해야 합니다.

두 가지 정리 방향:

  1. (선호) image 측을 markdown 컨벤션에 맞춤 — format!("{:?}: {}", warning_kind, note) 같은 pseudo-WarningKind 변형, 또는 이미지용 WarningKind 변형을 kebab-parse-types::WarningKind 에 추가해 정식 사유 코드화.
  2. markdown 컨벤션을 image-style 로 단순화 — format!("{prefix}: {note}"). 다만 이 쪽은 기존 frozen contract 변경이라 위험.

1번이 자연스러우며, WarningKind::OcrFailed / WarningKind::CaptionFailed 두 enum 추가는 P6-4 hotfix 또는 본 PR scope 안에서도 가능. 어느 쪽이든 "같은 필드는 같은 형식" invariant 회복 권장.

이미지 경로의 `IngestItem.warnings` 형식 (`"ocr_failed: <err>"`, `"caption_failed: <err>"`) 이 markdown 경로의 형식 (`format!("{:?}: {}", w.kind, w.note)` — 예: `"MalformedFrontmatter: missing closing fence"`) 과 다릅니다. 같은 wire 필드를 두 갈래의 다른 컨벤션이 채우면 downstream consumer (예: `kebab inspect doc --json` 의 reader) 가 "warnings 안의 prefix 가 무엇을 뜻하는지" 조건문을 두 번 작성해야 합니다. 두 가지 정리 방향: 1. (선호) image 측을 markdown 컨벤션에 맞춤 — `format!("{:?}: {}", warning_kind, note)` 같은 pseudo-WarningKind 변형, 또는 이미지용 `WarningKind` 변형을 `kebab-parse-types::WarningKind` 에 추가해 정식 사유 코드화. 2. markdown 컨벤션을 image-style 로 단순화 — `format!("{prefix}: {note}")`. 다만 이 쪽은 기존 frozen contract 변경이라 위험. 1번이 자연스러우며, `WarningKind::OcrFailed` / `WarningKind::CaptionFailed` 두 enum 추가는 P6-4 hotfix 또는 본 PR scope 안에서도 가능. 어느 쪽이든 "같은 필드는 같은 형식" invariant 회복 권장.
"OcrFailed",
e,
now,
);
}
if let Some(llm) = caption_llm
&& let Err(e) = apply_caption(
llm,
&bytes,
block,
lang_hint.as_ref(),
&app.config,
&mut canonical.provenance.events,
)
{
record_image_analysis_failure(
asset,
&mut canonical.provenance.events,
&mut warning_notes,
"CaptionFailed",
e,
now,
);
}
}
// P6-1 contract: image documents always have exactly one
// `Block::ImageRef`. If a future task introduces multi-block
// image documents the silent-skip would mask a real bug, so
// this arm surfaces the divergence loudly.
other => {
tracing::warn!(
target: "kebab-app",
path = %asset.workspace_path.0,
blocks = canonical.blocks.len(),
"image document missing leading ImageRef block — OCR/caption skipped (first block: {:?})",
other.map(|b| std::mem::discriminant(b))
);
canonical.provenance.events.push(kebab_core::ProvenanceEvent {
at: now,
agent: "kb-app".to_string(),
kind: kebab_core::ProvenanceKind::Warning,
note: Some(
"image document missing leading ImageRef block — OCR/caption skipped"
.to_string(),
),
});
warning_notes
.push("ImageDispatchAnomaly: missing ImageRef block".to_string());
}
}
// 4. Chunk via the same `MdHeadingV1Chunker` markdown uses — its
// `Block::ImageRef` arm already produces a single chunk per
// image (P1-5). The chunk text now follows the (β) plain-concat
// contract per the kebab-chunk render_block_text update.
let chunks = MdHeadingV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::MdHeadingV1Chunker::chunk (image)")?;
// 5. Persist + embed — identical sequence to markdown.
app.sqlite
.put_asset_with_bytes(asset, &bytes)
.context("DocumentStore::put_asset_with_bytes (image)")?;
app.sqlite
.put_document(&canonical)
.context("DocumentStore::put_document (image)")?;
app.sqlite
.put_blocks(&canonical.doc_id, &canonical.blocks)
.context("DocumentStore::put_blocks (image)")?;
app.sqlite
.put_chunks(&canonical.doc_id, &chunks)
.context("DocumentStore::put_chunks (image)")?;
if let (Some(emb), Some(vec_store)) = (embedder, vector_store)
&& !chunks.is_empty()
{
let inputs: Vec<EmbeddingInput<'_>> = chunks
.iter()
.map(|c| EmbeddingInput {
text: c.text.as_str(),
kind: EmbeddingKind::Document,
})
.collect();
let vectors = emb
.embed(&inputs)
.context("Embedder::embed (image chunks)")?;
let model_id = emb.model_id();
let model_version = emb.model_version();
let dimensions = emb.dimensions();
let records: Vec<VectorRecord> = chunks
.iter()
.zip(vectors)
.map(|(c, v)| VectorRecord {
embedding_id: kebab_core::id_for_embedding(
&c.chunk_id,
&model_id,
&model_version,
dimensions,
),
chunk_id: c.chunk_id.clone(),
vector: v,
doc_id: canonical.doc_id.clone(),
text: c.text.clone(),
heading_path: c.heading_path.clone(),
model_id: model_id.clone(),
model_version: model_version.clone(),
dimensions,
})
.collect();
vec_store
.upsert(&records)
.context("VectorStore::upsert (image)")?;
}
let kind = if existing_doc_ids.contains(&canonical.doc_id.0) {
kebab_core::IngestItemKind::Updated
} else {
kebab_core::IngestItemKind::New
};
Ok(kebab_core::IngestItem {
kind,
doc_id: Some(canonical.doc_id.clone()),
doc_path: asset.workspace_path.clone(),
asset_id: Some(asset.asset_id.clone()),
byte_len: Some(asset.byte_len),
block_count: u32::try_from(canonical.blocks.len()).ok(),
chunk_count: u32::try_from(chunks.len()).ok(),
parser_version: Some(canonical.parser_version.clone()),
chunker_version: Some(MdHeadingV1Chunker.chunker_version()),
warnings: warning_notes,
error: None,
})
}
/// Centralised handling for image-analysis (OCR / caption) failures.
/// Emits a `tracing::warn!`, appends a `ProvenanceKind::Warning`
/// event sharing the caller's per-document `now`, and pushes a
/// `<WarningKind>: <err>` note onto the `IngestItem.warnings` slot
/// using the same shape the markdown path uses (so downstream wire
/// readers don't have to learn two formats — see kb-normalize's
/// `warning_agent`).
fn record_image_analysis_failure(
asset: &RawAsset,
events: &mut Vec<kebab_core::ProvenanceEvent>,
warning_notes: &mut Vec<String>,
kind_label: &str,
err: anyhow::Error,
now: time::OffsetDateTime,
) {
let detail = format!("{err:#}");
let note = format!("{kind_label}: {detail}");
tracing::warn!(
target: "kebab-app",
path = %asset.workspace_path.0,
"image analysis stage {} failed: {}",
kind_label,
detail
);
events.push(kebab_core::ProvenanceEvent {
at: now,
agent: "kb-app".to_string(),
kind: kebab_core::ProvenanceKind::Warning,
note: Some(note.clone()),
});
warning_notes.push(note);
}
/// Pull the BCP-47 language hint from the canonical document. P6-1
/// stamps `Lang("und")` by default; image-pipeline OCR / caption
/// adapters special-case "und" so the hint is intentionally dropped
/// from prompts.
fn lang_hint_from_doc(doc: &CanonicalDocument) -> Option<Lang> {
let s = doc.lang.0.as_str();
if s.is_empty() || s == "und" {
None
} else {
Some(doc.lang.clone())
}
}
/// Convenience: end byte of the frontmatter region (or 0 when absent).
fn fm_span_end(span: Option<kebab_parse_md::FrontmatterSpan>) -> usize {
span.map(|s| s.end).unwrap_or(0)

View File

@@ -0,0 +1,421 @@
//! P6-4 image ingest wiring — end-to-end integration.
//!
//! Each test spins up a `TempDir` workspace + writes one PNG fixture +
//! routes OCR / caption HTTP calls through a `wiremock` server that
//! impersonates Ollama's `/api/generate` endpoint. The kb-app code
//! under test is sync; the wiremock server is async, so test bodies
//! drive blocking work via `tokio::task::spawn_blocking`.
mod common;
use std::path::Path;
use common::TestEnv;
use kebab_config::Config;
use serde_json::json;
use tokio::task::spawn_blocking;
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};
// ── Fixture helpers ──────────────────────────────────────────────────────
/// Tiny solid-red PNG written into the test workspace at `<root>/<name>`.
/// 100×50 — small enough to skip downscale by default but non-trivially
/// inspectable in stored DB rows.
fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf {
use image::{ImageBuffer, Rgb};
let img: ImageBuffer<Rgb<u8>, _> =
ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
let path = root.join(name);
img.save(&path).expect("write PNG fixture");
path
}
fn cfg_with_image_pipeline(env: &TestEnv, mock_endpoint: &str) -> Config {
let mut cfg = env.config.clone();
// Ensure image assets are scanned.
cfg.workspace
.include
.push("**/*.png".to_string());
cfg.image.ocr.enabled = true;
cfg.image.ocr.endpoint = Some(mock_endpoint.to_string());
cfg.image.ocr.model = "vision-mock:1b".to_string();
cfg.image.ocr.max_pixels = 512;
cfg.image.caption.enabled = false; // tested separately below
cfg.models.llm.endpoint = mock_endpoint.to_string();
cfg.models.llm.model = "vision-mock:1b".to_string();
cfg
}
// ── 1. Happy path: OCR-only ingest ───────────────────────────────────────
/// One PNG asset + OCR enabled (caption off) → ingest produces 1 doc + 1
/// chunk; chunk text contains alt + OCR transcription joined by `\n\n`.
#[tokio::test]
async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
let server = MockServer::start().await;
Mock::given(method("POST"))
.and(path("/api/generate"))
.respond_with(ResponseTemplate::new(200).set_body_json(json!({
"model": "vision-mock:1b",
"response": "Hello World 2026",
"done": true,
"done_reason": "stop"
})))
.mount(&server)
.await;
let env = TestEnv::lexical_only();
let png = write_red_png(&env.workspace_root, "diagram.png");
eprintln!("PNG written to {}", png.display());
let cfg = cfg_with_image_pipeline(&env, &server.uri());
let cfg_clone = cfg.clone();
let env_workspace = env.workspace_root.clone();
let env_scope = env.scope();
let report = spawn_blocking(move || {
kebab_app::ingest_with_config(cfg_clone, env_scope, false)
.expect("image ingest must succeed")
})
.await
.expect("blocking task panicked");
// Counters: scanned should include the PNG; new ≥ 1 (markdown
// fixtures from the workspace tree may also count).
assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items);
assert_eq!(report.errors, 0, "no errors on lenient OCR path");
// Locate the image doc in the report items.
let items = report.items.expect("items present (summary_only=false)");
let img_item = items
.iter()
.find(|i| i.doc_path.0.ends_with("diagram.png"))
.expect("image doc item must be present");
assert_eq!(
img_item.kind,
kebab_core::IngestItemKind::New,
"image asset must be classified New on first ingest"
);
assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk");
// Inspect the stored chunk text via kb-app's inspect_chunk facade.
let doc_id = img_item.doc_id.clone().expect("image doc id");
let doc = kebab_app::inspect_doc_with_config(cfg.clone(), &doc_id)
.expect("inspect_doc returns the image document");
let block = match doc.blocks.first() {
Some(kebab_core::Block::ImageRef(b)) => b,
other => panic!("expected ImageRef, got {other:?}"),
};
assert!(block.ocr.is_some(), "block.ocr populated by apply_ocr");
assert_eq!(
block.ocr.as_ref().unwrap().joined,
"Hello World 2026",
"OCR text from mock"
);
assert!(
block.caption.is_none(),
"caption disabled in cfg → block.caption stays None"
);
// Sanity: the doc was actually persisted into SQLite (kb-app's
// list_docs facade reads the same store the chunker writes to).
let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
.expect("list_docs");
assert!(
summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")),
"image doc must appear in list_docs"
);
drop(env_workspace); // keep TempDir alive until here
drop(env);
}
// ── 2. OCR + caption together ────────────────────────────────────────────
/// Both OCR and caption enabled. The mock returns the same JSON body
/// for every `/api/generate` POST — wiremock has no per-prompt routing
/// on the default `Mock` so we treat both calls as equivalent. We then
/// verify both `block.ocr` and `block.caption` are populated, and the
/// chunk text contains both fragments separated by `\n\n`.
#[tokio::test]
async fn ingest_image_with_ocr_and_caption_populates_both_fields() {
let server = MockServer::start().await;
Mock::given(method("POST"))
.and(path("/api/generate"))
.respond_with(ResponseTemplate::new(200).set_body_json(json!({
"response": "shared mock body",
"done": true,
"done_reason": "stop"
})))
.mount(&server)
.await;
let env = TestEnv::lexical_only();
write_red_png(&env.workspace_root, "diagram.png");
let mut cfg = cfg_with_image_pipeline(&env, &server.uri());
cfg.image.caption.enabled = true;
cfg.image.caption.max_pixels = 384;
let cfg_clone = cfg.clone();
let scope = env.scope();
let report = spawn_blocking(move || {
kebab_app::ingest_with_config(cfg_clone, scope, false)
.expect("ingest must succeed with both OCR+caption")
})
.await
.expect("task");
assert_eq!(report.errors, 0);
let img_item = report
.items
.as_ref()
.unwrap()
.iter()
.find(|i| i.doc_path.0.ends_with("diagram.png"))
.unwrap();
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
.unwrap();
let block = match &doc.blocks[0] {
kebab_core::Block::ImageRef(b) => b,
_ => unreachable!(),
};
assert!(block.ocr.is_some(), "OCR populated");
assert!(block.caption.is_some(), "caption populated");
drop(env);
}
// ── 3. Lenient failure: OCR Ollama 503 → asset still indexed ─────────────
/// OCR endpoint returns 503. Spec contract: image is still indexed,
/// `block.ocr = None`, Provenance has a Warning event, `errors`
/// counter NOT incremented.
#[tokio::test]
async fn ocr_failure_indexes_asset_with_warning_no_error_counter() {
let server = MockServer::start().await;
Mock::given(method("POST"))
.and(path("/api/generate"))
.respond_with(ResponseTemplate::new(503))
.mount(&server)
.await;
let env = TestEnv::lexical_only();
write_red_png(&env.workspace_root, "broken.png");
let cfg = cfg_with_image_pipeline(&env, &server.uri());
let cfg_clone = cfg.clone();
let scope = env.scope();
let report = spawn_blocking(move || {
kebab_app::ingest_with_config(cfg_clone, scope, false)
.expect("ingest does not abort on lenient OCR failure")
})
.await
.expect("task");
assert_eq!(
report.errors, 0,
"lenient OCR failure must NOT increment errors counter (spec)"
);
let img_item = report
.items
.as_ref()
.unwrap()
.iter()
.find(|i| i.doc_path.0.ends_with("broken.png"))
.expect("asset still indexed despite OCR failure");
assert_eq!(img_item.kind, kebab_core::IngestItemKind::New);
assert_eq!(img_item.chunk_count, Some(1));
assert!(
!img_item.warnings.is_empty(),
"lenient OCR failure must surface a warning on the IngestItem"
);
let doc_id = img_item.doc_id.clone().unwrap();
let doc = kebab_app::inspect_doc_with_config(cfg, &doc_id).unwrap();
let block = match &doc.blocks[0] {
kebab_core::Block::ImageRef(b) => b,
_ => unreachable!(),
};
assert!(block.ocr.is_none(), "block.ocr stays None on OCR failure");
let warning = doc
.provenance
.events
.iter()
.find(|e| e.kind == kebab_core::ProvenanceKind::Warning && e.agent == "kb-app")
.expect("Provenance Warning attributed to kb-app");
let note = warning.note.as_deref().unwrap_or("");
assert!(
note.contains("OcrFailed"),
"warning note must describe OCR failure with OcrFailed prefix \
(markdown-style WarningKind format): {note}"
);
}
// ── 4. Both image.ocr.enabled and image.caption.enabled = false ──────────
/// When both adapters are disabled, the image is still extracted +
/// chunked. Chunk text falls back to the filename. EXIF + dimensions
/// are populated by the extractor regardless.
#[tokio::test]
async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
// No mock server needed — neither HTTP path is touched.
let env = TestEnv::lexical_only();
write_red_png(&env.workspace_root, "raw.png");
let mut cfg = env.config.clone();
cfg.workspace.include.push("**/*.png".to_string());
cfg.image.ocr.enabled = false;
cfg.image.caption.enabled = false;
let cfg_clone = cfg.clone();
let scope = env.scope();
let report = spawn_blocking(move || {
kebab_app::ingest_with_config(cfg_clone, scope, false)
.expect("ingest with no OCR/caption")
})
.await
.expect("task");
assert_eq!(report.errors, 0);
let img_item = report
.items
.as_ref()
.unwrap()
.iter()
.find(|i| i.doc_path.0.ends_with("raw.png"))
.unwrap();
assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk");
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
.unwrap();
let block = match &doc.blocks[0] {
kebab_core::Block::ImageRef(b) => b,
_ => unreachable!(),
};
assert!(block.ocr.is_none() && block.caption.is_none());
// EXIF + dimensions still populated by the extractor.
let dims = doc
.metadata
.user
.get("dimensions")
.and_then(|v: &serde_json::Value| v.as_object())
.expect("dimensions object present");
assert_eq!(
dims.get("w").and_then(|v: &serde_json::Value| v.as_u64()),
Some(100)
);
assert_eq!(
dims.get("h").and_then(|v: &serde_json::Value| v.as_u64()),
Some(50)
);
}

(칭찬) garbage_png_increments_errors_counter_exactly_once 가 단순 "errors == 1" 검증을 넘어 IngestReport 의 산술 invariant (scanned = new + updated + skipped + errors) 까지 박은 점이 좋습니다. 이 invariant 는 카운터 종류가 늘어나도 (예: 미래에 image_ocr_failed 카운터 추가 시) 자연스럽게 회귀를 잡아 줍니다.

(칭찬) `garbage_png_increments_errors_counter_exactly_once` 가 단순 "errors == 1" 검증을 넘어 IngestReport 의 산술 invariant (`scanned = new + updated + skipped + errors`) 까지 박은 점이 좋습니다. 이 invariant 는 카운터 종류가 늘어나도 (예: 미래에 `image_ocr_failed` 카운터 추가 시) 자연스럽게 회귀를 잡아 줍니다.
// ── 5. Garbage bytes (not an image) → errors counter exactly 1 ──────────
/// `kebab-source-fs` classifies a `.png` extension as
/// `MediaType::Image(Png)` regardless of content. When the bytes don't
/// decode as any image format, `ImageExtractor::extract` returns Err
/// and the asset must be classified as `IngestItemKind::Error` with
/// the `errors` counter incremented **exactly once** (regression for
/// the double-count bug surfaced during P6-4 manual smoke).
#[tokio::test]
async fn garbage_png_increments_errors_counter_exactly_once() {
// No mock server needed — extract fails before any HTTP call.
let env = TestEnv::lexical_only();
// Single non-image asset with .png extension.
std::fs::write(
env.workspace_root.join("garbage.png"),
b"this is not an image at all",
)
.expect("write garbage fixture");
let mut cfg = env.config.clone();
cfg.workspace.include.push("**/*.png".to_string());
cfg.image.ocr.enabled = false;
cfg.image.caption.enabled = false;
let cfg_clone = cfg.clone();
let scope = env.scope();
let report = spawn_blocking(move || {
kebab_app::ingest_with_config(cfg_clone, scope, false)
.expect("ingest does not abort on per-asset failure")
})
.await
.expect("task");
// Exactly-once: scanned counts the asset, errors counts it once,
// and (scanned == new + updated + skipped + errors) holds.
assert_eq!(
report.errors, 1,
"garbage PNG must increment errors exactly once, not twice (double-count regression)"
);
assert_eq!(
report.scanned,
report.new + report.updated + report.skipped + report.errors,
"counter sum must equal scanned — invariant of the IngestReport contract"
);
// The single Error item carries the propagated extract error.
let items = report.items.expect("items present");
let err_item = items
.iter()
.find(|i| i.doc_path.0.ends_with("garbage.png"))
.expect("garbage item present");
assert_eq!(err_item.kind, kebab_core::IngestItemKind::Error);
assert!(err_item.error.is_some(), "Error item carries error string");
}
// ── 6. Determinism: re-ingest produces identical doc_id / chunk_id ───────
/// Idempotency contract — running the same ingest twice should mark
/// the asset Updated on the second run with byte-identical IDs.
#[tokio::test]
async fn re_ingest_image_produces_updated_with_same_doc_id() {
let server = MockServer::start().await;
Mock::given(method("POST"))
.and(path("/api/generate"))
.respond_with(ResponseTemplate::new(200).set_body_json(json!({
"response": "stable",
"done": true,
"done_reason": "stop"
})))
.mount(&server)
.await;
let env = TestEnv::lexical_only();
write_red_png(&env.workspace_root, "diagram.png");
let cfg = cfg_with_image_pipeline(&env, &server.uri());
let scope = env.scope();
let cfg1 = cfg.clone();
let cfg2 = cfg.clone();
let scope1 = scope.clone();
let scope2 = scope.clone();
let r1 = spawn_blocking(move || {
kebab_app::ingest_with_config(cfg1, scope1, false).unwrap()
})
.await
.unwrap();
let r2 = spawn_blocking(move || {
kebab_app::ingest_with_config(cfg2, scope2, false).unwrap()
})
.await
.unwrap();
let id1 = r1
.items
.as_ref()
.unwrap()
.iter()
.find(|i| i.doc_path.0.ends_with("diagram.png"))
.unwrap()
.doc_id
.clone()
.unwrap();
let img2 = r2
.items
.as_ref()
.unwrap()
.iter()
.find(|i| i.doc_path.0.ends_with("diagram.png"))
.unwrap();
assert_eq!(img2.kind, kebab_core::IngestItemKind::Updated);
assert_eq!(img2.doc_id.as_ref().unwrap(), &id1);
}

View File

@@ -381,17 +381,41 @@ fn render_block_text(b: &Block) -> String {
}
s
}
// ImageRef text portion = alt (per task spec). Fall back to
// model caption text if alt is empty.
// ImageRef text portion follows the P6-4 (β) plain-concat
// contract — `[alt, ocr.joined, caption.text]` joined by
// `\n\n`, dropping empty parts. Filename fallback for empty
// alt keeps lexical search hits on filenames working even when
// P6-1's filename auto-fill is bypassed.
Block::ImageRef(i) => {
if !i.alt.is_empty() {
let alt = if !i.alt.is_empty() {
i.alt.clone()
} else {
i.caption
.as_ref()
.map(|c| c.text.clone())
.unwrap_or_default()
}
// P6-1 falls back to filename so this branch is
// defensive — keep it lest a future test fixture or
// synthetic block path skip the auto-fill.
i.src
.rsplit('/')
.next()
.filter(|s| !s.is_empty())
.unwrap_or("[image]")
.to_string()
};
let ocr = i
.ocr
.as_ref()
.map(|o| o.joined.as_str())
.unwrap_or("");
let cap = i
.caption
.as_ref()
.map(|c| c.text.as_str())
.unwrap_or("");
[alt.as_str(), ocr, cap]
.iter()
.filter(|s| !s.is_empty())
.copied()
.collect::<Vec<_>>()
.join("\n\n")
}
// AudioRef has no caption preview yet (transcript joins land
// in P8). Empty string per task spec.
@@ -700,6 +724,63 @@ mod tests {
}
}
/// P6-4 (β) plain concatenation — alt + ocr.joined + caption.text
/// joined by `\n\n`, dropping empty parts. Verifies all four
/// (alt-only, alt+ocr, alt+caption, alt+ocr+caption) shapes.
#[test]
fn image_ref_p6_4_plain_concat_drops_empty_parts() {
use kebab_core::{ModelCaption, OcrText};
let mk = |alt: &str, ocr: Option<&str>, cap: Option<&str>| {
Block::ImageRef(ImageRefBlock {
common: common_for("imageref", &[], 0, span(1, 1)),
asset_id: None,
src: "img.png".into(),
alt: alt.into(),
ocr: ocr.map(|t| OcrText {
joined: t.into(),
regions: vec![],
engine: "test".into(),
engine_version: "v1".into(),
}),
caption: cap.map(|t| ModelCaption {
text: t.into(),
model: "m".into(),
model_version: "v".into(),
}),
})
};
// alt-only — no separators between empty parts.
assert_eq!(render_block_text(&mk("photo.png", None, None)), "photo.png");
// alt + ocr — joined by exactly one `\n\n`.
assert_eq!(
render_block_text(&mk("photo.png", Some("Hello"), None)),
"photo.png\n\nHello"
);
// alt + caption.
assert_eq!(
render_block_text(&mk("photo.png", None, Some("a red square"))),
"photo.png\n\na red square"
);
// alt + ocr + caption — three parts joined by `\n\n` each.
assert_eq!(
render_block_text(&mk("photo.png", Some("Hello"), Some("a red square"))),
"photo.png\n\nHello\n\na red square"
);
// empty alt — falls back to filename derived from `src`.
let blk = mk("", Some("text from image"), None);
assert_eq!(
render_block_text(&blk),
"img.png\n\ntext from image",
"empty alt must fall back to the basename of `src`"
);
}
/// ImageRef → own chunk, token_estimate=0.
#[test]
fn image_ref_emits_own_chunk_zero_tokens() {

View File

@@ -118,16 +118,41 @@ max_context_tokens = 6000
KEBAB() { ./target/debug/kebab --config /tmp/kebab-smoke/config.toml "$@"; }
KB doctor # 1. health check
KB ingest # 2. 워크스페이스 색인
KB list docs # 3. 색인 결과 목록
KB ingest # 2. 워크스페이스 색인 (markdown + image)
KB list docs # 3. 색인 결과 목록 (markdown + image 모두 표시)
KB search --mode lexical "코루틴" --k 3 # 4. lexical 검색
KB search --mode vector "memory safety" --k 3 # 5. vector 검색
KB search --mode hybrid "Cargo workspace" --k 3 # 6. hybrid 검색
KB inspect chunk <chunk_id> # 7. raw chunk 보기
KB ask "이 KB 안에서 ..." --mode hybrid --k 5 # 8. RAG 답변 (Ollama 필요)
KB --json ask "..." --mode hybrid # 9. 기계 친화 출력 검증
KB search --mode lexical "Hello World" --k 3 # 7. image OCR 텍스트 검색 (P6-4)
KB inspect chunk <chunk_id> # 8. raw chunk 보기
KB ask "이 KB 안에서 ..." --mode hybrid --k 5 # 9. RAG 답변 (Ollama 필요)
KB --json ask "..." --mode hybrid # 10. 기계 친화 출력 검증
```
## P6-4 이미지 ingestion 옵션
`config.toml` 에 다음 절을 추가하면 `kebab ingest``**/*.png` / `**/*.jpg` 등 이미지 자산도 함께 색인합니다 (텍스트만 색인하려면 생략):
```toml
[workspace]
include = ["**/*.md", "**/*.png", "**/*.jpg"]
[image.ocr]
enabled = true # vision LM 으로 이미지 안 텍스트 전사
engine = "ollama-vision"
model = "gemma4:e4b" # 사용자 환경의 비전 모델
endpoint = "http://192.168.0.47:11434" # 비우면 models.llm.endpoint fallback
languages = ["eng", "kor"]
max_pixels = 1600 # long-edge cap
[image.caption]
enabled = true # vision LM 으로 한 문장 객관 설명 생성
max_pixels = 768
prompt_template_version = "caption-v1"
```
이미지 자산 한 장당 OCR 1 호출 + Caption 1 호출 → ~3-6초 (`gemma4:e4b` 기준). 다이어그램 / 카메라 사진 / 스크린샷 위주 워크스페이스에 권장. 책 / 스캔본은 P7 PDF 라인으로 (P7 머지 후).
각 명령은 0 종료 코드면 정상. `kebab ask` 는 거절 시 종료 코드 1 (`RefusalSignal`) — 의도된 동작.
## 검증 체크리스트
@@ -138,6 +163,8 @@ KB --json ask "..." --mode hybrid # 9. 기계 친화 출력 검
- `kebab search --mode hybrid``fusion_score``[0, 1]` 범위 (top-1 종종 1.0 — 두 retriever 모두 rank 1 일 때).
- `kebab ask` JSON 응답에 `model.id` 가 config 의 모델 (`gemma4:26b` 등) 과 일치, `embedding.id = multilingual-e5-small`, `citations[].marker``[1]` / `[2]` 형식 (square-bracketed bare index).
- 코퍼스에 없는 주제로 `kebab ask``refusal_reason: "llm_self_judge"` (또는 `no_chunks` / `score_gate`) + `grounded: false`.
- (P6-4) `image.ocr.enabled = true` 로 PNG 자산을 ingest 하면 `kebab list docs` 가 markdown 옆에 image doc 도 출력 (`workspace_path``*.png`). `kebab inspect doc <image_doc_id>``block.ocr.joined` 가 vision LM 의 OCR 결과 (예: 스크린샷 안의 텍스트). `kebab search --mode lexical "<OCR text>"` 가 그 image chunk 를 반환하면 wiring 정상.
- OCR / caption 부분 실패는 `errors` 카운터 미증가 — `kebab inspect doc <id>` 의 Provenance Warning 이벤트 또는 `--debug` 로그에서만 확인.
## 정리
@@ -154,5 +181,6 @@ rm -rf /tmp/kebab-smoke # 통째로 정리
- `kebab ask` 응답 시간 = LLM 토큰 throughput 에 종속. M4 Pro 48GB + gemma4:26b 기준 답변 50100 토큰에 2055초.
- `--config` path 가 존재하지 않거나 malformed 면 `kebab doctor` 가 hard fail (defaults 가 silently mask 하지 않게 하는 hotfix 동작).
- 매 CLI invocation 마다 fastembed 모델 init 비용 (~4초) — process-level 캐시 부재 때문. P9 TUI 진입 시 `App``OnceLock` 으로 세션 동안 한 번만 init.
- (P6-4) `image.ocr.enabled = true` + `image.caption.enabled = true` 인 워크스페이스에 PNG 가 N장 있으면 ingest 시간 ≈ markdown_time + N × (OCR + Caption latency). `gemma4:e4b` + 192.168.0.47 로 자산당 ~5-10초. 다수의 책 페이지를 이미지로 넣지 말 것 — 책은 P7 PDF 라인 사용 권장 (P7 머지 후).
자세한 history 와 발견된 버그는 [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 참조.

View File

@@ -3,7 +3,7 @@ phase: P6
component: kebab-app (image ingest dispatch + chunking)
task_id: p6-4
title: "Wire ImageExtractor + OCR + caption into kebab-app::ingest end-to-end"
status: planned
status: completed
depends_on: [p6-1, p6-2, p6-3, p1-6, p3-5]
unblocks: []
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md