P7-1 (`PdfTextExtractor`) + P7-2 (`PdfPageV1Chunker`) 의 라이브러리를 `kebab-app::ingest_with_config` 에 와이어링. `kebab-source-fs` 가 이미 `*.pdf` 를 `MediaType::Pdf` 로 분류하던 자산이 이제 검색 가능한 doc 으로 색인됨. P6-4 image wiring 패턴과 평행 — `ingest_one_asset` 에 `MediaType::Pdf` arm 추가, 새 private fn `ingest_one_pdf_asset` 로 분기. 핵심 동작: - per-medium chunker 선택: PDF 자산은 `PdfPageV1Chunker` 하드코딩 (compile-time match 기반). `config.chunking.chunker_version` 은 markdown 만 represent — PDF 는 항상 `pdf-page-v1`. HOTFIXES entry `2026-05-02 P7-3` 에 deviation 기록. - encrypted PDF / corrupt PDF → `errors+=1` + P7-1 의 `qpdf --decrypt` hint 를 `IngestItem.error` 에 verbatim 보존. - 빈/scanned candidate 페이지 → 0 chunk, P7-1 의 `Provenance::Warning` 그대로 통과. v1 에서는 검색 불가, P+ scanned-PDF OCR fallback 대기. - determinism stress: extract → chunk 사이 `now()` 추가 호출 없음 (P6-4 invariant 계승). PDF doc/chunk_id 모두 결정적. 통합 테스트 (`tests/pdf_pipeline.rs`, 8 passed + 1 ignored): - 3-page text PDF → 1 doc + 3 chunk + Page span 검증 - identical re-ingest → Updated, doc_id 동일 - encrypted PDF → Error + `qpdf` hint 보존 - corrupt header PDF → Error + 미저장 - mixed page (page 2 빈) → 2 chunk + Warning 1개 - IngestReport 산술 invariant - 50-page 긴 PDF → ≥50 chunk - inspect doc → SourceSpan::Page round-trip - (ignored) edited bytes re-ingest → storage UNIQUE bug 노출, P+ fix 대기 추가 발견 (HOTFIXES `2026-05-02 P7-3`): `assets.workspace_path` 의 UNIQUE 제약과 `upsert_asset_row` 의 `ON CONFLICT(asset_id)` 만 처리하는 부분 사이에 gap 존재. byte 변경 시 새 asset_id → 같은 workspace_path 충돌. md / image / pdf 모두 영향. P7-3 통합 테스트가 처음 노출. 본 PR 은 fix 안 함 — P+ storage task. `docs/SMOKE.md` 에 PDF 섹션 + 검증 체크리스트 + 알려진 동작 4건 추가. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
60 lines
2.7 KiB
TOML
60 lines
2.7 KiB
TOML
[package]
|
|
name = "kebab-app"
|
|
version = { workspace = true }
|
|
edition = { workspace = true }
|
|
rust-version = { workspace = true }
|
|
license = { workspace = true }
|
|
repository = { workspace = true }
|
|
description = "Facade — orchestrates components for kb-cli/tui/desktop"
|
|
|
|
[dependencies]
|
|
kebab-core = { path = "../kebab-core" }
|
|
kebab-config = { path = "../kebab-config" }
|
|
kebab-source-fs = { path = "../kebab-source-fs" }
|
|
kebab-parse-md = { path = "../kebab-parse-md" }
|
|
kebab-parse-types = { path = "../kebab-parse-types" }
|
|
kebab-normalize = { path = "../kebab-normalize" }
|
|
kebab-chunk = { path = "../kebab-chunk" }
|
|
kebab-store-sqlite = { path = "../kebab-store-sqlite" }
|
|
kebab-store-vector = { path = "../kebab-store-vector" }
|
|
kebab-search = { path = "../kebab-search" }
|
|
kebab-embed = { path = "../kebab-embed" }
|
|
kebab-embed-local = { path = "../kebab-embed-local" }
|
|
kebab-llm = { path = "../kebab-llm" }
|
|
kebab-llm-local = { path = "../kebab-llm-local" }
|
|
kebab-rag = { path = "../kebab-rag" }
|
|
# P6-4: image extractor + OCR + caption adapters live here. App
|
|
# threads them into the per-asset dispatch (see `ingest_one_asset`
|
|
# image branch). Trait-only consumption — no `kebab-parse-image`
|
|
# internals leak into kb-app code.
|
|
kebab-parse-image = { path = "../kebab-parse-image" }
|
|
# P7-3: PDF text extractor lives here. App threads it into the
|
|
# per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the
|
|
# resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
|
|
kebab-parse-pdf = { path = "../kebab-parse-pdf" }
|
|
anyhow = { workspace = true }
|
|
blake3 = { workspace = true }
|
|
serde = { workspace = true }
|
|
serde_json = { workspace = true }
|
|
time = { workspace = true }
|
|
tracing = { workspace = true }
|
|
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] }
|
|
tracing-appender = "0.2"
|
|
toml = "0.8"
|
|
dirs = "5"
|
|
|
|
[dev-dependencies]
|
|
rusqlite = { workspace = true }
|
|
tempfile = { workspace = true }
|
|
# Image-pipeline integration tests use wiremock to stub Ollama for OCR
|
|
# / caption HTTP calls. Async runtime to host the mock server only;
|
|
# the kb-app code under test stays sync.
|
|
wiremock = { workspace = true }
|
|
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
|
image = { version = "0.25", default-features = false, features = ["png"] }
|
|
# P7-3 PDF integration tests build in-memory PDF fixtures via the same
|
|
# lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
|
|
# to the same major (0.32) so byte output is identical between the two
|
|
# fixture surfaces.
|
|
lopdf = "0.32"
|