Merge pull request 'feat(pdf): scanned PDF OCR via qwen2.5vl:3b vision LLM (v0.20.0 sub-item 1)' (#189) from feat/pdf-scanned-ocr into main
Reviewed-on: #189
This commit was merged in pull request #189.
This commit is contained in:
47
Cargo.lock
generated
47
Cargo.lock
generated
@@ -4127,7 +4127,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-app"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64 0.22.1",
|
||||
@@ -4166,12 +4166,13 @@ dependencies = [
|
||||
"tracing-appender",
|
||||
"tracing-subscriber",
|
||||
"unicode-normalization",
|
||||
"uuid",
|
||||
"wiremock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kebab-chunk"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4187,7 +4188,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-cli"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
@@ -4208,7 +4209,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-config"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"dirs 5.0.1",
|
||||
@@ -4223,7 +4224,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-core"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4237,7 +4238,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4251,7 +4252,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-embed-local"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"fastembed",
|
||||
@@ -4264,7 +4265,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-eval"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-app",
|
||||
@@ -4283,7 +4284,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-llm"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -4292,7 +4293,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-llm-local"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-config",
|
||||
@@ -4309,7 +4310,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-mcp"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-app",
|
||||
@@ -4327,7 +4328,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-nli"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"hf-hub",
|
||||
@@ -4342,7 +4343,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-code"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"gix",
|
||||
@@ -4365,7 +4366,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-image"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"ab_glyph",
|
||||
"anyhow",
|
||||
@@ -4389,7 +4390,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-md"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kebab-core",
|
||||
@@ -4406,20 +4407,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-parse-pdf"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
"kebab-core",
|
||||
"kebab-parse-image",
|
||||
"lopdf",
|
||||
"serde_json",
|
||||
"strsim",
|
||||
"time",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kebab-rag"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4441,7 +4444,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-search"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"globset",
|
||||
@@ -4460,7 +4463,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-source-fs"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4478,7 +4481,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-store-sqlite"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
@@ -4498,7 +4501,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-store-vector"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arrow",
|
||||
@@ -4522,7 +4525,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "kebab-tui"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"crossterm",
|
||||
|
||||
@@ -30,7 +30,7 @@ edition = "2024"
|
||||
rust-version = "1.85"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/altair823/kebab"
|
||||
version = "0.19.0"
|
||||
version = "0.20.0" # v0.20.0 sub-item 1 (scanned PDF OCR via qwen2.5vl:3b) — CLAUDE.md §Release 사용자 도그푸딩 트리거
|
||||
|
||||
# pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with
|
||||
# intentional allow-list. The allowed lints are either cosmetic (doc style),
|
||||
@@ -141,6 +141,7 @@ proptest = "1"
|
||||
# p9-fb-19: LRU cache for `App::search` results. Bounded capacity
|
||||
# from `config.search.cache_capacity` (default 256, ~1.3 MB cap).
|
||||
lru = "0.12"
|
||||
lopdf = "0.32"
|
||||
# fastembed-rs ships ONNX runtime via the `ort-download-binaries` feature
|
||||
# in its default set (which also pulls `hf-hub` for first-run model
|
||||
# downloads). Pinned to the 4.x line per task p3-2 (current 5.x release
|
||||
|
||||
@@ -17,7 +17,7 @@ P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) + P10 전체
|
||||
| **P4** | Local LLM + RAG + grounded answer | `kebab-llm`, `kebab-llm-local`, `kebab-rag` | P3 | ✅ 완료 |
|
||||
| **P5** | Golden query / regression eval | `kebab-eval` | P4 | ✅ 완료 |
|
||||
| **P6** | 이미지 ingestion (OCR + caption) | `kebab-parse-image` | P5 | ✅ 완료 (4/4 component, OCR/caption Ollama-vision) |
|
||||
| **P7** | PDF text + page citation | `kebab-parse-pdf` | P5 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring) |
|
||||
| **P7** | PDF text + page citation + scanned OCR (v0.20.0 sub-item 1) | `kebab-parse-pdf` + `kebab-app::pdf_ocr_apply` | P5 + P6 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring + post-extract OCR enrichment via qwen2.5vl:3b vision LLM) |
|
||||
| **P8** | 음성 transcription + timestamp citation | `kebab-parse-audio` | P5 | ⏸ 보류 (whisper-rs 시스템 dep brainstorm 필요) |
|
||||
| **P9** | TUI + desktop app | `kebab-tui`, `kebab-desktop` | P5 | 🟡 진행 (4/5 component — P9-1/2/3/4 완료 [Library / Search / Ask / Inspect], P9-5 desktop 예정 · 도그푸딩 피드백 **20/20 ✅**) |
|
||||
| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)**, **2 ✅ (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` — v0.14.0)**, **3 ✅ (Tier 3 paragraph fallback: code-text-paragraph-v1 — v0.15.0)**, **1D ✅ (C + C++ AST chunkers, code-c-ast-v1 + code-cpp-ast-v1 — v0.16.0)** |
|
||||
@@ -32,6 +32,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
|
||||
|
||||
머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만:
|
||||
|
||||
- **v0.20 sub-item 1 (scanned PDF OCR via qwen2.5vl:3b)**: post-extract enrichment pattern (`kebab-app::pdf_ocr_apply`, H-1 resolution), DCTDecode-only v1 scope (FlateDecode/CCITTFax page 는 warning + skip), parser_version `"pdf-text-v1"` 보존 + force-reingest UX 명문 (H-4).
|
||||
- **2026-05-26 kebab-normalize + kebab-parse-types 흡수 (24 → 22 crates, design §3.7b 재작성)** — v0.19.0 cut. 4 parser 중 markdown 한 갈래만 lift 를 경유하는 reality 가 design §3.7b 의 fan-in ≥ 2 가정과 diverge → thin layer (`kebab-parse-types`) + `kebab-normalize` 두 crate 가 `kebab-parse-md` 로 흡수. 5 사용 type + 3 forward-declared struct 모두 `kebab-parse-md::{types,normalize}` module 의 `pub` re-export 로 보존. wire / surface impact = 0 (CLI / TUI / MCP / `--json` / config / XDG / parser_version 모두 unchanged). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-26 design deviation entry).
|
||||
- **2026-05-26 v0.18.0 fb-41 multi-hop RAG + NLI verification ship (PR #176-180) + post-PR9 cleanup (PR #181)** — pre-v0.18.0 dogfood (`/build/cache/dogfood-v018/`, 33 assets / 205 chunks, gemma3:4b CPU only / 16 GB RAM) 에서 발견된 S7 caffeine hallucination 의 root cause = LLM-self-judge ceiling (synthesize 가 chunks 와 무관한 Adam optimizer gradient 식을 silent emit, self-judge 가 reject 못함). 학계 표준 (Self-RAG, CRAG, Auto-GDA, MedTrust-RAG) 결론 = deterministic post-synthesis verification. mDeBERTa-v3 XNLI ONNX (280 MB, Xenova HF) 가 `(packed_chunks, answer)` entailment 검사 — `[rag] nli_threshold > 0` (default 0.0 = disabled, production 권장 0.5) 일 때 활성. dogfood retest 측정 — S7 PR-8 baseline `grounded=true + Adam hallucination` → PR-9 `nli_verification_failed, nli_score 0.0035`. wire additive minor — `answer.v1.verification` field + `refusal_reason` 의 `nli_verification_failed` / `nli_model_unavailable` 추가, pre-v0.18 reader 무영향. 5 sub-PR 시퀀스 + cleanup PR (clippy::pedantic baseline + 의도적 30+ allow + H1 `[models.nli].model` config wiring + 9 new tests). post-refactor retest = PR-9d byte-identical (deterministic 확인). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 fb-41 PR-9 closure entry + S3 follow-up).
|
||||
- **2026-05-25 v0.17.2 post-v0.17.1 polish (PR #164 + #165)** — v0.17.1 의 두 follow-up closure. (1) `[image.ocr] request_timeout_secs` 별 노브 — `crates/kebab-parse-image/src/ocr.rs::REQUEST_TIMEOUT` hard 300s 제거, LLM 쪽 패턴 (PR #162) 을 OCR 어댑터에 동일 적용. 사용자 결정으로 별 노브 분리 (OCR vs LLM 의 cold start 패턴이 달라 독립 조절). v0.17.1 미진행 항목 closure. (2) `chunks_fts` 의 `heading_path` 컬럼이 JSON 표기 + path 세그먼트 까지 trigram 색인 → query false positive 가능 문제 closure. `lexical.rs::build_match_string` 가 non-raw 분기 결과를 `text : (<expr>)` 로 wrap — heading 색인 V007 verbatim 유지, 매칭만 text 한정. 사용자가 명시 heading 검색 하려면 raw mode `'heading_path : <token>'` escape hatch (SKILL.md 갱신). 둘 다 additive (옛 config 호환) / re-ingest 불필요. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 v0.17.2 두 entry).
|
||||
|
||||
25
README.md
25
README.md
@@ -192,7 +192,7 @@ flowchart TB
|
||||
|
||||
## Configuration
|
||||
|
||||
- `~/.config/kebab/config.toml` — `kebab init` 가 XDG 경로에 생성. `[workspace]` (root, exclude — include 필드는 제거됨, 지원 형식은 자동 결정), `[storage]`, `[chunking]`, `[models.embedding]`, `[models.llm]`, `[image.ocr]`, `[image.caption]`, `[search]`, `[rag]`, `[ui]` 절.
|
||||
- `~/.config/kebab/config.toml` — `kebab init` 가 XDG 경로에 생성. `[workspace]` (root, exclude — include 필드는 제거됨, 지원 형식은 자동 결정), `[storage]`, `[chunking]`, `[models.embedding]`, `[models.llm]`, `[image.ocr]`, `[image.caption]`, `[pdf.ocr]`, `[search]`, `[rag]`, `[ui]` 절.
|
||||
- `[models.embedding]` —
|
||||
- `model` (default `"multilingual-e5-large"`, fb-39b) — 다국어 sentence embedding 모델. 1024-dim. ONNX (~1.3 GB) 첫 실행 시 fastembed cache (`config.storage.model_dir/fastembed/`) 에 자동 다운로드. `"multilingual-e5-small"` (384 dim) 는 backwards-compat 으로 사용 가능 — TOML 에 명시.
|
||||
- `dimensions` (default `1024`) — 모델의 embedding 차원. config 와 LanceDB stored dim 불일치 시 검색 결과 0 건 (orphan table). 모델 변경 시 `kebab reset --vector-only && kebab ingest` 로 vector index 재구축 권장.
|
||||
@@ -211,6 +211,29 @@ flowchart TB
|
||||
|
||||
config 예시는 [docs/SMOKE.md](docs/SMOKE.md) 의 `/tmp/kebab-smoke/config.toml` 블록 참조.
|
||||
|
||||
### `[pdf.ocr]` — scanned PDF OCR (v0.20.0+)
|
||||
|
||||
embedded text 가 없는 scanned PDF (책 스캔, 영수증, 카메라 page 등) 의 OCR 활성화. **default off (opt-in)** — OCR 한 page 당 ~45-100s (qwen2.5vl:3b on CPU) 의 cost 때문에 책 / 논문 archive 등 명시적 KB 에만 활성화.
|
||||
|
||||
```toml
|
||||
[pdf.ocr]
|
||||
enabled = false # opt-in: 책 / 논문 archive KB 에서 true
|
||||
always_on = false # true 시 vector PDF page 도 dual-block OCR (confidence boost)
|
||||
engine = "ollama-vision"
|
||||
model = "qwen2.5vl:3b" # PoC alnum 94.79% page1 / 81.56% 받침 (vs gemma4:e4b 의 27%)
|
||||
# endpoint = "http://localhost:11434" # 미명시 시 models.llm.endpoint fallback
|
||||
languages = ["eng", "kor"]
|
||||
max_pixels = 2048
|
||||
request_timeout_secs = 600
|
||||
valid_ratio_threshold = 0.5 # text-detect threshold — mojibake / scanned 판정 boundary
|
||||
min_char_count = 20
|
||||
lang_hint = "kor"
|
||||
```
|
||||
|
||||
env override: `KEBAB_PDF_OCR_*` 11 변수 (예: `KEBAB_PDF_OCR_ENABLED=true kebab ingest`).
|
||||
|
||||
**v0.20 upgrade after**: scanned PDF 가 v0.19 에 빈 block + warning 으로 indexed 된 경우 자동으로 OCR 재실행 안 됨 (parser_version `"pdf-text-v1"` 보존). 명시적 재처리: `kebab ingest --force-reingest`.
|
||||
|
||||
## 외부 AI 통합
|
||||
|
||||
`--json` 출력 + frozen wire schema v1 가 stable contract. 통합 옵션:
|
||||
|
||||
@@ -35,6 +35,7 @@ kebab-parse-image = { path = "../kebab-parse-image" }
|
||||
# per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the
|
||||
# resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
|
||||
kebab-parse-pdf = { path = "../kebab-parse-pdf" }
|
||||
lopdf = { workspace = true }
|
||||
# p10-1A-2: Rust AST extractor lives here. App threads it into the
|
||||
# per-asset dispatch (see `ingest_one_asset` Code branch) and runs the
|
||||
# resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`.
|
||||
@@ -44,6 +45,7 @@ blake3 = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] }
|
||||
tracing-appender = "0.2"
|
||||
@@ -75,7 +77,7 @@ image = { version = "0.25", default-features = false, features =
|
||||
# lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
|
||||
# to the same major (0.32) so byte output is identical between the two
|
||||
# fixture surfaces.
|
||||
lopdf = "0.32"
|
||||
lopdf = { workspace = true }
|
||||
# error_wire::tests::llm_unreachable_classifies_to_model_unreachable needs a real
|
||||
# reqwest::Error (private constructor) — built from a connect-refused call.
|
||||
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
|
||||
|
||||
@@ -46,9 +46,8 @@ use kebab_core::{
|
||||
use kebab_embed_local::FastembedEmbedder;
|
||||
use kebab_llm_local::OllamaLanguageModel;
|
||||
use kebab_parse_code::{
|
||||
CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor,
|
||||
JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor,
|
||||
TypescriptAstExtractor,
|
||||
CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor,
|
||||
KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor,
|
||||
};
|
||||
use kebab_parse_image::ImageExtractor;
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
@@ -242,15 +241,15 @@ impl App {
|
||||
// kebab-nli construction. Failure (`?`) surfaces as a user-
|
||||
// facing error at App boot — never a panic in the pipeline's
|
||||
// `expect("verifier must be Some when nli_threshold > 0.0")`.
|
||||
let pipeline_verifier: Option<Arc<dyn kebab_nli::NliVerifier>> =
|
||||
if config.rag.nli_threshold > 0.0 {
|
||||
let v = kebab_nli::OnnxNliVerifier::new(&config).context(
|
||||
"kebab-app: construct OnnxNliVerifier (config.rag.nli_threshold > 0)",
|
||||
)?;
|
||||
Some(Arc::new(v))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let pipeline_verifier: Option<Arc<dyn kebab_nli::NliVerifier>> = if config.rag.nli_threshold
|
||||
> 0.0
|
||||
{
|
||||
let v = kebab_nli::OnnxNliVerifier::new(&config)
|
||||
.context("kebab-app: construct OnnxNliVerifier (config.rag.nli_threshold > 0)")?;
|
||||
Some(Arc::new(v))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(Self {
|
||||
config,
|
||||
sqlite: Arc::new(sqlite),
|
||||
@@ -350,7 +349,9 @@ impl App {
|
||||
// so other in-flight searches can use the cache concurrently.
|
||||
drop(guard);
|
||||
let hits = self.search_uncached(query)?;
|
||||
let mut guard = cache.lock().unwrap_or_else(std::sync::PoisonError::into_inner);
|
||||
let mut guard = cache
|
||||
.lock()
|
||||
.unwrap_or_else(std::sync::PoisonError::into_inner);
|
||||
guard.put(key, hits.clone());
|
||||
Ok(hits)
|
||||
}
|
||||
@@ -430,11 +431,7 @@ impl App {
|
||||
///
|
||||
/// `SearchResponse.next_cursor` and `truncated` are independent
|
||||
/// signals — see `SearchResponse` doc for details.
|
||||
pub fn search_with_opts(
|
||||
&self,
|
||||
query: SearchQuery,
|
||||
opts: SearchOpts,
|
||||
) -> Result<SearchResponse> {
|
||||
pub fn search_with_opts(&self, query: SearchQuery, opts: SearchOpts) -> Result<SearchResponse> {
|
||||
use crate::cursor;
|
||||
|
||||
let corpus_revision = self.sqlite.corpus_revision().to_string();
|
||||
@@ -519,8 +516,7 @@ impl App {
|
||||
// Apply offset + k_effective truncation (mirrors non-trace path).
|
||||
let drop_n = offset.min(traced_hits.len());
|
||||
traced_hits.drain(..drop_n);
|
||||
let mut hits: Vec<SearchHit> =
|
||||
traced_hits.into_iter().take(k_effective).collect();
|
||||
let mut hits: Vec<SearchHit> = traced_hits.into_iter().take(k_effective).collect();
|
||||
|
||||
// Snippet truncation if opts.snippet_chars set (mirror non-trace path).
|
||||
if opts.snippet_chars.is_some() {
|
||||
@@ -551,8 +547,7 @@ impl App {
|
||||
// Skip offset.
|
||||
let drop_n = offset.min(all_hits.len());
|
||||
all_hits.drain(..drop_n);
|
||||
let mut hits: Vec<SearchHit> =
|
||||
all_hits.into_iter().take(k_effective).collect();
|
||||
let mut hits: Vec<SearchHit> = all_hits.into_iter().take(k_effective).collect();
|
||||
|
||||
// Apply snippet_chars override if shorter than what the
|
||||
// retriever returned (retriever already honored
|
||||
@@ -573,15 +568,11 @@ impl App {
|
||||
// Step 1: shorten snippets progressively to a 60-char floor.
|
||||
const SNIPPET_FLOOR: usize = 60;
|
||||
let mut current_snippet_cap = snippet_chars;
|
||||
while estimate_chars(&hits) > max_chars
|
||||
&& current_snippet_cap > SNIPPET_FLOOR
|
||||
{
|
||||
current_snippet_cap =
|
||||
(current_snippet_cap / 2).max(SNIPPET_FLOOR);
|
||||
while estimate_chars(&hits) > max_chars && current_snippet_cap > SNIPPET_FLOOR {
|
||||
current_snippet_cap = (current_snippet_cap / 2).max(SNIPPET_FLOOR);
|
||||
for h in &mut hits {
|
||||
if h.snippet.chars().count() > current_snippet_cap {
|
||||
h.snippet =
|
||||
trim_to_chars(&h.snippet, current_snippet_cap);
|
||||
h.snippet = trim_to_chars(&h.snippet, current_snippet_cap);
|
||||
truncated = true;
|
||||
}
|
||||
}
|
||||
@@ -651,8 +642,7 @@ impl App {
|
||||
retriever: Arc<dyn Retriever>,
|
||||
llm: Arc<dyn LanguageModel>,
|
||||
) -> RagPipeline {
|
||||
let pipeline =
|
||||
RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
|
||||
let pipeline = RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
|
||||
match &self.pipeline_verifier {
|
||||
Some(v) => pipeline.with_verifier(v.clone()),
|
||||
None => pipeline,
|
||||
@@ -723,12 +713,7 @@ impl App {
|
||||
/// returns; on persistence error, the answer is still returned
|
||||
/// (don't lose the user's compute) but the error is logged so
|
||||
/// the operator notices.
|
||||
pub fn ask_with_session(
|
||||
&self,
|
||||
session_id: &str,
|
||||
query: &str,
|
||||
opts: AskOpts,
|
||||
) -> Result<Answer> {
|
||||
pub fn ask_with_session(&self, session_id: &str, query: &str, opts: AskOpts) -> Result<Answer> {
|
||||
use kebab_core::traits::{ChatSessionRepo, ChatSessionRow, ChatTurnRow};
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
@@ -766,13 +751,8 @@ impl App {
|
||||
let retriever = self.build_retriever(opts.mode)?;
|
||||
let llm = self.llm()?;
|
||||
let pipeline = self.build_pipeline(retriever, llm);
|
||||
let answer = pipeline.ask_with_history(
|
||||
query,
|
||||
history,
|
||||
session_id.to_string(),
|
||||
next_index,
|
||||
opts,
|
||||
)?;
|
||||
let answer =
|
||||
pipeline.ask_with_history(query, history, session_id.to_string(), next_index, opts)?;
|
||||
|
||||
// Auto-create the session header on first use. Title from
|
||||
// the first question (≤40 chars after trim).
|
||||
@@ -813,7 +793,8 @@ impl App {
|
||||
turn_index: next_index,
|
||||
question: query.to_string(),
|
||||
answer: answer.answer.clone(),
|
||||
citations_json: serde_json::to_string(&answer.citations).unwrap_or_else(|_| "[]".to_string()),
|
||||
citations_json: serde_json::to_string(&answer.citations)
|
||||
.unwrap_or_else(|_| "[]".to_string()),
|
||||
created_at: now_unix,
|
||||
};
|
||||
if let Err(e) = self.sqlite.append_turn(&turn_row) {
|
||||
@@ -848,8 +829,7 @@ impl App {
|
||||
return Ok(Some(e.clone()));
|
||||
}
|
||||
let emb: Arc<dyn Embedder + Send + Sync> = Arc::new(
|
||||
FastembedEmbedder::new(&self.config)
|
||||
.context("kb-app: load FastembedEmbedder")?,
|
||||
FastembedEmbedder::new(&self.config).context("kb-app: load FastembedEmbedder")?,
|
||||
);
|
||||
// `set` returns Err if another thread won the race; in that case
|
||||
// the loser still returns the (now-cached) winner via `get()`.
|
||||
@@ -925,7 +905,9 @@ impl App {
|
||||
/// clear` admin command). No-op when the cache is disabled.
|
||||
pub fn clear_search_cache(&self) {
|
||||
if let Some(cache) = self.search_cache.as_ref() {
|
||||
let mut guard = cache.lock().unwrap_or_else(std::sync::PoisonError::into_inner);
|
||||
let mut guard = cache
|
||||
.lock()
|
||||
.unwrap_or_else(std::sync::PoisonError::into_inner);
|
||||
guard.clear();
|
||||
}
|
||||
}
|
||||
@@ -946,8 +928,8 @@ impl App {
|
||||
/// git tree) correctly keep `repo: None` — `Metadata.repo` is already
|
||||
/// `None` for those, so the assignment is a no-op.
|
||||
fn backfill_repo(&self, hits: &mut [SearchHit]) {
|
||||
use std::collections::HashMap;
|
||||
use kebab_core::DocumentId;
|
||||
use std::collections::HashMap;
|
||||
|
||||
// doc_id → Option<String> where None means "not found / no repo"
|
||||
let mut cache: HashMap<DocumentId, Option<String>> = HashMap::new();
|
||||
@@ -956,26 +938,24 @@ impl App {
|
||||
if hit.repo.is_some() {
|
||||
continue;
|
||||
}
|
||||
let repo_val = cache
|
||||
.entry(hit.doc_id.clone())
|
||||
.or_insert_with(|| {
|
||||
// Deliberately non-aborting: a failed store lookup for
|
||||
// one hit must not abort the whole search response. Log
|
||||
// the error so it's observable rather than silently
|
||||
// dropped (review #140 round 1).
|
||||
match self.sqlite.get_document(&hit.doc_id) {
|
||||
Ok(opt) => opt.and_then(|doc| doc.metadata.repo),
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
doc_id = %hit.doc_id,
|
||||
error = %e,
|
||||
"backfill_repo: get_document failed; leaving hit.repo = None"
|
||||
);
|
||||
None
|
||||
}
|
||||
let repo_val = cache.entry(hit.doc_id.clone()).or_insert_with(|| {
|
||||
// Deliberately non-aborting: a failed store lookup for
|
||||
// one hit must not abort the whole search response. Log
|
||||
// the error so it's observable rather than silently
|
||||
// dropped (review #140 round 1).
|
||||
match self.sqlite.get_document(&hit.doc_id) {
|
||||
Ok(opt) => opt.and_then(|doc| doc.metadata.repo),
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
doc_id = %hit.doc_id,
|
||||
error = %e,
|
||||
"backfill_repo: get_document failed; leaving hit.repo = None"
|
||||
);
|
||||
None
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
if let Some(r) = repo_val {
|
||||
hit.repo = Some(r.clone());
|
||||
}
|
||||
@@ -986,10 +966,7 @@ impl App {
|
||||
/// "switch to --mode lexical" error when embeddings are disabled.
|
||||
fn require_embeddings(
|
||||
&self,
|
||||
) -> Result<(
|
||||
Arc<dyn Embedder + Send + Sync>,
|
||||
Arc<LanceVectorStore>,
|
||||
)> {
|
||||
) -> Result<(Arc<dyn Embedder + Send + Sync>, Arc<LanceVectorStore>)> {
|
||||
let emb = self.embedder()?.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"embeddings disabled (config.models.embedding.provider == \"none\" \
|
||||
@@ -1278,8 +1255,8 @@ mod tests_extractor_dispatch {
|
||||
MediaType::Code("kotlin".into()),
|
||||
MediaType::Code("c".into()),
|
||||
MediaType::Code("cpp".into()),
|
||||
MediaType::Code("yaml".into()), // registry NOT cover
|
||||
MediaType::Code("shell".into()), // registry NOT cover
|
||||
MediaType::Code("yaml".into()), // registry NOT cover
|
||||
MediaType::Code("shell".into()), // registry NOT cover
|
||||
MediaType::Audio(AudioType::Wav), // registry NOT cover
|
||||
];
|
||||
for sample in &samples {
|
||||
|
||||
@@ -215,7 +215,10 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
|
||||
.and_then(serde_json::Value::as_u64)
|
||||
.map(|n| n as usize),
|
||||
cursor: obj.get("cursor").and_then(|v| v.as_str()).map(String::from),
|
||||
trace: obj.get("trace").and_then(serde_json::Value::as_bool).unwrap_or(false),
|
||||
trace: obj
|
||||
.get("trace")
|
||||
.and_then(serde_json::Value::as_bool)
|
||||
.unwrap_or(false),
|
||||
};
|
||||
|
||||
Ok((
|
||||
|
||||
@@ -10,6 +10,6 @@
|
||||
|
||||
pub use crate::doctor_signal::{DoctorUnhealthy, NoHitSignal, RefusalSignal};
|
||||
|
||||
pub use kebab_config::{ConfigInvalid, ConfigNotFound};
|
||||
pub use kebab_llm_local::LlmError;
|
||||
pub use kebab_config::ConfigInvalid;
|
||||
pub use kebab_store_sqlite::NotIndexed;
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use crate::error_signal::{ConfigInvalid, LlmError, NotIndexed};
|
||||
use crate::error_signal::{ConfigInvalid, ConfigNotFound, LlmError, NotIndexed};
|
||||
|
||||
// p9-fb-34: `stale_cursor` is constructed directly by `cursor::decode`
|
||||
// and surfaced through `StructuredError` (an anyhow-friendly wrapper
|
||||
@@ -65,6 +65,20 @@ pub fn classify(err: &anyhow::Error, verbose: bool) -> ErrorV1 {
|
||||
hint: Some("check `--config <path>` and TOML syntax".to_string()),
|
||||
};
|
||||
}
|
||||
if let Some(s) = err.downcast_ref::<ConfigNotFound>() {
|
||||
return ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "config_not_found".to_string(),
|
||||
message: s.to_string(),
|
||||
details: json!({
|
||||
"path": s.path.to_string_lossy(),
|
||||
}),
|
||||
hint: Some(
|
||||
"verify --config <path>; pass an existing toml file or omit --config to use XDG default"
|
||||
.to_string(),
|
||||
),
|
||||
};
|
||||
}
|
||||
if let Some(s) = err.downcast_ref::<NotIndexed>() {
|
||||
return ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
@@ -158,7 +172,10 @@ mod tests {
|
||||
});
|
||||
let v1 = classify(&err, false);
|
||||
assert_eq!(v1.code, "config_invalid");
|
||||
assert_eq!(v1.details.get("path").and_then(|p| p.as_str()), Some("/tmp/x.toml"));
|
||||
assert_eq!(
|
||||
v1.details.get("path").and_then(|p| p.as_str()),
|
||||
Some("/tmp/x.toml")
|
||||
);
|
||||
assert!(v1.hint.is_some());
|
||||
}
|
||||
|
||||
@@ -182,7 +199,8 @@ mod tests {
|
||||
// the resulting LlmError::Unreachable maps to "model_unreachable".
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.timeout(std::time::Duration::from_millis(500))
|
||||
.build().unwrap();
|
||||
.build()
|
||||
.unwrap();
|
||||
let err = client.get("http://127.0.0.1:1").send().unwrap_err();
|
||||
let llm = LlmError::Unreachable {
|
||||
endpoint: "http://127.0.0.1:1".to_string(),
|
||||
@@ -198,7 +216,10 @@ mod tests {
|
||||
let llm = LlmError::ModelNotPulled("gemma4:e4b".to_string());
|
||||
let v1 = classify(&anyhow::Error::new(llm), false);
|
||||
assert_eq!(v1.code, "model_not_pulled");
|
||||
assert_eq!(v1.details.get("model").and_then(|p| p.as_str()), Some("gemma4:e4b"));
|
||||
assert_eq!(
|
||||
v1.details.get("model").and_then(|p| p.as_str()),
|
||||
Some("gemma4:e4b")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -235,7 +256,10 @@ mod tests {
|
||||
// (single source of truth). classify must not pattern-match on
|
||||
// anyhow string contents — that would create two sources of
|
||||
// truth. The bare anyhow string falls through to "generic".
|
||||
assert_ne!(v1.code, "stale_cursor", "classify must not produce stale_cursor from bare anyhow string");
|
||||
assert_ne!(
|
||||
v1.code, "stale_cursor",
|
||||
"classify must not produce stale_cursor from bare anyhow string"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -36,9 +36,7 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let already = existing
|
||||
.lines()
|
||||
.any(|line| line.trim() == KEBABIGNORE_LINE);
|
||||
let already = existing.lines().any(|line| line.trim() == KEBABIGNORE_LINE);
|
||||
if already {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -57,11 +55,7 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
|
||||
/// Copy bytes to `<external_dir>/<blake3-12>.<ext>`. Idempotent — if the
|
||||
/// destination file already exists with the expected hash, the existing
|
||||
/// file is reused (no second write). Returns the destination path.
|
||||
pub fn copy_to_external(
|
||||
external_dir: &Path,
|
||||
bytes: &[u8],
|
||||
ext: &str,
|
||||
) -> Result<PathBuf> {
|
||||
pub fn copy_to_external(external_dir: &Path, bytes: &[u8], ext: &str) -> Result<PathBuf> {
|
||||
let hash = blake3::hash(bytes);
|
||||
let hex = hash.to_hex();
|
||||
let prefix = &hex.as_str()[..12];
|
||||
@@ -82,11 +76,7 @@ pub fn copy_to_external(
|
||||
/// Internal `yaml_quote` always uses double-quoted YAML form with backslash
|
||||
/// escapes for `"` / `\` / control chars — agent-supplied titles with
|
||||
/// special characters are safe.
|
||||
pub fn inject_frontmatter(
|
||||
body: &str,
|
||||
title: &str,
|
||||
source_uri: Option<&str>,
|
||||
) -> Result<String> {
|
||||
pub fn inject_frontmatter(body: &str, title: &str, source_uri: Option<&str>) -> Result<String> {
|
||||
let head = body.trim_start();
|
||||
if head.starts_with("---\n") || head.starts_with("---\r\n") || head.starts_with("---\r") {
|
||||
anyhow::bail!(
|
||||
|
||||
@@ -50,14 +50,14 @@ impl App {
|
||||
fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result<FetchResult> {
|
||||
let target = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_chunk(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "chunk_not_found".to_string(),
|
||||
message: format!("chunk_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "chunk_not_found".to_string(),
|
||||
message: format!("chunk_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
let doc_id = target.doc_id.clone();
|
||||
let doc =
|
||||
@@ -107,14 +107,14 @@ fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result<FetchResult> {
|
||||
fn fetch_doc(app: &App, id: DocumentId, opts: FetchOpts) -> Result<FetchResult> {
|
||||
let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
let mut text = fmt_canonical_to_markdown(&doc);
|
||||
let mut truncated = false;
|
||||
@@ -176,14 +176,14 @@ fn fetch_span(
|
||||
) -> Result<FetchResult> {
|
||||
let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
// Reject line-incompatible media types (PDF / audio). `SourceType`
|
||||
// (markdown / note / paper / reference / inbox) is the *user-facing*
|
||||
|
||||
328
crates/kebab-app/src/ingest_log.rs
Normal file
328
crates/kebab-app/src/ingest_log.rs
Normal file
@@ -0,0 +1,328 @@
|
||||
//! Per-ingest-run structured ndjson log writer (v0.20.x ingest log feature).
|
||||
//!
|
||||
//! Each `kebab ingest` run produces one `ingest-{run_id}.ndjson` file in
|
||||
//! `config.logging.ingest_log_dir`. Records are appended line by line; the
|
||||
//! last record is always `kind="summary"`. `IngestLogWriter::open` returns
|
||||
//! `Ok(None)` when `ingest_log_enabled = false` so callers need not branch.
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
|
||||
pub struct IngestLogWriter {
|
||||
file: BufWriter<File>,
|
||||
path: PathBuf,
|
||||
run_id: String,
|
||||
started_at: SystemTime,
|
||||
}
|
||||
|
||||
impl IngestLogWriter {
|
||||
/// Open a new log file. Returns `Ok(None)` when `cfg.ingest_log_enabled == false` (AC-6).
|
||||
pub fn open(cfg: &kebab_config::LoggingCfg) -> anyhow::Result<Option<Self>> {
|
||||
if !cfg.ingest_log_enabled {
|
||||
return Ok(None);
|
||||
}
|
||||
let run_id = generate_run_id();
|
||||
let log_dir = expand_log_dir(&cfg.ingest_log_dir);
|
||||
std::fs::create_dir_all(&log_dir)?;
|
||||
let path = log_dir.join(format!("ingest-{run_id}.ndjson"));
|
||||
let file = BufWriter::new(File::create(&path)?);
|
||||
Ok(Some(Self {
|
||||
file,
|
||||
path,
|
||||
run_id,
|
||||
started_at: SystemTime::now(),
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn write_event(&mut self, event: &LogEvent<'_>) -> anyhow::Result<()> {
|
||||
serde_json::to_writer(&mut self.file, event)?;
|
||||
writeln!(self.file)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_summary(&mut self, summary: &IngestSummary) -> anyhow::Result<()> {
|
||||
serde_json::to_writer(&mut self.file, summary)?;
|
||||
writeln!(self.file)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> anyhow::Result<()> {
|
||||
self.file.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn run_id(&self) -> &str {
|
||||
&self.run_id
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.path
|
||||
}
|
||||
|
||||
pub fn started_at(&self) -> SystemTime {
|
||||
self.started_at
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for IngestLogWriter {
|
||||
fn drop(&mut self) {
|
||||
let _ = self.file.flush();
|
||||
}
|
||||
}
|
||||
|
||||
/// ISO 8601 compact timestamp + uuid v7 suffix: `20260528T013000Z-abc123de`.
|
||||
/// uuid v7 is the workspace dep (Cargo.toml); `rand` is not added (spec §6 R-5).
|
||||
fn generate_run_id() -> String {
|
||||
use time::macros::format_description;
|
||||
let now = time::OffsetDateTime::now_utc();
|
||||
let ts = now
|
||||
.format(format_description!(
|
||||
"[year][month][day]T[hour][minute][second]Z"
|
||||
))
|
||||
.unwrap_or_else(|_| "19700101T000000Z".to_string());
|
||||
let uid = uuid::Uuid::now_v7().simple().to_string();
|
||||
let suffix = &uid[uid.len() - 8..];
|
||||
format!("{ts}-{suffix}")
|
||||
}
|
||||
|
||||
/// Expand `{state_dir}` placeholder → XDG state dir (spec §6 R-3).
|
||||
/// Other tilde/env expansion is delegated to `kebab_config::expand_path`.
|
||||
fn expand_log_dir(path: &Path) -> PathBuf {
|
||||
let path_str = path.to_string_lossy();
|
||||
if path_str.contains("{state_dir}") {
|
||||
let state_dir = kebab_config::Config::xdg_state_dir();
|
||||
PathBuf::from(path_str.replace("{state_dir}", &state_dir.to_string_lossy()))
|
||||
} else {
|
||||
path.to_path_buf()
|
||||
}
|
||||
}
|
||||
|
||||
/// RFC 3339 UTC timestamp for log records.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn now_ts() -> String {
|
||||
time::OffsetDateTime::now_utc()
|
||||
.format(&Rfc3339)
|
||||
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string())
|
||||
}
|
||||
|
||||
/// Ingest event record (ndjson line). `kind` is the discriminator.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||
pub enum LogEvent<'a> {
|
||||
Ocr {
|
||||
ts: String,
|
||||
doc_path: &'a str,
|
||||
page: u32,
|
||||
image_byte_size: Option<u64>,
|
||||
image_width: Option<u32>,
|
||||
image_height: Option<u32>,
|
||||
ms: u64,
|
||||
chars: u32,
|
||||
success: bool,
|
||||
reason: Option<&'a str>,
|
||||
ocr_engine: &'a str,
|
||||
},
|
||||
ParseError {
|
||||
ts: String,
|
||||
doc_path: &'a str,
|
||||
reason: &'a str,
|
||||
message: &'a str,
|
||||
},
|
||||
Skip {
|
||||
ts: String,
|
||||
doc_path: &'a str,
|
||||
reason: &'a str,
|
||||
detail: Option<&'a str>,
|
||||
},
|
||||
Error {
|
||||
ts: String,
|
||||
code: &'a str,
|
||||
message: &'a str,
|
||||
},
|
||||
}
|
||||
|
||||
/// Final summary record — always the last line of the log file.
|
||||
/// Explicit `kind` field serializes to `"kind": "summary"`.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct IngestSummary {
|
||||
pub kind: String,
|
||||
pub ts: String,
|
||||
pub run_id: String,
|
||||
pub scanned: u32,
|
||||
pub new: u32,
|
||||
pub errors: u32,
|
||||
pub ocr_pages: u32,
|
||||
pub ocr_failures: u32,
|
||||
pub ocr_p50_ms: Option<u64>,
|
||||
pub ocr_p90_ms: Option<u64>,
|
||||
pub ocr_max_ms: Option<u64>,
|
||||
pub duration_ms: u64,
|
||||
}
|
||||
|
||||
impl IngestSummary {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
ts: String,
|
||||
run_id: String,
|
||||
scanned: u32,
|
||||
new: u32,
|
||||
errors: u32,
|
||||
ocr_pages: u32,
|
||||
ocr_failures: u32,
|
||||
ocr_ms_samples: &[u64],
|
||||
duration_ms: u64,
|
||||
) -> Self {
|
||||
let (p50, p90, max) = percentiles(ocr_ms_samples);
|
||||
Self {
|
||||
kind: "summary".to_string(),
|
||||
ts,
|
||||
run_id,
|
||||
scanned,
|
||||
new,
|
||||
errors,
|
||||
ocr_pages,
|
||||
ocr_failures,
|
||||
ocr_p50_ms: p50,
|
||||
ocr_p90_ms: p90,
|
||||
ocr_max_ms: max,
|
||||
duration_ms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple percentile extraction on a sorted copy of `samples`.
|
||||
/// Returns `(p50, p90, max)`. All `None` when samples is empty.
|
||||
pub(crate) fn percentiles(samples: &[u64]) -> (Option<u64>, Option<u64>, Option<u64>) {
|
||||
if samples.is_empty() {
|
||||
return (None, None, None);
|
||||
}
|
||||
let mut sorted = samples.to_vec();
|
||||
sorted.sort_unstable();
|
||||
let n = sorted.len();
|
||||
let p50 = sorted[n * 50 / 100];
|
||||
let p90 = sorted[n * 90 / 100];
|
||||
let max = *sorted.last().unwrap();
|
||||
(Some(p50), Some(p90), Some(max))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_config::LoggingCfg;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn generate_run_id_has_iso_prefix_and_8_hex_suffix() {
|
||||
let id = generate_run_id();
|
||||
// Format: YYYYMMDDTHHmmssZ-xxxxxxxx (total len = 16+1+8 = 25)
|
||||
assert_eq!(id.len(), 25, "run_id len should be 25: {id}");
|
||||
let (prefix, suffix) = id.split_once('-').expect("run_id should contain '-'");
|
||||
assert_eq!(prefix.len(), 16, "prefix should be 16 chars: {prefix}");
|
||||
assert!(prefix.contains('T'), "prefix should contain T: {prefix}");
|
||||
assert!(prefix.ends_with('Z'), "prefix should end with Z: {prefix}");
|
||||
assert_eq!(suffix.len(), 8, "suffix should be 8 chars: {suffix}");
|
||||
assert!(
|
||||
suffix.chars().all(|c| c.is_ascii_hexdigit()),
|
||||
"suffix should be hex: {suffix}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_log_dir_substitutes_state_dir_placeholder() {
|
||||
let input = PathBuf::from("{state_dir}/logs");
|
||||
let expanded = expand_log_dir(&input);
|
||||
let expected = kebab_config::Config::xdg_state_dir().join("logs");
|
||||
assert_eq!(expanded, expected);
|
||||
assert!(!expanded.to_string_lossy().contains("{state_dir}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn writer_disabled_returns_none() {
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: false,
|
||||
ingest_log_dir: PathBuf::from("/tmp/should-not-exist"),
|
||||
};
|
||||
let result = IngestLogWriter::open(&cfg).expect("open should not error");
|
||||
assert!(result.is_none(), "disabled writer should return None");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn writer_writes_one_event_per_line_with_kind_discriminator() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: tmp.path().to_path_buf(),
|
||||
};
|
||||
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
|
||||
let path = writer.path().to_path_buf();
|
||||
|
||||
writer
|
||||
.write_event(&LogEvent::Skip {
|
||||
ts: now_ts(),
|
||||
doc_path: "a.zip",
|
||||
reason: "builtin_blacklist",
|
||||
detail: Some(".zip extension"),
|
||||
})
|
||||
.unwrap();
|
||||
writer
|
||||
.write_event(&LogEvent::Error {
|
||||
ts: now_ts(),
|
||||
code: "ingest_fatal",
|
||||
message: "something bad",
|
||||
})
|
||||
.unwrap();
|
||||
writer
|
||||
.write_event(&LogEvent::ParseError {
|
||||
ts: now_ts(),
|
||||
doc_path: "weird.pdf",
|
||||
reason: "lopdf_error",
|
||||
message: "unexpected EOF",
|
||||
})
|
||||
.unwrap();
|
||||
writer.flush().unwrap();
|
||||
|
||||
let contents = std::fs::read_to_string(&path).unwrap();
|
||||
let lines: Vec<&str> = contents.lines().collect();
|
||||
assert_eq!(lines.len(), 3, "expected 3 lines, got: {}", lines.len());
|
||||
for line in &lines {
|
||||
assert!(
|
||||
line.starts_with('{'),
|
||||
"each line should be JSON object: {line}"
|
||||
);
|
||||
assert!(
|
||||
line.contains("\"kind\""),
|
||||
"each line should have 'kind': {line}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn drop_flushes_pending_buffer() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: tmp.path().to_path_buf(),
|
||||
};
|
||||
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
|
||||
let path = writer.path().to_path_buf();
|
||||
writer
|
||||
.write_event(&LogEvent::Error {
|
||||
ts: now_ts(),
|
||||
code: "test",
|
||||
message: "drop flush test",
|
||||
})
|
||||
.unwrap();
|
||||
// Drop without explicit flush — Drop impl should flush BufWriter.
|
||||
drop(writer);
|
||||
let contents = std::fs::read_to_string(&path).unwrap();
|
||||
assert!(
|
||||
contents.lines().count() >= 1,
|
||||
"file should have at least 1 line after drop"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -46,10 +46,13 @@ pub struct AggregateCounts {
|
||||
/// Ordering invariant per design §2.4a:
|
||||
///
|
||||
/// ```text
|
||||
/// ScanStarted < ScanCompleted < (AssetStarted < AssetFinished)*
|
||||
/// < (Completed | Aborted)
|
||||
/// ScanStarted < ScanCompleted
|
||||
/// < (AssetStarted [< (PdfOcrStarted < PdfOcrFinished)*] < AssetFinished)*
|
||||
/// < (Completed | Aborted)
|
||||
/// ```
|
||||
///
|
||||
/// `[]` = optional, per-PDF asset only (v0.20.0 sub-item 1).
|
||||
///
|
||||
/// Embed-batch events (`embed_batch_started` / `embed_batch_finished`
|
||||
/// in §2.4a) are reserved for a future iteration and are not emitted
|
||||
/// by this task; the spec calls them out as "임의 위치" (optional).
|
||||
@@ -85,6 +88,30 @@ pub enum IngestEvent {
|
||||
/// aggregate at the cancel boundary. Emitted by `p9-fb-04`; this
|
||||
/// task never produces `Aborted`.
|
||||
Aborted { counts: AggregateCounts },
|
||||
/// PDF page 별 OCR 시작 시 emit. v0.20.0 sub-item 1.
|
||||
PdfOcrStarted { page: u32 },
|
||||
/// PDF page 별 OCR 종료 시 emit. v0.20.0 sub-item 1.
|
||||
/// `skipped` = `true` 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패).
|
||||
/// `chars = 0` 만으로는 "skip" 과 "0-char OCR result" 구분 불가, `skipped` field 가 명시적.
|
||||
PdfOcrFinished {
|
||||
page: u32,
|
||||
ms: u64,
|
||||
chars: u32,
|
||||
ocr_engine: String,
|
||||
skipped: bool,
|
||||
/// v0.20.x ingest log: raster image byte size (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_byte_size: Option<u64>,
|
||||
/// v0.20.x ingest log: raster image width in pixels (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_width: Option<u32>,
|
||||
/// v0.20.x ingest log: raster image height in pixels (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_height: Option<u32>,
|
||||
/// v0.20.x ingest log: OCR failure reason (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
failure_reason: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
/// Map a `MediaType` to the short label used by `IngestEvent::AssetStarted`.
|
||||
@@ -118,10 +145,7 @@ pub fn render_skipped_breakdown(map: &std::collections::BTreeMap<String, u32>) -
|
||||
/// Best-effort send into an optional `mpsc::Sender`. A dropped receiver
|
||||
/// is silently absorbed — the ingest hot path must not stall on a slow
|
||||
/// consumer. Logged at `trace` for diagnostics.
|
||||
pub(crate) fn emit(
|
||||
progress: Option<&std::sync::mpsc::Sender<IngestEvent>>,
|
||||
event: IngestEvent,
|
||||
) {
|
||||
pub(crate) fn emit(progress: Option<&std::sync::mpsc::Sender<IngestEvent>>, event: IngestEvent) {
|
||||
if let Some(tx) = progress {
|
||||
if tx.send(event).is_err() {
|
||||
tracing::trace!(
|
||||
@@ -165,7 +189,10 @@ mod tests {
|
||||
media: "markdown".into(),
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("asset_started"));
|
||||
assert_eq!(
|
||||
v.get("kind").and_then(|s| s.as_str()),
|
||||
Some("asset_started")
|
||||
);
|
||||
assert_eq!(v.get("idx").and_then(serde_json::Value::as_u64), Some(1));
|
||||
assert_eq!(v.get("total").and_then(serde_json::Value::as_u64), Some(10));
|
||||
assert_eq!(v.get("path").and_then(|s| s.as_str()), Some("notes/foo.md"));
|
||||
@@ -184,8 +211,14 @@ mod tests {
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("completed"));
|
||||
let counts = v.get("counts").unwrap();
|
||||
assert_eq!(counts.get("scanned").and_then(serde_json::Value::as_u64), Some(5));
|
||||
assert_eq!(counts.get("new").and_then(serde_json::Value::as_u64), Some(2));
|
||||
assert_eq!(
|
||||
counts.get("scanned").and_then(serde_json::Value::as_u64),
|
||||
Some(5)
|
||||
);
|
||||
assert_eq!(
|
||||
counts.get("new").and_then(serde_json::Value::as_u64),
|
||||
Some(2)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -34,21 +34,25 @@
|
||||
//! still allowing the cross-crate calls.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use anyhow::{Context, anyhow};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use kebab_chunk::{CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker};
|
||||
use kebab_chunk::{
|
||||
CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, CodeJavaAstV1Chunker,
|
||||
CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker,
|
||||
CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker,
|
||||
K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker,
|
||||
};
|
||||
use kebab_core::{
|
||||
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
|
||||
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
|
||||
EmbeddingKind, ExtractContext, IngestReport, Lang, LanguageModel, MediaType,
|
||||
ParserVersion, RawAsset, SearchHit, SearchQuery, SourceScope,
|
||||
SourceUri, VectorRecord, VectorStore,
|
||||
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, Chunker, ChunkerVersion,
|
||||
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, EmbeddingKind,
|
||||
ExtractContext, IngestReport, Lang, LanguageModel, MediaType, ParserVersion, RawAsset,
|
||||
SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore,
|
||||
};
|
||||
use kebab_llm_local::OllamaLanguageModel;
|
||||
use kebab_parse_image::{OllamaVisionOcr, apply_caption, apply_ocr};
|
||||
use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_caption, apply_ocr};
|
||||
use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
|
||||
@@ -60,20 +64,26 @@ pub mod error_signal;
|
||||
pub mod error_wire;
|
||||
pub mod external;
|
||||
pub mod fetch;
|
||||
pub mod ingest_log;
|
||||
pub mod ingest_progress;
|
||||
pub mod logging;
|
||||
pub mod pdf_ocr_apply;
|
||||
pub mod reset;
|
||||
pub mod schema;
|
||||
mod staleness;
|
||||
|
||||
pub use app::{App, SearchResponse, short_query_hint};
|
||||
pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
|
||||
pub use reset::{ResetReport, ResetScope, enumerate_orphans};
|
||||
pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify};
|
||||
pub use fetch::fetch_with_config;
|
||||
#[doc(hidden)]
|
||||
pub use bulk::{BULK_QUERIES_MAX, bulk_search_with_config};
|
||||
pub use schema::{Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config};
|
||||
pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify};
|
||||
pub use fetch::fetch_with_config;
|
||||
pub use ingest_log::{IngestLogWriter, IngestSummary, LogEvent};
|
||||
pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
|
||||
pub use kebab_config::{ConfigInvalid, ConfigNotFound};
|
||||
pub use reset::{ResetReport, ResetScope, enumerate_orphans};
|
||||
pub use schema::{
|
||||
Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config,
|
||||
};
|
||||
pub use staleness::{compute_stale, mark_stale_in_place};
|
||||
|
||||
/// p9-fb-25: sentinel for files without an extension in
|
||||
@@ -293,6 +303,24 @@ pub fn ingest_with_config_opts(
|
||||
|
||||
let app = App::open_with_config(config)?;
|
||||
|
||||
// v0.20.x Hook 1: init per-run log writer (None when disabled or on open failure).
|
||||
let log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>> =
|
||||
match crate::ingest_log::IngestLogWriter::open(&app.config.logging) {
|
||||
Ok(Some(w)) => Some(Arc::new(Mutex::new(w))),
|
||||
Ok(None) => None,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
error = %e,
|
||||
"ingest_log: failed to open log file; logging disabled for this run"
|
||||
);
|
||||
None
|
||||
}
|
||||
};
|
||||
let ocr_ms_samples: Arc<Mutex<Vec<u64>>> = Arc::new(Mutex::new(Vec::new()));
|
||||
let ocr_pages_cnt: Arc<Mutex<u32>> = Arc::new(Mutex::new(0u32));
|
||||
let ocr_failures_cnt: Arc<Mutex<u32>> = Arc::new(Mutex::new(0u32));
|
||||
|
||||
// Walk the workspace.
|
||||
crate::ingest_progress::emit(
|
||||
progress,
|
||||
@@ -300,8 +328,8 @@ pub fn ingest_with_config_opts(
|
||||
root: scope.root.to_string_lossy().into_owned(),
|
||||
},
|
||||
);
|
||||
let connector = FsSourceConnector::new(&app.config)
|
||||
.context("kb-app::ingest: build FsSourceConnector")?;
|
||||
let connector =
|
||||
FsSourceConnector::new(&app.config).context("kb-app::ingest: build FsSourceConnector")?;
|
||||
let (assets, fs_skips) = connector
|
||||
.scan_with_skips(&scope)
|
||||
.context("kb-app::ingest: scan workspace")?;
|
||||
@@ -312,6 +340,20 @@ pub fn ingest_with_config_opts(
|
||||
},
|
||||
);
|
||||
|
||||
// v0.20.x Hook 4: emit skip events from scan into log writer.
|
||||
if let Some(ref lw) = log_writer {
|
||||
for ev in &fs_skips.events {
|
||||
if let Ok(mut w) = lw.lock() {
|
||||
let _ = w.write_event(&crate::ingest_log::LogEvent::Skip {
|
||||
ts: crate::ingest_log::now_ts(),
|
||||
doc_path: &ev.doc_path,
|
||||
reason: ev.reason,
|
||||
detail: ev.detail.as_deref(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Embedder + vector store: build once at the top so the cold-start
|
||||
// cost is paid once even when the workspace has 1000 markdown files.
|
||||
let embedder = app.embedder()?;
|
||||
@@ -336,18 +378,14 @@ pub fn ingest_with_config_opts(
|
||||
// endpoint) aborts ingest fail-fast — better than silently disabling
|
||||
// OCR/caption mid-run.
|
||||
let ocr_engine: Option<OllamaVisionOcr> = if app.config.image.ocr.enabled {
|
||||
Some(
|
||||
OllamaVisionOcr::new(&app.config)
|
||||
.context("kb-app::ingest: build OllamaVisionOcr")?,
|
||||
)
|
||||
Some(OllamaVisionOcr::new(&app.config).context("kb-app::ingest: build OllamaVisionOcr")?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let caption_llm: Option<Box<dyn LanguageModel>> = if app.config.image.caption.enabled {
|
||||
Some(Box::new(
|
||||
OllamaLanguageModel::new(&app.config)
|
||||
.context("kb-app::ingest: build OllamaLanguageModel for caption")?,
|
||||
))
|
||||
Some(Box::new(OllamaLanguageModel::new(&app.config).context(
|
||||
"kb-app::ingest: build OllamaLanguageModel for caption",
|
||||
)?))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -356,6 +394,29 @@ pub fn ingest_with_config_opts(
|
||||
caption_llm: caption_llm.as_deref(),
|
||||
};
|
||||
|
||||
// p10 / v0.20 sub-item 1: PDF OCR engine eager init (H-5 resolution).
|
||||
// image OCR pattern mirror — per-ingest 1회 build, fallible → fail-fast.
|
||||
let pdf_ocr_engine: Option<OllamaVisionOcr> =
|
||||
if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on {
|
||||
let cfg = &app.config.pdf.ocr;
|
||||
let endpoint = match cfg.endpoint.as_deref() {
|
||||
Some(s) if !s.is_empty() => s.to_string(),
|
||||
_ => app.config.models.llm.endpoint.clone(),
|
||||
};
|
||||
Some(
|
||||
OllamaVisionOcr::from_parts(
|
||||
endpoint,
|
||||
cfg.model.clone(),
|
||||
cfg.languages.clone(),
|
||||
cfg.max_pixels,
|
||||
cfg.request_timeout_secs,
|
||||
)
|
||||
.context("kb-app::ingest: build OllamaVisionOcr (pdf)")?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Pre-load every existing doc_id so we can label `IngestItem.kind`
|
||||
// as `New` vs `Updated` correctly. `list_documents` returns one
|
||||
// row per `(workspace_path, asset_id)` — index by the deterministic
|
||||
@@ -381,10 +442,8 @@ pub fn ingest_with_config_opts(
|
||||
// current walker scope (config narrowing / include-glob change) is
|
||||
// NOT purged — we leave it in place to protect against accidental
|
||||
// data loss via config edits.
|
||||
let scanned_paths: std::collections::HashSet<kebab_core::WorkspacePath> = assets
|
||||
.iter()
|
||||
.map(|a| a.workspace_path.clone())
|
||||
.collect();
|
||||
let scanned_paths: std::collections::HashSet<kebab_core::WorkspacePath> =
|
||||
assets.iter().map(|a| a.workspace_path.clone()).collect();
|
||||
let purged_deleted_files = sweep_deleted_files(
|
||||
&app,
|
||||
&scanned_paths,
|
||||
@@ -447,6 +506,13 @@ pub fn ingest_with_config_opts(
|
||||
&existing_doc_ids,
|
||||
&image_pipeline,
|
||||
force_reingest,
|
||||
pdf_ocr_engine.as_ref(),
|
||||
progress,
|
||||
opts.cancel.as_ref(),
|
||||
log_writer.clone(),
|
||||
ocr_ms_samples.clone(),
|
||||
ocr_pages_cnt.clone(),
|
||||
ocr_failures_cnt.clone(),
|
||||
);
|
||||
|
||||
let item = match item {
|
||||
@@ -458,6 +524,16 @@ pub fn ingest_with_config_opts(
|
||||
error = %e,
|
||||
"kb-app::ingest: per-file fatal"
|
||||
);
|
||||
// v0.20.x Hook 3: write per-asset error to log writer.
|
||||
if let Some(ref lw) = log_writer {
|
||||
if let Ok(mut w) = lw.lock() {
|
||||
let _ = w.write_event(&crate::ingest_log::LogEvent::Error {
|
||||
ts: crate::ingest_log::now_ts(),
|
||||
code: "ingest_asset_error",
|
||||
message: &format!("{e:#}"),
|
||||
});
|
||||
}
|
||||
}
|
||||
// Note: `error_count += 1` happens below in the
|
||||
// `match item.kind { Error => ... }` arm — incrementing
|
||||
// here too would double-count (a regression first
|
||||
@@ -475,6 +551,8 @@ pub fn ingest_with_config_opts(
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: Vec::new(),
|
||||
pdf_ocr_pages: None,
|
||||
pdf_ocr_ms_total: None,
|
||||
error: Some(format!("{e:#}")),
|
||||
}
|
||||
}
|
||||
@@ -581,8 +659,7 @@ pub fn ingest_with_config_opts(
|
||||
}
|
||||
}
|
||||
|
||||
let duration_ms = u32::try_from(started_instant.elapsed().as_millis())
|
||||
.unwrap_or(u32::MAX);
|
||||
let duration_ms = u32::try_from(started_instant.elapsed().as_millis()).unwrap_or(u32::MAX);
|
||||
let finished_at = time::OffsetDateTime::now_utc();
|
||||
|
||||
// Record the ingest_runs row with aggregate counts.
|
||||
@@ -682,6 +759,29 @@ pub fn ingest_with_config_opts(
|
||||
}
|
||||
}
|
||||
|
||||
// v0.20.x Hook 1 exit: write summary record + flush log writer.
|
||||
if let Some(ref lw) = log_writer {
|
||||
if let Ok(mut w) = lw.lock() {
|
||||
let run_id = w.run_id().to_string();
|
||||
let ms_samples = ocr_ms_samples.lock().map(|v| v.clone()).unwrap_or_default();
|
||||
let pages = ocr_pages_cnt.lock().map(|v| *v).unwrap_or(0);
|
||||
let failures = ocr_failures_cnt.lock().map(|v| *v).unwrap_or(0);
|
||||
let summary = crate::ingest_log::IngestSummary::new(
|
||||
crate::ingest_log::now_ts(),
|
||||
run_id,
|
||||
scanned_count,
|
||||
new_count,
|
||||
error_count,
|
||||
pages,
|
||||
failures,
|
||||
&ms_samples,
|
||||
started_instant.elapsed().as_millis() as u64,
|
||||
);
|
||||
let _ = w.write_summary(&summary);
|
||||
let _ = w.flush();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(IngestReport {
|
||||
scope,
|
||||
scanned: scanned_count,
|
||||
@@ -840,8 +940,8 @@ fn try_skip_unchanged(
|
||||
|
||||
if stored_is_tier3_fallback {
|
||||
// Embedder version still must match.
|
||||
let embedder_match = existing_doc.last_embedding_version.as_ref()
|
||||
== current_embedding_version;
|
||||
let embedder_match =
|
||||
existing_doc.last_embedding_version.as_ref() == current_embedding_version;
|
||||
if !embedder_match {
|
||||
return Ok(None);
|
||||
}
|
||||
@@ -863,6 +963,8 @@ fn try_skip_unchanged(
|
||||
parser_version: Some(existing_doc.parser_version.clone()),
|
||||
chunker_version: existing_doc.last_chunker_version.clone(),
|
||||
warnings: Vec::new(),
|
||||
pdf_ocr_pages: None,
|
||||
pdf_ocr_ms_total: None,
|
||||
error: None,
|
||||
}));
|
||||
}
|
||||
@@ -883,23 +985,17 @@ fn try_skip_unchanged(
|
||||
// sentinel removes every doc at this path (the new doc_id is
|
||||
// not yet known here — it's computed downstream from the new
|
||||
// PARSER_VERSION).
|
||||
purge_workspace_path_for_parser_bump(app, asset).with_context(|| {
|
||||
format!(
|
||||
"parser-bump orphan purge at {}",
|
||||
asset.workspace_path.0
|
||||
)
|
||||
})?;
|
||||
purge_workspace_path_for_parser_bump(app, asset)
|
||||
.with_context(|| format!("parser-bump orphan purge at {}", asset.workspace_path.0))?;
|
||||
return Ok(None);
|
||||
}
|
||||
// 3. Chunker unchanged.
|
||||
let chunker_match = existing_doc.last_chunker_version.as_ref()
|
||||
== Some(current_chunker_version);
|
||||
let chunker_match = existing_doc.last_chunker_version.as_ref() == Some(current_chunker_version);
|
||||
if !chunker_match {
|
||||
return Ok(None);
|
||||
}
|
||||
// 4. Embedder unchanged.
|
||||
let embedder_match = existing_doc.last_embedding_version.as_ref()
|
||||
== current_embedding_version;
|
||||
let embedder_match = existing_doc.last_embedding_version.as_ref() == current_embedding_version;
|
||||
if !embedder_match {
|
||||
return Ok(None);
|
||||
}
|
||||
@@ -921,6 +1017,8 @@ fn try_skip_unchanged(
|
||||
parser_version: Some(existing_doc.parser_version.clone()),
|
||||
chunker_version: existing_doc.last_chunker_version.clone(),
|
||||
warnings: Vec::new(),
|
||||
pdf_ocr_pages: None,
|
||||
pdf_ocr_ms_total: None,
|
||||
error: None,
|
||||
}))
|
||||
}
|
||||
@@ -933,7 +1031,8 @@ fn try_skip_unchanged(
|
||||
fn ext_for_skip_warning(path: &str) -> String {
|
||||
std::path::Path::new(path)
|
||||
.extension()
|
||||
.and_then(|s| s.to_str()).map_or_else(|| NO_EXT_SENTINEL.to_string(), str::to_ascii_lowercase)
|
||||
.and_then(|s| s.to_str())
|
||||
.map_or_else(|| NO_EXT_SENTINEL.to_string(), str::to_ascii_lowercase)
|
||||
}
|
||||
|
||||
/// p9-fb-25: render the `IngestItem.warnings` line for a Skipped
|
||||
@@ -963,6 +1062,13 @@ fn ingest_one_asset(
|
||||
existing_doc_ids: &std::collections::HashSet<String>,
|
||||
image_pipeline: &ImagePipeline<'_>,
|
||||
force_reingest: bool,
|
||||
pdf_ocr_engine: Option<&OllamaVisionOcr>,
|
||||
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
|
||||
cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
|
||||
log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
|
||||
ocr_ms_samples: Arc<Mutex<Vec<u64>>>,
|
||||
ocr_pages_cnt: Arc<Mutex<u32>>,
|
||||
ocr_failures_cnt: Arc<Mutex<u32>>,
|
||||
) -> anyhow::Result<kebab_core::IngestItem> {
|
||||
tracing::debug!(
|
||||
target: "kebab-app::ingest",
|
||||
@@ -998,14 +1104,37 @@ fn ingest_one_asset(
|
||||
vector_store,
|
||||
existing_doc_ids,
|
||||
force_reingest,
|
||||
pdf_ocr_engine,
|
||||
progress,
|
||||
cancel,
|
||||
log_writer,
|
||||
ocr_ms_samples,
|
||||
ocr_pages_cnt,
|
||||
ocr_failures_cnt,
|
||||
);
|
||||
}
|
||||
// p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. p10-1D: c/cpp added.
|
||||
MediaType::Code(lang)
|
||||
if matches!(lang.as_str(),
|
||||
"rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin"
|
||||
| "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod"
|
||||
| "shell" | "c" | "cpp") =>
|
||||
if matches!(
|
||||
lang.as_str(),
|
||||
"rust"
|
||||
| "python"
|
||||
| "typescript"
|
||||
| "javascript"
|
||||
| "go"
|
||||
| "java"
|
||||
| "kotlin"
|
||||
| "yaml"
|
||||
| "dockerfile"
|
||||
| "toml"
|
||||
| "json"
|
||||
| "xml"
|
||||
| "groovy"
|
||||
| "go-mod"
|
||||
| "shell"
|
||||
| "c"
|
||||
| "cpp"
|
||||
) =>
|
||||
{
|
||||
return ingest_one_code_asset(
|
||||
app,
|
||||
@@ -1032,6 +1161,8 @@ fn ingest_one_asset(
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: vec![unsupported_media_warning(&asset.workspace_path.0)],
|
||||
pdf_ocr_pages: None,
|
||||
pdf_ocr_ms_total: None,
|
||||
error: None,
|
||||
});
|
||||
}
|
||||
@@ -1051,6 +1182,8 @@ fn ingest_one_asset(
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: vec!["kb:// URI not yet supported".to_string()],
|
||||
pdf_ocr_pages: None,
|
||||
pdf_ocr_ms_total: None,
|
||||
error: None,
|
||||
});
|
||||
}
|
||||
@@ -1081,16 +1214,17 @@ fn ingest_one_asset(
|
||||
|
||||
// Frontmatter — `parse_frontmatter` returns Ok even on malformed
|
||||
// frontmatter (warnings are surfaced through the `Vec<Warning>`).
|
||||
let (metadata, fm_span, fm_warns) = parse_frontmatter(&bytes, &body_hints)
|
||||
.context("kb-parse-md::parse_frontmatter")?;
|
||||
let (metadata, fm_span, fm_warns) =
|
||||
parse_frontmatter(&bytes, &body_hints).context("kb-parse-md::parse_frontmatter")?;
|
||||
|
||||
let body_offset_lines = match fm_span {
|
||||
Some(span) => count_lines_in(&bytes[..span.end]),
|
||||
None => 0,
|
||||
};
|
||||
|
||||
let (parsed_blocks, blk_warns) = parse_blocks(&bytes[fm_span_end(fm_span)..], body_offset_lines)
|
||||
.context("kb-parse-md::parse_blocks")?;
|
||||
let (parsed_blocks, blk_warns) =
|
||||
parse_blocks(&bytes[fm_span_end(fm_span)..], body_offset_lines)
|
||||
.context("kb-parse-md::parse_blocks")?;
|
||||
|
||||
let mut all_warnings = Vec::with_capacity(fm_warns.len() + blk_warns.len());
|
||||
all_warnings.extend(fm_warns);
|
||||
@@ -1103,14 +1237,9 @@ fn ingest_one_asset(
|
||||
.map(|w| format!("{:?}: {}", w.kind, w.note))
|
||||
.collect();
|
||||
|
||||
let mut canonical = build_canonical_document(
|
||||
asset,
|
||||
metadata,
|
||||
parsed_blocks,
|
||||
parser_version,
|
||||
all_warnings,
|
||||
)
|
||||
.context("kb-parse-md::build_canonical_document")?;
|
||||
let mut canonical =
|
||||
build_canonical_document(asset, metadata, parsed_blocks, parser_version, all_warnings)
|
||||
.context("kb-parse-md::build_canonical_document")?;
|
||||
|
||||
let chunks = MdHeadingV1Chunker
|
||||
.chunk(&canonical, chunk_policy)
|
||||
@@ -1177,9 +1306,7 @@ fn ingest_one_asset(
|
||||
dimensions,
|
||||
})
|
||||
.collect();
|
||||
vec_store
|
||||
.upsert(&records)
|
||||
.context("VectorStore::upsert")?;
|
||||
vec_store.upsert(&records).context("VectorStore::upsert")?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1200,6 +1327,8 @@ fn ingest_one_asset(
|
||||
parser_version: Some(parser_version.clone()),
|
||||
chunker_version: Some(MdHeadingV1Chunker.chunker_version()),
|
||||
warnings: warning_notes,
|
||||
pdf_ocr_pages: None,
|
||||
pdf_ocr_ms_total: None,
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
@@ -1242,9 +1371,9 @@ fn ingest_one_image_asset(
|
||||
chunk_count: None,
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: vec![
|
||||
"kb:// URI not yet supported".to_string(),
|
||||
],
|
||||
warnings: vec!["kb:// URI not yet supported".to_string()],
|
||||
pdf_ocr_pages: None,
|
||||
pdf_ocr_ms_total: None,
|
||||
error: None,
|
||||
});
|
||||
}
|
||||
@@ -1354,17 +1483,19 @@ fn ingest_one_image_asset(
|
||||
"image document missing leading ImageRef block — OCR/caption skipped (first block: {:?})",
|
||||
other.map(|b| std::mem::discriminant(b))
|
||||
);
|
||||
canonical.provenance.events.push(kebab_core::ProvenanceEvent {
|
||||
at: now,
|
||||
agent: "kb-app".to_string(),
|
||||
kind: kebab_core::ProvenanceKind::Warning,
|
||||
note: Some(
|
||||
"image document missing leading ImageRef block — OCR/caption skipped"
|
||||
.to_string(),
|
||||
),
|
||||
});
|
||||
warning_notes
|
||||
.push("ImageDispatchAnomaly: missing ImageRef block".to_string());
|
||||
canonical
|
||||
.provenance
|
||||
.events
|
||||
.push(kebab_core::ProvenanceEvent {
|
||||
at: now,
|
||||
agent: "kb-app".to_string(),
|
||||
kind: kebab_core::ProvenanceKind::Warning,
|
||||
note: Some(
|
||||
"image document missing leading ImageRef block — OCR/caption skipped"
|
||||
.to_string(),
|
||||
),
|
||||
});
|
||||
warning_notes.push("ImageDispatchAnomaly: missing ImageRef block".to_string());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1455,6 +1586,8 @@ fn ingest_one_image_asset(
|
||||
parser_version: Some(canonical.parser_version.clone()),
|
||||
chunker_version: Some(MdHeadingV1Chunker.chunker_version()),
|
||||
warnings: warning_notes,
|
||||
pdf_ocr_pages: None,
|
||||
pdf_ocr_ms_total: None,
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
@@ -1510,10 +1643,7 @@ fn record_image_analysis_failure(
|
||||
/// 3. Sweeps the SQLite `documents` row (CASCADE drops `blocks` /
|
||||
/// `chunks` / `embedding_records`). The `assets` row stays — same
|
||||
/// bytes, same asset_id, only the derived `doc_id` changed.
|
||||
fn purge_workspace_path_for_parser_bump(
|
||||
app: &App,
|
||||
asset: &RawAsset,
|
||||
) -> anyhow::Result<()> {
|
||||
fn purge_workspace_path_for_parser_bump(app: &App, asset: &RawAsset) -> anyhow::Result<()> {
|
||||
let path = &asset.workspace_path.0;
|
||||
let stale = app
|
||||
.sqlite
|
||||
@@ -1648,21 +1778,19 @@ fn sweep_deleted_files(
|
||||
}
|
||||
|
||||
// File is truly absent → purge.
|
||||
let chunk_ids = match kebab_store_sqlite::purge_deleted_workspace_path(
|
||||
&app.sqlite,
|
||||
&stored_path,
|
||||
) {
|
||||
Ok(ids) => ids,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
path = %stored_path.0,
|
||||
error = %e,
|
||||
"sweep_deleted_files: purge failed; skipping this path"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let chunk_ids =
|
||||
match kebab_store_sqlite::purge_deleted_workspace_path(&app.sqlite, &stored_path) {
|
||||
Ok(ids) => ids,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
path = %stored_path.0,
|
||||
error = %e,
|
||||
"sweep_deleted_files: purge failed; skipping this path"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// Purge associated vectors (best-effort; partial failure
|
||||
// acceptable — orphan vectors get cleaned by `kebab reset
|
||||
@@ -1725,6 +1853,13 @@ fn ingest_one_pdf_asset(
|
||||
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
|
||||
existing_doc_ids: &std::collections::HashSet<String>,
|
||||
force_reingest: bool,
|
||||
pdf_ocr_engine: Option<&OllamaVisionOcr>,
|
||||
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
|
||||
cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
|
||||
log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
|
||||
ocr_ms_samples: Arc<Mutex<Vec<u64>>>,
|
||||
ocr_pages_cnt: Arc<Mutex<u32>>,
|
||||
ocr_failures_cnt: Arc<Mutex<u32>>,
|
||||
) -> anyhow::Result<kebab_core::IngestItem> {
|
||||
let path = match &asset.source_uri {
|
||||
SourceUri::File(p) => p.clone(),
|
||||
@@ -1739,9 +1874,9 @@ fn ingest_one_pdf_asset(
|
||||
chunk_count: None,
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: vec![
|
||||
"kb:// URI not yet supported".to_string(),
|
||||
],
|
||||
warnings: vec!["kb:// URI not yet supported".to_string()],
|
||||
pdf_ocr_pages: None,
|
||||
pdf_ocr_ms_total: None,
|
||||
error: None,
|
||||
});
|
||||
}
|
||||
@@ -1778,6 +1913,105 @@ fn ingest_one_pdf_asset(
|
||||
.extract_for(&asset.media_type, &ctx, &bytes)
|
||||
.context("kb-app::extract_for (pdf)")?;
|
||||
|
||||
// v0.20 sub-item 1: post-extract OCR enrichment (PR #187 registry
|
||||
// dispatch invariant 보존 — extract_for 가 normal entry).
|
||||
let (pdf_ocr_pages, pdf_ocr_ms_total): (Option<u32>, Option<u64>) =
|
||||
if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on {
|
||||
match pdf_ocr_engine {
|
||||
Some(engine) => {
|
||||
let ocr_opts = crate::pdf_ocr_apply::PdfOcrOpts {
|
||||
enabled: app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on,
|
||||
always_on: app.config.pdf.ocr.always_on,
|
||||
valid_ratio_threshold: app.config.pdf.ocr.valid_ratio_threshold,
|
||||
min_char_count: app.config.pdf.ocr.min_char_count,
|
||||
lang_hint: app.config.pdf.ocr.lang_hint.clone().map(kebab_core::Lang),
|
||||
cancel: cancel.cloned(),
|
||||
};
|
||||
// v0.20.x Hook 2: pre-clone Arcs for capture by OCR closure.
|
||||
let lw_for_ocr = log_writer.clone();
|
||||
let samples_for_ocr = ocr_ms_samples.clone();
|
||||
let pages_for_ocr = ocr_pages_cnt.clone();
|
||||
let failures_for_ocr = ocr_failures_cnt.clone();
|
||||
let doc_path_for_log = asset.workspace_path.0.clone();
|
||||
|
||||
let summary = crate::pdf_ocr_apply::apply_ocr_to_pdf_pages(
|
||||
&mut canonical,
|
||||
engine,
|
||||
&bytes,
|
||||
&ocr_opts,
|
||||
|p| match p {
|
||||
crate::pdf_ocr_apply::PdfOcrProgress::Started { page } => {
|
||||
if let Some(sender) = progress {
|
||||
let _ = sender.send(
|
||||
crate::ingest_progress::IngestEvent::PdfOcrStarted { page },
|
||||
);
|
||||
}
|
||||
}
|
||||
crate::pdf_ocr_apply::PdfOcrProgress::Finished {
|
||||
page,
|
||||
ms,
|
||||
chars,
|
||||
skipped,
|
||||
image_byte_size,
|
||||
image_width,
|
||||
image_height,
|
||||
ref failure_reason,
|
||||
} => {
|
||||
if let Some(sender) = progress {
|
||||
let _ = sender.send(
|
||||
crate::ingest_progress::IngestEvent::PdfOcrFinished {
|
||||
page,
|
||||
ms,
|
||||
chars,
|
||||
ocr_engine: engine.engine_name().to_string(),
|
||||
skipped,
|
||||
image_byte_size,
|
||||
image_width,
|
||||
image_height,
|
||||
failure_reason: failure_reason.clone(),
|
||||
},
|
||||
);
|
||||
}
|
||||
// v0.20.x Hook 2: write OCR event to log writer.
|
||||
let success = !skipped && failure_reason.is_none();
|
||||
if let Some(ref lw) = lw_for_ocr {
|
||||
if let Ok(mut w) = lw.lock() {
|
||||
let _ = w.write_event(&crate::ingest_log::LogEvent::Ocr {
|
||||
ts: crate::ingest_log::now_ts(),
|
||||
doc_path: &doc_path_for_log,
|
||||
page,
|
||||
image_byte_size,
|
||||
image_width,
|
||||
image_height,
|
||||
ms,
|
||||
chars,
|
||||
success,
|
||||
reason: failure_reason.as_deref(),
|
||||
ocr_engine: engine.engine_name(),
|
||||
});
|
||||
}
|
||||
}
|
||||
if let Ok(mut p) = pages_for_ocr.lock() {
|
||||
*p += 1;
|
||||
}
|
||||
if success {
|
||||
if let Ok(mut s) = samples_for_ocr.lock() {
|
||||
s.push(ms);
|
||||
}
|
||||
} else if let Ok(mut f) = failures_for_ocr.lock() {
|
||||
*f += 1;
|
||||
}
|
||||
}
|
||||
},
|
||||
)?;
|
||||
(Some(summary.pages_ocrd), Some(summary.ms_total))
|
||||
}
|
||||
None => (Some(0), Some(0)),
|
||||
}
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
|
||||
// Per-medium chunker selection: PDF docs always use pdf-page-v1
|
||||
// regardless of `config.chunking.chunker_version`. The chunker
|
||||
// validates every block carries `SourceSpan::Page`; failure here
|
||||
@@ -1818,9 +2052,7 @@ fn ingest_one_pdf_asset(
|
||||
kind: EmbeddingKind::Document,
|
||||
})
|
||||
.collect();
|
||||
let vectors = emb
|
||||
.embed(&inputs)
|
||||
.context("Embedder::embed (pdf chunks)")?;
|
||||
let vectors = emb.embed(&inputs).context("Embedder::embed (pdf chunks)")?;
|
||||
let model_id = emb.model_id();
|
||||
let model_version = emb.model_version();
|
||||
let dimensions = emb.dimensions();
|
||||
@@ -1879,6 +2111,8 @@ fn ingest_one_pdf_asset(
|
||||
parser_version: Some(canonical.parser_version.clone()),
|
||||
chunker_version: Some(chunker.chunker_version()),
|
||||
warnings,
|
||||
pdf_ocr_pages,
|
||||
pdf_ocr_ms_total,
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
@@ -1902,7 +2136,7 @@ fn ingest_one_code_asset(
|
||||
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
|
||||
existing_doc_ids: &std::collections::HashSet<String>,
|
||||
force_reingest: bool,
|
||||
code_lang: &str, // <-- NEW (p10-1b Task D)
|
||||
code_lang: &str, // <-- NEW (p10-1b Task D)
|
||||
) -> anyhow::Result<kebab_core::IngestItem> {
|
||||
let path = match &asset.source_uri {
|
||||
SourceUri::File(p) => p.clone(),
|
||||
@@ -1917,9 +2151,9 @@ fn ingest_one_code_asset(
|
||||
chunk_count: None,
|
||||
parser_version: None,
|
||||
chunker_version: None,
|
||||
warnings: vec![
|
||||
"kb:// URI not yet supported".to_string(),
|
||||
],
|
||||
warnings: vec!["kb:// URI not yet supported".to_string()],
|
||||
pdf_ocr_pages: None,
|
||||
pdf_ocr_ms_total: None,
|
||||
error: None,
|
||||
});
|
||||
}
|
||||
@@ -1927,43 +2161,43 @@ fn ingest_one_code_asset(
|
||||
|
||||
// p10-1b Task D/G/J: parser_version per-lang.
|
||||
let parser_version = match code_lang {
|
||||
"rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()),
|
||||
"python" => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.to_string()),
|
||||
"rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()),
|
||||
"python" => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.to_string()),
|
||||
"typescript" => ParserVersion(kebab_parse_code::TS_PARSER_VERSION.to_string()),
|
||||
"javascript" => ParserVersion(kebab_parse_code::JS_PARSER_VERSION.to_string()),
|
||||
"go" => ParserVersion(kebab_parse_code::GO_PARSER_VERSION.to_string()),
|
||||
"java" => ParserVersion(kebab_parse_code::JAVA_PARSER_VERSION.to_string()),
|
||||
"kotlin" => ParserVersion(kebab_parse_code::KOTLIN_PARSER_VERSION.to_string()),
|
||||
// p10-2: Tier 2 has no parse step — sentinel "none-v1".
|
||||
"yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod"
|
||||
=> ParserVersion("none-v1".to_string()),
|
||||
"yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => {
|
||||
ParserVersion("none-v1".to_string())
|
||||
}
|
||||
// p10-3: shell direct routes to Tier 3 (no parse step).
|
||||
"shell" => ParserVersion("none-v1".to_string()),
|
||||
// p10-1D: C + C++ AST extractors.
|
||||
"c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()),
|
||||
"c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()),
|
||||
"cpp" => ParserVersion(kebab_parse_code::CPP_PARSER_VERSION.to_string()),
|
||||
other => anyhow::bail!("unsupported code_lang: {other}"),
|
||||
};
|
||||
|
||||
// p10-1b Task D/G/J/L: chunker_version per-lang.
|
||||
let mut chunker_version = match code_lang {
|
||||
"rust" => CodeRustAstV1Chunker.chunker_version(),
|
||||
"python" => CodePythonAstV1Chunker.chunker_version(),
|
||||
"rust" => CodeRustAstV1Chunker.chunker_version(),
|
||||
"python" => CodePythonAstV1Chunker.chunker_version(),
|
||||
"typescript" => CodeTsAstV1Chunker.chunker_version(),
|
||||
"javascript" => CodeJsAstV1Chunker.chunker_version(),
|
||||
"go" => CodeGoAstV1Chunker.chunker_version(),
|
||||
"java" => CodeJavaAstV1Chunker.chunker_version(),
|
||||
"kotlin" => CodeKotlinAstV1Chunker.chunker_version(),
|
||||
"kotlin" => CodeKotlinAstV1Chunker.chunker_version(),
|
||||
// p10-2 Tier 2:
|
||||
"yaml" => K8sManifestResourceV1Chunker.chunker_version(),
|
||||
"yaml" => K8sManifestResourceV1Chunker.chunker_version(),
|
||||
"dockerfile" => DockerfileFileV1Chunker.chunker_version(),
|
||||
"toml" | "json" | "xml" | "groovy" | "go-mod"
|
||||
=> ManifestFileV1Chunker.chunker_version(),
|
||||
"toml" | "json" | "xml" | "groovy" | "go-mod" => ManifestFileV1Chunker.chunker_version(),
|
||||
// p10-3:
|
||||
"shell" => CodeTextParagraphV1Chunker.chunker_version(),
|
||||
"shell" => CodeTextParagraphV1Chunker.chunker_version(),
|
||||
// p10-1D: C + C++ AST chunkers.
|
||||
"c" => CodeCAstV1Chunker.chunker_version(),
|
||||
"cpp" => CodeCppAstV1Chunker.chunker_version(),
|
||||
"c" => CodeCAstV1Chunker.chunker_version(),
|
||||
"cpp" => CodeCppAstV1Chunker.chunker_version(),
|
||||
other => anyhow::bail!("unreachable chunker_version: {other}"),
|
||||
};
|
||||
|
||||
@@ -2026,8 +2260,12 @@ fn ingest_one_code_asset(
|
||||
// Tier 2 (yaml/dockerfile/…) and shell errors are real (e.g. non-UTF-8) — propagate.
|
||||
let mut canonical = match canonical_result {
|
||||
Ok(d) => d,
|
||||
Err(e) if code_lang == "shell"
|
||||
|| matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod") =>
|
||||
Err(e)
|
||||
if code_lang == "shell"
|
||||
|| matches!(
|
||||
code_lang,
|
||||
"yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod"
|
||||
) =>
|
||||
{
|
||||
return Err(e).context("synthesize_tier2_document failed for tier 2/3 lang");
|
||||
}
|
||||
@@ -2051,7 +2289,10 @@ fn ingest_one_code_asset(
|
||||
// Tier 2 langs already have "none-v1" parser_version normally, so exclude them
|
||||
// from the extract_fell_back guard with the !matches! exclusion.
|
||||
let extract_fell_back = canonical.parser_version.0 == "none-v1"
|
||||
&& !matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" | "shell");
|
||||
&& !matches!(
|
||||
code_lang,
|
||||
"yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" | "shell"
|
||||
);
|
||||
|
||||
let chunks_result: anyhow::Result<Vec<Chunk>> = if extract_fell_back {
|
||||
// Tier 1 lang whose extractor errored — go straight to Tier 3 chunker.
|
||||
@@ -2110,7 +2351,7 @@ fn ingest_one_code_asset(
|
||||
// "shell" direct path is already Tier 3 — don't retry-double-up.
|
||||
let chunks: Vec<Chunk> = match chunks_result {
|
||||
Ok(v) if !v.is_empty() => v,
|
||||
other if code_lang == "shell" => other?, // shell propagates directly
|
||||
other if code_lang == "shell" => other?, // shell propagates directly
|
||||
Ok(_empty) => {
|
||||
tracing::warn!(
|
||||
workspace_path = %asset.workspace_path.0,
|
||||
@@ -2134,7 +2375,9 @@ fn ingest_one_code_asset(
|
||||
canonical.parser_version = ParserVersion("none-v1".to_string());
|
||||
CodeTextParagraphV1Chunker
|
||||
.chunk(&canonical, chunk_policy)
|
||||
.context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback after error)")?
|
||||
.context(
|
||||
"kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback after error)",
|
||||
)?
|
||||
}
|
||||
};
|
||||
|
||||
@@ -2226,6 +2469,8 @@ fn ingest_one_code_asset(
|
||||
parser_version: Some(canonical.parser_version.clone()),
|
||||
chunker_version: Some(chunker_version),
|
||||
warnings,
|
||||
pdf_ocr_pages: None,
|
||||
pdf_ocr_ms_total: None,
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
@@ -2260,13 +2505,7 @@ fn synthesize_tier2_document(
|
||||
symbol: Some("<file>".to_string()),
|
||||
lang: Some(code_lang.to_string()),
|
||||
};
|
||||
let block_id: BlockId = id_for_block(
|
||||
&doc_id,
|
||||
"code",
|
||||
&[],
|
||||
0,
|
||||
&span,
|
||||
);
|
||||
let block_id: BlockId = id_for_block(&doc_id, "code", &[], 0, &span);
|
||||
let block = kebab_core::Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id,
|
||||
@@ -2312,7 +2551,9 @@ fn synthesize_tier2_document(
|
||||
};
|
||||
|
||||
let title = {
|
||||
let fname = asset.workspace_path.0
|
||||
let fname = asset
|
||||
.workspace_path
|
||||
.0
|
||||
.rsplit('/')
|
||||
.next()
|
||||
.unwrap_or(&asset.workspace_path.0);
|
||||
@@ -2558,7 +2799,9 @@ pub fn ask_with_session_with_config(
|
||||
/// `data_dir_writable` check probes the resolved `storage.data_dir`
|
||||
/// from that config (so `--config` users see their custom paths
|
||||
/// reflected in the report rather than the XDG defaults).
|
||||
pub fn doctor_with_config_path(config_path: Option<&std::path::Path>) -> anyhow::Result<DoctorReport> {
|
||||
pub fn doctor_with_config_path(
|
||||
config_path: Option<&std::path::Path>,
|
||||
) -> anyhow::Result<DoctorReport> {
|
||||
tracing::debug!("doctor() invoked");
|
||||
let mut checks = Vec::new();
|
||||
|
||||
@@ -2576,11 +2819,7 @@ pub fn doctor_with_config_path(config_path: Option<&std::path::Path>) -> anyhow:
|
||||
} else if config_path.is_some() {
|
||||
// Explicit `--config <path>` that doesn't exist is a hard error
|
||||
// — defaults would silently mask the user's intent.
|
||||
(
|
||||
false,
|
||||
format!("{} (not found)", cfg_path.display()),
|
||||
None,
|
||||
)
|
||||
(false, format!("{} (not found)", cfg_path.display()), None)
|
||||
} else {
|
||||
// No `--config` and no XDG file: defaults are always loadable.
|
||||
(true, format!("{} (defaults)", cfg_path.display()), None)
|
||||
@@ -2666,16 +2905,18 @@ pub fn ingest_file_with_config(
|
||||
path: &std::path::Path,
|
||||
) -> anyhow::Result<IngestReport> {
|
||||
if !path.exists() {
|
||||
anyhow::bail!("ingest-file: source path does not exist: {}", path.display());
|
||||
anyhow::bail!(
|
||||
"ingest-file: source path does not exist: {}",
|
||||
path.display()
|
||||
);
|
||||
}
|
||||
if !path.is_file() {
|
||||
anyhow::bail!("ingest-file: not a regular file: {}", path.display());
|
||||
}
|
||||
|
||||
let ext_raw = path
|
||||
.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("ingest-file: source has no extension: {}", path.display()))?;
|
||||
let ext_raw = path.extension().and_then(|e| e.to_str()).ok_or_else(|| {
|
||||
anyhow::anyhow!("ingest-file: source has no extension: {}", path.display())
|
||||
})?;
|
||||
let ext = ext_raw.to_lowercase();
|
||||
|
||||
const SUPPORTED_EXTS: &[&str] = &["md", "pdf", "png", "jpg", "jpeg"];
|
||||
@@ -2752,11 +2993,7 @@ pub fn ingest_stdin_with_config(
|
||||
let external_dir = crate::external::ensure_external_dir(&workspace_root)?;
|
||||
crate::external::ensure_kebabignore_entry(&workspace_root)?;
|
||||
|
||||
let dest = crate::external::copy_to_external(
|
||||
&external_dir,
|
||||
wrapped.as_bytes(),
|
||||
"md",
|
||||
)?;
|
||||
let dest = crate::external::copy_to_external(&external_dir, wrapped.as_bytes(), "md")?;
|
||||
|
||||
ingest_file_with_config(config, &dest)
|
||||
}
|
||||
@@ -2764,7 +3001,10 @@ pub fn ingest_stdin_with_config(
|
||||
/// Returns true if `source_path` matches any `.kebabignore` pattern
|
||||
/// rooted at `workspace_root`. Used by `ingest_file_with_config` to
|
||||
/// emit a stderr warn before bypassing the ignore.
|
||||
fn check_kebabignore_match(workspace_root: &std::path::Path, source_path: &std::path::Path) -> bool {
|
||||
fn check_kebabignore_match(
|
||||
workspace_root: &std::path::Path,
|
||||
source_path: &std::path::Path,
|
||||
) -> bool {
|
||||
let kebabignore = workspace_root.join(".kebabignore");
|
||||
if !kebabignore.exists() {
|
||||
return false;
|
||||
@@ -2785,5 +3025,7 @@ fn check_kebabignore_match(workspace_root: &std::path::Path, source_path: &std::
|
||||
Ok(m) => m,
|
||||
Err(_) => return false,
|
||||
};
|
||||
matcher.matched(source_path, source_path.is_dir()).is_ignore()
|
||||
matcher
|
||||
.matched(source_path, source_path.is_dir())
|
||||
.is_ignore()
|
||||
}
|
||||
|
||||
@@ -26,7 +26,9 @@ pub fn init(level: LogLevel) -> Result<WorkerGuard> {
|
||||
let (nb, guard) = tracing_appender::non_blocking(file_appender);
|
||||
|
||||
let env_filter = match level {
|
||||
LogLevel::Default => EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")),
|
||||
LogLevel::Default => {
|
||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn"))
|
||||
}
|
||||
LogLevel::Verbose => EnvFilter::new("info"),
|
||||
LogLevel::Debug => EnvFilter::new("debug"),
|
||||
};
|
||||
|
||||
323
crates/kebab-app/src/pdf_ocr_apply.rs
Normal file
323
crates/kebab-app/src/pdf_ocr_apply.rs
Normal file
@@ -0,0 +1,323 @@
|
||||
// crates/kebab-app/src/pdf_ocr_apply.rs
|
||||
//
|
||||
// PDF post-extract OCR enrichment. parser isolation 보존 — kebab-parse-pdf 가
|
||||
// kebab-parse-image::OcrEngine 을 import 하지 않도록, helper 는 kebab-app 에 둠.
|
||||
// image path 의 apply_ocr (kebab-parse-image::ocr::apply_ocr) 의
|
||||
// PDF page 변형 — image 는 ImageRefBlock.ocr 를 mutate, PDF 는
|
||||
// Block::Paragraph.text / inlines 를 in-place mutate (단일 OCR fallback) 또는
|
||||
// 새 Block::Paragraph 를 push (always_on dual-block).
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, CommonBlock, Inline, Lang, ProvenanceEvent, ProvenanceKind,
|
||||
SourceSpan, TextBlock, id_for_block,
|
||||
};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
use kebab_parse_pdf::{compute_valid_char_ratio, extract_dctdecode_page_image};
|
||||
use lopdf::Document as LopdfDocument;
|
||||
use time::OffsetDateTime;
|
||||
use tracing::warn;
|
||||
|
||||
/// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`].
|
||||
/// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade
|
||||
/// (`kebab_app::ingest_one_pdf_asset`) fills these from
|
||||
/// `kebab_config::Config::pdf::ocr` plus runtime flags (CLI / SIGINT).
|
||||
pub struct PdfOcrOpts {
|
||||
/// Master switch. `false` short-circuits to
|
||||
/// `PdfOcrSummary { pages_ocrd: 0, ms_total: 0 }` without lopdf reparse.
|
||||
pub enabled: bool,
|
||||
/// `true` → 모든 page OCR (dual-block path, new `Block::Paragraph` push).
|
||||
/// `false` → text-detect block 의 `min_char_count` 또는
|
||||
/// `valid_ratio_threshold` 미달인 page 만 OCR (in-place mutate).
|
||||
pub always_on: bool,
|
||||
/// 0.0..=1.0. text-detect block 의 `compute_valid_char_ratio` 가
|
||||
/// 본 임계 미만이면 OCR fallback. Default `0.5`.
|
||||
pub valid_ratio_threshold: f32,
|
||||
/// text-detect block 의 char count 가 본 임계 미만이면 OCR fallback.
|
||||
/// empty page (cover, blank separator) 자동 skip. Default `20`.
|
||||
pub min_char_count: u32,
|
||||
/// OCR engine 에 전달할 언어 힌트 (예: `Lang("kor".into())`).
|
||||
/// `None` → no hint passed to engine.
|
||||
pub lang_hint: Option<Lang>,
|
||||
/// Optional per-page cancellation handle. checked at start of each page
|
||||
/// loop iteration; set→true 시 `cancelled mid-PDF` error 반환. plan §6 E4
|
||||
/// + verifier LOW L-1 resolution + spec §4.8 line 1159 명시.
|
||||
pub cancel: Option<Arc<AtomicBool>>,
|
||||
}
|
||||
|
||||
/// OCR run summary returned by [`apply_ocr_to_pdf_pages`] for the caller's
|
||||
/// `IngestItem.pdf_ocr_pages` + `pdf_ocr_ms_total` wire fields (§4.6.2).
|
||||
#[derive(Debug)]
|
||||
pub struct PdfOcrSummary {
|
||||
/// Number of pages 가 OCR pipeline 을 실제 통과 (skipped page 제외).
|
||||
pub pages_ocrd: u32,
|
||||
/// Cumulative wall-clock duration of successful OCR engine calls (ms).
|
||||
/// `saturating_add` 사용 — 24-day cumulative 까지 overflow-safe.
|
||||
pub ms_total: u64,
|
||||
}
|
||||
|
||||
/// Post-extract OCR enrichment for PDF. Walks `canonical.blocks` page-by-page,
|
||||
/// classifies each page via `text_quality::compute_valid_char_ratio` +
|
||||
/// `min_char_count`, and either:
|
||||
/// - skips (vector PDF + sufficient text + `always_on=false`),
|
||||
/// - mutates the text-detect `Block::Paragraph` in-place with OCR output
|
||||
/// (scanned/mojibake page), or
|
||||
/// - pushes a new `Block::Paragraph` with dual ordinal (`always_on=true` +
|
||||
/// vector page).
|
||||
///
|
||||
/// Errors:
|
||||
/// - cancel handle (`opts.cancel = Some(true)`) → `Err("PDF OCR cancelled mid-PDF at page N")`.
|
||||
/// - lopdf re-parse failure → `Err(...)`.
|
||||
/// - per-page OCR engine failure 또는 DCTDecode 부재 → `ProvenanceKind::Warning`
|
||||
/// event push + `emit_progress(Finished { skipped: true })` + continue
|
||||
/// (no `Err` propagation).
|
||||
///
|
||||
/// See spec §4.1 + §4.4 for the full pipeline.
|
||||
pub fn apply_ocr_to_pdf_pages<F>(
|
||||
canonical: &mut CanonicalDocument,
|
||||
engine: &dyn OcrEngine,
|
||||
pdf_bytes: &[u8],
|
||||
opts: &PdfOcrOpts,
|
||||
mut emit_progress: F,
|
||||
) -> Result<PdfOcrSummary>
|
||||
where
|
||||
F: FnMut(PdfOcrProgress),
|
||||
{
|
||||
if !opts.enabled {
|
||||
return Ok(PdfOcrSummary {
|
||||
pages_ocrd: 0,
|
||||
ms_total: 0,
|
||||
});
|
||||
}
|
||||
let pdf_doc = LopdfDocument::load_mem(pdf_bytes)
|
||||
.context("kb-app::pdf_ocr_apply: re-parse PDF for image extract")?;
|
||||
let page_count = pdf_doc.get_pages().len() as u32;
|
||||
|
||||
let mut new_events: Vec<ProvenanceEvent> = Vec::new();
|
||||
let mut ocr_blocks: Vec<Block> = Vec::new();
|
||||
let mut pages_ocrd: u32 = 0;
|
||||
let mut ms_total: u64 = 0;
|
||||
|
||||
// canonical.blocks 의 page → block index map (text-detect block 의 in-place
|
||||
// mutate 또는 dual-block push 결정용).
|
||||
// PdfTextExtractor 가 page 마다 1 Block::Paragraph + SourceSpan::Page 를
|
||||
// 생성 (§1.4) — 그 invariant 사용.
|
||||
for page_num in 1..=page_count {
|
||||
if let Some(cancel) = &opts.cancel {
|
||||
if cancel.load(std::sync::atomic::Ordering::Relaxed) {
|
||||
anyhow::bail!("PDF OCR cancelled mid-PDF at page {page_num}");
|
||||
}
|
||||
}
|
||||
|
||||
let text_block_idx = find_paragraph_block_idx(&canonical.blocks, page_num);
|
||||
let text = match &canonical.blocks[text_block_idx] {
|
||||
Block::Paragraph(tb) => tb.text.clone(),
|
||||
_ => String::new(),
|
||||
};
|
||||
let chars = text.chars().count() as u32;
|
||||
let valid_ratio = compute_valid_char_ratio(&text);
|
||||
let needs_ocr = chars < opts.min_char_count || valid_ratio < opts.valid_ratio_threshold;
|
||||
|
||||
// 결정 matrix:
|
||||
// always_on=true → 모든 page OCR (dual-block).
|
||||
// always_on=false + needs_ocr → in-place OCR (text-detect block mutate).
|
||||
// needs_ocr=false → skip.
|
||||
let do_ocr = opts.always_on || needs_ocr;
|
||||
if !do_ocr {
|
||||
continue;
|
||||
}
|
||||
|
||||
emit_progress(PdfOcrProgress::Started { page: page_num });
|
||||
|
||||
let page_image_bytes = if let Some(b) = extract_dctdecode_page_image(&pdf_doc, page_num)? {
|
||||
b
|
||||
} else {
|
||||
let note = format!(
|
||||
"page={page_num} skipped: no DCTDecode image XObject (vector PDF page or unsupported /Filter — v1 supports DCTDecode passthrough only; see release notes for normalization guidance)"
|
||||
);
|
||||
warn!(target: "kebab-app", "{}", note);
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(note),
|
||||
});
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: 0,
|
||||
chars: 0,
|
||||
skipped: true,
|
||||
image_byte_size: None,
|
||||
image_width: None,
|
||||
image_height: None,
|
||||
failure_reason: None,
|
||||
});
|
||||
continue;
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
let ocr = match engine.recognize(&page_image_bytes, opts.lang_hint.as_ref()) {
|
||||
Ok(t) => t,
|
||||
Err(e) => {
|
||||
// OCR failure: warning event + skip (text-detect block 그대로).
|
||||
let note = format!(
|
||||
"page={} OCR failed engine={} version={} err={}",
|
||||
page_num,
|
||||
engine.engine_name(),
|
||||
engine.engine_version(),
|
||||
e
|
||||
);
|
||||
warn!(target: "kebab-app", "{}", note);
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(note),
|
||||
});
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: start.elapsed().as_millis() as u64,
|
||||
chars: 0,
|
||||
skipped: true,
|
||||
image_byte_size: Some(page_image_bytes.len() as u64),
|
||||
image_width: None,
|
||||
image_height: None,
|
||||
failure_reason: Some("ocr_error".to_string()),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let elapsed_ms = start.elapsed().as_millis() as u64;
|
||||
let chars_ocr = ocr.joined.chars().count() as u32;
|
||||
|
||||
pages_ocrd = pages_ocrd.saturating_add(1);
|
||||
ms_total = ms_total.saturating_add(elapsed_ms);
|
||||
|
||||
if opts.always_on && !needs_ocr {
|
||||
// dual-block path: 새 Block::Paragraph push, ordinal = page-1 + page_count.
|
||||
let ocr_ordinal = (page_num - 1) + page_count;
|
||||
let span_ocr = SourceSpan::Page {
|
||||
page: page_num,
|
||||
char_start: Some(0),
|
||||
char_end: Some(chars_ocr),
|
||||
};
|
||||
let block_id =
|
||||
id_for_block(&canonical.doc_id, "paragraph", &[], ocr_ordinal, &span_ocr);
|
||||
let common = CommonBlock {
|
||||
block_id,
|
||||
heading_path: Vec::new(),
|
||||
source_span: span_ocr,
|
||||
};
|
||||
ocr_blocks.push(Block::Paragraph(TextBlock {
|
||||
common,
|
||||
text: ocr.joined.clone(),
|
||||
inlines: if ocr.joined.is_empty() {
|
||||
Vec::new()
|
||||
} else {
|
||||
vec![Inline::Text {
|
||||
text: ocr.joined.clone(),
|
||||
}]
|
||||
},
|
||||
}));
|
||||
} else {
|
||||
// in-place mutate: text-detect block (빈 또는 low-valid) 의 text/inlines 교체.
|
||||
// block_id / ordinal 보존 — span 의 char_end 만 갱신.
|
||||
if let Block::Paragraph(tb) = &mut canonical.blocks[text_block_idx] {
|
||||
tb.text = ocr.joined.clone();
|
||||
tb.inlines = if ocr.joined.is_empty() {
|
||||
Vec::new()
|
||||
} else {
|
||||
vec![Inline::Text {
|
||||
text: ocr.joined.clone(),
|
||||
}]
|
||||
};
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(chars_ocr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::OcrApplied,
|
||||
note: Some(format!(
|
||||
"page={} engine={} version={} regions={} ms={} chars={}",
|
||||
page_num,
|
||||
engine.engine_name(),
|
||||
engine.engine_version(),
|
||||
ocr.regions.len(),
|
||||
elapsed_ms,
|
||||
chars_ocr
|
||||
)),
|
||||
});
|
||||
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: elapsed_ms,
|
||||
chars: chars_ocr,
|
||||
skipped: false,
|
||||
image_byte_size: Some(page_image_bytes.len() as u64),
|
||||
image_width: None,
|
||||
image_height: None,
|
||||
failure_reason: None,
|
||||
});
|
||||
}
|
||||
|
||||
canonical.blocks.extend(ocr_blocks);
|
||||
canonical.provenance.events.extend(new_events);
|
||||
Ok(PdfOcrSummary {
|
||||
pages_ocrd,
|
||||
ms_total,
|
||||
})
|
||||
}
|
||||
|
||||
fn find_paragraph_block_idx(blocks: &[Block], page_num: u32) -> usize {
|
||||
blocks
|
||||
.iter()
|
||||
.position(|b| match b {
|
||||
Block::Paragraph(tb) => matches!(
|
||||
tb.common.source_span,
|
||||
SourceSpan::Page { page, .. } if page == page_num
|
||||
),
|
||||
_ => false,
|
||||
})
|
||||
.expect("PdfTextExtractor emits 1 Block::Paragraph per page (invariant)")
|
||||
}
|
||||
|
||||
/// Per-page OCR progress event 가 caller 의 `emit_progress` closure 호출 시 emit.
|
||||
/// Step 6 의 ingest_one_pdf_asset 가 IngestEvent::PdfOcrStarted / PdfOcrFinished
|
||||
/// 로 carry (spec §4.6.1 wire schema).
|
||||
pub enum PdfOcrProgress {
|
||||
/// page 별 OCR 시작 시 emit. `engine.recognize` 호출 직전.
|
||||
Started {
|
||||
/// 1-based PDF page number.
|
||||
page: u32,
|
||||
},
|
||||
/// page 별 OCR 종료 시 emit (성공 / skip / failure 모두).
|
||||
Finished {
|
||||
/// 1-based PDF page number.
|
||||
page: u32,
|
||||
/// `engine.recognize` wall-clock duration. skip path 의 의미는 mixed
|
||||
/// (DCTDecode 부재 시 `0`, OCR engine 실패 시 actual latency before bail).
|
||||
ms: u64,
|
||||
/// OCR result text 의 char count. skip 시 `0`.
|
||||
chars: u32,
|
||||
/// `true` = DCTDecode 부재 또는 OCR engine 실패 로 skip.
|
||||
/// `false` = 정상 OCR 완료.
|
||||
skipped: bool,
|
||||
/// v0.20.x ingest log: raster image byte size (additive, optional).
|
||||
image_byte_size: Option<u64>,
|
||||
/// v0.20.x ingest log: raster image width in pixels (additive, optional).
|
||||
image_width: Option<u32>,
|
||||
/// v0.20.x ingest log: raster image height in pixels (additive, optional).
|
||||
image_height: Option<u32>,
|
||||
/// v0.20.x ingest log: failure reason string when OCR failed (additive, optional).
|
||||
/// Values: "timeout" | "ocr_error" | "network_error" | None (success).
|
||||
failure_reason: Option<String>,
|
||||
},
|
||||
}
|
||||
@@ -85,8 +85,7 @@ pub fn enumerate_paths(scope: ResetScope, cfg: &Config) -> Vec<PathBuf> {
|
||||
ResetScope::All => vec![cfg_dir, data_dir, cache_dir, state_dir],
|
||||
ResetScope::DataOnly => vec![data_dir, cache_dir, state_dir],
|
||||
ResetScope::VectorOnly => {
|
||||
let vector_dir =
|
||||
expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
|
||||
let vector_dir = expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
|
||||
vec![vector_dir]
|
||||
}
|
||||
ResetScope::ConfigOnly => vec![cfg_dir],
|
||||
@@ -137,8 +136,8 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 {
|
||||
/// the double scan is acceptable for a rare destructive operation.
|
||||
pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
|
||||
use kebab_core::DocumentStore as _;
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
use kebab_core::SourceScope;
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
|
||||
let store = kebab_store_sqlite::SqliteStore::open(cfg)
|
||||
.context("enumerate_orphans: open SqliteStore")?;
|
||||
@@ -160,16 +159,13 @@ pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let connector = FsSourceConnector::new(cfg)
|
||||
.context("enumerate_orphans: build FsSourceConnector")?;
|
||||
let connector =
|
||||
FsSourceConnector::new(cfg).context("enumerate_orphans: build FsSourceConnector")?;
|
||||
let (assets, _skips) = connector
|
||||
.scan_with_skips(&scope)
|
||||
.context("enumerate_orphans: scan workspace")?;
|
||||
|
||||
let scanned: HashSet<WorkspacePath> = assets
|
||||
.into_iter()
|
||||
.map(|a| a.workspace_path)
|
||||
.collect();
|
||||
let scanned: HashSet<WorkspacePath> = assets.into_iter().map(|a| a.workspace_path).collect();
|
||||
|
||||
let mut orphans: Vec<WorkspacePath> = stored
|
||||
.into_iter()
|
||||
@@ -206,8 +202,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
||||
if !p.exists() {
|
||||
continue;
|
||||
}
|
||||
std::fs::remove_dir_all(p)
|
||||
.with_context(|| format!("remove {}", p.display()))?;
|
||||
std::fs::remove_dir_all(p).with_context(|| format!("remove {}", p.display()))?;
|
||||
removed.push(p.clone());
|
||||
}
|
||||
|
||||
@@ -229,8 +224,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
||||
/// Execute the `OrphansOnly` variant: reconcile stored docs against the
|
||||
/// current walker scope without touching any filesystem directory.
|
||||
fn execute_orphans_only(cfg: &Config) -> Result<ResetReport> {
|
||||
let orphans = enumerate_orphans(cfg)
|
||||
.context("execute_orphans_only: enumerate orphans")?;
|
||||
let orphans = enumerate_orphans(cfg).context("execute_orphans_only: enumerate orphans")?;
|
||||
|
||||
if orphans.is_empty() {
|
||||
return Ok(ResetReport {
|
||||
|
||||
@@ -39,6 +39,14 @@ pub struct Capabilities {
|
||||
pub struct Models {
|
||||
pub parser_version: String,
|
||||
pub chunker_version: String,
|
||||
/// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체.
|
||||
/// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존.
|
||||
#[serde(default)]
|
||||
pub active_parsers: Vec<String>,
|
||||
/// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체.
|
||||
/// 빈 corpus → empty Vec.
|
||||
#[serde(default)]
|
||||
pub active_chunkers: Vec<String>,
|
||||
pub embedding_version: String,
|
||||
pub prompt_template_version: String,
|
||||
pub index_version: String,
|
||||
@@ -142,10 +150,10 @@ fn capabilities_snapshot() -> Capabilities {
|
||||
rag_multi_turn: true,
|
||||
search_cache: true,
|
||||
incremental_ingest: true,
|
||||
streaming_ask: false,
|
||||
streaming_ask: true,
|
||||
http_daemon: false,
|
||||
mcp_server: true,
|
||||
single_file_ingest: false,
|
||||
single_file_ingest: true,
|
||||
bulk_search: true,
|
||||
}
|
||||
}
|
||||
@@ -160,12 +168,8 @@ fn open_store_for_stats(cfg: &Config) -> anyhow::Result<kebab_store_sqlite::Sqli
|
||||
kebab_store_sqlite::SqliteStore::open_existing(&db_path)
|
||||
}
|
||||
|
||||
fn collect_stats(
|
||||
cfg: &Config,
|
||||
store: &kebab_store_sqlite::SqliteStore,
|
||||
) -> anyhow::Result<Stats> {
|
||||
let counts = store
|
||||
.count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?;
|
||||
fn collect_stats(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
|
||||
let counts = store.count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?;
|
||||
let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
|
||||
let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir)
|
||||
.map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?;
|
||||
@@ -190,12 +194,16 @@ fn collect_stats(
|
||||
}
|
||||
|
||||
fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
|
||||
let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default();
|
||||
let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default();
|
||||
Models {
|
||||
// markdown parser only — pdf-page-v1 (P7) / image extractors (P6)
|
||||
// maintain their own versions; surface those when SchemaV1.models
|
||||
// becomes a multi-medium map (P+).
|
||||
parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
|
||||
chunker_version: cfg.chunking.chunker_version.clone(),
|
||||
active_parsers,
|
||||
active_chunkers,
|
||||
// EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
|
||||
embedding_version: cfg.models.embedding.model.clone(),
|
||||
prompt_template_version: cfg.rag.prompt_template_version.clone(),
|
||||
@@ -268,3 +276,27 @@ mod tests_stats_ext {
|
||||
assert_eq!(s.stats.stale_doc_count, 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_capabilities {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn capabilities_streaming_ask_matches_cli_surface() {
|
||||
// Bug #9: kebab ask --stream 가 answer_event.v1 ndjson 191 event 정상 emit →
|
||||
// capabilities.streaming_ask 가 true 여야 함.
|
||||
let caps = capabilities_snapshot();
|
||||
assert!(caps.streaming_ask, "streaming_ask must be true (Bug #9)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn capabilities_single_file_ingest_matches_cli_surface() {
|
||||
// Bug #9: kebab ingest-file <path> + kebab ingest-stdin --title <T> 양쪽 모두
|
||||
// ingest_report.v1 정상 emit → capabilities.single_file_ingest 가 true 여야 함.
|
||||
let caps = capabilities_snapshot();
|
||||
assert!(
|
||||
caps.single_file_ingest,
|
||||
"single_file_ingest must be true (Bug #9)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,11 +10,7 @@ use kebab_core::SearchHit;
|
||||
///
|
||||
/// p9-fb-32: mirrored in `kebab_rag::pipeline::compute_stale` (dep-boundary
|
||||
/// rule prevents `kebab-rag → kebab-app`). Update both together.
|
||||
pub fn compute_stale(
|
||||
indexed_at: OffsetDateTime,
|
||||
now: OffsetDateTime,
|
||||
threshold_days: u32,
|
||||
) -> bool {
|
||||
pub fn compute_stale(indexed_at: OffsetDateTime, now: OffsetDateTime, threshold_days: u32) -> bool {
|
||||
if threshold_days == 0 {
|
||||
return false;
|
||||
}
|
||||
@@ -23,11 +19,7 @@ pub fn compute_stale(
|
||||
}
|
||||
|
||||
/// Sets `stale` on each hit in place using `compute_stale`.
|
||||
pub fn mark_stale_in_place(
|
||||
hits: &mut [SearchHit],
|
||||
now: OffsetDateTime,
|
||||
threshold_days: u32,
|
||||
) {
|
||||
pub fn mark_stale_in_place(hits: &mut [SearchHit], now: OffsetDateTime, threshold_days: u32) {
|
||||
for h in hits {
|
||||
h.stale = compute_stale(h.indexed_at, now, threshold_days);
|
||||
}
|
||||
|
||||
@@ -29,9 +29,8 @@ fn rust_file_ingests_and_searches_as_code_citation() {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
|
||||
let items = report.items.as_ref().expect("items present");
|
||||
@@ -127,9 +126,8 @@ fn rust_code_search_hit_has_repo() {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
assert_eq!(report.errors, 0, "no ingest errors: {report:?}");
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("mul"))
|
||||
@@ -147,8 +145,7 @@ fn rust_code_search_hit_has_repo() {
|
||||
.and_then(|n| n.to_str())
|
||||
.map(str::to_owned);
|
||||
assert_eq!(
|
||||
h.repo,
|
||||
expected_repo,
|
||||
h.repo, expected_repo,
|
||||
"SearchHit.repo must match the workspace dir name (detect_repo result)"
|
||||
);
|
||||
// Also sanity-check code_lang is still filled.
|
||||
@@ -177,9 +174,8 @@ fn python_file_ingests_and_searches_as_code_citation() {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
assert!(report.new >= 1, "python file ingested: {report:?}");
|
||||
|
||||
@@ -254,9 +250,8 @@ fn typescript_file_ingests_and_searches_as_code_citation() {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
assert!(report.new >= 1, "ts file ingested: {report:?}");
|
||||
|
||||
@@ -331,9 +326,8 @@ fn javascript_file_ingests_and_searches_as_code_citation() {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
assert!(report.new >= 1, "js file ingested: {report:?}");
|
||||
|
||||
@@ -515,7 +509,11 @@ fn java_file_ingests_and_searches_as_code_citation() {
|
||||
line_start,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(lang.as_deref(), Some("java"), "citation.lang must be 'java'");
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("java"),
|
||||
"citation.lang must be 'java'"
|
||||
);
|
||||
assert_eq!(
|
||||
symbol.as_deref(),
|
||||
Some("com.foo.Foo.bar"),
|
||||
@@ -586,7 +584,11 @@ fn kotlin_file_ingests_and_searches_as_code_citation() {
|
||||
line_start,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(lang.as_deref(), Some("kotlin"), "citation.lang must be 'kotlin'");
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("kotlin"),
|
||||
"citation.lang must be 'kotlin'"
|
||||
);
|
||||
assert_eq!(
|
||||
symbol.as_deref(),
|
||||
Some("com.foo.Foo.bar"),
|
||||
@@ -651,8 +653,8 @@ fn tier2_k8s_yaml_ingest_searchable() {
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query)
|
||||
.expect("search must succeed");
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
|
||||
|
||||
let h = hits
|
||||
.iter()
|
||||
@@ -666,7 +668,11 @@ fn tier2_k8s_yaml_ingest_searchable() {
|
||||
line_start,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(lang.as_deref(), Some("yaml"), "citation.lang must be 'yaml'");
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("yaml"),
|
||||
"citation.lang must be 'yaml'"
|
||||
);
|
||||
assert_eq!(
|
||||
symbol.as_deref(),
|
||||
Some("Deployment/prod/api"),
|
||||
@@ -730,8 +736,8 @@ fn tier2_dockerfile_ingest_searchable() {
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query)
|
||||
.expect("search must succeed");
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
|
||||
|
||||
let h = hits
|
||||
.iter()
|
||||
@@ -813,8 +819,8 @@ fn tier2_cargo_toml_ingest_searchable() {
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query)
|
||||
.expect("search must succeed");
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
|
||||
|
||||
let h = hits
|
||||
.iter()
|
||||
@@ -896,8 +902,8 @@ fn tier3_shell_ingest_searchable() {
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query)
|
||||
.expect("search must succeed");
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
|
||||
|
||||
let h = hits
|
||||
.iter()
|
||||
@@ -987,8 +993,8 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() {
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query)
|
||||
.expect("search must succeed");
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
|
||||
|
||||
let h = hits
|
||||
.iter()
|
||||
@@ -1031,14 +1037,9 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() {
|
||||
fn rust_file_re_ingest_is_unchanged() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
std::fs::write(
|
||||
env.workspace_root.join("stable.rs"),
|
||||
"pub fn noop() {}\n",
|
||||
)
|
||||
.unwrap();
|
||||
std::fs::write(env.workspace_root.join("stable.rs"), "pub fn noop() {}\n").unwrap();
|
||||
|
||||
let r1 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let r1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let item1 = r1
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -1049,8 +1050,7 @@ fn rust_file_re_ingest_is_unchanged() {
|
||||
.unwrap();
|
||||
assert_eq!(item1.kind, IngestItemKind::New);
|
||||
|
||||
let r2 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let r2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let item2 = r2
|
||||
.items
|
||||
.unwrap()
|
||||
@@ -1081,9 +1081,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report1 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("first ingest");
|
||||
let report1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("first ingest");
|
||||
let item1 = report1
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -1093,7 +1092,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
|
||||
.expect("docker-compose.yml in first report");
|
||||
assert!(
|
||||
matches!(item1.kind, IngestItemKind::New),
|
||||
"first ingest must be New, got {:?}", item1.kind
|
||||
"first ingest must be New, got {:?}",
|
||||
item1.kind
|
||||
);
|
||||
assert_eq!(
|
||||
item1.chunker_version.as_ref().map(|c| c.0.as_str()),
|
||||
@@ -1101,9 +1101,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
|
||||
"first ingest must use Tier 3 fallback chunker"
|
||||
);
|
||||
|
||||
let report2 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest");
|
||||
let report2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest");
|
||||
let item2 = report2
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -1113,7 +1112,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
|
||||
.expect("docker-compose.yml in second report");
|
||||
assert!(
|
||||
matches!(item2.kind, IngestItemKind::Unchanged),
|
||||
"second ingest must be Unchanged, got {:?}", item2.kind
|
||||
"second ingest must be Unchanged, got {:?}",
|
||||
item2.kind
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1163,8 +1163,8 @@ fn tier1_c_ingest_searchable() {
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query)
|
||||
.expect("search must succeed");
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
|
||||
|
||||
let h = hits
|
||||
.iter()
|
||||
@@ -1247,8 +1247,8 @@ fn tier1_cpp_ingest_searchable() {
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query)
|
||||
.expect("search must succeed");
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
|
||||
|
||||
let h = hits
|
||||
.iter()
|
||||
@@ -1266,7 +1266,9 @@ fn tier1_cpp_ingest_searchable() {
|
||||
// Symbol could be "kebab::chunk::Foo" (class) or "kebab::chunk::Foo::bar"
|
||||
// (method) depending on which chunk ranks first.
|
||||
assert!(
|
||||
symbol.as_deref().is_some_and(|s| s.starts_with("kebab::chunk::Foo")),
|
||||
symbol
|
||||
.as_deref()
|
||||
.is_some_and(|s| s.starts_with("kebab::chunk::Foo")),
|
||||
"C++ symbol must start with namespace::Class prefix, got {symbol:?}"
|
||||
);
|
||||
assert!(*line_start >= 1, "line_start must be >=1");
|
||||
@@ -1335,8 +1337,8 @@ fn tier2_k8s_multi_resource_yaml_ingests_without_collision() {
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query)
|
||||
.expect("search must succeed");
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
|
||||
assert!(
|
||||
hits.len() >= 2,
|
||||
"expected ≥2 hits (Deployment + Service), got {}",
|
||||
@@ -1359,9 +1361,8 @@ fn tier3_shell_reingest_is_unchanged() {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let report1 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("first ingest");
|
||||
let report1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("first ingest");
|
||||
let item1 = report1
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -1371,12 +1372,12 @@ fn tier3_shell_reingest_is_unchanged() {
|
||||
.expect("deploy.sh in first report");
|
||||
assert!(
|
||||
matches!(item1.kind, IngestItemKind::New),
|
||||
"first ingest must be New, got {:?}", item1.kind
|
||||
"first ingest must be New, got {:?}",
|
||||
item1.kind
|
||||
);
|
||||
|
||||
let report2 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest");
|
||||
let report2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest");
|
||||
let item2 = report2
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -1386,6 +1387,7 @@ fn tier3_shell_reingest_is_unchanged() {
|
||||
.expect("deploy.sh in second report");
|
||||
assert!(
|
||||
matches!(item2.kind, IngestItemKind::Unchanged),
|
||||
"shell reingest must be Unchanged, got {:?}", item2.kind
|
||||
"shell reingest must be Unchanged, got {:?}",
|
||||
item2.kind
|
||||
);
|
||||
}
|
||||
|
||||
60
crates/kebab-app/tests/common/mock_ocr.rs
Normal file
60
crates/kebab-app/tests/common/mock_ocr.rs
Normal file
@@ -0,0 +1,60 @@
|
||||
use std::sync::Mutex;
|
||||
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Lang, OcrText};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
|
||||
pub struct MockOcrEngine {
|
||||
expected_texts: Vec<String>,
|
||||
call_index: Mutex<usize>,
|
||||
fail: bool,
|
||||
}
|
||||
|
||||
impl MockOcrEngine {
|
||||
/// Single text (backward-compat ctor for pdf_ocr_apply.rs 10 sites).
|
||||
pub fn single(text: impl Into<String>, fail: bool) -> Self {
|
||||
Self {
|
||||
expected_texts: vec![text.into()],
|
||||
call_index: Mutex::new(0),
|
||||
fail,
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-page texts (cursor advances per recognize call).
|
||||
pub fn per_page(texts: Vec<String>, fail: bool) -> Self {
|
||||
Self {
|
||||
expected_texts: texts,
|
||||
call_index: Mutex::new(0),
|
||||
fail,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OcrEngine for MockOcrEngine {
|
||||
fn engine_name(&self) -> &'static str {
|
||||
"mock-ocr"
|
||||
}
|
||||
|
||||
fn engine_version(&self) -> String {
|
||||
"mock-v1".to_string()
|
||||
}
|
||||
|
||||
fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
|
||||
if self.fail {
|
||||
anyhow::bail!("mock failure");
|
||||
}
|
||||
let mut idx = self.call_index.lock().unwrap();
|
||||
let text = self
|
||||
.expected_texts
|
||||
.get(*idx)
|
||||
.cloned()
|
||||
.unwrap_or_else(|| self.expected_texts.last().cloned().unwrap_or_default());
|
||||
*idx += 1;
|
||||
Ok(OcrText {
|
||||
joined: text,
|
||||
regions: vec![],
|
||||
engine: "mock-ocr".to_string(),
|
||||
engine_version: "mock-v1".to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -93,8 +93,7 @@ impl TestEnv {
|
||||
/// directly. Caller can invoke this multiple times to simulate
|
||||
/// re-opening the binary after a corpus revision bump.
|
||||
pub fn app(&self) -> kebab_app::App {
|
||||
kebab_app::App::open_with_config(self.config.clone())
|
||||
.expect("App::open_with_config")
|
||||
kebab_app::App::open_with_config(self.config.clone()).expect("App::open_with_config")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,3 +168,5 @@ fn copy_dir_recursive(src: &Path, dest: &Path) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub mod mock_ocr;
|
||||
|
||||
@@ -12,7 +12,11 @@ fn open(env: &common::TestEnv) -> App {
|
||||
#[test]
|
||||
fn fetch_chunk_returns_target_only_when_no_context() {
|
||||
let env = common::TestEnv::new();
|
||||
common::ingest_md(&env, "a.md", "# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n");
|
||||
common::ingest_md(
|
||||
&env,
|
||||
"a.md",
|
||||
"# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n",
|
||||
);
|
||||
let app = open(&env);
|
||||
|
||||
// Find a chunk via search to obtain its id.
|
||||
@@ -42,7 +46,8 @@ fn fetch_chunk_with_context_returns_neighbors() {
|
||||
// match. The earlier fixture used 2-char tokens like `A1`/`A3` for
|
||||
// section bodies — those zero-hit under trigram. Use 5-char unique
|
||||
// words per section so the query can pin one chunk deterministically.
|
||||
let body = "# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n";
|
||||
let body =
|
||||
"# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n";
|
||||
common::ingest_md(&env, "multi.md", body);
|
||||
let app = env.app();
|
||||
|
||||
@@ -110,7 +115,10 @@ fn fetch_doc_returns_serialized_markdown() {
|
||||
.unwrap();
|
||||
assert_eq!(result.kind, FetchKind::Doc);
|
||||
let text = result.text.expect("doc text");
|
||||
assert!(text.contains("Heading One"), "doc text contains heading: {text:?}");
|
||||
assert!(
|
||||
text.contains("Heading One"),
|
||||
"doc text contains heading: {text:?}"
|
||||
);
|
||||
assert!(text.contains("First paragraph"), "doc text contains body");
|
||||
assert!(!result.truncated);
|
||||
}
|
||||
@@ -155,7 +163,11 @@ fn fetch_doc_with_max_tokens_truncates() {
|
||||
.unwrap();
|
||||
assert!(result.truncated);
|
||||
let text = result.text.expect("doc text");
|
||||
assert!(text.chars().count() <= 100, "trimmed text len {}", text.chars().count());
|
||||
assert!(
|
||||
text.chars().count() <= 100,
|
||||
"trimmed text len {}",
|
||||
text.chars().count()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -292,8 +304,7 @@ fn fetch_span_line_start_beyond_total_returns_empty_text() {
|
||||
fn fetch_chunk_context_at_first_chunk_clamps_lower_bound() {
|
||||
let env = common::TestEnv::new();
|
||||
// Multi-chunk markdown so context ±N has neighbors.
|
||||
let body =
|
||||
"# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
|
||||
let body = "# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
|
||||
common::ingest_md(&env, "boundary.md", body);
|
||||
let app = env.app();
|
||||
let q = kebab_core::SearchQuery {
|
||||
|
||||
@@ -16,8 +16,8 @@
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::ingest_with_config_opts;
|
||||
use kebab_app::IngestOpts;
|
||||
use kebab_app::ingest_with_config_opts;
|
||||
use kebab_core::{DocFilter, DocumentStore, SearchMode, SearchQuery, SourceScope};
|
||||
|
||||
/// Helper: open the store via `TestEnv` and run `list_documents`.
|
||||
@@ -125,17 +125,10 @@ fn include_scope_narrowing_does_not_purge() {
|
||||
include: vec!["**/*.rs".to_string()],
|
||||
exclude: env.config.workspace.exclude.clone(),
|
||||
};
|
||||
let first = ingest_with_config_opts(
|
||||
env.config.clone(),
|
||||
wide_scope,
|
||||
false,
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.expect("first ingest (wide) must succeed");
|
||||
assert!(
|
||||
first.new >= 2,
|
||||
"expected at least 2 new docs: {first:?}"
|
||||
);
|
||||
let first =
|
||||
ingest_with_config_opts(env.config.clone(), wide_scope, false, IngestOpts::default())
|
||||
.expect("first ingest (wide) must succeed");
|
||||
assert!(first.new >= 2, "expected at least 2 new docs: {first:?}");
|
||||
assert_eq!(
|
||||
first.purged_deleted_files, 0,
|
||||
"no purges on first ingest: {first:?}"
|
||||
|
||||
@@ -24,8 +24,7 @@ use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
/// inspectable in stored DB rows.
|
||||
fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf {
|
||||
use image::{ImageBuffer, Rgb};
|
||||
let img: ImageBuffer<Rgb<u8>, _> =
|
||||
ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
|
||||
let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
|
||||
let path = root.join(name);
|
||||
img.save(&path).expect("write PNG fixture");
|
||||
path
|
||||
@@ -80,7 +79,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
|
||||
// Counters: scanned should include the PNG; new ≥ 1 (markdown
|
||||
// fixtures from the workspace tree may also count).
|
||||
assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items);
|
||||
assert!(
|
||||
report.scanned >= 1,
|
||||
"scanned={}, items={:?}",
|
||||
report.scanned,
|
||||
report.items
|
||||
);
|
||||
assert_eq!(report.errors, 0, "no errors on lenient OCR path");
|
||||
|
||||
// Locate the image doc in the report items.
|
||||
@@ -94,7 +98,11 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
kebab_core::IngestItemKind::New,
|
||||
"image asset must be classified New on first ingest"
|
||||
);
|
||||
assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk");
|
||||
assert_eq!(
|
||||
img_item.chunk_count,
|
||||
Some(1),
|
||||
"image emits exactly one chunk"
|
||||
);
|
||||
|
||||
// Inspect the stored chunk text via kb-app's inspect_chunk facade.
|
||||
let doc_id = img_item.doc_id.clone().expect("image doc id");
|
||||
@@ -117,10 +125,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
|
||||
// Sanity: the doc was actually persisted into SQLite (kb-app's
|
||||
// list_docs facade reads the same store the chunker writes to).
|
||||
let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
|
||||
.expect("list_docs");
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).expect("list_docs");
|
||||
assert!(
|
||||
summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")),
|
||||
summaries
|
||||
.iter()
|
||||
.any(|s| s.doc_path.0.ends_with("diagram.png")),
|
||||
"image doc must appear in list_docs"
|
||||
);
|
||||
|
||||
@@ -171,8 +181,7 @@ async fn ingest_image_with_ocr_and_caption_populates_both_fields() {
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("diagram.png"))
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let block = match &doc.blocks[0] {
|
||||
kebab_core::Block::ImageRef(b) => b,
|
||||
_ => unreachable!(),
|
||||
@@ -267,8 +276,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
|
||||
let cfg_clone = cfg.clone();
|
||||
let scope = env.scope();
|
||||
let report = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg_clone, scope, false)
|
||||
.expect("ingest with no OCR/caption")
|
||||
kebab_app::ingest_with_config(cfg_clone, scope, false).expect("ingest with no OCR/caption")
|
||||
})
|
||||
.await
|
||||
.expect("task");
|
||||
@@ -282,8 +290,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
|
||||
.find(|i| i.doc_path.0.ends_with("raw.png"))
|
||||
.unwrap();
|
||||
assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk");
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let block = match &doc.blocks[0] {
|
||||
kebab_core::Block::ImageRef(b) => b,
|
||||
_ => unreachable!(),
|
||||
@@ -392,16 +399,12 @@ async fn re_ingest_image_produces_unchanged_with_same_doc_id() {
|
||||
let scope1 = scope.clone();
|
||||
let scope2 = scope.clone();
|
||||
|
||||
let r1 = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg1, scope1, false).unwrap()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
let r2 = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg2, scope2, false).unwrap()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
let r1 = spawn_blocking(move || kebab_app::ingest_with_config(cfg1, scope1, false).unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
let r2 = spawn_blocking(move || kebab_app::ingest_with_config(cfg2, scope2, false).unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let id1 = r1
|
||||
.items
|
||||
|
||||
@@ -21,11 +21,16 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
// First ingest — populates the DB. Use the legacy entry so the
|
||||
// assertions cover the "previously ingested" set without needing
|
||||
// IngestOpts::default() to behave identically.
|
||||
let first =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||
assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
|
||||
assert_eq!(first.unchanged, 0, "first ingest cannot have unchanged: {first:?}");
|
||||
assert!(
|
||||
first.new >= 1,
|
||||
"first ingest must create new docs: {first:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
first.unchanged, 0,
|
||||
"first ingest cannot have unchanged: {first:?}"
|
||||
);
|
||||
|
||||
let scanned = first.scanned;
|
||||
|
||||
@@ -38,9 +43,15 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(second.scanned, scanned, "second scanned matches first: {second:?}");
|
||||
assert_eq!(
|
||||
second.scanned, scanned,
|
||||
"second scanned matches first: {second:?}"
|
||||
);
|
||||
assert_eq!(second.new, 0, "no new docs on re-ingest: {second:?}");
|
||||
assert_eq!(second.updated, 0, "nothing should be marked updated: {second:?}");
|
||||
assert_eq!(
|
||||
second.updated, 0,
|
||||
"nothing should be marked updated: {second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.unchanged, scanned,
|
||||
"every doc must be Unchanged: {second:?}"
|
||||
@@ -52,10 +63,12 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
fn force_reingest_bypasses_skip() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
let first =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||
assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
|
||||
assert!(
|
||||
first.new >= 1,
|
||||
"first ingest must create new docs: {first:?}"
|
||||
);
|
||||
let scanned = first.scanned;
|
||||
|
||||
let second = ingest_with_config_opts(
|
||||
|
||||
@@ -107,13 +107,9 @@ fn cancel_none_is_uncancellable_default() {
|
||||
// ingest_with_config_progress (no cancel) runs to completion.
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
|
||||
@@ -107,5 +107,8 @@ fn ingest_file_errors_on_unsupported_extension() {
|
||||
|
||||
let err = kebab_app::ingest_file_with_config(cfg, &docx).unwrap_err();
|
||||
assert!(err.to_string().contains("unsupported extension"), "{err}");
|
||||
assert!(err.to_string().contains(".docx") || err.to_string().contains("docx"), "{err}");
|
||||
assert!(
|
||||
err.to_string().contains(".docx") || err.to_string().contains("docx"),
|
||||
"{err}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -8,8 +8,7 @@ use common::TestEnv;
|
||||
#[test]
|
||||
fn ingest_then_list_inspects_round_trip() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
|
||||
// The fixture has 3 markdown files; first ingest should label them
|
||||
// all as New.
|
||||
@@ -27,17 +26,14 @@ fn ingest_then_list_inspects_round_trip() {
|
||||
}
|
||||
|
||||
// list_docs returns the 3 docs.
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert_eq!(docs.len(), 3, "docs: {docs:?}");
|
||||
|
||||
// inspect_doc round-trips one of them.
|
||||
let any_doc_id = docs[0].doc_id.clone();
|
||||
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id)
|
||||
.unwrap();
|
||||
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id).unwrap();
|
||||
assert_eq!(canonical.doc_id, any_doc_id);
|
||||
assert!(!canonical.blocks.is_empty(), "blocks empty");
|
||||
}
|
||||
@@ -46,12 +42,10 @@ fn ingest_then_list_inspects_round_trip() {
|
||||
fn ingest_idempotent_on_second_run() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
let r1 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let r1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(r1.new, 3);
|
||||
|
||||
let r2 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let r2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
// Same files re-ingested — p9-fb-23 task 7 introduced the early-skip
|
||||
// path: when checksum + parser/chunker/embedding versions all match,
|
||||
// the second run reports `Unchanged` rather than `Updated`. Pre-p9-fb-23
|
||||
@@ -63,19 +57,16 @@ fn ingest_idempotent_on_second_run() {
|
||||
assert_eq!(r2.unchanged, 3, "second run unchanged: {r2:?}");
|
||||
|
||||
// list_docs still has 3 docs (no duplicates).
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert_eq!(docs.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_summary_only_drops_items() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert!(report.items.is_none(), "summary-only should null items");
|
||||
}
|
||||
@@ -87,12 +78,10 @@ fn ingest_records_ingest_runs_row_with_aggregate_counts() {
|
||||
// of every run. `summary_only=true` writes `items_json=NULL`; the
|
||||
// counts MUST still be present.
|
||||
let env = TestEnv::lexical_only();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
|
||||
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir)
|
||||
.join("kebab.sqlite");
|
||||
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
|
||||
let conn = rusqlite::Connection::open(&db_path).expect("open kebab.sqlite");
|
||||
let (scanned, new_c, updated, skipped, errors, items_json): (
|
||||
i64,
|
||||
@@ -141,25 +130,18 @@ fn ingest_provider_none_skips_lance() {
|
||||
// tree shape (no `<data_dir>/lancedb` directory, or no `*.lance`
|
||||
// tables under it).
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0, "lexical-only run must not error");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir)
|
||||
.join("lancedb");
|
||||
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir).join("lancedb");
|
||||
if lance_dir.exists() {
|
||||
// If the dir was created (e.g., by an earlier consumer touching
|
||||
// the path), it MUST contain no `.lance` tables.
|
||||
let mut had_lance_table = false;
|
||||
for entry in std::fs::read_dir(&lance_dir).expect("read lance_dir") {
|
||||
let entry = entry.unwrap();
|
||||
if entry
|
||||
.path()
|
||||
.extension()
|
||||
.and_then(|s| s.to_str())
|
||||
== Some("lance")
|
||||
{
|
||||
if entry.path().extension().and_then(|s| s.to_str()) == Some("lance") {
|
||||
had_lance_table = true;
|
||||
break;
|
||||
}
|
||||
@@ -189,8 +171,7 @@ fn list_docs_filters_by_tags_any() {
|
||||
tags_any: vec!["rust".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let rust_docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
|
||||
let rust_docs = kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
|
||||
// intro.md and notes/cargo.md both tag "rust".
|
||||
assert_eq!(rust_docs.len(), 2, "expected 2 rust docs: {rust_docs:?}");
|
||||
}
|
||||
@@ -198,8 +179,9 @@ fn list_docs_filters_by_tags_any() {
|
||||
#[test]
|
||||
fn inspect_doc_not_found_returns_actionable_error() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bogus =
|
||||
kebab_core::DocumentId("0000000000000000000000000000000000000000000000000000000000000000".to_string());
|
||||
let bogus = kebab_core::DocumentId(
|
||||
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
|
||||
);
|
||||
let err = kebab_app::inspect_doc_with_config(env.config.clone(), &bogus).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(
|
||||
@@ -218,8 +200,7 @@ fn inspect_chunk_not_found_returns_actionable_error() {
|
||||
let bogus = kebab_core::ChunkId(
|
||||
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
|
||||
);
|
||||
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus)
|
||||
.unwrap_err();
|
||||
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("not found"), "got: {msg}");
|
||||
}
|
||||
@@ -251,22 +232,18 @@ fn ingest_with_config_opts_default_matches_legacy_behaviour() {
|
||||
#[test]
|
||||
fn ingest_stamps_chunker_version_on_document() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert!(report.new >= 1, "expected at least one new doc: {report:?}");
|
||||
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
|
||||
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert!(!docs.is_empty(), "no docs after ingest");
|
||||
|
||||
for doc_entry in &docs {
|
||||
let canonical =
|
||||
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id)
|
||||
.unwrap();
|
||||
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id).unwrap();
|
||||
assert!(
|
||||
canonical.last_chunker_version.is_some(),
|
||||
"last_chunker_version must be stamped for doc {}: got {:?}",
|
||||
|
||||
169
crates/kebab-app/tests/ingest_log_smoke.rs
Normal file
169
crates/kebab-app/tests/ingest_log_smoke.rs
Normal file
@@ -0,0 +1,169 @@
|
||||
// crates/kebab-app/tests/ingest_log_smoke.rs
|
||||
//
|
||||
// Integration tests for ingest_log feature (v0.20.x). Spec §5 AC-9 + AC-6.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_app::{IngestOpts, ingest_with_config_opts};
|
||||
use kebab_config::{Config, LoggingCfg};
|
||||
use kebab_core::SourceScope;
|
||||
use serde_json::Value;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Config {
|
||||
let data_dir = workspace.parent().unwrap().join("data");
|
||||
std::fs::create_dir_all(&data_dir).unwrap();
|
||||
let model_dir = workspace.parent().unwrap().join("models");
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.chunking.target_tokens = 80;
|
||||
cfg.chunking.overlap_tokens = 20;
|
||||
cfg.logging = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: log_dir.to_path_buf(),
|
||||
};
|
||||
cfg
|
||||
}
|
||||
|
||||
/// AC-9: ingest → log file exists + each line valid JSON + last line kind=summary + scanned>0.
|
||||
#[test]
|
||||
fn ingest_log_smoke() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let workspace = tmp.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let log_dir = tmp.path().join("logs");
|
||||
|
||||
// 1. Minimal corpus: 1 markdown + 1 scanned PDF (OCR disabled — no Ollama needed).
|
||||
std::fs::write(
|
||||
workspace.join("hello.md"),
|
||||
"# Hello\n\nThis is a smoke test.\n",
|
||||
)
|
||||
.unwrap();
|
||||
let pdf_src = PathBuf::from("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
if pdf_src.exists() {
|
||||
std::fs::copy(&pdf_src, workspace.join("scanned.pdf")).unwrap();
|
||||
}
|
||||
|
||||
// 2. Config with logging enabled.
|
||||
let cfg = minimal_config(&workspace, &log_dir);
|
||||
let scope = SourceScope {
|
||||
root: workspace.clone(),
|
||||
exclude: vec![],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// 3. Run ingest.
|
||||
ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
|
||||
.expect("ingest should succeed");
|
||||
|
||||
// 4. Assert log file exists in log_dir.
|
||||
let log_files: Vec<_> = std::fs::read_dir(&log_dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
e.file_name().to_string_lossy().starts_with("ingest-")
|
||||
&& e.file_name().to_string_lossy().ends_with(".ndjson")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(
|
||||
log_files.len(),
|
||||
1,
|
||||
"expected exactly 1 ingest-*.ndjson file, found: {log_files:?}"
|
||||
);
|
||||
|
||||
// 5. Parse each line as JSON — assert kind field present and valid.
|
||||
let body = std::fs::read_to_string(log_files[0].path()).unwrap();
|
||||
let lines: Vec<&str> = body.lines().collect();
|
||||
assert!(!lines.is_empty(), "log file should not be empty");
|
||||
|
||||
let valid_kinds = ["ocr", "parse_error", "skip", "error", "summary"];
|
||||
for line in &lines {
|
||||
let v: Value = serde_json::from_str(line)
|
||||
.unwrap_or_else(|e| panic!("line is not valid JSON: {e}\nline: {line}"));
|
||||
let kind = v
|
||||
.get("kind")
|
||||
.and_then(|k| k.as_str())
|
||||
.unwrap_or_else(|| panic!("line missing 'kind' field: {line}"));
|
||||
assert!(
|
||||
valid_kinds.contains(&kind),
|
||||
"unexpected kind '{kind}' in line: {line}"
|
||||
);
|
||||
}
|
||||
|
||||
// 6. Last line must be kind=summary with scanned > 0.
|
||||
let last = lines.last().unwrap();
|
||||
let last_v: Value = serde_json::from_str(last).unwrap();
|
||||
assert_eq!(
|
||||
last_v.get("kind").and_then(|k| k.as_str()),
|
||||
Some("summary"),
|
||||
"last line must be kind=summary, got: {last}"
|
||||
);
|
||||
let scanned = last_v.get("scanned").and_then(Value::as_u64).unwrap_or(0);
|
||||
assert!(scanned > 0, "summary.scanned should be > 0, got: {last}");
|
||||
}
|
||||
|
||||
/// AC-6: ingest_log_enabled=false → no log file created.
|
||||
#[test]
|
||||
fn ingest_log_disabled_emits_no_file() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let workspace = tmp.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let log_dir = tmp.path().join("logs");
|
||||
|
||||
std::fs::write(
|
||||
workspace.join("hello.md"),
|
||||
"# Hello\n\nDisabled log test.\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let data_dir = tmp.path().join("data");
|
||||
std::fs::create_dir_all(&data_dir).unwrap();
|
||||
let model_dir = tmp.path().join("models");
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.logging = LoggingCfg {
|
||||
ingest_log_enabled: false,
|
||||
ingest_log_dir: log_dir.clone(),
|
||||
};
|
||||
|
||||
let scope = SourceScope {
|
||||
root: workspace.clone(),
|
||||
exclude: vec![],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
|
||||
.expect("ingest should succeed");
|
||||
|
||||
// log_dir should either not exist or contain 0 ingest-*.ndjson files.
|
||||
let log_file_count = if log_dir.exists() {
|
||||
std::fs::read_dir(&log_dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
e.file_name().to_string_lossy().starts_with("ingest-")
|
||||
&& e.file_name().to_string_lossy().ends_with(".ndjson")
|
||||
})
|
||||
.count()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
assert_eq!(
|
||||
log_file_count, 0,
|
||||
"no ingest-*.ndjson file should be created when disabled"
|
||||
);
|
||||
}
|
||||
117
crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs
Normal file
117
crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
//! Integration smoke tests for the PDF OCR pipeline (§ Acceptance §9 #1 + #2).
|
||||
//!
|
||||
//! Tests 1 and 2 require a live Ollama endpoint — `#[ignore]` by default.
|
||||
//! Manual invoke:
|
||||
//! KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
|
||||
//! cargo test -p kebab-app --test ingest_pdf_ocr_smoke --ignored -j 4
|
||||
//!
|
||||
//! Test 3 (cancel) uses a dummy endpoint + pre-set cancel — runs by default
|
||||
//! to verify the cancel wiring doesn't panic/deadlock.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
fn ollama_endpoint() -> String {
|
||||
std::env::var("KEBAB_PDF_OCR_ENDPOINT").unwrap_or_else(|_| "http://localhost:11434".to_string())
|
||||
}
|
||||
|
||||
fn make_ocr_env_real() -> TestEnv {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.pdf.ocr.enabled = true;
|
||||
env.config.pdf.ocr.endpoint = Some(ollama_endpoint());
|
||||
env.config.models.embedding.provider = "none".to_string();
|
||||
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let dest = env.workspace_root.join("scanned_page1.pdf");
|
||||
std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
|
||||
|
||||
env
|
||||
}
|
||||
|
||||
/// § Acceptance §9 #1 — real Ollama OCR + IngestItem.pdf_ocr_pages = Some(1).
|
||||
#[test]
|
||||
#[ignore = "real Ollama qwen2.5vl:3b dependency"]
|
||||
fn ingest_with_mock_ocr_yields_pdf_ocr_summary() {
|
||||
let env = make_ocr_env_real();
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
|
||||
|
||||
assert!(report.new >= 1, "at least one PDF ingested: {report:?}");
|
||||
|
||||
let items = report.items.unwrap_or_default();
|
||||
let pdf_item = items.iter().find(|i| i.doc_path.0.ends_with(".pdf"));
|
||||
assert!(
|
||||
pdf_item.is_some(),
|
||||
"PDF item must appear in ingest report items: {items:?}"
|
||||
);
|
||||
let pdf_item = pdf_item.unwrap();
|
||||
assert!(
|
||||
pdf_item.pdf_ocr_pages.is_some(),
|
||||
"pdf_ocr_pages must be set for scanned PDF: {pdf_item:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.pdf_ocr_pages.unwrap(),
|
||||
1,
|
||||
"scanned_page1.pdf has exactly 1 page"
|
||||
);
|
||||
}
|
||||
|
||||
/// § Acceptance §9 #2 — OCR text indexed and retrievable via lexical search.
|
||||
#[test]
|
||||
#[ignore = "real Ollama qwen2.5vl:3b dependency"]
|
||||
fn ocr_text_indexed_and_searchable() {
|
||||
let env = make_ocr_env_real();
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
|
||||
|
||||
// Search for a Korean morpheme expected to appear in qwen2.5vl:3b OCR
|
||||
// output of the PoC ground-truth page. "다음" is a high-frequency token
|
||||
// in page1.txt truth file.
|
||||
let query = common::lexical_query("다음");
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query).expect("search");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"OCR-indexed text must surface in lexical search results"
|
||||
);
|
||||
}
|
||||
|
||||
/// Production cancel wiring smoke — pre-set cancel exits before any OCR call.
|
||||
/// Dummy endpoint (port 1 = connection-refused) means OCR HTTP calls would
|
||||
/// fail, but cancel=true prevents the loop from reaching OCR at all.
|
||||
/// Verifies no panic/deadlock regardless of Ok/Err outcome.
|
||||
#[test]
|
||||
fn ingest_with_cancel_aborts_mid_pdf() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.pdf.ocr.enabled = true;
|
||||
env.config.pdf.ocr.endpoint = Some("http://127.0.0.1:1".to_string());
|
||||
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let dest = env.workspace_root.join("scanned_page1.pdf");
|
||||
std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
|
||||
|
||||
let cancel = Arc::new(AtomicBool::new(true)); // pre-set — abort immediately
|
||||
|
||||
let result = kebab_app::ingest_with_config_cancellable(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
None,
|
||||
Some(cancel),
|
||||
);
|
||||
// Both Ok (pre-cancel exit) and Err (eager OCR engine fail) are acceptable —
|
||||
// key assertion is no panic/deadlock.
|
||||
let _ = result;
|
||||
}
|
||||
@@ -13,13 +13,9 @@ use kebab_core::IngestItemKind;
|
||||
fn run_with_progress() -> Vec<IngestEvent> {
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), false, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
@@ -116,13 +112,9 @@ fn ingest_with_config_progress_none_matches_ingest_with_config() {
|
||||
// `ingest_with_config_progress(..., None)` must produce identical
|
||||
// reports modulo wall-clock duration.
|
||||
let env = TestEnv::lexical_only();
|
||||
let r_none = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
let r_none =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, None)
|
||||
.unwrap();
|
||||
assert_eq!(r_none.scanned, 3);
|
||||
assert_eq!(r_none.new, 3);
|
||||
}
|
||||
@@ -134,12 +126,77 @@ fn dropped_receiver_does_not_panic_or_fail_ingest() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
drop(rx);
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
}
|
||||
|
||||
/// v0.20.0 sub-item 1: pdf_ocr_started + pdf_ocr_finished events 가 PDF asset 의
|
||||
/// OCR-enabled ingest 시 emit 됨을 검증. real Ollama 의존 — `#[ignore]` default.
|
||||
///
|
||||
/// Manual invoke:
|
||||
/// ```
|
||||
/// KEBAB_PDF_OCR_ENABLED=true \
|
||||
/// KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
|
||||
/// cargo test -p kebab-app --test ingest_progress \
|
||||
/// --ignored pdf_ocr_progress_emits_started_finished_events
|
||||
/// ```
|
||||
#[test]
|
||||
#[ignore = "real Ollama dependency — manual invoke via KEBAB_PDF_OCR_ENABLED=true"]
|
||||
fn pdf_ocr_progress_emits_started_finished_events() {
|
||||
// F1 fixture (DCTDecode JPEG passthrough) 을 tmpdir 의 workspace 로 copy.
|
||||
let tmpdir = tempfile::tempdir().expect("create tmpdir");
|
||||
let workspace = tmpdir.path().join("workspace");
|
||||
std::fs::create_dir_all(&workspace).expect("create workspace dir");
|
||||
let f1_src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let f1 = std::fs::read(&f1_src).expect("F1 fixture present");
|
||||
std::fs::write(workspace.join("page1.pdf"), &f1).expect("copy F1");
|
||||
|
||||
let data_dir = tmpdir.path().join("data");
|
||||
std::fs::create_dir_all(&data_dir).expect("create data dir");
|
||||
|
||||
let mut config = kebab_config::Config::defaults();
|
||||
config.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
config.models.embedding.provider = "none".to_string();
|
||||
config.models.embedding.dimensions = 0;
|
||||
config.pdf.ocr.enabled = true;
|
||||
if let Ok(endpoint) = std::env::var("KEBAB_PDF_OCR_ENDPOINT") {
|
||||
config.pdf.ocr.endpoint = Some(endpoint);
|
||||
}
|
||||
|
||||
let scope = kebab_core::SourceScope {
|
||||
root: workspace.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let _report = kebab_app::ingest_with_config_progress(config, scope, false, Some(tx))
|
||||
.expect("ingest_with_config_progress");
|
||||
|
||||
let events: Vec<_> = rx.iter().collect();
|
||||
|
||||
let started_count = events
|
||||
.iter()
|
||||
.filter(|e| matches!(e, IngestEvent::PdfOcrStarted { .. }))
|
||||
.count();
|
||||
let finished_count = events
|
||||
.iter()
|
||||
.filter(|e| matches!(e, IngestEvent::PdfOcrFinished { .. }))
|
||||
.count();
|
||||
|
||||
assert!(
|
||||
started_count >= 1,
|
||||
"PdfOcrStarted 가 ≥ 1 emit 됨 (got {started_count})"
|
||||
);
|
||||
assert!(
|
||||
finished_count >= 1,
|
||||
"PdfOcrFinished 가 ≥ 1 emit 됨 (got {finished_count})"
|
||||
);
|
||||
assert_eq!(
|
||||
started_count, finished_count,
|
||||
"Started 와 Finished 의 count 일치"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -29,12 +29,14 @@ fn ingest_stdin_writes_frontmatter_and_reports_new() {
|
||||
"## Body content\n\nMore.",
|
||||
"Article X",
|
||||
Some("https://example.com/x"),
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(report.new, 1, "{report:?}");
|
||||
|
||||
// _external/ contains exactly one .md file with frontmatter.
|
||||
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(std::result::Result::ok)
|
||||
.collect();
|
||||
assert_eq!(entries.len(), 1);
|
||||
@@ -50,16 +52,13 @@ fn ingest_stdin_without_source_uri() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = fresh_cfg(dir.path());
|
||||
|
||||
let report = kebab_app::ingest_stdin_with_config(
|
||||
cfg.clone(),
|
||||
"## Body",
|
||||
"Title",
|
||||
None,
|
||||
).unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_stdin_with_config(cfg.clone(), "## Body", "Title", None).unwrap();
|
||||
assert_eq!(report.new, 1);
|
||||
|
||||
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(std::result::Result::ok)
|
||||
.collect();
|
||||
let content = fs::read_to_string(entries[0].path()).unwrap();
|
||||
|
||||
@@ -17,9 +17,8 @@ fn init_workspace_header_lists_supported_extensions() {
|
||||
}
|
||||
kebab_app::init_workspace(true).expect("init_workspace");
|
||||
let cfg_path = kebab_config::Config::xdg_config_path();
|
||||
let body = std::fs::read_to_string(&cfg_path).unwrap_or_else(|e| {
|
||||
panic!("read config at {}: {e}", cfg_path.display())
|
||||
});
|
||||
let body = std::fs::read_to_string(&cfg_path)
|
||||
.unwrap_or_else(|e| panic!("read config at {}: {e}", cfg_path.display()));
|
||||
assert!(
|
||||
body.contains("처리 가능한 형식"),
|
||||
"header lists supported types section: body=\n{body}"
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
//! Bug #3 regression: multi-scanned PDF ingest must produce globally unique chunk_ids.
|
||||
//! v0.20.0 sub-item 1 bugfix.
|
||||
//!
|
||||
//! Strategy: helper-level chain test (apply_ocr_to_pdf_pages → PdfPageV1Chunker).
|
||||
//! Facade mock injection is unavailable (kebab-app hardcodes OllamaVisionOcr), so
|
||||
//! this test covers the full OCR→chunk pipeline with real PDF fixtures + MockOcrEngine,
|
||||
//! adding value beyond kebab-chunk unit test B5 (which tests PdfPageV1Chunker alone).
|
||||
|
||||
mod common;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use common::mock_ocr::MockOcrEngine;
|
||||
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
||||
use kebab_chunk::PdfPageV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetStorage, Checksum, ChunkPolicy, Chunker, ExtractConfig, ExtractContext, Extractor,
|
||||
MediaType, RawAsset, SourceUri, WorkspacePath, id_for_asset,
|
||||
};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn make_pdf_asset(path: &str, hash_char: char, byte_len: u64) -> RawAsset {
|
||||
let fake_hash: String = hash_char.to_string().repeat(64);
|
||||
let asset_id = id_for_asset(&fake_hash);
|
||||
RawAsset {
|
||||
asset_id,
|
||||
source_uri: SourceUri::File(PathBuf::from(path)),
|
||||
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
|
||||
media_type: MediaType::Pdf,
|
||||
byte_len,
|
||||
checksum: Checksum(fake_hash),
|
||||
discovered_at: OffsetDateTime::UNIX_EPOCH,
|
||||
stored: AssetStorage::Copied {
|
||||
path: PathBuf::from(path),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_and_ocr(
|
||||
bytes: &[u8],
|
||||
path: &str,
|
||||
hash_char: char,
|
||||
engine: &dyn OcrEngine,
|
||||
) -> kebab_core::CanonicalDocument {
|
||||
let asset = make_pdf_asset(path, hash_char, bytes.len() as u64);
|
||||
let workspace_root = Path::new("/");
|
||||
let config = ExtractConfig::default();
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root,
|
||||
config: &config,
|
||||
};
|
||||
let mut canonical = PdfTextExtractor::new().extract(&ctx, bytes).unwrap();
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
apply_ocr_to_pdf_pages(&mut canonical, engine, bytes, &opts, |_| {}).unwrap();
|
||||
canonical
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_scanned_pdf_ingest_no_chunk_id_collision() {
|
||||
let f1_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||
.expect("F1 fixture missing");
|
||||
let f2_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page2.pdf")
|
||||
.expect("F2 fixture missing");
|
||||
|
||||
// Bug #3 trigger shape: 10-char early segment + ". " + 500-char tail.
|
||||
// byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500 → multi-chunk.
|
||||
// overlap_bytes = min(240, 750) = 240 / chars=80 → second chunk's actual_start
|
||||
// collapses to prev_min=0 without the fix → same #c0 suffix → chunk_id collision.
|
||||
let trigger_text = format!("{}. {}", "가".repeat(10), "나".repeat(500));
|
||||
|
||||
let f1_engine = MockOcrEngine::single("F1 mock OCR page text", false);
|
||||
let f2_engine = MockOcrEngine::single(&trigger_text, false);
|
||||
|
||||
let f1_canonical = extract_and_ocr(&f1_bytes, "page1.pdf", '1', &f1_engine);
|
||||
let f2_canonical = extract_and_ocr(&f2_bytes, "page2.pdf", '2', &f2_engine);
|
||||
|
||||
let chunk_policy = ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: PdfPageV1Chunker.chunker_version(),
|
||||
};
|
||||
|
||||
let f1_chunks = PdfPageV1Chunker
|
||||
.chunk(&f1_canonical, &chunk_policy)
|
||||
.unwrap();
|
||||
let f2_chunks = PdfPageV1Chunker
|
||||
.chunk(&f2_canonical, &chunk_policy)
|
||||
.unwrap();
|
||||
|
||||
assert!(
|
||||
f2_chunks.len() >= 2,
|
||||
"F2 trigger text must produce ≥2 chunks for the collision to be possible; got {}",
|
||||
f2_chunks.len()
|
||||
);
|
||||
|
||||
let all_ids: Vec<&str> = f1_chunks
|
||||
.iter()
|
||||
.chain(f2_chunks.iter())
|
||||
.map(|c| c.chunk_id.0.as_str())
|
||||
.collect();
|
||||
let total = all_ids.len();
|
||||
let unique: HashSet<&str> = all_ids.iter().copied().collect();
|
||||
assert_eq!(
|
||||
unique.len(),
|
||||
total,
|
||||
"all chunk_ids must be globally unique across F1 + F2 ({} unique vs {} total — collision detected)",
|
||||
unique.len(),
|
||||
total,
|
||||
);
|
||||
}
|
||||
358
crates/kebab-app/tests/pdf_ocr_apply.rs
Normal file
358
crates/kebab-app/tests/pdf_ocr_apply.rs
Normal file
@@ -0,0 +1,358 @@
|
||||
//! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
use common::mock_ocr::MockOcrEngine;
|
||||
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
||||
use kebab_core::{
|
||||
AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext, Extractor,
|
||||
Inline, Lang, MediaType, RawAsset, SourceSpan, SourceUri, WorkspacePath, id_for_asset,
|
||||
};
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
// ── Fixture helpers ───────────────────────────────────────────────────────
|
||||
|
||||
fn f1_pdf_bytes() -> Vec<u8> {
|
||||
std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||
.expect("F1 fixture missing")
|
||||
}
|
||||
|
||||
fn make_raw_asset(path: &str, media_type: MediaType, byte_len: u64) -> RawAsset {
|
||||
let fake_hash = "0".repeat(64);
|
||||
let asset_id = id_for_asset(&fake_hash);
|
||||
RawAsset {
|
||||
asset_id,
|
||||
source_uri: SourceUri::File(PathBuf::from(path)),
|
||||
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
|
||||
media_type,
|
||||
byte_len,
|
||||
checksum: Checksum(fake_hash.clone()),
|
||||
discovered_at: OffsetDateTime::UNIX_EPOCH,
|
||||
stored: AssetStorage::Copied {
|
||||
path: PathBuf::from(path),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a CanonicalDocument from raw PDF bytes using PdfTextExtractor.
|
||||
/// F1 (scanned) returns an empty-text Block::Paragraph per page.
|
||||
fn extract_canonical_from_bytes(bytes: &[u8]) -> CanonicalDocument {
|
||||
let asset = make_raw_asset("test.pdf", MediaType::Pdf, bytes.len() as u64);
|
||||
let workspace_root = Path::new("/");
|
||||
let config = ExtractConfig::default();
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root,
|
||||
config: &config,
|
||||
};
|
||||
PdfTextExtractor::new().extract(&ctx, bytes).unwrap()
|
||||
}
|
||||
|
||||
/// F1 bytes → canonical with 1 empty Block::Paragraph for page 1.
|
||||
fn canonical_with_empty_block() -> CanonicalDocument {
|
||||
extract_canonical_from_bytes(&f1_pdf_bytes())
|
||||
}
|
||||
|
||||
/// F1-based canonical with block text replaced by `text` (high valid_ratio, chars≥20).
|
||||
fn canonical_with_filled_block(text: &str) -> CanonicalDocument {
|
||||
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
|
||||
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
|
||||
let char_count = text.chars().count() as u32;
|
||||
tb.text = text.to_string();
|
||||
tb.inlines = vec![Inline::Text {
|
||||
text: text.to_string(),
|
||||
}];
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(char_count);
|
||||
}
|
||||
}
|
||||
canonical
|
||||
}
|
||||
|
||||
/// F1-based canonical with block text replaced by PUA codepoints (low valid_ratio).
|
||||
fn canonical_with_mojibake_block() -> CanonicalDocument {
|
||||
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
|
||||
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
|
||||
let pua = "\u{E000}".repeat(25); // 25 PUA codepoints → valid_ratio ≈ 0
|
||||
let char_count = pua.chars().count() as u32;
|
||||
tb.text = pua.clone();
|
||||
tb.inlines = vec![Inline::Text { text: pua }];
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(char_count);
|
||||
}
|
||||
}
|
||||
canonical
|
||||
}
|
||||
|
||||
fn default_opts(enabled: bool) -> PdfOcrOpts {
|
||||
PdfOcrOpts {
|
||||
enabled,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// Test 1: F1 + enabled=true → in-place mutate
|
||||
#[test]
|
||||
fn f1_input_with_ocr_enabled_replaces_empty_block() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("MOCK_OCR_TEXT", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: Some(Lang("kor".into())),
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1);
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
assert!(first_para.is_some());
|
||||
assert_eq!(first_para.unwrap().text, "MOCK_OCR_TEXT");
|
||||
}
|
||||
|
||||
// Test 2: F3 vector (mock filled canonical) + enabled=true → OCR skip (needs_ocr=false)
|
||||
#[test]
|
||||
fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() {
|
||||
let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text
|
||||
let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "vector PDF 의 OCR 호출 0");
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
if let Some(tb) = first_para {
|
||||
assert!(tb.text.starts_with("충분한"), "원본 text 보존");
|
||||
}
|
||||
}
|
||||
|
||||
// Test 3: F1 + enabled=false → no-op
|
||||
#[test]
|
||||
fn f1_input_with_ocr_disabled_keeps_empty_block() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("IGNORED", false);
|
||||
let opts = default_opts(false);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0);
|
||||
assert_eq!(summary.ms_total, 0);
|
||||
}
|
||||
|
||||
// Test 4: mojibake canonical (PUA chars) + enabled=true → in-place mutate
|
||||
#[test]
|
||||
fn f4_input_with_ocr_enabled_replaces_mojibake_block() {
|
||||
let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image
|
||||
let mut canonical = canonical_with_mojibake_block();
|
||||
let engine = MockOcrEngine::single("OCR_MOJIBAKE_REPLACEMENT", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1, "mojibake page 의 OCR 호출");
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
if let Some(tb) = first_para {
|
||||
assert_eq!(tb.text, "OCR_MOJIBAKE_REPLACEMENT");
|
||||
}
|
||||
}
|
||||
|
||||
// Test 5: filled canonical + always_on=true → dual-block (+1 OCR block)
|
||||
#[test]
|
||||
fn f3_input_with_always_on_pushes_dual_blocks() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let original_block_count = canonical.blocks.len();
|
||||
let engine = MockOcrEngine::single("OCR_DUAL", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: true,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1);
|
||||
assert_eq!(
|
||||
canonical.blocks.len(),
|
||||
original_block_count + 1,
|
||||
"always_on 시 새 Block::Paragraph push"
|
||||
);
|
||||
let texts: Vec<&str> = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb.text.as_str()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
assert!(texts.contains(&"OCR_DUAL"), "OCR block 포함");
|
||||
assert!(
|
||||
texts.iter().any(|t| t.starts_with("vector")),
|
||||
"원본 text-detect block 보존"
|
||||
);
|
||||
}
|
||||
|
||||
// Test 6: F6 FlateDecode → extract_dctdecode_page_image=None → skip + warning
|
||||
#[test]
|
||||
fn f6_flatedecode_skipped_with_warning() {
|
||||
let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf")
|
||||
.expect("F6 fixture missing");
|
||||
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
summary.pages_ocrd, 0,
|
||||
"FlateDecode page 는 skip (DCTDecode-only v1 invariant)"
|
||||
);
|
||||
let warning_count = canonical
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
|
||||
.count();
|
||||
assert!(warning_count >= 1, "FlateDecode skip 시 Warning event 발행");
|
||||
}
|
||||
|
||||
// Test 7: F7 CCITTFax → skip + warning (verifier M-4 split)
|
||||
#[test]
|
||||
fn f7_ccittfax_skipped_with_warning() {
|
||||
let bytes =
|
||||
std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf").expect("F7 fixture missing");
|
||||
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "CCITTFax page 는 skip");
|
||||
let warning_count = canonical
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
|
||||
.count();
|
||||
assert!(warning_count >= 1, "CCITTFax skip 시 Warning event 발행");
|
||||
}
|
||||
|
||||
// Test 8: OCR engine failure → warning event + skip
|
||||
#[test]
|
||||
fn ocr_engine_failure_surfaces_as_warning() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("", true);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "OCR failure 시 pages_ocrd=0");
|
||||
let warning_with_failure = canonical.provenance.events.iter().any(|e| {
|
||||
e.kind == kebab_core::ProvenanceKind::Warning
|
||||
&& e.note.as_deref().unwrap_or("").contains("mock failure")
|
||||
});
|
||||
assert!(
|
||||
warning_with_failure,
|
||||
"OCR failure 의 error message 가 warning event 의 note 안"
|
||||
);
|
||||
}
|
||||
|
||||
// Test 9: dual-block ordinals are deterministic and unique
|
||||
#[test]
|
||||
fn dual_block_ordinals_are_deterministic_and_unique() {
|
||||
let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1
|
||||
let text = "vector 충분한 텍스트. This text has more than twenty characters total.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let engine = MockOcrEngine::single("DUAL", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: true,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
// page_count=1 → text-detect ordinal=0, ocr ordinal=1 (page_num-1 + page_count = 0+1=1)
|
||||
let para_count = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter(|b| matches!(b, Block::Paragraph(_)))
|
||||
.count();
|
||||
assert_eq!(para_count, 2, "dual-block: text-detect + OCR");
|
||||
|
||||
let all_page_1 = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(&tb.common.source_span),
|
||||
_ => None,
|
||||
})
|
||||
.all(|s| matches!(s, SourceSpan::Page { page: 1, .. }));
|
||||
assert!(all_page_1, "두 block 모두 page=1");
|
||||
}
|
||||
|
||||
// Test 10: cancel handle aborts mid-PDF
|
||||
#[test]
|
||||
fn cancel_handle_aborts_mid_pdf() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel
|
||||
let engine = MockOcrEngine::single("IGNORED", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: Some(cancel.clone()),
|
||||
};
|
||||
|
||||
let result = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {});
|
||||
let err = result.expect_err("cancel=true 시 error 반환");
|
||||
assert!(
|
||||
format!("{err}").contains("cancelled mid-PDF"),
|
||||
"error message 가 'cancelled mid-PDF' 포함: {err}"
|
||||
);
|
||||
}
|
||||
@@ -46,17 +46,13 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
|
||||
operations: vec![
|
||||
Operation::new("BT", vec![]),
|
||||
Operation::new("Tf", vec!["F1".into(), 24.into()]),
|
||||
Operation::new(
|
||||
"Td",
|
||||
vec![Object::Integer(100), Object::Integer(700)],
|
||||
),
|
||||
Operation::new("Td", vec![Object::Integer(100), Object::Integer(700)]),
|
||||
Operation::new("Tj", vec![Object::string_literal(*text)]),
|
||||
Operation::new("ET", vec![]),
|
||||
],
|
||||
};
|
||||
let stream_data = content.encode().expect("content encode");
|
||||
let content_id =
|
||||
doc.add_object(Stream::new(dictionary! {}, stream_data));
|
||||
let content_id = doc.add_object(Stream::new(dictionary! {}, stream_data));
|
||||
page_dict.set("Contents", content_id);
|
||||
}
|
||||
let page_id = doc.add_object(page_dict);
|
||||
@@ -76,8 +72,7 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
|
||||
Object::Integer(842),
|
||||
],
|
||||
};
|
||||
doc.objects
|
||||
.insert(pages_id, Object::Dictionary(pages_dict));
|
||||
doc.objects.insert(pages_id, Object::Dictionary(pages_dict));
|
||||
|
||||
let catalog_id = doc.add_object(dictionary! {
|
||||
"Type" => "Catalog",
|
||||
@@ -146,9 +141,8 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
|
||||
write_pdf(&env.workspace_root, "three.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
|
||||
.expect("PDF ingest must succeed");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
|
||||
.expect("PDF ingest must succeed");
|
||||
|
||||
assert_eq!(report.errors, 0);
|
||||
let items = report.items.as_ref().expect("items present");
|
||||
@@ -157,23 +151,28 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
|
||||
.find(|i| i.doc_path.0.ends_with("three.pdf"))
|
||||
.expect("PDF item present");
|
||||
assert_eq!(pdf_item.kind, IngestItemKind::New);
|
||||
assert_eq!(pdf_item.block_count, Some(3), "one Block::Paragraph per page");
|
||||
assert_eq!(pdf_item.chunk_count, Some(3), "one chunk per non-empty page");
|
||||
assert_eq!(
|
||||
pdf_item.block_count,
|
||||
Some(3),
|
||||
"one Block::Paragraph per page"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.chunk_count,
|
||||
Some(3),
|
||||
"one chunk per non-empty page"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
Some("pdf-text-v1")
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.chunker_version.as_ref().map(|c| c.0.as_str()),
|
||||
Some("pdf-page-v1")
|
||||
Some("pdf-page-v1.1")
|
||||
);
|
||||
|
||||
// Inspect the stored doc to confirm SourceSpan::Page round-trip.
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.expect("inspect_doc returns the PDF document");
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap())
|
||||
.expect("inspect_doc returns the PDF document");
|
||||
assert_eq!(doc.blocks.len(), 3);
|
||||
for (i, block) in doc.blocks.iter().enumerate() {
|
||||
let want_page = (i as u32) + 1;
|
||||
@@ -202,8 +201,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
|
||||
write_pdf(&env.workspace_root, "stable.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report1 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item1 = report1
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -214,8 +212,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
|
||||
.unwrap();
|
||||
assert_eq!(item1.kind, IngestItemKind::New);
|
||||
|
||||
let report2 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item2 = report2
|
||||
.items
|
||||
.unwrap()
|
||||
@@ -239,8 +236,7 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
|
||||
std::fs::write(&path, &bytes_v1).unwrap();
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report_v1 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report_v1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let id_v1 = report_v1
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -252,12 +248,10 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
|
||||
.clone()
|
||||
.unwrap();
|
||||
|
||||
let bytes_v2 =
|
||||
build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
|
||||
let bytes_v2 = build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
|
||||
std::fs::write(&path, &bytes_v2).unwrap();
|
||||
|
||||
let report_v2 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report_v2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item_v2 = report_v2
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -282,9 +276,11 @@ fn encrypted_pdf_fails_with_qpdf_hint() {
|
||||
write_pdf(&env.workspace_root, "secret.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 1, "encrypted PDF must increment errors exactly once");
|
||||
let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 1,
|
||||
"encrypted PDF must increment errors exactly once"
|
||||
);
|
||||
let items = report.items.as_ref().unwrap();
|
||||
let pdf_item = items
|
||||
.iter()
|
||||
@@ -310,9 +306,11 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
write_pdf(&env.workspace_root, "corrupt.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 1, "corrupt PDF must increment errors exactly once");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 1,
|
||||
"corrupt PDF must increment errors exactly once"
|
||||
);
|
||||
let items = report.items.as_ref().unwrap();
|
||||
let pdf_item = items
|
||||
.iter()
|
||||
@@ -322,11 +320,8 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
|
||||
// Confirm the doc was NOT stored — list_docs returns nothing for
|
||||
// this path.
|
||||
let summaries = kebab_app::list_docs_with_config(
|
||||
cfg,
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
|
||||
assert!(
|
||||
!summaries
|
||||
.iter()
|
||||
@@ -341,14 +336,15 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
#[test]
|
||||
fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bytes =
|
||||
build_text_pdf(&[Some("first page"), None, Some("third page")]);
|
||||
let bytes = build_text_pdf(&[Some("first page"), None, Some("third page")]);
|
||||
write_pdf(&env.workspace_root, "mixed.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0, "scanned candidate is a Warning, not Error");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 0,
|
||||
"scanned candidate is a Warning, not Error"
|
||||
);
|
||||
let pdf_item = report
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -365,14 +361,10 @@ fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
|
||||
assert_eq!(
|
||||
pdf_item.chunk_count,
|
||||
Some(2),
|
||||
"pdf-page-v1 emits 0 chunks for the empty page; total = 2"
|
||||
"pdf-page-v1.1 emits 0 chunks for the empty page; total = 2"
|
||||
);
|
||||
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let warnings: Vec<_> = doc
|
||||
.provenance
|
||||
.events
|
||||
@@ -419,8 +411,7 @@ fn ingest_report_arithmetic_invariant_holds_with_corrupt_pdf() {
|
||||
write_pdf(&env.workspace_root, "broken.pdf", &corrupt_pdf());
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
let total = report.new + report.updated + report.skipped + report.errors;
|
||||
assert_eq!(
|
||||
report.scanned, total,
|
||||
@@ -441,14 +432,12 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
let pages: Vec<String> = (1..=50)
|
||||
.map(|i| format!("Page {i} body — lorem ipsum dolor sit amet."))
|
||||
.collect();
|
||||
let page_refs: Vec<Option<&str>> =
|
||||
pages.iter().map(|s| Some(s.as_str())).collect();
|
||||
let page_refs: Vec<Option<&str>> = pages.iter().map(|s| Some(s.as_str())).collect();
|
||||
let bytes = build_text_pdf(&page_refs);
|
||||
write_pdf(&env.workspace_root, "long.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0);
|
||||
let pdf_item = report
|
||||
.items
|
||||
@@ -466,8 +455,7 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
|
||||
// Round-trip: list_docs sees the long PDF.
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
|
||||
assert!(summaries.iter().any(|s| s.doc_path.0.ends_with("long.pdf")));
|
||||
}
|
||||
|
||||
@@ -476,13 +464,11 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
#[test]
|
||||
fn inspect_doc_surfaces_page_spans() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bytes =
|
||||
build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
|
||||
let bytes = build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
|
||||
write_pdf(&env.workspace_root, "inspect.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let pdf_item = report
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -490,19 +476,12 @@ fn inspect_doc_surfaces_page_spans() {
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("inspect.pdf"))
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
assert_eq!(doc.parser_version.0, "pdf-text-v1");
|
||||
assert_eq!(doc.blocks.len(), 3);
|
||||
for block in &doc.blocks {
|
||||
match block {
|
||||
Block::Paragraph(p) => assert!(matches!(
|
||||
p.common.source_span,
|
||||
SourceSpan::Page { .. }
|
||||
)),
|
||||
Block::Paragraph(p) => assert!(matches!(p.common.source_span, SourceSpan::Page { .. })),
|
||||
other => panic!("expected Paragraph, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,19 +78,15 @@ fn reset_orphans_only_purges_out_of_scope_docs() {
|
||||
narrow_cfg.workspace.exclude = vec!["b.rs".to_string(), "c.rs".to_string()];
|
||||
|
||||
// Run orphans-only reset.
|
||||
let report = execute(ResetScope::OrphansOnly, &narrow_cfg)
|
||||
.expect("orphans-only reset must succeed");
|
||||
let report =
|
||||
execute(ResetScope::OrphansOnly, &narrow_cfg).expect("orphans-only reset must succeed");
|
||||
|
||||
assert_eq!(
|
||||
report.orphans_purged, 2,
|
||||
"expected 2 orphans purged (b.rs + c.rs): {report:?}"
|
||||
);
|
||||
|
||||
let mut purged: Vec<String> = report
|
||||
.purged_paths
|
||||
.iter()
|
||||
.map(|p| p.0.clone())
|
||||
.collect();
|
||||
let mut purged: Vec<String> = report.purged_paths.iter().map(|p| p.0.clone()).collect();
|
||||
purged.sort();
|
||||
assert_eq!(
|
||||
purged,
|
||||
|
||||
79
crates/kebab-app/tests/schema_active_versions.rs
Normal file
79
crates/kebab-app/tests/schema_active_versions.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
//! Integration tests for Bug #13: schema.v1.models.active_parsers + active_chunkers.
|
||||
|
||||
use kebab_app::schema_with_config;
|
||||
use kebab_config::Config;
|
||||
use kebab_core::SourceScope;
|
||||
|
||||
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.chunking.target_tokens = 80;
|
||||
cfg.chunking.overlap_tokens = 20;
|
||||
cfg
|
||||
}
|
||||
|
||||
fn minimal_scope(workspace_root: &std::path::Path) -> SourceScope {
|
||||
SourceScope {
|
||||
root: workspace_root.to_path_buf(),
|
||||
include: vec![],
|
||||
exclude: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn schema_models_active_arrays_empty_on_empty_corpus() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let workspace = dir.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let cfg = minimal_config(dir.path(), &workspace);
|
||||
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
drop(store);
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
assert!(
|
||||
s.models.active_parsers.is_empty(),
|
||||
"empty corpus → no parsers"
|
||||
);
|
||||
assert!(
|
||||
s.models.active_chunkers.is_empty(),
|
||||
"empty corpus → no chunkers"
|
||||
);
|
||||
// backward compat: 기존 단일 field 는 markdown default 보존.
|
||||
assert_eq!(s.models.parser_version, kebab_parse_md::PARSER_VERSION);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn schema_emits_active_parsers_and_chunkers_array_after_ingest() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let workspace = dir.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
std::fs::write(workspace.join("a.md"), "# A\nhello world\n").unwrap();
|
||||
let cfg = minimal_config(dir.path(), &workspace);
|
||||
let scope = minimal_scope(&workspace);
|
||||
|
||||
kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap();
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
assert!(
|
||||
!s.models.active_parsers.is_empty(),
|
||||
"active_parsers populated after ingest"
|
||||
);
|
||||
assert!(
|
||||
!s.models.active_chunkers.is_empty(),
|
||||
"active_chunkers populated after ingest"
|
||||
);
|
||||
// active arrays must be sorted (ORDER BY in SQL).
|
||||
let mut sorted = s.models.active_parsers.clone();
|
||||
sorted.sort();
|
||||
assert_eq!(
|
||||
s.models.active_parsers, sorted,
|
||||
"active_parsers must be sorted"
|
||||
);
|
||||
}
|
||||
@@ -57,7 +57,7 @@ fn schema_report_reflects_freshly_ingested_kb() {
|
||||
schema.wire.schemas
|
||||
);
|
||||
assert!(schema.capabilities.json_mode);
|
||||
assert!(!schema.capabilities.streaming_ask);
|
||||
assert!(schema.capabilities.streaming_ask); // Bug #9: streaming_ask is now true
|
||||
assert!(
|
||||
schema.capabilities.mcp_server,
|
||||
"mcp_server should be true after fb-30",
|
||||
|
||||
@@ -27,7 +27,10 @@ fn search_with_opts_no_budget_matches_search() {
|
||||
|
||||
assert_eq!(resp.hits.len(), baseline.len());
|
||||
assert!(!resp.truncated);
|
||||
assert!(resp.next_cursor.is_none(), "k=5 against 1 doc → no next page");
|
||||
assert!(
|
||||
resp.next_cursor.is_none(),
|
||||
"k=5 against 1 doc → no next page"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -62,7 +65,11 @@ fn budget_truncates_snippets_when_below_threshold() {
|
||||
fn cursor_paginates_to_next_page() {
|
||||
let env = common::TestEnv::new();
|
||||
for i in 0..6 {
|
||||
common::ingest_md(&env, &format!("d{i}.md"), &format!("# T{i}\n\nrust topic {i}\n"));
|
||||
common::ingest_md(
|
||||
&env,
|
||||
&format!("d{i}.md"),
|
||||
&format!("# T{i}\n\nrust topic {i}\n"),
|
||||
);
|
||||
}
|
||||
let app = env.app();
|
||||
|
||||
@@ -88,7 +95,10 @@ fn cursor_paginates_to_next_page() {
|
||||
page1.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
|
||||
let p2_ids: std::collections::HashSet<_> =
|
||||
page2.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
|
||||
assert!(p1_ids.is_disjoint(&p2_ids), "page 2 must not repeat page 1 hits");
|
||||
assert!(
|
||||
p1_ids.is_disjoint(&p2_ids),
|
||||
"page 2 must not repeat page 1 hits"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -75,11 +75,9 @@ fn lexical_multi_token_korean_query_hits() {
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits = kebab_app::search_with_config(
|
||||
env.config.clone(),
|
||||
common::lexical_query("해시 충돌"),
|
||||
)
|
||||
.expect("search must succeed");
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query("해시 충돌"))
|
||||
.expect("search must succeed");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
@@ -113,11 +111,9 @@ fn lexical_mixed_korean_english_multi_token_query_hits() {
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits = kebab_app::search_with_config(
|
||||
env.config.clone(),
|
||||
common::lexical_query("Rust 충돌은"),
|
||||
)
|
||||
.expect("search must succeed");
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust 충돌은"))
|
||||
.expect("search must succeed");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
|
||||
@@ -35,8 +35,8 @@ fn lexical_search_returns_hits_after_ingest() {
|
||||
fn lexical_search_empty_query_returns_empty() {
|
||||
let env = TestEnv::lexical_only();
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query(" "))
|
||||
.unwrap();
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query(" ")).unwrap();
|
||||
assert!(hits.is_empty(), "blank query must short-circuit empty");
|
||||
}
|
||||
|
||||
@@ -107,17 +107,17 @@ fn search_uncached_returns_same_hits_as_cached() {
|
||||
#[test]
|
||||
fn first_ingest_bumps_corpus_revision() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let store_before =
|
||||
kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
let store_before = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
store_before.run_migrations().unwrap();
|
||||
assert_eq!(store_before.corpus_revision(), 0, "fresh store seeds 0");
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert!(report.new + report.updated > 0, "first ingest must commit ≥1 doc");
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert!(
|
||||
report.new + report.updated > 0,
|
||||
"first ingest must commit ≥1 doc"
|
||||
);
|
||||
|
||||
let store_after =
|
||||
kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
let store_after = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
assert!(
|
||||
store_after.corpus_revision() >= 1,
|
||||
"ingest commit must bump corpus_revision (got {})",
|
||||
|
||||
@@ -29,7 +29,9 @@ fn fresh_doc_is_not_stale_with_default_threshold() {
|
||||
assert!(
|
||||
hits.iter().all(|h| !h.stale),
|
||||
"freshly-ingested doc must not be stale at default 30d threshold: {:?}",
|
||||
hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::<Vec<_>>()
|
||||
hits.iter()
|
||||
.map(|h| (h.doc_path.0.clone(), h.stale))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -50,7 +52,9 @@ fn threshold_zero_disables_staleness() {
|
||||
assert!(
|
||||
hits.iter().all(|h| !h.stale),
|
||||
"threshold=0 disables staleness even for year-old docs: {:?}",
|
||||
hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::<Vec<_>>()
|
||||
hits.iter()
|
||||
.map(|h| (h.doc_path.0.clone(), h.stale))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,8 @@ use common::TestEnv;
|
||||
fn require_avx_or_panic() {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
assert!(std::is_x86_feature_detected!("avx"),
|
||||
assert!(
|
||||
std::is_x86_feature_detected!("avx"),
|
||||
"kb-app vector integration test requires AVX-capable hardware; \
|
||||
host CPU lacks AVX. Run on an AVX-capable machine."
|
||||
);
|
||||
@@ -28,8 +29,7 @@ fn ingest_then_hybrid_search_returns_hits() {
|
||||
require_avx_or_panic();
|
||||
|
||||
let env = TestEnv::with_embeddings();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
@@ -55,8 +55,7 @@ fn ingest_then_vector_search_carries_embedding_model() {
|
||||
require_avx_or_panic();
|
||||
|
||||
let env = TestEnv::with_embeddings();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
|
||||
@@ -13,11 +13,7 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
|
||||
std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap();
|
||||
std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap();
|
||||
|
||||
let report = kebab_app::ingest_with_config(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
|
||||
let items = report.items.as_ref().expect("items array populated");
|
||||
let docx_item = items
|
||||
@@ -39,5 +35,8 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
|
||||
vec!["unsupported media type: <no-ext>".to_string()],
|
||||
);
|
||||
assert_eq!(report.skipped_by_extension.get("docx").copied(), Some(1));
|
||||
assert_eq!(report.skipped_by_extension.get("<no-ext>").copied(), Some(1));
|
||||
assert_eq!(
|
||||
report.skipped_by_extension.get("<no-ext>").copied(),
|
||||
Some(1)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -44,8 +44,8 @@ fn twin_files_fetch_span_uses_correct_asset() {
|
||||
std::fs::write(dir_b.join("note.md"), content).unwrap();
|
||||
|
||||
// Ingest all files (fixture workspace + our two new twins).
|
||||
let report = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
let report =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest must succeed");
|
||||
assert_eq!(report.errors, 0, "no ingest errors; report={report:?}");
|
||||
|
||||
// Both twin paths must appear as New in the report.
|
||||
@@ -53,8 +53,7 @@ fn twin_files_fetch_span_uses_correct_asset() {
|
||||
let twin_items: Vec<_> = items
|
||||
.iter()
|
||||
.filter(|i| {
|
||||
i.doc_path.0.ends_with("src_a/note.md")
|
||||
|| i.doc_path.0.ends_with("src_b/note.md")
|
||||
i.doc_path.0.ends_with("src_a/note.md") || i.doc_path.0.ends_with("src_b/note.md")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(
|
||||
@@ -149,7 +148,10 @@ fn twin_files_fetch_span_uses_correct_asset() {
|
||||
// at either twin, making one twin's span fetch behave incorrectly.
|
||||
let report2 = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest must succeed");
|
||||
assert_eq!(report2.errors, 0, "no ingest errors on second run; report={report2:?}");
|
||||
assert_eq!(
|
||||
report2.errors, 0,
|
||||
"no ingest errors on second run; report={report2:?}"
|
||||
);
|
||||
|
||||
// Re-open app after second ingest and verify span still works on both.
|
||||
let app2 = env.app();
|
||||
|
||||
@@ -43,9 +43,7 @@ fn twin_files_second_ingest_is_unchanged() {
|
||||
let items = first.items.as_ref().expect("items must be present");
|
||||
let twin_items: Vec<_> = items
|
||||
.iter()
|
||||
.filter(|i| {
|
||||
i.doc_path.0.ends_with("__init__.py")
|
||||
})
|
||||
.filter(|i| i.doc_path.0.ends_with("__init__.py"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
twin_items.len(),
|
||||
@@ -63,8 +61,14 @@ fn twin_files_second_ingest_is_unchanged() {
|
||||
// Second ingest — same files, same content → both must be Unchanged.
|
||||
let second = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest must succeed");
|
||||
assert_eq!(second.errors, 0, "second ingest: no errors; report={second:?}");
|
||||
assert_eq!(second.new, 0, "second ingest: no new docs; report={second:?}");
|
||||
assert_eq!(
|
||||
second.errors, 0,
|
||||
"second ingest: no errors; report={second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.new, 0,
|
||||
"second ingest: no new docs; report={second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.updated, 0,
|
||||
"second ingest: no updated docs (twin-file bug would set this to 2); report={second:?}"
|
||||
|
||||
@@ -39,17 +39,11 @@ impl Chunker for CodeCAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeCAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
_ => anyhow::bail!("CodeCAstV1Chunker only handles code docs (got non-Code block)"),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
@@ -68,9 +62,12 @@ impl Chunker for CodeCAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +81,13 @@ impl Chunker for CodeCAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +95,7 @@ impl Chunker for CodeCAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +103,13 @@ impl Chunker for CodeCAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -183,9 +188,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,39 +211,60 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("c".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("c".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("c".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_c_ast_v1() {
|
||||
assert_eq!(CodeCAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-c-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodeCAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-c-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -256,7 +282,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-c-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +297,32 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::<String>();
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} = {i};\n"))
|
||||
.collect::<String>();
|
||||
let code = format!("int big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort_unstable(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +336,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeCAstV1Chunker"));
|
||||
@@ -304,11 +346,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
|
||||
let base: Vec<String> = CodeCAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeCAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +366,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeCAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodeCAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,17 +39,13 @@ impl Chunker for CodeCppAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeCppAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
_ => {
|
||||
anyhow::bail!("CodeCppAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodeCppAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodeCppAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodeCppAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodeCppAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,39 +213,60 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("cpp".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("cpp".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("cpp".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_cpp_ast_v1() {
|
||||
assert_eq!(CodeCppAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-cpp-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodeCppAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-cpp-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -256,7 +284,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-cpp-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +299,32 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::<String>();
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} = {i};\n"))
|
||||
.collect::<String>();
|
||||
let code = format!("int big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort_unstable(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +338,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeCppAstV1Chunker"));
|
||||
@@ -304,11 +348,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
|
||||
let base: Vec<String> = CodeCppAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeCppAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +368,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeCppAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodeCppAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,17 +39,13 @@ impl Chunker for CodeGoAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeGoAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
_ => {
|
||||
anyhow::bail!("CodeGoAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodeGoAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodeGoAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodeGoAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodeGoAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,46 +213,72 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("go".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("go".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("go".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_go_ast_v1() {
|
||||
assert_eq!(CodeGoAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-go-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodeGoAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-go-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "func parse() {\n\t// x\n}"),
|
||||
("Foo.double", 5, 7, "func double() int {\n\t//\n\treturn 0\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"func double() int {\n\t//\n\treturn 0\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
@@ -256,7 +289,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-go-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +304,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!("\tx{i} := {i}")).collect::<Vec<_>>().join("\n");
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} := {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("func big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort_unstable(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +344,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeGoAstV1Chunker"));
|
||||
@@ -304,11 +354,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "func parse() {}\n")]);
|
||||
let base: Vec<String> = CodeGoAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeGoAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +374,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeGoAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodeGoAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,11 +39,7 @@ impl Chunker for CodeJavaAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodeJavaAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodeJavaAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodeJavaAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodeJavaAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,39 +213,60 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("java".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("java".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("java".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_java_ast_v1() {
|
||||
assert_eq!(CodeJavaAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-java-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodeJavaAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-java-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -256,7 +284,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-java-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +299,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!("\tint x{i} = {i};")).collect::<Vec<_>>().join("\n");
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tint x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("void big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort_unstable(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +339,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeJavaAstV1Chunker"));
|
||||
@@ -304,11 +349,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "void parse() {}\n")]);
|
||||
let base: Vec<String> = CodeJavaAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeJavaAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +369,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeJavaAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodeJavaAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,17 +39,13 @@ impl Chunker for CodeJsAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeJsAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
_ => {
|
||||
anyhow::bail!("CodeJsAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodeJsAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodeJsAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodeJsAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodeJsAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,46 +213,72 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("javascript".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("javascript".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("javascript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_js_ast_v1() {
|
||||
assert_eq!(CodeJsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-js-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodeJsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-js-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "function parse() {\n // x\n}"),
|
||||
("Foo.double", 5, 7, "function double() {\n //\n return 0;\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"function double() {\n //\n return 0;\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
@@ -256,7 +289,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-js-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +304,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!(" const x{i} = {i};")).collect::<Vec<_>>().join("\n");
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" const x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("function big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort_unstable(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +344,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeJsAstV1Chunker"));
|
||||
@@ -304,11 +354,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "function parse() {}\n")]);
|
||||
let base: Vec<String> = CodeJsAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeJsAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +374,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeJsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodeJsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,11 +39,7 @@ impl Chunker for CodeKotlinAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodeKotlinAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodeKotlinAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodeKotlinAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodeKotlinAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,46 +213,72 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("kotlin".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("kotlin".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("kotlin".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_kotlin_ast_v1() {
|
||||
assert_eq!(CodeKotlinAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-kotlin-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodeKotlinAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-kotlin-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "fun parse() {\n\t// x\n}"),
|
||||
("Foo.double", 5, 7, "fun double(): Int {\n\t//\n\treturn 0\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"fun double(): Int {\n\t//\n\treturn 0\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
@@ -256,7 +289,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-kotlin-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +304,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!("\tval x{i} = {i}")).collect::<Vec<_>>().join("\n");
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tval x{i} = {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("fun big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort_unstable(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +344,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeKotlinAstV1Chunker"));
|
||||
@@ -304,11 +354,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "fun parse() {}\n")]);
|
||||
let base: Vec<String> = CodeKotlinAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeKotlinAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +374,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeKotlinAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodeKotlinAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,11 +39,7 @@ impl Chunker for CodePythonAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodePythonAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodePythonAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodePythonAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodePythonAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,39 +213,60 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("python".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("python".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("python".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_python_ast_v1() {
|
||||
assert_eq!(CodePythonAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-python-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodePythonAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-python-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -256,7 +284,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-python-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +299,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!(" x{i} = {i}")).collect::<Vec<_>>().join("\n");
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" x{i} = {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("def big():\n{body}\n");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort_unstable(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +339,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodePythonAstV1Chunker"));
|
||||
@@ -304,11 +349,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "def parse(): pass\n")]);
|
||||
let base: Vec<String> = CodePythonAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodePythonAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +369,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodePythonAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodePythonAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,11 +39,7 @@ impl Chunker for CodeRustAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodeRustAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodeRustAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodeRustAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodeRustAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,39 +213,60 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("rust".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("rust".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("rust".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_rust_ast_v1() {
|
||||
assert_eq!(CodeRustAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-rust-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodeRustAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-rust-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -256,7 +284,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-rust-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +299,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!(" let x{i} = {i};")).collect::<Vec<_>>().join("\n");
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" let x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("pub fn big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort_unstable(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +339,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeRustAstV1Chunker"));
|
||||
@@ -304,11 +349,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "fn parse(){}\n}")]);
|
||||
let base: Vec<String> = CodeRustAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeRustAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +369,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeRustAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodeRustAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
use crate::tier2_shared::{build_chunk_no_symbol, policy_hash};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "code-text-paragraph-v1";
|
||||
|
||||
|
||||
@@ -39,17 +39,13 @@ impl Chunker for CodeTsAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeTsAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
_ => {
|
||||
anyhow::bail!("CodeTsAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodeTsAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodeTsAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodeTsAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodeTsAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,46 +213,72 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("typescript".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("typescript".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("typescript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_ts_ast_v1() {
|
||||
assert_eq!(CodeTsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-ts-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodeTsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-ts-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "function parse(): void {\n // x\n}"),
|
||||
("Foo.double", 5, 7, "function double(): number {\n //\n return 0;\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"function double(): number {\n //\n return 0;\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
@@ -256,7 +289,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-ts-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +304,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!(" const x{i} = {i};")).collect::<Vec<_>>().join("\n");
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" const x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("function big(): void {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort_unstable(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +344,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeTsAstV1Chunker"));
|
||||
@@ -304,11 +354,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "function parse(): void {}\n")]);
|
||||
let base: Vec<String> = CodeTsAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeTsAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +374,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeTsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodeTsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "dockerfile-file-v1";
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "k8s-manifest-resource-v1";
|
||||
|
||||
@@ -49,19 +49,14 @@ impl Chunker for K8sManifestResourceV1Chunker {
|
||||
.get("apiVersion")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("");
|
||||
let kind = mapping
|
||||
.get("kind")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("");
|
||||
let kind = mapping.get("kind").and_then(|v| v.as_str()).unwrap_or("");
|
||||
|
||||
// Skip non-k8s documents.
|
||||
if api.is_empty() || kind.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let metadata = mapping
|
||||
.get("metadata")
|
||||
.and_then(|v| v.as_mapping());
|
||||
let metadata = mapping.get("metadata").and_then(|v| v.as_mapping());
|
||||
let name = metadata
|
||||
.and_then(|m| m.get("name"))
|
||||
.and_then(|v| v.as_str())
|
||||
@@ -118,10 +113,7 @@ fn split_yaml_documents(text: &str) -> Vec<YamlSlice<'_>> {
|
||||
.enumerate()
|
||||
.filter_map(|(i, l)| {
|
||||
let trimmed = l.trim_end();
|
||||
if trimmed == "---"
|
||||
|| trimmed.starts_with("--- ")
|
||||
|| trimmed.starts_with("---\t")
|
||||
{
|
||||
if trimmed == "---" || trimmed.starts_with("--- ") || trimmed.starts_with("---\t") {
|
||||
Some(i)
|
||||
} else {
|
||||
None
|
||||
|
||||
@@ -23,14 +23,14 @@ mod code_js_ast_v1;
|
||||
mod code_kotlin_ast_v1;
|
||||
mod code_python_ast_v1;
|
||||
mod code_rust_ast_v1;
|
||||
pub mod code_text_paragraph_v1;
|
||||
mod code_ts_ast_v1;
|
||||
pub mod dockerfile_file_v1;
|
||||
pub mod k8s_manifest_resource_v1;
|
||||
pub mod manifest_file_v1;
|
||||
mod md_heading_v1;
|
||||
mod pdf_page_v1;
|
||||
mod tier2_shared;
|
||||
pub mod k8s_manifest_resource_v1;
|
||||
pub mod dockerfile_file_v1;
|
||||
pub mod manifest_file_v1;
|
||||
pub mod code_text_paragraph_v1;
|
||||
|
||||
pub use code_c_ast_v1::CodeCAstV1Chunker;
|
||||
pub use code_cpp_ast_v1::CodeCppAstV1Chunker;
|
||||
@@ -40,10 +40,10 @@ pub use code_js_ast_v1::CodeJsAstV1Chunker;
|
||||
pub use code_kotlin_ast_v1::CodeKotlinAstV1Chunker;
|
||||
pub use code_python_ast_v1::CodePythonAstV1Chunker;
|
||||
pub use code_rust_ast_v1::CodeRustAstV1Chunker;
|
||||
pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker;
|
||||
pub use code_ts_ast_v1::CodeTsAstV1Chunker;
|
||||
pub use dockerfile_file_v1::DockerfileFileV1Chunker;
|
||||
pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
|
||||
pub use manifest_file_v1::ManifestFileV1Chunker;
|
||||
pub use md_heading_v1::MdHeadingV1Chunker;
|
||||
pub use pdf_page_v1::PdfPageV1Chunker;
|
||||
pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
|
||||
pub use dockerfile_file_v1::DockerfileFileV1Chunker;
|
||||
pub use manifest_file_v1::ManifestFileV1Chunker;
|
||||
pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker;
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "manifest-file-v1";
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
//! `md-heading-v1` — heading-aware Markdown chunker.
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker,
|
||||
ChunkerVersion, DocumentId, SourceSpan, id_for_chunk,
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
/// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label
|
||||
@@ -99,11 +99,7 @@ impl Chunker for MdHeadingV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
let policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
@@ -152,22 +148,12 @@ impl Chunker for MdHeadingV1Chunker {
|
||||
// `collect_overlap_seed` keeps seed ≤ target/2, so
|
||||
// a flush here never produces a chunk smaller than
|
||||
// the seed budget.
|
||||
let would_exceed = acc.text_tokens + next_tokens
|
||||
> policy.target_tokens
|
||||
let would_exceed = acc.text_tokens + next_tokens > policy.target_tokens
|
||||
&& acc.has_non_heading_content();
|
||||
if would_exceed {
|
||||
let overlap_seed = collect_overlap_seed(
|
||||
&acc,
|
||||
policy.overlap_tokens,
|
||||
policy.target_tokens,
|
||||
);
|
||||
flush(
|
||||
&mut acc,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&policy_hash,
|
||||
&mut out,
|
||||
);
|
||||
let overlap_seed =
|
||||
collect_overlap_seed(&acc, policy.overlap_tokens, policy.target_tokens);
|
||||
flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
|
||||
// Seed next accumulator with the prior chunk's
|
||||
// tail blocks (paragraph-level overlap). The
|
||||
// heading is *not* re-included here — it lives
|
||||
@@ -292,10 +278,11 @@ fn build_chunk(
|
||||
) -> Chunk {
|
||||
debug_assert!(!blocks.is_empty(), "build_chunk requires ≥1 block");
|
||||
|
||||
let block_ids: Vec<BlockId> =
|
||||
blocks.iter().map(|b| common(b).block_id.clone()).collect();
|
||||
let source_spans: Vec<SourceSpan> =
|
||||
blocks.iter().map(|b| common(b).source_span.clone()).collect();
|
||||
let block_ids: Vec<BlockId> = blocks.iter().map(|b| common(b).block_id.clone()).collect();
|
||||
let source_spans: Vec<SourceSpan> = blocks
|
||||
.iter()
|
||||
.map(|b| common(b).source_span.clone())
|
||||
.collect();
|
||||
|
||||
// heading_path: pick the first non-Heading block's heading_path
|
||||
// (which already includes every parent heading per kb-normalize).
|
||||
@@ -339,12 +326,7 @@ fn build_chunk(
|
||||
text.len().div_ceil(BYTES_PER_TOKEN)
|
||||
};
|
||||
|
||||
let chunk_id = id_for_chunk(
|
||||
&doc.doc_id,
|
||||
chunker_version,
|
||||
&block_ids,
|
||||
policy_hash,
|
||||
);
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, &block_ids, policy_hash);
|
||||
|
||||
Chunk {
|
||||
chunk_id,
|
||||
@@ -400,14 +382,8 @@ fn render_block_text(b: &Block) -> String {
|
||||
} else {
|
||||
i.alt.clone()
|
||||
};
|
||||
let ocr = i
|
||||
.ocr
|
||||
.as_ref()
|
||||
.map_or("", |o| o.joined.as_str());
|
||||
let cap = i
|
||||
.caption
|
||||
.as_ref()
|
||||
.map_or("", |c| c.text.as_str());
|
||||
let ocr = i.ocr.as_ref().map_or("", |o| o.joined.as_str());
|
||||
let cap = i.caption.as_ref().map_or("", |c| c.text.as_str());
|
||||
[alt.as_str(), ocr, cap]
|
||||
.iter()
|
||||
.filter(|s| !s.is_empty())
|
||||
@@ -447,9 +423,8 @@ fn common(b: &Block) -> &kebab_core::CommonBlock {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang,
|
||||
Metadata, Provenance, SourceType, TableBlock, TextBlock, TrustLevel,
|
||||
WorkspacePath, id_for_block,
|
||||
AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang, Metadata, Provenance,
|
||||
SourceType, TableBlock, TextBlock, TrustLevel, WorkspacePath, id_for_block,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -492,12 +467,7 @@ mod tests {
|
||||
SourceSpan::Line { start, end }
|
||||
}
|
||||
|
||||
fn common_for(
|
||||
kind: &str,
|
||||
heading_path: &[String],
|
||||
ordinal: u32,
|
||||
s: SourceSpan,
|
||||
) -> CommonBlock {
|
||||
fn common_for(kind: &str, heading_path: &[String], ordinal: u32, s: SourceSpan) -> CommonBlock {
|
||||
CommonBlock {
|
||||
block_id: id_for_block(&doc_id(), kind, heading_path, ordinal, &s),
|
||||
heading_path: heading_path.to_vec(),
|
||||
@@ -532,12 +502,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn paragraph(
|
||||
text: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
line: u32,
|
||||
) -> Block {
|
||||
fn paragraph(text: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::Paragraph(TextBlock {
|
||||
common: common_for("paragraph", &hp, ordinal, span(line, line)),
|
||||
@@ -546,12 +511,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn code_block(
|
||||
code: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
s: SourceSpan,
|
||||
) -> Block {
|
||||
fn code_block(code: &str, heading_path: &[&str], ordinal: u32, s: SourceSpan) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::Code(CodeBlock {
|
||||
common: common_for("code", &hp, ordinal, s),
|
||||
@@ -578,12 +538,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn image_ref(
|
||||
alt: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
line: u32,
|
||||
) -> Block {
|
||||
fn image_ref(alt: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::ImageRef(ImageRefBlock {
|
||||
common: common_for("imageref", &hp, ordinal, span(line, line)),
|
||||
|
||||
@@ -53,18 +53,21 @@
|
||||
//! one chunk per atomic block. PdfPageV1 cannot.
|
||||
//!
|
||||
//! Workaround that doesn't change the §4.2 recipe: feed a per-chunk
|
||||
//! variant `format!("{base_policy_hash}#c{char_start}")` into the
|
||||
//! recipe's `policy_hash` slot (so distinct chunks distinguish via
|
||||
//! different policy_hash inputs), while storing the unmodified
|
||||
//! `base_policy_hash` in `Chunk.policy_hash` so the field still answers
|
||||
//! "what policy was active". Logged in `tasks/HOTFIXES.md`.
|
||||
//! variant `format!("{base_policy_hash}#c{segment_start}")` into the
|
||||
//! recipe's `policy_hash` slot. `segment_start` is the pre-overlap
|
||||
//! segment boundary, strictly increasing across the returned chunks
|
||||
//! even when the overlap walk collapses `actual_start` to a previous
|
||||
//! chunk's `prev_min`. Unmodified `base_policy_hash` is stored in
|
||||
//! `Chunk.policy_hash` so the field still answers "what policy was
|
||||
//! active". v1.1 second-iteration patch — logged in
|
||||
//! `tasks/HOTFIXES.md` (2026-05-27).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "pdf-page-v1";
|
||||
const VERSION_LABEL: &str = "pdf-page-v1.1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
|
||||
@@ -89,11 +92,7 @@ impl Chunker for PdfPageV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
// Validate up front — every block must be a Paragraph carrying
|
||||
// SourceSpan::Page. A mixed document signals a routing bug in
|
||||
// the caller (e.g. running this chunker on Markdown) and is
|
||||
@@ -106,18 +105,13 @@ impl Chunker for PdfPageV1Chunker {
|
||||
),
|
||||
};
|
||||
if !matches!(common.source_span, SourceSpan::Page { .. }) {
|
||||
anyhow::bail!(
|
||||
"PdfPageV1Chunker only handles PDF docs (got non-Page source_span)"
|
||||
);
|
||||
anyhow::bail!("PdfPageV1Chunker only handles PDF docs (got non-Page source_span)");
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let target_bytes = policy
|
||||
.target_tokens
|
||||
.saturating_mul(BYTES_PER_TOKEN)
|
||||
.max(1);
|
||||
let target_bytes = policy.target_tokens.saturating_mul(BYTES_PER_TOKEN).max(1);
|
||||
// Clamp the overlap to half the target. Without this, a policy
|
||||
// with `overlap_tokens >= target_tokens` would make every chunk
|
||||
// fully re-emit the previous chunk's text — mirrors
|
||||
@@ -146,7 +140,7 @@ impl Chunker for PdfPageV1Chunker {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (char_start, char_end, slice) in
|
||||
for (segment_start, char_start, char_end, slice) in
|
||||
chunk_page(&p.text, target_bytes, overlap_bytes)
|
||||
{
|
||||
// PDF chars-per-page comfortably fits in u32 (a single
|
||||
@@ -154,20 +148,20 @@ impl Chunker for PdfPageV1Chunker {
|
||||
// typography); silent `as u32` truncation would only
|
||||
// surface on corrupted input, where an explicit panic
|
||||
// is preferable to an off-by-2^32 span.
|
||||
let char_start_u32 = u32::try_from(char_start)
|
||||
.expect("page chars fit in u32");
|
||||
let char_end_u32 =
|
||||
u32::try_from(char_end).expect("page chars fit in u32");
|
||||
let char_start_u32 = u32::try_from(char_start).expect("page chars fit in u32");
|
||||
let char_end_u32 = u32::try_from(char_end).expect("page chars fit in u32");
|
||||
let span = SourceSpan::Page {
|
||||
page: page_num,
|
||||
char_start: Some(char_start_u32),
|
||||
char_end: Some(char_end_u32),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![p.common.block_id.clone()];
|
||||
// Per-chunk policy_hash variant prevents chunk_id
|
||||
// collision when a page produces multiple chunks. See
|
||||
// module docs for rationale.
|
||||
let per_chunk_hash = format!("{base_policy_hash}#c{char_start}");
|
||||
// v0.20.0 sub-item 1 bugfix (#3): per-chunk policy_hash
|
||||
// variant uses `segment_start` (pre-overlap boundary,
|
||||
// strictly increasing) instead of `char_start` (post-
|
||||
// overlap, may collapse to prev_min). See module docs +
|
||||
// spec §4.1 root cause + HOTFIXES.md 2026-05-27.
|
||||
let per_chunk_hash = format!("{base_policy_hash}#c{segment_start}");
|
||||
let chunk_id =
|
||||
id_for_chunk(&doc.doc_id, &chunker_version, &block_ids, &per_chunk_hash);
|
||||
let token_estimate = slice.len().div_ceil(BYTES_PER_TOKEN);
|
||||
@@ -198,18 +192,28 @@ impl Chunker for PdfPageV1Chunker {
|
||||
}
|
||||
|
||||
/// Split a single page's text into ordered chunks, each represented as
|
||||
/// `(char_start, char_end, text_slice)`. Char positions are within the
|
||||
/// page text, suitable for `SourceSpan::Page::char_start` / `char_end`.
|
||||
/// `(segment_start, actual_start, chunk_end, text_slice)`.
|
||||
///
|
||||
/// - `segment_start` = pre-overlap segment boundary. Strictly increasing
|
||||
/// across the returned vec. Use this for chunk_id uniqueness suffixes.
|
||||
/// - `actual_start` = post-overlap start char index. May collapse to a
|
||||
/// previous chunk's `actual_start` under aggressive overlap policy.
|
||||
/// Use this for `SourceSpan::Page::char_start`.
|
||||
/// - `chunk_end` = chunk's end char index (exclusive).
|
||||
///
|
||||
/// Returns an empty vector when `text` is empty or whitespace-only.
|
||||
fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usize, usize, String)> {
|
||||
fn chunk_page(
|
||||
text: &str,
|
||||
target_bytes: usize,
|
||||
overlap_bytes: usize,
|
||||
) -> Vec<(usize, usize, usize, String)> {
|
||||
let chars: Vec<char> = text.chars().collect();
|
||||
let n = chars.len();
|
||||
if n == 0 {
|
||||
return Vec::new();
|
||||
}
|
||||
if text.len() <= target_bytes {
|
||||
return vec![(0, n, text.to_string())];
|
||||
return vec![(0, 0, n, text.to_string())];
|
||||
}
|
||||
|
||||
// Build candidate boundary positions (char indices where a chunk
|
||||
@@ -222,8 +226,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
let c = chars[k];
|
||||
let nx = chars[k + 1];
|
||||
let is_paragraph_break = c == '\n' && nx == '\n';
|
||||
let is_sentence_end =
|
||||
matches!(c, '.' | '?' | '!') && nx.is_whitespace();
|
||||
let is_sentence_end = matches!(c, '.' | '?' | '!') && nx.is_whitespace();
|
||||
if (is_paragraph_break || is_sentence_end) && k + 2 <= n {
|
||||
bounds.push(k + 2);
|
||||
}
|
||||
@@ -235,11 +238,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
bounds.dedup();
|
||||
|
||||
// UTF-8 byte length of the slice between two char indices.
|
||||
let byte_len = |a: usize, b: usize| -> usize {
|
||||
chars[a..b].iter().map(|c| c.len_utf8()).sum()
|
||||
};
|
||||
let byte_len = |a: usize, b: usize| -> usize { chars[a..b].iter().map(|c| c.len_utf8()).sum() };
|
||||
|
||||
let mut chunks: Vec<(usize, usize, String)> = Vec::new();
|
||||
let mut chunks: Vec<(usize, usize, usize, String)> = Vec::new();
|
||||
let mut seg_idx: usize = 0;
|
||||
while seg_idx + 1 < bounds.len() {
|
||||
let start = bounds[seg_idx];
|
||||
@@ -264,7 +265,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
// have absorbed up to `overlap_bytes` of bytes, but never past
|
||||
// the previous chunk's start (no full re-emission).
|
||||
let actual_start = if let Some(prev) = chunks.last() {
|
||||
let prev_min = prev.0;
|
||||
// prev tuple shape = (segment_start, actual_start, chunk_end, slice).
|
||||
// overlap walk floor = previous chunk's actual_start (prev.1).
|
||||
let prev_min = prev.1;
|
||||
let mut a = start;
|
||||
let mut acc_o: usize = 0;
|
||||
while a > prev_min {
|
||||
@@ -281,7 +284,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
};
|
||||
|
||||
let slice: String = chars[actual_start..chunk_end].iter().collect();
|
||||
chunks.push((actual_start, chunk_end, slice));
|
||||
chunks.push((start, actual_start, chunk_end, slice));
|
||||
seg_idx = end_idx;
|
||||
}
|
||||
|
||||
@@ -390,7 +393,11 @@ mod tests {
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
match c.source_spans[0] {
|
||||
SourceSpan::Page { page, char_start, char_end } => {
|
||||
SourceSpan::Page {
|
||||
page,
|
||||
char_start,
|
||||
char_end,
|
||||
} => {
|
||||
assert_eq!(page, (i as u32) + 1);
|
||||
assert_eq!(char_start, Some(0));
|
||||
assert!(char_end.unwrap() > 0);
|
||||
@@ -435,11 +442,16 @@ mod tests {
|
||||
// N-1's char_end).
|
||||
for w in chunks.windows(2) {
|
||||
let prev_end = match w[0].source_spans[0] {
|
||||
SourceSpan::Page { char_end: Some(e), .. } => e,
|
||||
SourceSpan::Page {
|
||||
char_end: Some(e), ..
|
||||
} => e,
|
||||
_ => panic!("missing char_end"),
|
||||
};
|
||||
let next_start = match w[1].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
assert!(
|
||||
@@ -653,11 +665,17 @@ mod tests {
|
||||
// overlap) is the failure mode.
|
||||
for w in chunks.windows(2) {
|
||||
let prev_start = match w[0].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
let next_start = match w[1].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
assert!(
|
||||
@@ -674,6 +692,43 @@ mod tests {
|
||||
assert_eq!(ids.len(), total, "chunk_ids must remain unique");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_chunk_page_with_aggressive_overlap_produces_unique_chunk_ids() {
|
||||
// 한국어 OCR text 의 trigger shape: 10 char "가" + ". " + 500 char "나".
|
||||
// → first segment [0, 12), second segment [12, n).
|
||||
// page_text byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500
|
||||
// → multi-chunk. overlap_bytes = min(240, 750) = 240 chars=80
|
||||
// → second chunk 의 actual_start 가 prev_min=0 collapse → same `#c0`.
|
||||
//
|
||||
// default_policy(500, 80) — target_tokens=500 → target_bytes=500*3=1500
|
||||
// (한국어 3byte/char 환산), overlap_tokens=80 → overlap_bytes=min(240, 750)=240.
|
||||
// verifier round 1 L-3 보강.
|
||||
let early_seg = "가".repeat(10);
|
||||
let tail = "나".repeat(500);
|
||||
let page_text = format!("{early_seg}. {tail}");
|
||||
|
||||
let doc = make_pdf_doc(&[&page_text]);
|
||||
let policy = default_policy(500, 80); // target=1500 byte, overlap=240 byte
|
||||
let chunks = PdfPageV1Chunker.chunk(&doc, &policy).unwrap();
|
||||
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"expected ≥2 chunks for {} byte page; got {}",
|
||||
page_text.len(),
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
ids.sort_unstable();
|
||||
let total = ids.len();
|
||||
ids.dedup();
|
||||
assert_eq!(
|
||||
ids.len(),
|
||||
total,
|
||||
"all chunk_ids must be unique even when overlap walks actual_start back to prev_min"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1_for_identical_policy() {
|
||||
// Cross-chunker policy fingerprint identity — important so a
|
||||
|
||||
@@ -113,7 +113,14 @@ pub(crate) fn build_chunk(
|
||||
symbol: Some(symbol.to_string()),
|
||||
lang: Some(lang.to_string()),
|
||||
};
|
||||
build_chunk_from_span(doc, chunker_version, base_policy_hash, text, span, split_key)
|
||||
build_chunk_from_span(
|
||||
doc,
|
||||
chunker_version,
|
||||
base_policy_hash,
|
||||
text,
|
||||
span,
|
||||
split_key,
|
||||
)
|
||||
}
|
||||
|
||||
/// Like `build_chunk` but emits `symbol: None`. Used by Tier 3 (per spec §9.3).
|
||||
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeCAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -15,9 +15,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeCppAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use kebab_parse_code::CppAstExtractor;
|
||||
use serde_json::Value;
|
||||
@@ -171,7 +171,9 @@ fn extract_cpp_fixture() -> CanonicalDocument {
|
||||
workspace_root: &root,
|
||||
config: &cfg,
|
||||
};
|
||||
CppAstExtractor::new().extract(&ctx, src.as_bytes()).unwrap()
|
||||
CppAstExtractor::new()
|
||||
.extract(&ctx, src.as_bytes())
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -261,43 +263,61 @@ fn code_cpp_ast_extractor_snapshot() {
|
||||
let doc = extract_cpp_fixture();
|
||||
|
||||
// Verify the extractor emits all expected named units.
|
||||
let block_syms: Vec<Option<String>> = doc.blocks.iter().filter_map(|b| match b {
|
||||
Block::Code(c) => match &c.common.source_span {
|
||||
SourceSpan::Code { symbol, .. } => Some(symbol.clone()),
|
||||
let block_syms: Vec<Option<String>> = doc
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Code(c) => match &c.common.source_span {
|
||||
SourceSpan::Code { symbol, .. } => Some(symbol.clone()),
|
||||
_ => None,
|
||||
},
|
||||
_ => None,
|
||||
},
|
||||
_ => None,
|
||||
}).collect();
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Must include namespace-qualified class and its methods
|
||||
assert!(
|
||||
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker")),
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker")),
|
||||
"class unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker")),
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker")),
|
||||
"ctor unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker")),
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker")),
|
||||
"dtor unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::chunk_doc")),
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::chunk_doc")),
|
||||
"chunk_doc unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::operator()")),
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::operator()")),
|
||||
"operator() unit missing: {block_syms:?}"
|
||||
);
|
||||
// Template function (inside kebab::chunk namespace in the fixture)
|
||||
assert!(
|
||||
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::identity")),
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::identity")),
|
||||
"identity template fn unit missing: {block_syms:?}"
|
||||
);
|
||||
// Free function in outer namespace
|
||||
assert!(
|
||||
block_syms.iter().any(|s| s.as_deref() == Some("kebab::global_helper")),
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::global_helper")),
|
||||
"global_helper unit missing: {block_syms:?}"
|
||||
);
|
||||
// Global main
|
||||
@@ -312,14 +332,23 @@ fn code_cpp_ast_extractor_snapshot() {
|
||||
fn code_cpp_ast_extractor_chunks_deterministic() {
|
||||
let doc1 = extract_cpp_fixture();
|
||||
let doc2 = extract_cpp_fixture();
|
||||
assert_eq!(doc1.blocks, doc2.blocks, "extractor output non-deterministic");
|
||||
assert_eq!(
|
||||
doc1.blocks, doc2.blocks,
|
||||
"extractor output non-deterministic"
|
||||
);
|
||||
|
||||
let policy = fixed_policy();
|
||||
let chunks1 = CodeCppAstV1Chunker.chunk(&doc1, &policy).unwrap();
|
||||
let chunks2 = CodeCppAstV1Chunker.chunk(&doc2, &policy).unwrap();
|
||||
assert_eq!(
|
||||
chunks1.iter().map(|c| c.chunk_id.0.clone()).collect::<Vec<_>>(),
|
||||
chunks2.iter().map(|c| c.chunk_id.0.clone()).collect::<Vec<_>>(),
|
||||
chunks1
|
||||
.iter()
|
||||
.map(|c| c.chunk_id.0.clone())
|
||||
.collect::<Vec<_>>(),
|
||||
chunks2
|
||||
.iter()
|
||||
.map(|c| c.chunk_id.0.clone())
|
||||
.collect::<Vec<_>>(),
|
||||
"chunker output non-deterministic"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeGoAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeJavaAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeJsAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeKotlinAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodePythonAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeRustAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeTsAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -124,7 +124,11 @@ fn dockerfile_emits_single_chunk() {
|
||||
Some("<dockerfile>"),
|
||||
"symbol must be '<dockerfile>'"
|
||||
);
|
||||
assert_eq!(lang.as_deref(), Some("dockerfile"), "lang must be 'dockerfile'");
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("dockerfile"),
|
||||
"lang must be 'dockerfile'"
|
||||
);
|
||||
}
|
||||
other => panic!("expected SourceSpan::Code, got {other:?}"),
|
||||
}
|
||||
|
||||
@@ -110,13 +110,11 @@ fn k8s_multi_doc_emits_one_chunk_per_resource() {
|
||||
|
||||
let symbols: Vec<&str> = chunks
|
||||
.iter()
|
||||
.map(|c| {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
symbol.as_deref().expect("symbol must be Some for k8s chunks")
|
||||
}
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
}
|
||||
.map(|c| match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => symbol
|
||||
.as_deref()
|
||||
.expect("symbol must be Some for k8s chunks"),
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -270,7 +268,11 @@ fn k8s_oversize_splits_into_line_windows_sharing_symbol() {
|
||||
let ranges: Vec<(u32, u32)> = chunks
|
||||
.iter()
|
||||
.map(|c| match &c.source_spans[0] {
|
||||
SourceSpan::Code { line_start, line_end, .. } => (*line_start, *line_end),
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => (*line_start, *line_end),
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -15,7 +15,7 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::MdHeadingV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, AssetStorage, Checksum, ChunkPolicy, ChunkerVersion, Chunker, MediaType,
|
||||
AssetId, AssetStorage, Checksum, ChunkPolicy, Chunker, ChunkerVersion, MediaType,
|
||||
ParserVersion, RawAsset, SourceUri, WorkspacePath,
|
||||
};
|
||||
use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
|
||||
@@ -65,8 +65,7 @@ fn long_section_chunks_snapshot() {
|
||||
Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1,
|
||||
None => 1,
|
||||
};
|
||||
let (blocks, parse_warns) =
|
||||
parse_blocks(&bytes, body_offset_lines).expect("blocks parse");
|
||||
let (blocks, parse_warns) = parse_blocks(&bytes, body_offset_lines).expect("blocks parse");
|
||||
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let parser_version = ParserVersion("kb-chunk-snapshot-test-0".into());
|
||||
@@ -74,9 +73,8 @@ fn long_section_chunks_snapshot() {
|
||||
metadata.aliases.sort();
|
||||
metadata.tags.sort();
|
||||
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns)
|
||||
.expect("build_canonical_document");
|
||||
let doc = build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns)
|
||||
.expect("build_canonical_document");
|
||||
|
||||
// Pin policy so policy_hash and chunk_ids are reproducible.
|
||||
let policy = ChunkPolicy {
|
||||
@@ -102,8 +100,7 @@ fn long_section_chunks_snapshot() {
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value =
|
||||
serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
@@ -154,14 +151,8 @@ fn long_section_chunks_are_deterministic() {
|
||||
let mut metadata = metadata;
|
||||
metadata.aliases.sort();
|
||||
metadata.tags.sort();
|
||||
let doc = build_canonical_document(
|
||||
&asset,
|
||||
metadata,
|
||||
blocks,
|
||||
&parser_version,
|
||||
parse_warns,
|
||||
)
|
||||
.expect("build_canonical_document");
|
||||
let doc = build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns)
|
||||
.expect("build_canonical_document");
|
||||
let ids: Vec<String> = MdHeadingV1Chunker
|
||||
.chunk(&doc, &policy)
|
||||
.unwrap()
|
||||
|
||||
@@ -107,9 +107,7 @@ fn cargo_toml_single_chunk_with_toml_lang() {
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
let doc = manifest_doc("toml", &text);
|
||||
let chunks = ManifestFileV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
@@ -149,9 +147,7 @@ fn package_json_single_chunk_with_json_lang() {
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
let doc = manifest_doc("json", &text);
|
||||
let chunks = ManifestFileV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
@@ -191,9 +187,7 @@ fn pom_xml_single_chunk_with_xml_lang() {
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
let doc = manifest_doc("xml", &text);
|
||||
let chunks = ManifestFileV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
@@ -233,9 +227,7 @@ fn go_mod_single_chunk_with_go_mod_lang() {
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
let doc = manifest_doc("go-mod", &text);
|
||||
let chunks = ManifestFileV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
|
||||
@@ -156,7 +156,7 @@ enum Cmd {
|
||||
|
||||
/// p9-fb-36: filter by `assets.media_type` kind. Comma-separated.
|
||||
/// Aliases: `md` → `markdown`. Other accepted: `markdown`, `pdf`,
|
||||
/// `image`, `audio`, `other`. Unknown values match nothing.
|
||||
/// `image`, `audio`, `code`, `other`. Unknown values match nothing.
|
||||
#[arg(long, value_delimiter = ',')]
|
||||
media: Vec<String>,
|
||||
|
||||
@@ -179,7 +179,12 @@ enum Cmd {
|
||||
/// canonical). Repeatable or comma-separated.
|
||||
/// Examples: `rust`, `python`, `typescript`.
|
||||
/// Unknown values produce empty hits.
|
||||
#[arg(long = "code-lang", value_name = "LANG", num_args = 1, value_delimiter = ',')]
|
||||
#[arg(
|
||||
long = "code-lang",
|
||||
value_name = "LANG",
|
||||
num_args = 1,
|
||||
value_delimiter = ','
|
||||
)]
|
||||
code_lang: Vec<String>,
|
||||
|
||||
/// p9-fb-37: emit pre-fusion lexical / vector / RRF candidate
|
||||
@@ -464,7 +469,9 @@ fn parse_bool_env(s: &str) -> Result<bool, String> {
|
||||
match s.to_ascii_lowercase().as_str() {
|
||||
"1" | "true" | "yes" | "on" => Ok(true),
|
||||
"0" | "false" | "no" | "off" => Ok(false),
|
||||
other => Err(format!("expected 1/0/true/false/yes/no/on/off, got {other:?}")),
|
||||
other => Err(format!(
|
||||
"expected 1/0/true/false/yes/no/on/off, got {other:?}"
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -551,8 +558,14 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
"created {}",
|
||||
kebab_config::Config::xdg_config_path().display()
|
||||
);
|
||||
println!("created {}", kebab_config::Config::xdg_data_dir().display());
|
||||
println!("created {}", kebab_config::Config::xdg_state_dir().display());
|
||||
println!(
|
||||
"created {}",
|
||||
kebab_config::Config::xdg_data_dir().display()
|
||||
);
|
||||
println!(
|
||||
"created {}",
|
||||
kebab_config::Config::xdg_state_dir().display()
|
||||
);
|
||||
println!("hint edit the config above, then `kebab ingest`");
|
||||
}
|
||||
Ok(())
|
||||
@@ -565,7 +578,9 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
} => {
|
||||
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
|
||||
let scope = kebab_core::SourceScope {
|
||||
root: root.clone().unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)),
|
||||
root: root
|
||||
.clone()
|
||||
.unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)),
|
||||
exclude: cfg.workspace.exclude.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
@@ -580,9 +595,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
.unwrap_or(false);
|
||||
let mode = progress::ProgressMode::from_flags(cli.json, cli.quiet, plain_env);
|
||||
let (tx, rx) = std::sync::mpsc::channel::<kebab_app::IngestEvent>();
|
||||
let display_handle = std::thread::spawn(move || {
|
||||
progress::ProgressDisplay::new(mode).run(rx)
|
||||
});
|
||||
let display_handle =
|
||||
std::thread::spawn(move || progress::ProgressDisplay::new(mode).run(rx));
|
||||
|
||||
// p9-fb-04: register a Ctrl-C handler that flips the same
|
||||
// AtomicBool the facade polls at each step boundary. The
|
||||
@@ -614,7 +628,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
if cli.json {
|
||||
println!("{}", serde_json::to_string(&wire::wire_ingest(&report))?);
|
||||
} else {
|
||||
let skipped_breakdown = kebab_app::render_skipped_breakdown(&report.skipped_by_extension);
|
||||
let skipped_breakdown =
|
||||
kebab_app::render_skipped_breakdown(&report.skipped_by_extension);
|
||||
let purged_suffix = if report.purged_deleted_files > 0 {
|
||||
format!(" purged {}", report.purged_deleted_files)
|
||||
} else {
|
||||
@@ -640,7 +655,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
|
||||
let docs = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())?;
|
||||
if cli.json {
|
||||
println!("{}", serde_json::to_string(&wire::wire_doc_summaries(&docs))?);
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string(&wire::wire_doc_summaries(&docs))?
|
||||
);
|
||||
} else {
|
||||
for d in &docs {
|
||||
println!("{}\t{}", d.doc_id, d.doc_path.0);
|
||||
@@ -667,7 +685,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
|
||||
let chunk_id: kebab_core::ChunkId = id.parse()?;
|
||||
let chunk = kebab_app::inspect_chunk_with_config(cfg, &chunk_id)?;
|
||||
println!("{}", serde_json::to_string(&wire::wire_chunk_inspection(&chunk))?);
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string(&wire::wire_chunk_inspection(&chunk))?
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
@@ -708,7 +729,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
};
|
||||
let result = kebab_app::fetch_with_config(cfg, query, opts)?;
|
||||
if cli.json {
|
||||
println!("{}", serde_json::to_string(&wire::wire_fetch_result(&result))?);
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string(&wire::wire_fetch_result(&result))?
|
||||
);
|
||||
} else {
|
||||
render_fetch_plain(&result);
|
||||
}
|
||||
@@ -752,30 +776,21 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
let v: serde_json::Value =
|
||||
serde_json::from_str(&line).map_err(|e| {
|
||||
anyhow::Error::new(kebab_app::StructuredError(
|
||||
kebab_app::ErrorV1 {
|
||||
schema_version: kebab_app::ERROR_V1_ID
|
||||
.to_string(),
|
||||
code: "config_invalid".to_string(),
|
||||
message: format!(
|
||||
"stdin ndjson line {} parse error: {e}",
|
||||
lineno + 1
|
||||
),
|
||||
details: serde_json::Value::Null,
|
||||
hint: Some(
|
||||
"each line must be a JSON object with at least `query`"
|
||||
.to_string(),
|
||||
),
|
||||
},
|
||||
))
|
||||
})?;
|
||||
let v: serde_json::Value = serde_json::from_str(&line).map_err(|e| {
|
||||
anyhow::Error::new(kebab_app::StructuredError(kebab_app::ErrorV1 {
|
||||
schema_version: kebab_app::ERROR_V1_ID.to_string(),
|
||||
code: "config_invalid".to_string(),
|
||||
message: format!("stdin ndjson line {} parse error: {e}", lineno + 1),
|
||||
details: serde_json::Value::Null,
|
||||
hint: Some(
|
||||
"each line must be a JSON object with at least `query`".to_string(),
|
||||
),
|
||||
}))
|
||||
})?;
|
||||
raw_items.push(v);
|
||||
}
|
||||
|
||||
let (items, summary) =
|
||||
kebab_app::bulk_search_with_config(cfg, raw_items)?;
|
||||
let (items, summary) = kebab_app::bulk_search_with_config(cfg, raw_items)?;
|
||||
|
||||
if cli.json {
|
||||
let mut stdout = std::io::stdout().lock();
|
||||
@@ -799,11 +814,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
if let Some(err) = &item.error {
|
||||
writeln!(stdout, "error: {err}")?;
|
||||
} else if let Some(resp) = &item.response {
|
||||
writeln!(
|
||||
stdout,
|
||||
"{}",
|
||||
serde_json::to_string_pretty(resp)?
|
||||
)?;
|
||||
writeln!(stdout, "{}", serde_json::to_string_pretty(resp)?)?;
|
||||
}
|
||||
writeln!(stdout)?;
|
||||
}
|
||||
@@ -819,6 +830,17 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
|
||||
// p9-fb-42: bulk mode requires no query; single-query mode requires query.
|
||||
let query_text = match query.as_ref() {
|
||||
Some(q) if q.trim().is_empty() => {
|
||||
return Err(anyhow::Error::new(kebab_app::StructuredError(
|
||||
kebab_app::ErrorV1 {
|
||||
schema_version: kebab_app::ERROR_V1_ID.to_string(),
|
||||
code: "invalid_input".to_string(),
|
||||
message: "query is empty; provide a non-empty search term or use --bulk".into(),
|
||||
details: serde_json::Value::Null,
|
||||
hint: Some("e.g. `kebab search 'rust async'` or `kebab search --bulk < queries.ndjson`".into()),
|
||||
},
|
||||
)));
|
||||
}
|
||||
Some(q) => q.clone(),
|
||||
None => {
|
||||
return Err(anyhow::anyhow!("query is required unless --bulk is set"));
|
||||
@@ -832,8 +854,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
other => other.to_string(),
|
||||
}
|
||||
}
|
||||
let media_norm: Vec<String> =
|
||||
media.iter().map(|s| normalize_media_alias(s)).collect();
|
||||
let media_norm: Vec<String> = media.iter().map(|s| normalize_media_alias(s)).collect();
|
||||
|
||||
// p9-fb-36: parse --ingested-after as RFC3339; structured error on failure.
|
||||
let ingested_after_parsed: Option<time::OffsetDateTime> =
|
||||
@@ -845,8 +866,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
) {
|
||||
Ok(ts) => Some(ts),
|
||||
Err(e) => {
|
||||
return Err(anyhow::Error::new(
|
||||
kebab_app::StructuredError(kebab_app::ErrorV1 {
|
||||
return Err(anyhow::Error::new(kebab_app::StructuredError(
|
||||
kebab_app::ErrorV1 {
|
||||
schema_version: kebab_app::ERROR_V1_ID.to_string(),
|
||||
code: "config_invalid".to_string(),
|
||||
message: format!(
|
||||
@@ -856,8 +877,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
hint: Some(
|
||||
"expected format like 2026-04-01T00:00:00Z".to_string(),
|
||||
),
|
||||
}),
|
||||
));
|
||||
},
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -932,11 +953,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
};
|
||||
println!(
|
||||
"{:>2}. {:.4} {}{}{}",
|
||||
h.rank,
|
||||
h.retrieval.fusion_score,
|
||||
stale_tag,
|
||||
h.doc_path.0,
|
||||
heading,
|
||||
h.rank, h.retrieval.fusion_score, stale_tag, h.doc_path.0, heading,
|
||||
);
|
||||
}
|
||||
// p9-fb-34: truncation hint goes to stderr so it
|
||||
@@ -958,15 +975,33 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
if let Some(t) = &resp.trace {
|
||||
eprintln!();
|
||||
eprintln!("Trace:");
|
||||
eprintln!(" lexical ({} hits, {}ms):", t.lexical.len(), t.timing.lexical_ms);
|
||||
eprintln!(
|
||||
" lexical ({} hits, {}ms):",
|
||||
t.lexical.len(),
|
||||
t.timing.lexical_ms
|
||||
);
|
||||
for c in t.lexical.iter().take(3) {
|
||||
eprintln!(" rank={} score={:.4} chunk={}", c.rank, c.score, c.chunk_id.0);
|
||||
eprintln!(
|
||||
" rank={} score={:.4} chunk={}",
|
||||
c.rank, c.score, c.chunk_id.0
|
||||
);
|
||||
}
|
||||
eprintln!(" vector ({} hits, {}ms):", t.vector.len(), t.timing.vector_ms);
|
||||
eprintln!(
|
||||
" vector ({} hits, {}ms):",
|
||||
t.vector.len(),
|
||||
t.timing.vector_ms
|
||||
);
|
||||
for c in t.vector.iter().take(3) {
|
||||
eprintln!(" rank={} score={:.4} chunk={}", c.rank, c.score, c.chunk_id.0);
|
||||
eprintln!(
|
||||
" rank={} score={:.4} chunk={}",
|
||||
c.rank, c.score, c.chunk_id.0
|
||||
);
|
||||
}
|
||||
eprintln!(" fusion ({} inputs, {}ms)", t.rrf_inputs.len(), t.timing.fusion_ms);
|
||||
eprintln!(
|
||||
" fusion ({} inputs, {}ms)",
|
||||
t.rrf_inputs.len(),
|
||||
t.timing.fusion_ms
|
||||
);
|
||||
eprintln!(" total: {}ms", t.timing.total_ms);
|
||||
}
|
||||
}
|
||||
@@ -988,6 +1023,17 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
multi_hop,
|
||||
} => {
|
||||
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
|
||||
if query.trim().is_empty() {
|
||||
return Err(anyhow::Error::new(kebab_app::StructuredError(
|
||||
kebab_app::ErrorV1 {
|
||||
schema_version: kebab_app::ERROR_V1_ID.to_string(),
|
||||
code: "invalid_input".to_string(),
|
||||
message: "query is empty; provide a non-empty prompt".into(),
|
||||
details: serde_json::Value::Null,
|
||||
hint: Some("e.g. `kebab ask \"explain this code\"`".into()),
|
||||
},
|
||||
)));
|
||||
}
|
||||
if *stream {
|
||||
// p9-fb-33: streaming branch. Background thread runs
|
||||
// ask_with_config (which calls into the rag pipeline);
|
||||
@@ -1017,16 +1063,12 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
let cfg2 = cfg.clone();
|
||||
let q = query.clone();
|
||||
let session2 = session.clone();
|
||||
let handle = std::thread::spawn(
|
||||
move || -> anyhow::Result<kebab_core::Answer> {
|
||||
match session2.as_deref() {
|
||||
Some(sid) => kebab_app::ask_with_session_with_config(
|
||||
cfg2, sid, &q, opts,
|
||||
),
|
||||
None => kebab_app::ask_with_config(cfg2, &q, opts),
|
||||
}
|
||||
},
|
||||
);
|
||||
let handle = std::thread::spawn(move || -> anyhow::Result<kebab_core::Answer> {
|
||||
match session2.as_deref() {
|
||||
Some(sid) => kebab_app::ask_with_session_with_config(cfg2, sid, &q, opts),
|
||||
None => kebab_app::ask_with_config(cfg2, &q, opts),
|
||||
}
|
||||
});
|
||||
|
||||
// Drain receiver, write ndjson to stderr until
|
||||
// completion or BrokenPipe.
|
||||
@@ -1302,9 +1344,18 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
println!("{}", serde_json::to_string_pretty(&agg)?);
|
||||
} else {
|
||||
println!("run_id: {run_id}");
|
||||
println!("queries: {} ({} failed)", agg.total_queries, agg.failed_queries);
|
||||
println!("hit@1: {:.4}", agg.hit_at_k.get(&1).copied().unwrap_or(0.0));
|
||||
println!("hit@5: {:.4}", agg.hit_at_k.get(&5).copied().unwrap_or(0.0));
|
||||
println!(
|
||||
"queries: {} ({} failed)",
|
||||
agg.total_queries, agg.failed_queries
|
||||
);
|
||||
println!(
|
||||
"hit@1: {:.4}",
|
||||
agg.hit_at_k.get(&1).copied().unwrap_or(0.0)
|
||||
);
|
||||
println!(
|
||||
"hit@5: {:.4}",
|
||||
agg.hit_at_k.get(&5).copied().unwrap_or(0.0)
|
||||
);
|
||||
println!("MRR: {:.4}", agg.mrr);
|
||||
}
|
||||
Ok(())
|
||||
@@ -1354,8 +1405,12 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
} else {
|
||||
println!(
|
||||
"ingest-file: scanned={} new={} updated={} unchanged={} skipped={} errors={}",
|
||||
report.scanned, report.new, report.updated,
|
||||
report.unchanged, report.skipped, report.errors
|
||||
report.scanned,
|
||||
report.new,
|
||||
report.updated,
|
||||
report.unchanged,
|
||||
report.skipped,
|
||||
report.errors
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
@@ -1368,20 +1423,20 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
.read_to_string(&mut body)
|
||||
.context("kebab ingest-stdin: read stdin")?;
|
||||
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
|
||||
let report = kebab_app::ingest_stdin_with_config(
|
||||
cfg,
|
||||
&body,
|
||||
title,
|
||||
source_uri.as_deref(),
|
||||
)?;
|
||||
let report =
|
||||
kebab_app::ingest_stdin_with_config(cfg, &body, title, source_uri.as_deref())?;
|
||||
if cli.json {
|
||||
let v = wire::wire_ingest(&report);
|
||||
println!("{}", serde_json::to_string(&v)?);
|
||||
} else {
|
||||
println!(
|
||||
"ingest-stdin: scanned={} new={} updated={} unchanged={} skipped={} errors={}",
|
||||
report.scanned, report.new, report.updated,
|
||||
report.unchanged, report.skipped, report.errors
|
||||
report.scanned,
|
||||
report.new,
|
||||
report.updated,
|
||||
report.unchanged,
|
||||
report.skipped,
|
||||
report.errors
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
@@ -1410,10 +1465,7 @@ fn render_ask_plain_citations(
|
||||
writeln!(w)?;
|
||||
writeln!(w, "근거:")?;
|
||||
for (idx, c) in ans.citations.iter().enumerate() {
|
||||
let marker = c
|
||||
.marker
|
||||
.clone()
|
||||
.unwrap_or_else(|| format!("{}", idx + 1));
|
||||
let marker = c.marker.clone().unwrap_or_else(|| format!("{}", idx + 1));
|
||||
// p9-fb-32: `[stale]` prefix on the URI for citations whose
|
||||
// `stale: true`. Yellow on TTY, plain otherwise — mirrors the
|
||||
// search-plain renderer in `Cmd::Search`.
|
||||
@@ -1474,7 +1526,10 @@ fn print_schema_text(s: &kebab_app::SchemaV1) {
|
||||
println!(" parser_version {}", s.models.parser_version);
|
||||
println!(" chunker_version {}", s.models.chunker_version);
|
||||
println!(" embedding_version {}", s.models.embedding_version);
|
||||
println!(" prompt_template_version {}", s.models.prompt_template_version);
|
||||
println!(
|
||||
" prompt_template_version {}",
|
||||
s.models.prompt_template_version
|
||||
);
|
||||
println!(" index_version {}", s.models.index_version);
|
||||
println!(" corpus_revision {}", s.models.corpus_revision);
|
||||
println!();
|
||||
@@ -1523,9 +1578,7 @@ fn confirm_destructive(
|
||||
/// Confirm prompt for `--orphans-only`: shows the orphan count + a
|
||||
/// sample of up to 5 paths so the user knows what will be purged before
|
||||
/// committing. No filesystem paths are removed — only store records.
|
||||
fn confirm_orphans_only(
|
||||
orphan_paths: &[kebab_core::WorkspacePath],
|
||||
) -> anyhow::Result<bool> {
|
||||
fn confirm_orphans_only(orphan_paths: &[kebab_core::WorkspacePath]) -> anyhow::Result<bool> {
|
||||
use std::io::Write;
|
||||
let n = orphan_paths.len();
|
||||
let mut out = std::io::stderr().lock();
|
||||
@@ -1538,11 +1591,7 @@ fn confirm_orphans_only(
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
let sample: Vec<&str> = orphan_paths
|
||||
.iter()
|
||||
.take(5)
|
||||
.map(|p| p.0.as_str())
|
||||
.collect();
|
||||
let sample: Vec<&str> = orphan_paths.iter().take(5).map(|p| p.0.as_str()).collect();
|
||||
let sample_str = sample.join(", ");
|
||||
let ellipsis = if n > 5 { ", …" } else { "" };
|
||||
|
||||
@@ -1571,19 +1620,28 @@ fn render_fetch_plain(r: &kebab_core::FetchResult) {
|
||||
if !r.context_before.is_empty() {
|
||||
println!("\n=== before ===");
|
||||
for c in &r.context_before {
|
||||
let heading = c.heading_path.last().map_or("", std::string::String::as_str);
|
||||
let heading = c
|
||||
.heading_path
|
||||
.last()
|
||||
.map_or("", std::string::String::as_str);
|
||||
println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text);
|
||||
}
|
||||
}
|
||||
if let Some(c) = &r.chunk {
|
||||
println!("\n=== target ===");
|
||||
let heading = c.heading_path.last().map_or("", std::string::String::as_str);
|
||||
let heading = c
|
||||
.heading_path
|
||||
.last()
|
||||
.map_or("", std::string::String::as_str);
|
||||
println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text);
|
||||
}
|
||||
if !r.context_after.is_empty() {
|
||||
println!("\n=== after ===");
|
||||
for c in &r.context_after {
|
||||
let heading = c.heading_path.last().map_or("", std::string::String::as_str);
|
||||
let heading = c
|
||||
.heading_path
|
||||
.last()
|
||||
.map_or("", std::string::String::as_str);
|
||||
println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text);
|
||||
}
|
||||
}
|
||||
@@ -1615,8 +1673,8 @@ mod tests {
|
||||
//! against a synthetic `Answer` instead.
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Answer, AnswerCitation, AnswerRetrievalSummary, Citation, ModelRef,
|
||||
PromptTemplateVersion, SearchMode, TokenUsage, TraceId, WorkspacePath,
|
||||
Answer, AnswerCitation, AnswerRetrievalSummary, Citation, ModelRef, PromptTemplateVersion,
|
||||
SearchMode, TokenUsage, TraceId, WorkspacePath,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -1712,4 +1770,3 @@ mod tests {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -124,11 +124,9 @@ impl ProgressDisplay {
|
||||
bar.set_length(u64::from(*total));
|
||||
bar.set_position(0);
|
||||
bar.set_style(
|
||||
ProgressStyle::with_template(
|
||||
"ingest [{bar:30}] {pos}/{len} {wide_msg}",
|
||||
)
|
||||
.unwrap()
|
||||
.progress_chars("=> "),
|
||||
ProgressStyle::with_template("ingest [{bar:30}] {pos}/{len} {wide_msg}")
|
||||
.unwrap()
|
||||
.progress_chars("=> "),
|
||||
);
|
||||
bar.set_message("");
|
||||
}
|
||||
@@ -170,11 +168,7 @@ impl ProgressDisplay {
|
||||
let _ = writeln!(
|
||||
err,
|
||||
"ingest: complete (scanned={} new={} updated={} skipped={} errors={})",
|
||||
counts.scanned,
|
||||
counts.new,
|
||||
counts.updated,
|
||||
counts.skipped,
|
||||
counts.errors,
|
||||
counts.scanned, counts.new, counts.updated, counts.skipped, counts.errors,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -193,14 +187,42 @@ impl ProgressDisplay {
|
||||
let _ = writeln!(
|
||||
err,
|
||||
"ingest: aborted (scanned={} new={} updated={} skipped={} errors={})",
|
||||
counts.scanned,
|
||||
counts.new,
|
||||
counts.updated,
|
||||
counts.skipped,
|
||||
counts.errors,
|
||||
counts.scanned, counts.new, counts.updated, counts.skipped, counts.errors,
|
||||
);
|
||||
}
|
||||
}
|
||||
// v0.20.0 sub-item 1: per-page PDF OCR events — sub-progress lines
|
||||
// under AssetStarted for scanned PDF. spec §4.6.1 line 1085-1086.
|
||||
// skipped=true 시 (DCTDecode 부재 또는 engine fail) skip line.
|
||||
IngestEvent::PdfOcrStarted { page } => {
|
||||
if !quiet {
|
||||
let mut err = std::io::stderr().lock();
|
||||
let _ = writeln!(err, " 📷 OCR page {page}...");
|
||||
}
|
||||
}
|
||||
IngestEvent::PdfOcrFinished {
|
||||
page,
|
||||
ms,
|
||||
chars,
|
||||
ocr_engine,
|
||||
skipped,
|
||||
..
|
||||
} => {
|
||||
if !quiet {
|
||||
let mut err = std::io::stderr().lock();
|
||||
if *skipped {
|
||||
let _ = writeln!(
|
||||
err,
|
||||
" ⊘ OCR page {page} skipped (no DCTDecode or engine fail, {ms}ms)"
|
||||
);
|
||||
} else {
|
||||
let _ = writeln!(
|
||||
err,
|
||||
" ✓ OCR page {page} ({chars} chars, {ms}ms via {ocr_engine})"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -231,7 +253,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn from_flags_json_takes_priority_over_tty() {
|
||||
assert_eq!(ProgressMode::from_flags(true, false, false), ProgressMode::Json);
|
||||
assert_eq!(
|
||||
ProgressMode::from_flags(true, false, false),
|
||||
ProgressMode::Json
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -114,10 +114,7 @@ pub fn wire_answer(a: &Answer) -> Value {
|
||||
/// The timestamp is added at emit time (caller fills `ts`), since the
|
||||
/// pipeline doesn't carry one in the in-process enum — mirrors the
|
||||
/// `wire_ingest_progress` pattern (§2 ingest_progress.v1).
|
||||
pub fn wire_answer_event(
|
||||
ev: &kebab_app::StreamEvent,
|
||||
ts: time::OffsetDateTime,
|
||||
) -> Value {
|
||||
pub fn wire_answer_event(ev: &kebab_app::StreamEvent, ts: time::OffsetDateTime) -> Value {
|
||||
let mut v = serde_json::to_value(ev).expect("StreamEvent serializes");
|
||||
let ts_str = ts
|
||||
.format(&time::format_description::well_known::Rfc3339)
|
||||
@@ -161,9 +158,7 @@ pub fn wire_reset(r: &kebab_app::ResetReport) -> Value {
|
||||
/// wall-clock — the emit site is the only place that knows the moment
|
||||
/// of emission, so the timestamp is stamped here rather than carried
|
||||
/// on the event itself.
|
||||
pub fn wire_ingest_progress(
|
||||
event: &kebab_app::IngestEvent,
|
||||
) -> anyhow::Result<Value> {
|
||||
pub fn wire_ingest_progress(event: &kebab_app::IngestEvent) -> anyhow::Result<Value> {
|
||||
let mut v = serde_json::to_value(event)?;
|
||||
if let Value::Object(ref mut map) = v {
|
||||
map.insert(
|
||||
@@ -305,15 +300,15 @@ mod tests {
|
||||
let v = wire_search_response(&r);
|
||||
assert_eq!(schema_of(&v), Some("search_response.v1"));
|
||||
assert!(v.get("hits").and_then(|h| h.as_array()).is_some());
|
||||
assert_eq!(
|
||||
v.get("hits").and_then(|h| h.as_array()).unwrap().len(),
|
||||
0
|
||||
);
|
||||
assert_eq!(v.get("hits").and_then(|h| h.as_array()).unwrap().len(), 0);
|
||||
assert_eq!(
|
||||
v.get("next_cursor").and_then(|c| c.as_str()),
|
||||
Some("opaque-cursor-abc")
|
||||
);
|
||||
assert_eq!(v.get("truncated").and_then(serde_json::Value::as_bool), Some(true));
|
||||
assert_eq!(
|
||||
v.get("truncated").and_then(serde_json::Value::as_bool),
|
||||
Some(true)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -322,23 +317,36 @@ mod tests {
|
||||
let schema = SchemaV1 {
|
||||
schema_version: "schema.v1".to_string(),
|
||||
kebab_version: "0.2.1".to_string(),
|
||||
wire: WireBlock { schemas: vec!["answer.v1".to_string()] },
|
||||
wire: WireBlock {
|
||||
schemas: vec!["answer.v1".to_string()],
|
||||
},
|
||||
capabilities: Capabilities {
|
||||
json_mode: true, ingest_progress: true, ingest_cancellation: true,
|
||||
rag_multi_turn: true, search_cache: true, incremental_ingest: true,
|
||||
streaming_ask: false, http_daemon: false, mcp_server: false,
|
||||
single_file_ingest: false, bulk_search: true,
|
||||
json_mode: true,
|
||||
ingest_progress: true,
|
||||
ingest_cancellation: true,
|
||||
rag_multi_turn: true,
|
||||
search_cache: true,
|
||||
incremental_ingest: true,
|
||||
streaming_ask: false,
|
||||
http_daemon: false,
|
||||
mcp_server: false,
|
||||
single_file_ingest: false,
|
||||
bulk_search: true,
|
||||
},
|
||||
models: Models {
|
||||
parser_version: "x".to_string(),
|
||||
chunker_version: "y".to_string(),
|
||||
active_parsers: vec![],
|
||||
active_chunkers: vec![],
|
||||
embedding_version: "z".to_string(),
|
||||
prompt_template_version: "w".to_string(),
|
||||
index_version: "v".to_string(),
|
||||
corpus_revision: 7,
|
||||
},
|
||||
stats: Stats {
|
||||
doc_count: 1, chunk_count: 2, asset_count: 1,
|
||||
doc_count: 1,
|
||||
chunk_count: 2,
|
||||
asset_count: 1,
|
||||
last_ingest_at: None,
|
||||
media_breakdown: Default::default(),
|
||||
lang_breakdown: Default::default(),
|
||||
@@ -350,7 +358,10 @@ mod tests {
|
||||
};
|
||||
let v = wire_schema(&schema);
|
||||
assert_eq!(schema_of(&v), Some("schema.v1"));
|
||||
assert_eq!(v.get("kebab_version").and_then(Value::as_str), Some("0.2.1"));
|
||||
assert_eq!(
|
||||
v.get("kebab_version").and_then(Value::as_str),
|
||||
Some("0.2.1")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -365,7 +376,10 @@ mod tests {
|
||||
};
|
||||
let v = wire_error_v1(&err);
|
||||
assert_eq!(schema_of(&v), Some("error.v1"));
|
||||
assert_eq!(v.get("code").and_then(Value::as_str), Some("config_invalid"));
|
||||
assert_eq!(
|
||||
v.get("code").and_then(Value::as_str),
|
||||
Some("config_invalid")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -391,8 +405,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn search_response_with_trace_serializes_trace_field() {
|
||||
use kebab_core::{SearchTrace, TraceCandidate, TraceFusionInput,
|
||||
TraceTiming, ChunkId, DocumentId, WorkspacePath};
|
||||
use kebab_core::{
|
||||
ChunkId, DocumentId, SearchTrace, TraceCandidate, TraceFusionInput, TraceTiming,
|
||||
WorkspacePath,
|
||||
};
|
||||
let r = kebab_app::SearchResponse {
|
||||
hits: vec![],
|
||||
next_cursor: None,
|
||||
@@ -412,7 +428,12 @@ mod tests {
|
||||
vector_rank: None,
|
||||
fusion_score: 0.0,
|
||||
}],
|
||||
timing: TraceTiming { lexical_ms: 5, vector_ms: 0, fusion_ms: 1, total_ms: 7 },
|
||||
timing: TraceTiming {
|
||||
lexical_ms: 5,
|
||||
vector_ms: 0,
|
||||
fusion_ms: 1,
|
||||
total_ms: 7,
|
||||
},
|
||||
}),
|
||||
hint: None,
|
||||
};
|
||||
|
||||
64
crates/kebab-cli/tests/cli_config_not_found.rs
Normal file
64
crates/kebab-cli/tests/cli_config_not_found.rs
Normal file
@@ -0,0 +1,64 @@
|
||||
//! Integration tests for Bug #10: explicit --config <path> that does not exist
|
||||
//! must fail with exit≠0 and error.v1 code=config_not_found (not silently fall
|
||||
//! back to XDG defaults).
|
||||
|
||||
use serde_json::Value;
|
||||
use std::process::Command;
|
||||
|
||||
fn kebab_bin() -> String {
|
||||
env!("CARGO_BIN_EXE_kebab").to_string()
|
||||
}
|
||||
|
||||
fn parse_error_v1(stderr: &str) -> Value {
|
||||
let last = stderr
|
||||
.lines()
|
||||
.last()
|
||||
.expect("expected error.v1 ndjson on stderr");
|
||||
serde_json::from_str(last)
|
||||
.unwrap_or_else(|e| panic!("expected ndjson on stderr: {e}\nstderr={stderr}"))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_config_path_emits_error_v1_with_nonzero_exit() {
|
||||
let absent = "/tmp/__kebab_bugfix3_absolute_nonexistent.toml";
|
||||
assert!(!std::path::Path::new(absent).exists());
|
||||
|
||||
let out = Command::new(kebab_bin())
|
||||
.args(["search", "rust", "--config", absent, "--json"])
|
||||
.output()
|
||||
.expect("spawn kebab");
|
||||
|
||||
assert_ne!(
|
||||
out.status.code(),
|
||||
Some(0),
|
||||
"exit must be nonzero on missing --config"
|
||||
);
|
||||
let stderr = String::from_utf8_lossy(&out.stderr);
|
||||
let v = parse_error_v1(&stderr);
|
||||
assert_eq!(v["schema_version"], "error.v1");
|
||||
assert_eq!(v["code"], "config_not_found");
|
||||
assert!(v["hint"].is_string(), "hint must be present");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_relative_config_path_emits_config_not_found() {
|
||||
// Bug #10 spec §6 R-1: relative path も cwd-relative で cover.
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let out = Command::new(kebab_bin())
|
||||
.args([
|
||||
"search",
|
||||
"rust",
|
||||
"--config",
|
||||
"nonexistent-rel.toml",
|
||||
"--json",
|
||||
])
|
||||
.current_dir(tmp.path())
|
||||
.output()
|
||||
.expect("spawn kebab");
|
||||
|
||||
assert_ne!(out.status.code(), Some(0));
|
||||
let stderr = String::from_utf8_lossy(&out.stderr);
|
||||
let v = parse_error_v1(&stderr);
|
||||
assert_eq!(v["schema_version"], "error.v1");
|
||||
assert_eq!(v["code"], "config_not_found");
|
||||
}
|
||||
50
crates/kebab-cli/tests/cli_empty_query.rs
Normal file
50
crates/kebab-cli/tests/cli_empty_query.rs
Normal file
@@ -0,0 +1,50 @@
|
||||
//! Integration tests for Bug #14: empty or whitespace-only query must emit
|
||||
//! error.v1 code=invalid_input and exit nonzero (not silent 0-hit return).
|
||||
|
||||
use serde_json::Value;
|
||||
use std::process::Command;
|
||||
|
||||
fn kebab_bin() -> String {
|
||||
env!("CARGO_BIN_EXE_kebab").to_string()
|
||||
}
|
||||
|
||||
fn parse_error_v1(stderr: &str) -> Value {
|
||||
let last = stderr
|
||||
.lines()
|
||||
.last()
|
||||
.expect("expected error.v1 ndjson on stderr");
|
||||
serde_json::from_str(last)
|
||||
.unwrap_or_else(|e| panic!("expected ndjson on stderr: {e}\nstderr={stderr}"))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_empty_query_emits_invalid_input() {
|
||||
for q in ["", " "] {
|
||||
let out = Command::new(kebab_bin())
|
||||
.args(["search", q, "--json"])
|
||||
.output()
|
||||
.expect("spawn kebab");
|
||||
assert_ne!(
|
||||
out.status.code(),
|
||||
Some(0),
|
||||
"empty/whitespace query must fail (q={q:?})"
|
||||
);
|
||||
let stderr = String::from_utf8_lossy(&out.stderr);
|
||||
let v = parse_error_v1(&stderr);
|
||||
assert_eq!(v["schema_version"], "error.v1", "stderr={stderr}");
|
||||
assert_eq!(v["code"], "invalid_input", "stderr={stderr}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ask_empty_query_emits_invalid_input() {
|
||||
let out = Command::new(kebab_bin())
|
||||
.args(["ask", "", "--json"])
|
||||
.output()
|
||||
.expect("spawn kebab");
|
||||
assert_ne!(out.status.code(), Some(0));
|
||||
let stderr = String::from_utf8_lossy(&out.stderr);
|
||||
let v = parse_error_v1(&stderr);
|
||||
assert_eq!(v["schema_version"], "error.v1");
|
||||
assert_eq!(v["code"], "invalid_input");
|
||||
}
|
||||
@@ -2,11 +2,10 @@
|
||||
//! on stderr while non-json mode emits the legacy `error:` text prefix.
|
||||
//!
|
||||
//! The `config_invalid` code is triggered by supplying an *existing* but
|
||||
//! malformed TOML file via `--config`. Note: supplying a *non-existent*
|
||||
//! path does NOT trigger this error — Config::load silently falls back to
|
||||
//! defaults when the specified config file is absent (by design, so that
|
||||
//! `kebab doctor` runs before `kebab init` is ever called). A file that
|
||||
//! exists but fails TOML parsing is the reliable path to `config_invalid`.
|
||||
//! malformed TOML file via `--config`. A file that exists but fails TOML
|
||||
//! parsing is the reliable path to `config_invalid`. Supplying a path that
|
||||
//! does not exist emits `config_not_found` instead (Bug #10 fix, v0.20.0
|
||||
//! bugfix3); see `cli_config_not_found.rs` for those tests.
|
||||
|
||||
use std::process::Command;
|
||||
|
||||
@@ -37,12 +36,7 @@ fn json_mode_emits_error_v1_on_config_invalid() {
|
||||
std::fs::write(&bad_config, b"this is not { valid toml !!!").unwrap();
|
||||
|
||||
let mut cmd = Command::new(kebab_bin());
|
||||
cmd.args([
|
||||
"--json",
|
||||
"--config",
|
||||
bad_config.to_str().unwrap(),
|
||||
"ingest",
|
||||
]);
|
||||
cmd.args(["--json", "--config", bad_config.to_str().unwrap(), "ingest"]);
|
||||
for (k, v) in xdg_envs(tmp.path()) {
|
||||
cmd.env(k, v);
|
||||
}
|
||||
@@ -56,7 +50,10 @@ fn json_mode_emits_error_v1_on_config_invalid() {
|
||||
assert_eq!(exit_code, 2, "expected exit code 2, got {exit_code}");
|
||||
|
||||
let stderr = String::from_utf8(out.stderr).unwrap();
|
||||
let first_line = stderr.lines().next().expect("stderr must have at least one line");
|
||||
let first_line = stderr
|
||||
.lines()
|
||||
.next()
|
||||
.expect("stderr must have at least one line");
|
||||
let v: serde_json::Value =
|
||||
serde_json::from_str(first_line).expect("stderr first line must be valid JSON");
|
||||
|
||||
|
||||
17
crates/kebab-cli/tests/cli_help_smoke.rs
Normal file
17
crates/kebab-cli/tests/cli_help_smoke.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
// crates/kebab-cli/tests/cli_help_smoke.rs
|
||||
//
|
||||
// Regression pin — `kebab search --help` 의 `--media` value list 가
|
||||
// `code` 를 노출. Bug #7 (v0.20.0 bugfix round 2 spec §4.4).
|
||||
|
||||
#[test]
|
||||
fn search_help_lists_code_in_media_values() {
|
||||
let out = std::process::Command::new(env!("CARGO_BIN_EXE_kebab"))
|
||||
.args(["search", "--help"])
|
||||
.output()
|
||||
.expect("kebab search --help");
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
assert!(
|
||||
stdout.contains("`code`"),
|
||||
"search --help must list 'code' as accepted --media value; stdout = {stdout}"
|
||||
);
|
||||
}
|
||||
@@ -72,21 +72,34 @@ max_context_tokens = 8000
|
||||
workspace = workspace.display(),
|
||||
data = data.display(),
|
||||
),
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let src = dir.path().join("doc.md");
|
||||
fs::write(&src, "# A\n\nbody.").unwrap();
|
||||
|
||||
let bin = env!("CARGO_BIN_EXE_kebab");
|
||||
let out = Command::new(bin)
|
||||
.args(["--json", "--config", cfg_path.to_str().unwrap(), "ingest-file"])
|
||||
.args([
|
||||
"--json",
|
||||
"--config",
|
||||
cfg_path.to_str().unwrap(),
|
||||
"ingest-file",
|
||||
])
|
||||
.arg(&src)
|
||||
.output()
|
||||
.unwrap();
|
||||
assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"stderr: {}",
|
||||
String::from_utf8_lossy(&out.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
let v: serde_json::Value = serde_json::from_str(stdout.trim()).unwrap();
|
||||
assert_eq!(v.get("schema_version").and_then(|s| s.as_str()), Some("ingest_report.v1"));
|
||||
assert_eq!(
|
||||
v.get("schema_version").and_then(|s| s.as_str()),
|
||||
Some("ingest_report.v1")
|
||||
);
|
||||
assert_eq!(v.get("new").and_then(serde_json::Value::as_u64), Some(1));
|
||||
}
|
||||
|
||||
@@ -73,13 +73,18 @@ max_context_tokens = 8000
|
||||
workspace = workspace.display(),
|
||||
data = data.display(),
|
||||
),
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let bin = env!("CARGO_BIN_EXE_kebab");
|
||||
let mut child = Command::new(bin)
|
||||
.args([
|
||||
"--json", "--config", cfg_path.to_str().unwrap(),
|
||||
"ingest-stdin", "--title", "X",
|
||||
"--json",
|
||||
"--config",
|
||||
cfg_path.to_str().unwrap(),
|
||||
"ingest-stdin",
|
||||
"--title",
|
||||
"X",
|
||||
])
|
||||
.stdin(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
@@ -91,10 +96,17 @@ max_context_tokens = 8000
|
||||
stdin.write_all(b"## Body\n\nbody text.\n").unwrap();
|
||||
}
|
||||
let out = child.wait_with_output().unwrap();
|
||||
assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"stderr: {}",
|
||||
String::from_utf8_lossy(&out.stderr)
|
||||
);
|
||||
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
let v: serde_json::Value = serde_json::from_str(stdout.trim()).unwrap();
|
||||
assert_eq!(v.get("schema_version").and_then(|s| s.as_str()), Some("ingest_report.v1"));
|
||||
assert_eq!(
|
||||
v.get("schema_version").and_then(|s| s.as_str()),
|
||||
Some("ingest_report.v1")
|
||||
);
|
||||
assert_eq!(v.get("new").and_then(serde_json::Value::as_u64), Some(1));
|
||||
}
|
||||
|
||||
@@ -112,7 +112,13 @@ fn kebab_readonly_env_blocks_ingest() {
|
||||
fn readonly_json_mode_emits_error_v1() {
|
||||
let (tmp, ws) = fixture_workspace();
|
||||
let out = Command::new(kebab_bin())
|
||||
.args(["--readonly", "--json", "ingest", "--root", ws.to_str().unwrap()])
|
||||
.args([
|
||||
"--readonly",
|
||||
"--json",
|
||||
"ingest",
|
||||
"--root",
|
||||
ws.to_str().unwrap(),
|
||||
])
|
||||
.envs(xdg_envs(tmp.path()))
|
||||
.output()
|
||||
.unwrap();
|
||||
@@ -164,12 +170,22 @@ fn quiet_flag_suppresses_progress_stderr() {
|
||||
fn quiet_with_json_stdout_has_report_stderr_is_empty() {
|
||||
let (tmp, ws) = fixture_workspace();
|
||||
let out = Command::new(kebab_bin())
|
||||
.args(["--quiet", "--json", "ingest", "--root", ws.to_str().unwrap()])
|
||||
.args([
|
||||
"--quiet",
|
||||
"--json",
|
||||
"ingest",
|
||||
"--root",
|
||||
ws.to_str().unwrap(),
|
||||
])
|
||||
.envs(xdg_envs(tmp.path()))
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"stderr: {}",
|
||||
String::from_utf8_lossy(&out.stderr)
|
||||
);
|
||||
let stderr = String::from_utf8_lossy(&out.stderr);
|
||||
assert!(stderr.is_empty(), "expected empty stderr, got: {stderr}");
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
|
||||
@@ -90,12 +90,7 @@ fn ingest_human_non_tty_emits_progress_lines_to_stderr() {
|
||||
// target is `hidden` and progress lines go to stderr instead.
|
||||
let (tmp, ws) = fixture_workspace();
|
||||
let mut cmd = Command::new(kebab_bin());
|
||||
cmd.args([
|
||||
"ingest",
|
||||
"--root",
|
||||
ws.to_str().unwrap(),
|
||||
"--summary-only",
|
||||
]);
|
||||
cmd.args(["ingest", "--root", ws.to_str().unwrap(), "--summary-only"]);
|
||||
for (k, v) in xdg_envs(tmp.path()) {
|
||||
cmd.env(k, v);
|
||||
}
|
||||
@@ -155,8 +150,14 @@ fn ingest_json_progress_lines_carry_kind_and_ts() {
|
||||
saw_completed = true;
|
||||
// Counts mirror the report.
|
||||
let counts = v.get("counts").unwrap();
|
||||
assert_eq!(counts.get("scanned").and_then(serde_json::Value::as_u64), Some(2));
|
||||
assert_eq!(counts.get("new").and_then(serde_json::Value::as_u64), Some(2));
|
||||
assert_eq!(
|
||||
counts.get("scanned").and_then(serde_json::Value::as_u64),
|
||||
Some(2)
|
||||
);
|
||||
assert_eq!(
|
||||
counts.get("new").and_then(serde_json::Value::as_u64),
|
||||
Some(2)
|
||||
);
|
||||
}
|
||||
}
|
||||
assert!(saw_scan_started, "missing scan_started event");
|
||||
|
||||
@@ -50,9 +50,18 @@ fn reset_data_only_yes_removes_data_dir_and_keeps_config() {
|
||||
);
|
||||
|
||||
assert!(!xdg_data.join("kebab").exists(), "data dir should be gone");
|
||||
assert!(!xdg_cache.join("kebab").exists(), "cache dir should be gone");
|
||||
assert!(!xdg_state.join("kebab").exists(), "state dir should be gone");
|
||||
assert!(xdg_cfg.join("kebab/marker").exists(), "config dir preserved");
|
||||
assert!(
|
||||
!xdg_cache.join("kebab").exists(),
|
||||
"cache dir should be gone"
|
||||
);
|
||||
assert!(
|
||||
!xdg_state.join("kebab").exists(),
|
||||
"state dir should be gone"
|
||||
);
|
||||
assert!(
|
||||
xdg_cfg.join("kebab/marker").exists(),
|
||||
"config dir preserved"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -101,7 +110,11 @@ fn reset_data_only_yes_json_emits_reset_report_v1() {
|
||||
.env("XDG_STATE_HOME", tmp.path().join("state"))
|
||||
.output()
|
||||
.unwrap();
|
||||
assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"stderr: {}",
|
||||
String::from_utf8_lossy(&out.stderr)
|
||||
);
|
||||
|
||||
let v: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap();
|
||||
assert_eq!(
|
||||
|
||||
@@ -32,10 +32,9 @@ fn schema_path(name: &str) -> PathBuf {
|
||||
}
|
||||
|
||||
fn parse_schema(name: &str) -> serde_json::Value {
|
||||
let text = std::fs::read_to_string(schema_path(name))
|
||||
.unwrap_or_else(|e| panic!("read {name}: {e}"));
|
||||
serde_json::from_str(&text)
|
||||
.unwrap_or_else(|e| panic!("{name} must parse as valid JSON: {e}"))
|
||||
let text =
|
||||
std::fs::read_to_string(schema_path(name)).unwrap_or_else(|e| panic!("read {name}: {e}"));
|
||||
serde_json::from_str(&text).unwrap_or_else(|e| panic!("{name} must parse as valid JSON: {e}"))
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -41,8 +41,7 @@ fn relax_score_gate(cfg: &Path) {
|
||||
#[ignore = "requires real Ollama on 127.0.0.1:11434"]
|
||||
fn stream_emits_ndjson_events_on_stderr() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let (cfg, workspace, _data) =
|
||||
common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
|
||||
let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
|
||||
relax_score_gate(&cfg);
|
||||
fs::write(
|
||||
workspace.join("a.md"),
|
||||
@@ -93,12 +92,8 @@ fn stream_emits_ndjson_events_on_stderr() {
|
||||
// stdout: last line is answer.v1 (backwards compat with the
|
||||
// non-streaming path — same wire shape, just emitted after the
|
||||
// ndjson event stream rather than instead of it).
|
||||
let final_line = stdout
|
||||
.lines()
|
||||
.last()
|
||||
.expect("stdout has at least one line");
|
||||
let answer: Value =
|
||||
serde_json::from_str(final_line).expect("stdout final line = answer.v1");
|
||||
let final_line = stdout.lines().last().expect("stdout has at least one line");
|
||||
let answer: Value = serde_json::from_str(final_line).expect("stdout final line = answer.v1");
|
||||
assert_eq!(answer["schema_version"], "answer.v1");
|
||||
}
|
||||
|
||||
@@ -109,8 +104,7 @@ fn non_stream_path_unchanged() {
|
||||
// emits a single `answer.v1` line on stdout — fb-33 must not
|
||||
// perturb the existing wire surface.
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let (cfg, workspace, _data) =
|
||||
common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
|
||||
let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
|
||||
relax_score_gate(&cfg);
|
||||
fs::write(
|
||||
workspace.join("a.md"),
|
||||
@@ -140,8 +134,7 @@ fn stream_cancels_when_stderr_closes() {
|
||||
use std::process::{Command, Stdio};
|
||||
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let (cfg, workspace, _data) =
|
||||
common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
|
||||
let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
|
||||
relax_score_gate(&cfg);
|
||||
fs::write(
|
||||
workspace.join("a.md"),
|
||||
@@ -198,15 +191,10 @@ fn stream_cancels_when_stderr_closes() {
|
||||
#[ignore = "requires real Ollama on 127.0.0.1:11434"]
|
||||
fn stream_score_gate_refusal_emits_only_retrieval_done() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let (cfg, workspace, _data) =
|
||||
common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
|
||||
let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
|
||||
// Intentionally NO relax_score_gate — keep the default 0.30
|
||||
// so the thin-doc + unrelated-query combo trips refusal.
|
||||
fs::write(
|
||||
workspace.join("a.md"),
|
||||
"# Title\n\nrust is a language.\n",
|
||||
)
|
||||
.unwrap();
|
||||
fs::write(workspace.join("a.md"), "# Title\n\nrust is a language.\n").unwrap();
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
let (stdout, stderr) =
|
||||
@@ -230,12 +218,8 @@ fn stream_score_gate_refusal_emits_only_retrieval_done() {
|
||||
);
|
||||
|
||||
// Stdout still has answer.v1 with grounded=false.
|
||||
let final_line = stdout
|
||||
.lines()
|
||||
.last()
|
||||
.expect("stdout has at least one line");
|
||||
let answer: Value =
|
||||
serde_json::from_str(final_line).expect("answer.v1");
|
||||
let final_line = stdout.lines().last().expect("stdout has at least one line");
|
||||
let answer: Value = serde_json::from_str(final_line).expect("answer.v1");
|
||||
assert_eq!(answer["schema_version"], "answer.v1");
|
||||
assert_eq!(answer["grounded"], false);
|
||||
}
|
||||
|
||||
@@ -21,7 +21,11 @@ fn cargo_bin() -> &'static str {
|
||||
env!("CARGO_BIN_EXE_kebab")
|
||||
}
|
||||
|
||||
fn run_bulk_with_stdin(cfg: &std::path::Path, stdin_body: &str, json: bool) -> std::process::Output {
|
||||
fn run_bulk_with_stdin(
|
||||
cfg: &std::path::Path,
|
||||
stdin_body: &str,
|
||||
json: bool,
|
||||
) -> std::process::Output {
|
||||
let mut cmd = Command::new(cargo_bin());
|
||||
cmd.arg("--config").arg(cfg).arg("search").arg("--bulk");
|
||||
if json {
|
||||
@@ -94,7 +98,10 @@ fn empty_stdin_returns_empty_results_with_zero_summary() {
|
||||
let out = run_bulk_with_stdin(&cfg, "", true);
|
||||
assert!(out.status.success());
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
assert!(stdout.trim().is_empty(), "expected empty stdout, got: {stdout}");
|
||||
assert!(
|
||||
stdout.trim().is_empty(),
|
||||
"expected empty stdout, got: {stdout}"
|
||||
);
|
||||
let stderr = String::from_utf8_lossy(&out.stderr);
|
||||
assert!(stderr.contains("bulk_summary: total=0 succeeded=0 failed=0"));
|
||||
}
|
||||
|
||||
@@ -19,7 +19,10 @@ fn line_variant_serialization_unchanged() {
|
||||
assert_eq!(v["end"], 2);
|
||||
assert_eq!(v["section"], "§14");
|
||||
// Must not bleed Code-variant keys.
|
||||
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
|
||||
assert!(
|
||||
v.get("line_start").is_none(),
|
||||
"line_start must be absent: {v}"
|
||||
);
|
||||
assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
|
||||
assert!(v.get("code").is_none(), "code must be absent: {v}");
|
||||
}
|
||||
@@ -48,7 +51,10 @@ fn page_variant_serialization_unchanged() {
|
||||
let v = serde_json::to_value(&c).unwrap();
|
||||
assert_eq!(v["kind"], "page");
|
||||
assert_eq!(v["page"], 13);
|
||||
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
|
||||
assert!(
|
||||
v.get("line_start").is_none(),
|
||||
"line_start must be absent: {v}"
|
||||
);
|
||||
assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
|
||||
}
|
||||
|
||||
@@ -67,7 +73,10 @@ fn region_variant_serialization_unchanged() {
|
||||
assert_eq!(v["y"], 20);
|
||||
assert_eq!(v["w"], 100);
|
||||
assert_eq!(v["h"], 200);
|
||||
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
|
||||
assert!(
|
||||
v.get("line_start").is_none(),
|
||||
"line_start must be absent: {v}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -79,7 +88,10 @@ fn caption_variant_serialization_unchanged() {
|
||||
let v = serde_json::to_value(&c).unwrap();
|
||||
assert_eq!(v["kind"], "caption");
|
||||
assert_eq!(v["model"], "qwen2.5-vl:7b");
|
||||
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
|
||||
assert!(
|
||||
v.get("line_start").is_none(),
|
||||
"line_start must be absent: {v}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -95,6 +107,9 @@ fn time_variant_serialization_unchanged() {
|
||||
assert_eq!(v["start_ms"], 1000);
|
||||
assert_eq!(v["end_ms"], 5000);
|
||||
assert_eq!(v["speaker"], "Alice");
|
||||
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
|
||||
assert!(
|
||||
v.get("line_start").is_none(),
|
||||
"line_start must be absent: {v}"
|
||||
);
|
||||
assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
|
||||
}
|
||||
|
||||
@@ -24,10 +24,8 @@ fn fetch_chunk_json_emits_fetch_result_v1() {
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
// Find chunk_id via search.
|
||||
let (search_stdout, _) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--json", "--mode", "lexical", "--k", "1", "apples"],
|
||||
);
|
||||
let (search_stdout, _) =
|
||||
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "1", "apples"]);
|
||||
let search: Value = serde_json::from_str(search_stdout.trim())
|
||||
.unwrap_or_else(|e| panic!("search not JSON: {search_stdout:?}: {e}"));
|
||||
let chunk_id = search["hits"][0]["chunk_id"]
|
||||
@@ -35,10 +33,7 @@ fn fetch_chunk_json_emits_fetch_result_v1() {
|
||||
.expect("chunk_id on first hit")
|
||||
.to_string();
|
||||
|
||||
let (stdout, _) = common::run_fetch_with_args(
|
||||
&cfg,
|
||||
&["--json", "chunk", &chunk_id],
|
||||
);
|
||||
let (stdout, _) = common::run_fetch_with_args(&cfg, &["--json", "chunk", &chunk_id]);
|
||||
let v: Value = serde_json::from_str(stdout.trim())
|
||||
.unwrap_or_else(|e| panic!("fetch not JSON: {stdout:?}: {e}"));
|
||||
assert_eq!(v["schema_version"], "fetch_result.v1");
|
||||
@@ -59,10 +54,8 @@ fn fetch_doc_json_with_max_tokens_truncates() {
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
// Find doc_id via search.
|
||||
let (search_stdout, _) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--json", "--mode", "lexical", "--k", "1", "Lorem"],
|
||||
);
|
||||
let (search_stdout, _) =
|
||||
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "1", "Lorem"]);
|
||||
let search: Value = serde_json::from_str(search_stdout.trim())
|
||||
.unwrap_or_else(|e| panic!("search not JSON: {search_stdout:?}: {e}"));
|
||||
let doc_id = search["hits"][0]["doc_id"]
|
||||
@@ -70,10 +63,8 @@ fn fetch_doc_json_with_max_tokens_truncates() {
|
||||
.expect("doc_id on first hit")
|
||||
.to_string();
|
||||
|
||||
let (stdout, _) = common::run_fetch_with_args(
|
||||
&cfg,
|
||||
&["--json", "doc", &doc_id, "--max-tokens", "20"],
|
||||
);
|
||||
let (stdout, _) =
|
||||
common::run_fetch_with_args(&cfg, &["--json", "doc", &doc_id, "--max-tokens", "20"]);
|
||||
let v: Value = serde_json::from_str(stdout.trim())
|
||||
.unwrap_or_else(|e| panic!("fetch not JSON: {stdout:?}: {e}"));
|
||||
assert_eq!(v["kind"], "doc");
|
||||
|
||||
@@ -32,12 +32,9 @@ fn search_with_doc_id_filter_returns_only_target_doc() {
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
// First, search without a doc-id filter to find what doc_ids exist.
|
||||
let (stdout, _) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--json", "--mode", "lexical", "rust"],
|
||||
);
|
||||
let resp: Value = serde_json::from_str(stdout.trim())
|
||||
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
let (stdout, _) = common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]);
|
||||
let resp: Value =
|
||||
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
let hits = resp["hits"].as_array().expect("hits array");
|
||||
assert!(
|
||||
hits.len() >= 2,
|
||||
@@ -147,15 +144,19 @@ fn search_with_media_filter_md_alias_normalizes_to_markdown() {
|
||||
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
|
||||
|
||||
// Only a markdown file — the `md` alias should match it.
|
||||
fs::write(workspace.join("notes.md"), "# Notes\n\nrust async programming\n").unwrap();
|
||||
fs::write(
|
||||
workspace.join("notes.md"),
|
||||
"# Notes\n\nrust async programming\n",
|
||||
)
|
||||
.unwrap();
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
let (stdout, _) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--json", "--mode", "lexical", "--media", "md", "rust"],
|
||||
);
|
||||
let resp: Value = serde_json::from_str(stdout.trim())
|
||||
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
let resp: Value =
|
||||
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
let hits = resp["hits"].as_array().expect("hits array");
|
||||
|
||||
assert!(
|
||||
@@ -189,10 +190,8 @@ fn search_with_tag_filter_matches_frontmatter_tags() {
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
// Without filter — both docs must produce hits.
|
||||
let (unfiltered, _) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--json", "--mode", "lexical", "rust"],
|
||||
);
|
||||
let (unfiltered, _) =
|
||||
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]);
|
||||
let uresp: Value = serde_json::from_str(unfiltered.trim())
|
||||
.unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}"));
|
||||
let uhits = uresp["hits"].as_array().expect("unfiltered hits array");
|
||||
@@ -254,10 +253,8 @@ fn search_with_two_tag_filters_returns_or_within_tags() {
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
// Without filter: all three docs produce hits.
|
||||
let (unfiltered, _) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--json", "--mode", "lexical", "rust"],
|
||||
);
|
||||
let (unfiltered, _) =
|
||||
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]);
|
||||
let uresp: Value = serde_json::from_str(unfiltered.trim())
|
||||
.unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}"));
|
||||
let uhits = uresp["hits"].as_array().expect("unfiltered hits array");
|
||||
@@ -270,10 +267,7 @@ fn search_with_two_tag_filters_returns_or_within_tags() {
|
||||
let (filtered, _) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&[
|
||||
"--json", "--mode", "lexical",
|
||||
"--tag", "rust",
|
||||
"--tag", "async",
|
||||
"rust",
|
||||
"--json", "--mode", "lexical", "--tag", "rust", "--tag", "async", "rust",
|
||||
],
|
||||
);
|
||||
let fresp: Value = serde_json::from_str(filtered.trim())
|
||||
@@ -301,6 +295,12 @@ fn search_with_two_tag_filters_returns_or_within_tags() {
|
||||
.collect();
|
||||
let has_a = paths.iter().any(|p| p.ends_with("a.md"));
|
||||
let has_b = paths.iter().any(|p| p.ends_with("b.md"));
|
||||
assert!(has_a, "--tag rust must include a.md (rust-tagged): paths={paths:?}");
|
||||
assert!(has_b, "--tag async must include b.md (async-tagged): paths={paths:?}");
|
||||
assert!(
|
||||
has_a,
|
||||
"--tag rust must include a.md (rust-tagged): paths={paths:?}"
|
||||
);
|
||||
assert!(
|
||||
has_b,
|
||||
"--tag async must include b.md (async-tagged): paths={paths:?}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
//! inject spurious keys into the existing markdown corpus wire shape.
|
||||
|
||||
use kebab_core::{
|
||||
Citation, ChunkId, ChunkerVersion, DocumentId, IndexVersion, RetrievalDetail, ScoreKind,
|
||||
ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail, ScoreKind,
|
||||
SearchHit, WorkspacePath,
|
||||
};
|
||||
|
||||
|
||||
@@ -23,12 +23,10 @@ fn search_json_emits_search_response_v1_wrapper() {
|
||||
fs::write(workspace.join("a.md"), "# T\n\napples are red.\n").unwrap();
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
let (stdout, _stderr) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--json", "--mode", "lexical", "apples"],
|
||||
);
|
||||
let v: Value = serde_json::from_str(stdout.trim())
|
||||
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
let (stdout, _stderr) =
|
||||
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "apples"]);
|
||||
let v: Value =
|
||||
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
assert_eq!(v["schema_version"], "search_response.v1");
|
||||
assert!(v["hits"].is_array(), "hits must be array, got {v}");
|
||||
assert!(
|
||||
@@ -67,8 +65,8 @@ fn search_json_truncates_with_max_tokens() {
|
||||
&cfg,
|
||||
&["--json", "--mode", "lexical", "--max-tokens", "30", "rust"],
|
||||
);
|
||||
let v: Value = serde_json::from_str(stdout.trim())
|
||||
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
let v: Value =
|
||||
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
assert_eq!(
|
||||
v["truncated"], true,
|
||||
"30-token cap must trip truncation: {v}"
|
||||
@@ -88,10 +86,8 @@ fn search_json_cursor_paginates() {
|
||||
}
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
let (page1, _) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--json", "--mode", "lexical", "--k", "2", "rust"],
|
||||
);
|
||||
let (page1, _) =
|
||||
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "2", "rust"]);
|
||||
let v1: Value = serde_json::from_str(page1.trim())
|
||||
.unwrap_or_else(|e| panic!("page1 not JSON: {page1:?}: {e}"));
|
||||
let cursor = v1["next_cursor"]
|
||||
@@ -101,14 +97,7 @@ fn search_json_cursor_paginates() {
|
||||
let (page2, _) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&[
|
||||
"--json",
|
||||
"--mode",
|
||||
"lexical",
|
||||
"--k",
|
||||
"2",
|
||||
"--cursor",
|
||||
cursor,
|
||||
"rust",
|
||||
"--json", "--mode", "lexical", "--k", "2", "--cursor", cursor, "rust",
|
||||
],
|
||||
);
|
||||
let v2: Value = serde_json::from_str(page2.trim())
|
||||
@@ -118,23 +107,13 @@ fn search_json_cursor_paginates() {
|
||||
.as_array()
|
||||
.expect("page1 hits array")
|
||||
.iter()
|
||||
.map(|h| {
|
||||
h["chunk_id"]
|
||||
.as_str()
|
||||
.expect("chunk_id string")
|
||||
.to_string()
|
||||
})
|
||||
.map(|h| h["chunk_id"].as_str().expect("chunk_id string").to_string())
|
||||
.collect();
|
||||
let p2_ids: Vec<String> = v2["hits"]
|
||||
.as_array()
|
||||
.expect("page2 hits array")
|
||||
.iter()
|
||||
.map(|h| {
|
||||
h["chunk_id"]
|
||||
.as_str()
|
||||
.expect("chunk_id string")
|
||||
.to_string()
|
||||
})
|
||||
.map(|h| h["chunk_id"].as_str().expect("chunk_id string").to_string())
|
||||
.collect();
|
||||
assert!(
|
||||
!p2_ids.is_empty(),
|
||||
@@ -161,10 +140,8 @@ fn search_stale_cursor_returns_error_v1_with_stale_cursor_code() {
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
// Get a valid cursor first.
|
||||
let (page1_stdout, _) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--mode", "lexical", "--json", "--k", "1", "apples"],
|
||||
);
|
||||
let (page1_stdout, _) =
|
||||
common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "--k", "1", "apples"]);
|
||||
let v1: Value = serde_json::from_str(page1_stdout.trim()).expect("json");
|
||||
let cursor = v1["next_cursor"]
|
||||
.as_str()
|
||||
@@ -181,16 +158,8 @@ fn search_stale_cursor_returns_error_v1_with_stale_cursor_code() {
|
||||
let cfg_str = cfg.to_str().expect("utf8");
|
||||
let out = std::process::Command::new(exe)
|
||||
.args([
|
||||
"--config",
|
||||
cfg_str,
|
||||
"--json",
|
||||
"search",
|
||||
"--mode",
|
||||
"lexical",
|
||||
"--json",
|
||||
"--cursor",
|
||||
&cursor,
|
||||
"apples",
|
||||
"--config", cfg_str, "--json", "search", "--mode", "lexical", "--json", "--cursor",
|
||||
&cursor, "apples",
|
||||
])
|
||||
.output()
|
||||
.expect("kebab search --cursor");
|
||||
@@ -234,10 +203,8 @@ fn search_plain_emits_truncated_hint_to_stderr() {
|
||||
}
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
let (_stdout, stderr) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--mode", "lexical", "--max-tokens", "30", "rust"],
|
||||
);
|
||||
let (_stdout, stderr) =
|
||||
common::run_search_with_args(&cfg, &["--mode", "lexical", "--max-tokens", "30", "rust"]);
|
||||
assert!(
|
||||
stderr.contains("[truncated;"),
|
||||
"stderr must carry truncated hint: {stderr:?}"
|
||||
@@ -254,10 +221,7 @@ fn search_plain_emits_short_query_hint_to_stderr() {
|
||||
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
let (_stdout, stderr) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--mode", "lexical", "ab"],
|
||||
);
|
||||
let (_stdout, stderr) = common::run_search_with_args(&cfg, &["--mode", "lexical", "ab"]);
|
||||
assert!(
|
||||
stderr.contains("[hint]"),
|
||||
"stderr must carry short-query hint: {stderr:?}"
|
||||
@@ -278,18 +242,18 @@ fn search_json_emits_hint_field_for_short_query() {
|
||||
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
let (stdout, _stderr) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--json", "--mode", "lexical", "ab"],
|
||||
);
|
||||
let v: Value = serde_json::from_str(stdout.trim())
|
||||
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
let (stdout, _stderr) =
|
||||
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "ab"]);
|
||||
let v: Value =
|
||||
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
assert!(
|
||||
v["hits"].as_array().unwrap().is_empty(),
|
||||
"empty hits expected for short query in empty KB: {v}"
|
||||
);
|
||||
assert_eq!(
|
||||
v["hint"].as_str().expect("hint field set on short empty result"),
|
||||
v["hint"]
|
||||
.as_str()
|
||||
.expect("hint field set on short empty result"),
|
||||
"3자 이상 키워드 권장 (trigram tokenizer 제약)",
|
||||
"hint must carry the standard advisory: {v}"
|
||||
);
|
||||
@@ -305,12 +269,10 @@ fn search_json_omits_hint_field_when_query_is_long_enough() {
|
||||
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
let (stdout, _stderr) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--json", "--mode", "lexical", "abc"],
|
||||
);
|
||||
let v: Value = serde_json::from_str(stdout.trim())
|
||||
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
let (stdout, _stderr) =
|
||||
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "abc"]);
|
||||
let v: Value =
|
||||
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
|
||||
assert!(
|
||||
v.get("hint").is_none(),
|
||||
"hint must be absent for ≥3-char queries: {v}"
|
||||
|
||||
@@ -16,10 +16,8 @@ fn lexical_mode_hits_carry_bm25_score_kind() {
|
||||
doc_with_term(&workspace);
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
let (stdout, _stderr) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--mode", "lexical", "--json", "rust"],
|
||||
);
|
||||
let (stdout, _stderr) =
|
||||
common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "rust"]);
|
||||
let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON");
|
||||
let hits = v["hits"].as_array().expect("hits array");
|
||||
assert!(!hits.is_empty(), "expected at least 1 hit");
|
||||
@@ -40,10 +38,8 @@ fn old_wire_reader_compat_score_kind_optional_field() {
|
||||
doc_with_term(&workspace);
|
||||
common::ingest(&cfg, &workspace);
|
||||
|
||||
let (stdout, _stderr) = common::run_search_with_args(
|
||||
&cfg,
|
||||
&["--mode", "lexical", "--json", "rust"],
|
||||
);
|
||||
let (stdout, _stderr) =
|
||||
common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "rust"]);
|
||||
let v: Value = serde_json::from_str(stdout.trim()).unwrap();
|
||||
let hit = &v["hits"][0];
|
||||
assert!(hit.get("score_kind").is_some(), "score_kind always emitted");
|
||||
|
||||
@@ -59,15 +59,14 @@ fn search_json_includes_indexed_at_and_stale() {
|
||||
.get("hits")
|
||||
.and_then(|h| h.as_array())
|
||||
.unwrap_or_else(|| panic!("expected hits array, got {stdout}"));
|
||||
let first = arr.first().unwrap_or_else(|| panic!("expected ≥1 hit, got empty hits: {stdout}"));
|
||||
let first = arr
|
||||
.first()
|
||||
.unwrap_or_else(|| panic!("expected ≥1 hit, got empty hits: {stdout}"));
|
||||
assert!(
|
||||
first.get("indexed_at").is_some(),
|
||||
"missing indexed_at in {first}"
|
||||
);
|
||||
assert!(
|
||||
first.get("stale").is_some(),
|
||||
"missing stale in {first}"
|
||||
);
|
||||
assert!(first.get("stale").is_some(), "missing stale in {first}");
|
||||
assert_eq!(
|
||||
first["stale"], false,
|
||||
"freshly ingested doc must not be stale at default 30d threshold"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user