Merge pull request 'feat(pdf): scanned PDF OCR via qwen2.5vl:3b vision LLM (v0.20.0 sub-item 1)' (#189) from feat/pdf-scanned-ocr into main

Reviewed-on: #189
This commit was merged in pull request #189.
This commit is contained in:
2026-05-28 04:37:41 +00:00
280 changed files with 18330 additions and 3897 deletions

47
Cargo.lock generated
View File

@@ -4127,7 +4127,7 @@ dependencies = [
[[package]]
name = "kebab-app"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"base64 0.22.1",
@@ -4166,12 +4166,13 @@ dependencies = [
"tracing-appender",
"tracing-subscriber",
"unicode-normalization",
"uuid",
"wiremock",
]
[[package]]
name = "kebab-chunk"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"blake3",
@@ -4187,7 +4188,7 @@ dependencies = [
[[package]]
name = "kebab-cli"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"clap",
@@ -4208,7 +4209,7 @@ dependencies = [
[[package]]
name = "kebab-config"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"dirs 5.0.1",
@@ -4223,7 +4224,7 @@ dependencies = [
[[package]]
name = "kebab-core"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"blake3",
@@ -4237,7 +4238,7 @@ dependencies = [
[[package]]
name = "kebab-embed"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"blake3",
@@ -4251,7 +4252,7 @@ dependencies = [
[[package]]
name = "kebab-embed-local"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"fastembed",
@@ -4264,7 +4265,7 @@ dependencies = [
[[package]]
name = "kebab-eval"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"kebab-app",
@@ -4283,7 +4284,7 @@ dependencies = [
[[package]]
name = "kebab-llm"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"kebab-core",
@@ -4292,7 +4293,7 @@ dependencies = [
[[package]]
name = "kebab-llm-local"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"kebab-config",
@@ -4309,7 +4310,7 @@ dependencies = [
[[package]]
name = "kebab-mcp"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"kebab-app",
@@ -4327,7 +4328,7 @@ dependencies = [
[[package]]
name = "kebab-nli"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"hf-hub",
@@ -4342,7 +4343,7 @@ dependencies = [
[[package]]
name = "kebab-parse-code"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"gix",
@@ -4365,7 +4366,7 @@ dependencies = [
[[package]]
name = "kebab-parse-image"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"ab_glyph",
"anyhow",
@@ -4389,7 +4390,7 @@ dependencies = [
[[package]]
name = "kebab-parse-md"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"kebab-core",
@@ -4406,20 +4407,22 @@ dependencies = [
[[package]]
name = "kebab-parse-pdf"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"blake3",
"kebab-core",
"kebab-parse-image",
"lopdf",
"serde_json",
"strsim",
"time",
"tracing",
]
[[package]]
name = "kebab-rag"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"blake3",
@@ -4441,7 +4444,7 @@ dependencies = [
[[package]]
name = "kebab-search"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"globset",
@@ -4460,7 +4463,7 @@ dependencies = [
[[package]]
name = "kebab-source-fs"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"blake3",
@@ -4478,7 +4481,7 @@ dependencies = [
[[package]]
name = "kebab-store-sqlite"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"blake3",
@@ -4498,7 +4501,7 @@ dependencies = [
[[package]]
name = "kebab-store-vector"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"arrow",
@@ -4522,7 +4525,7 @@ dependencies = [
[[package]]
name = "kebab-tui"
version = "0.19.0"
version = "0.20.0"
dependencies = [
"anyhow",
"crossterm",

View File

@@ -30,7 +30,7 @@ edition = "2024"
rust-version = "1.85"
license = "MIT OR Apache-2.0"
repository = "https://github.com/altair823/kebab"
version = "0.19.0"
version = "0.20.0" # v0.20.0 sub-item 1 (scanned PDF OCR via qwen2.5vl:3b) — CLAUDE.md §Release 사용자 도그푸딩 트리거
# pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with
# intentional allow-list. The allowed lints are either cosmetic (doc style),
@@ -141,6 +141,7 @@ proptest = "1"
# p9-fb-19: LRU cache for `App::search` results. Bounded capacity
# from `config.search.cache_capacity` (default 256, ~1.3 MB cap).
lru = "0.12"
lopdf = "0.32"
# fastembed-rs ships ONNX runtime via the `ort-download-binaries` feature
# in its default set (which also pulls `hf-hub` for first-run model
# downloads). Pinned to the 4.x line per task p3-2 (current 5.x release

View File

@@ -17,7 +17,7 @@ P0P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) + P10 전체
| **P4** | Local LLM + RAG + grounded answer | `kebab-llm`, `kebab-llm-local`, `kebab-rag` | P3 | ✅ 완료 |
| **P5** | Golden query / regression eval | `kebab-eval` | P4 | ✅ 완료 |
| **P6** | 이미지 ingestion (OCR + caption) | `kebab-parse-image` | P5 | ✅ 완료 (4/4 component, OCR/caption Ollama-vision) |
| **P7** | PDF text + page citation | `kebab-parse-pdf` | P5 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring) |
| **P7** | PDF text + page citation + scanned OCR (v0.20.0 sub-item 1) | `kebab-parse-pdf` + `kebab-app::pdf_ocr_apply` | P5 + P6 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring + post-extract OCR enrichment via qwen2.5vl:3b vision LLM) |
| **P8** | 음성 transcription + timestamp citation | `kebab-parse-audio` | P5 | ⏸ 보류 (whisper-rs 시스템 dep brainstorm 필요) |
| **P9** | TUI + desktop app | `kebab-tui`, `kebab-desktop` | P5 | 🟡 진행 (4/5 component — P9-1/2/3/4 완료 [Library / Search / Ask / Inspect], P9-5 desktop 예정 · 도그푸딩 피드백 **20/20 ✅**) |
| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)**, **2 ✅ (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` — v0.14.0)**, **3 ✅ (Tier 3 paragraph fallback: code-text-paragraph-v1 — v0.15.0)**, **1D ✅ (C + C++ AST chunkers, code-c-ast-v1 + code-cpp-ast-v1 — v0.16.0)** |
@@ -32,6 +32,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만:
- **v0.20 sub-item 1 (scanned PDF OCR via qwen2.5vl:3b)**: post-extract enrichment pattern (`kebab-app::pdf_ocr_apply`, H-1 resolution), DCTDecode-only v1 scope (FlateDecode/CCITTFax page 는 warning + skip), parser_version `"pdf-text-v1"` 보존 + force-reingest UX 명문 (H-4).
- **2026-05-26 kebab-normalize + kebab-parse-types 흡수 (24 → 22 crates, design §3.7b 재작성)** — v0.19.0 cut. 4 parser 중 markdown 한 갈래만 lift 를 경유하는 reality 가 design §3.7b 의 fan-in ≥ 2 가정과 diverge → thin layer (`kebab-parse-types`) + `kebab-normalize` 두 crate 가 `kebab-parse-md` 로 흡수. 5 사용 type + 3 forward-declared struct 모두 `kebab-parse-md::{types,normalize}` module 의 `pub` re-export 로 보존. wire / surface impact = 0 (CLI / TUI / MCP / `--json` / config / XDG / parser_version 모두 unchanged). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-26 design deviation entry).
- **2026-05-26 v0.18.0 fb-41 multi-hop RAG + NLI verification ship (PR #176-180) + post-PR9 cleanup (PR #181)** — pre-v0.18.0 dogfood (`/build/cache/dogfood-v018/`, 33 assets / 205 chunks, gemma3:4b CPU only / 16 GB RAM) 에서 발견된 S7 caffeine hallucination 의 root cause = LLM-self-judge ceiling (synthesize 가 chunks 와 무관한 Adam optimizer gradient 식을 silent emit, self-judge 가 reject 못함). 학계 표준 (Self-RAG, CRAG, Auto-GDA, MedTrust-RAG) 결론 = deterministic post-synthesis verification. mDeBERTa-v3 XNLI ONNX (280 MB, Xenova HF) 가 `(packed_chunks, answer)` entailment 검사 — `[rag] nli_threshold > 0` (default 0.0 = disabled, production 권장 0.5) 일 때 활성. dogfood retest 측정 — S7 PR-8 baseline `grounded=true + Adam hallucination` → PR-9 `nli_verification_failed, nli_score 0.0035`. wire additive minor — `answer.v1.verification` field + `refusal_reason``nli_verification_failed` / `nli_model_unavailable` 추가, pre-v0.18 reader 무영향. 5 sub-PR 시퀀스 + cleanup PR (clippy::pedantic baseline + 의도적 30+ allow + H1 `[models.nli].model` config wiring + 9 new tests). post-refactor retest = PR-9d byte-identical (deterministic 확인). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 fb-41 PR-9 closure entry + S3 follow-up).
- **2026-05-25 v0.17.2 post-v0.17.1 polish (PR #164 + #165)** — v0.17.1 의 두 follow-up closure. (1) `[image.ocr] request_timeout_secs` 별 노브 — `crates/kebab-parse-image/src/ocr.rs::REQUEST_TIMEOUT` hard 300s 제거, LLM 쪽 패턴 (PR #162) 을 OCR 어댑터에 동일 적용. 사용자 결정으로 별 노브 분리 (OCR vs LLM 의 cold start 패턴이 달라 독립 조절). v0.17.1 미진행 항목 closure. (2) `chunks_fts``heading_path` 컬럼이 JSON 표기 + path 세그먼트 까지 trigram 색인 → query false positive 가능 문제 closure. `lexical.rs::build_match_string` 가 non-raw 분기 결과를 `text : (<expr>)` 로 wrap — heading 색인 V007 verbatim 유지, 매칭만 text 한정. 사용자가 명시 heading 검색 하려면 raw mode `'heading_path : <token>'` escape hatch (SKILL.md 갱신). 둘 다 additive (옛 config 호환) / re-ingest 불필요. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 v0.17.2 두 entry).

View File

@@ -192,7 +192,7 @@ flowchart TB
## Configuration
- `~/.config/kebab/config.toml` — `kebab init` 가 XDG 경로에 생성. `[workspace]` (root, exclude — include 필드는 제거됨, 지원 형식은 자동 결정), `[storage]`, `[chunking]`, `[models.embedding]`, `[models.llm]`, `[image.ocr]`, `[image.caption]`, `[search]`, `[rag]`, `[ui]` 절.
- `~/.config/kebab/config.toml` — `kebab init` 가 XDG 경로에 생성. `[workspace]` (root, exclude — include 필드는 제거됨, 지원 형식은 자동 결정), `[storage]`, `[chunking]`, `[models.embedding]`, `[models.llm]`, `[image.ocr]`, `[image.caption]`, `[pdf.ocr]`, `[search]`, `[rag]`, `[ui]` 절.
- `[models.embedding]` —
- `model` (default `"multilingual-e5-large"`, fb-39b) — 다국어 sentence embedding 모델. 1024-dim. ONNX (~1.3 GB) 첫 실행 시 fastembed cache (`config.storage.model_dir/fastembed/`) 에 자동 다운로드. `"multilingual-e5-small"` (384 dim) 는 backwards-compat 으로 사용 가능 — TOML 에 명시.
- `dimensions` (default `1024`) — 모델의 embedding 차원. config 와 LanceDB stored dim 불일치 시 검색 결과 0 건 (orphan table). 모델 변경 시 `kebab reset --vector-only && kebab ingest` 로 vector index 재구축 권장.
@@ -211,6 +211,29 @@ flowchart TB
config 예시는 [docs/SMOKE.md](docs/SMOKE.md) 의 `/tmp/kebab-smoke/config.toml` 블록 참조.
### `[pdf.ocr]` — scanned PDF OCR (v0.20.0+)
embedded text 가 없는 scanned PDF (책 스캔, 영수증, 카메라 page 등) 의 OCR 활성화. **default off (opt-in)** — OCR 한 page 당 ~45-100s (qwen2.5vl:3b on CPU) 의 cost 때문에 책 / 논문 archive 등 명시적 KB 에만 활성화.
```toml
[pdf.ocr]
enabled = false # opt-in: 책 / 논문 archive KB 에서 true
always_on = false # true 시 vector PDF page 도 dual-block OCR (confidence boost)
engine = "ollama-vision"
model = "qwen2.5vl:3b" # PoC alnum 94.79% page1 / 81.56% 받침 (vs gemma4:e4b 의 27%)
# endpoint = "http://localhost:11434" # 미명시 시 models.llm.endpoint fallback
languages = ["eng", "kor"]
max_pixels = 2048
request_timeout_secs = 600
valid_ratio_threshold = 0.5 # text-detect threshold — mojibake / scanned 판정 boundary
min_char_count = 20
lang_hint = "kor"
```
env override: `KEBAB_PDF_OCR_*` 11 변수 (예: `KEBAB_PDF_OCR_ENABLED=true kebab ingest`).
**v0.20 upgrade after**: scanned PDF 가 v0.19 에 빈 block + warning 으로 indexed 된 경우 자동으로 OCR 재실행 안 됨 (parser_version `"pdf-text-v1"` 보존). 명시적 재처리: `kebab ingest --force-reingest`.
## 외부 AI 통합
`--json` 출력 + frozen wire schema v1 가 stable contract. 통합 옵션:

View File

@@ -35,6 +35,7 @@ kebab-parse-image = { path = "../kebab-parse-image" }
# per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the
# resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
kebab-parse-pdf = { path = "../kebab-parse-pdf" }
lopdf = { workspace = true }
# p10-1A-2: Rust AST extractor lives here. App threads it into the
# per-asset dispatch (see `ingest_one_asset` Code branch) and runs the
# resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`.
@@ -44,6 +45,7 @@ blake3 = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
time = { workspace = true }
uuid = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] }
tracing-appender = "0.2"
@@ -75,7 +77,7 @@ image = { version = "0.25", default-features = false, features =
# lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
# to the same major (0.32) so byte output is identical between the two
# fixture surfaces.
lopdf = "0.32"
lopdf = { workspace = true }
# error_wire::tests::llm_unreachable_classifies_to_model_unreachable needs a real
# reqwest::Error (private constructor) — built from a connect-refused call.
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }

View File

@@ -46,9 +46,8 @@ use kebab_core::{
use kebab_embed_local::FastembedEmbedder;
use kebab_llm_local::OllamaLanguageModel;
use kebab_parse_code::{
CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor,
JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor,
TypescriptAstExtractor,
CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor,
KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor,
};
use kebab_parse_image::ImageExtractor;
use kebab_parse_pdf::PdfTextExtractor;
@@ -242,15 +241,15 @@ impl App {
// kebab-nli construction. Failure (`?`) surfaces as a user-
// facing error at App boot — never a panic in the pipeline's
// `expect("verifier must be Some when nli_threshold > 0.0")`.
let pipeline_verifier: Option<Arc<dyn kebab_nli::NliVerifier>> =
if config.rag.nli_threshold > 0.0 {
let v = kebab_nli::OnnxNliVerifier::new(&config).context(
"kebab-app: construct OnnxNliVerifier (config.rag.nli_threshold > 0)",
)?;
Some(Arc::new(v))
} else {
None
};
let pipeline_verifier: Option<Arc<dyn kebab_nli::NliVerifier>> = if config.rag.nli_threshold
> 0.0
{
let v = kebab_nli::OnnxNliVerifier::new(&config)
.context("kebab-app: construct OnnxNliVerifier (config.rag.nli_threshold > 0)")?;
Some(Arc::new(v))
} else {
None
};
Ok(Self {
config,
sqlite: Arc::new(sqlite),
@@ -350,7 +349,9 @@ impl App {
// so other in-flight searches can use the cache concurrently.
drop(guard);
let hits = self.search_uncached(query)?;
let mut guard = cache.lock().unwrap_or_else(std::sync::PoisonError::into_inner);
let mut guard = cache
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);
guard.put(key, hits.clone());
Ok(hits)
}
@@ -430,11 +431,7 @@ impl App {
///
/// `SearchResponse.next_cursor` and `truncated` are independent
/// signals — see `SearchResponse` doc for details.
pub fn search_with_opts(
&self,
query: SearchQuery,
opts: SearchOpts,
) -> Result<SearchResponse> {
pub fn search_with_opts(&self, query: SearchQuery, opts: SearchOpts) -> Result<SearchResponse> {
use crate::cursor;
let corpus_revision = self.sqlite.corpus_revision().to_string();
@@ -519,8 +516,7 @@ impl App {
// Apply offset + k_effective truncation (mirrors non-trace path).
let drop_n = offset.min(traced_hits.len());
traced_hits.drain(..drop_n);
let mut hits: Vec<SearchHit> =
traced_hits.into_iter().take(k_effective).collect();
let mut hits: Vec<SearchHit> = traced_hits.into_iter().take(k_effective).collect();
// Snippet truncation if opts.snippet_chars set (mirror non-trace path).
if opts.snippet_chars.is_some() {
@@ -551,8 +547,7 @@ impl App {
// Skip offset.
let drop_n = offset.min(all_hits.len());
all_hits.drain(..drop_n);
let mut hits: Vec<SearchHit> =
all_hits.into_iter().take(k_effective).collect();
let mut hits: Vec<SearchHit> = all_hits.into_iter().take(k_effective).collect();
// Apply snippet_chars override if shorter than what the
// retriever returned (retriever already honored
@@ -573,15 +568,11 @@ impl App {
// Step 1: shorten snippets progressively to a 60-char floor.
const SNIPPET_FLOOR: usize = 60;
let mut current_snippet_cap = snippet_chars;
while estimate_chars(&hits) > max_chars
&& current_snippet_cap > SNIPPET_FLOOR
{
current_snippet_cap =
(current_snippet_cap / 2).max(SNIPPET_FLOOR);
while estimate_chars(&hits) > max_chars && current_snippet_cap > SNIPPET_FLOOR {
current_snippet_cap = (current_snippet_cap / 2).max(SNIPPET_FLOOR);
for h in &mut hits {
if h.snippet.chars().count() > current_snippet_cap {
h.snippet =
trim_to_chars(&h.snippet, current_snippet_cap);
h.snippet = trim_to_chars(&h.snippet, current_snippet_cap);
truncated = true;
}
}
@@ -651,8 +642,7 @@ impl App {
retriever: Arc<dyn Retriever>,
llm: Arc<dyn LanguageModel>,
) -> RagPipeline {
let pipeline =
RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
let pipeline = RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
match &self.pipeline_verifier {
Some(v) => pipeline.with_verifier(v.clone()),
None => pipeline,
@@ -723,12 +713,7 @@ impl App {
/// returns; on persistence error, the answer is still returned
/// (don't lose the user's compute) but the error is logged so
/// the operator notices.
pub fn ask_with_session(
&self,
session_id: &str,
query: &str,
opts: AskOpts,
) -> Result<Answer> {
pub fn ask_with_session(&self, session_id: &str, query: &str, opts: AskOpts) -> Result<Answer> {
use kebab_core::traits::{ChatSessionRepo, ChatSessionRow, ChatTurnRow};
use std::time::{SystemTime, UNIX_EPOCH};
@@ -766,13 +751,8 @@ impl App {
let retriever = self.build_retriever(opts.mode)?;
let llm = self.llm()?;
let pipeline = self.build_pipeline(retriever, llm);
let answer = pipeline.ask_with_history(
query,
history,
session_id.to_string(),
next_index,
opts,
)?;
let answer =
pipeline.ask_with_history(query, history, session_id.to_string(), next_index, opts)?;
// Auto-create the session header on first use. Title from
// the first question (≤40 chars after trim).
@@ -813,7 +793,8 @@ impl App {
turn_index: next_index,
question: query.to_string(),
answer: answer.answer.clone(),
citations_json: serde_json::to_string(&answer.citations).unwrap_or_else(|_| "[]".to_string()),
citations_json: serde_json::to_string(&answer.citations)
.unwrap_or_else(|_| "[]".to_string()),
created_at: now_unix,
};
if let Err(e) = self.sqlite.append_turn(&turn_row) {
@@ -848,8 +829,7 @@ impl App {
return Ok(Some(e.clone()));
}
let emb: Arc<dyn Embedder + Send + Sync> = Arc::new(
FastembedEmbedder::new(&self.config)
.context("kb-app: load FastembedEmbedder")?,
FastembedEmbedder::new(&self.config).context("kb-app: load FastembedEmbedder")?,
);
// `set` returns Err if another thread won the race; in that case
// the loser still returns the (now-cached) winner via `get()`.
@@ -925,7 +905,9 @@ impl App {
/// clear` admin command). No-op when the cache is disabled.
pub fn clear_search_cache(&self) {
if let Some(cache) = self.search_cache.as_ref() {
let mut guard = cache.lock().unwrap_or_else(std::sync::PoisonError::into_inner);
let mut guard = cache
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);
guard.clear();
}
}
@@ -946,8 +928,8 @@ impl App {
/// git tree) correctly keep `repo: None` — `Metadata.repo` is already
/// `None` for those, so the assignment is a no-op.
fn backfill_repo(&self, hits: &mut [SearchHit]) {
use std::collections::HashMap;
use kebab_core::DocumentId;
use std::collections::HashMap;
// doc_id → Option<String> where None means "not found / no repo"
let mut cache: HashMap<DocumentId, Option<String>> = HashMap::new();
@@ -956,26 +938,24 @@ impl App {
if hit.repo.is_some() {
continue;
}
let repo_val = cache
.entry(hit.doc_id.clone())
.or_insert_with(|| {
// Deliberately non-aborting: a failed store lookup for
// one hit must not abort the whole search response. Log
// the error so it's observable rather than silently
// dropped (review #140 round 1).
match self.sqlite.get_document(&hit.doc_id) {
Ok(opt) => opt.and_then(|doc| doc.metadata.repo),
Err(e) => {
tracing::warn!(
target: "kebab-app",
doc_id = %hit.doc_id,
error = %e,
"backfill_repo: get_document failed; leaving hit.repo = None"
);
None
}
let repo_val = cache.entry(hit.doc_id.clone()).or_insert_with(|| {
// Deliberately non-aborting: a failed store lookup for
// one hit must not abort the whole search response. Log
// the error so it's observable rather than silently
// dropped (review #140 round 1).
match self.sqlite.get_document(&hit.doc_id) {
Ok(opt) => opt.and_then(|doc| doc.metadata.repo),
Err(e) => {
tracing::warn!(
target: "kebab-app",
doc_id = %hit.doc_id,
error = %e,
"backfill_repo: get_document failed; leaving hit.repo = None"
);
None
}
});
}
});
if let Some(r) = repo_val {
hit.repo = Some(r.clone());
}
@@ -986,10 +966,7 @@ impl App {
/// "switch to --mode lexical" error when embeddings are disabled.
fn require_embeddings(
&self,
) -> Result<(
Arc<dyn Embedder + Send + Sync>,
Arc<LanceVectorStore>,
)> {
) -> Result<(Arc<dyn Embedder + Send + Sync>, Arc<LanceVectorStore>)> {
let emb = self.embedder()?.ok_or_else(|| {
anyhow!(
"embeddings disabled (config.models.embedding.provider == \"none\" \
@@ -1278,8 +1255,8 @@ mod tests_extractor_dispatch {
MediaType::Code("kotlin".into()),
MediaType::Code("c".into()),
MediaType::Code("cpp".into()),
MediaType::Code("yaml".into()), // registry NOT cover
MediaType::Code("shell".into()), // registry NOT cover
MediaType::Code("yaml".into()), // registry NOT cover
MediaType::Code("shell".into()), // registry NOT cover
MediaType::Audio(AudioType::Wav), // registry NOT cover
];
for sample in &samples {

View File

@@ -215,7 +215,10 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
.and_then(serde_json::Value::as_u64)
.map(|n| n as usize),
cursor: obj.get("cursor").and_then(|v| v.as_str()).map(String::from),
trace: obj.get("trace").and_then(serde_json::Value::as_bool).unwrap_or(false),
trace: obj
.get("trace")
.and_then(serde_json::Value::as_bool)
.unwrap_or(false),
};
Ok((

View File

@@ -10,6 +10,6 @@
pub use crate::doctor_signal::{DoctorUnhealthy, NoHitSignal, RefusalSignal};
pub use kebab_config::{ConfigInvalid, ConfigNotFound};
pub use kebab_llm_local::LlmError;
pub use kebab_config::ConfigInvalid;
pub use kebab_store_sqlite::NotIndexed;

View File

@@ -9,7 +9,7 @@
use serde::{Deserialize, Serialize};
use serde_json::{Value, json};
use crate::error_signal::{ConfigInvalid, LlmError, NotIndexed};
use crate::error_signal::{ConfigInvalid, ConfigNotFound, LlmError, NotIndexed};
// p9-fb-34: `stale_cursor` is constructed directly by `cursor::decode`
// and surfaced through `StructuredError` (an anyhow-friendly wrapper
@@ -65,6 +65,20 @@ pub fn classify(err: &anyhow::Error, verbose: bool) -> ErrorV1 {
hint: Some("check `--config <path>` and TOML syntax".to_string()),
};
}
if let Some(s) = err.downcast_ref::<ConfigNotFound>() {
return ErrorV1 {
schema_version: ERROR_V1_ID.to_string(),
code: "config_not_found".to_string(),
message: s.to_string(),
details: json!({
"path": s.path.to_string_lossy(),
}),
hint: Some(
"verify --config <path>; pass an existing toml file or omit --config to use XDG default"
.to_string(),
),
};
}
if let Some(s) = err.downcast_ref::<NotIndexed>() {
return ErrorV1 {
schema_version: ERROR_V1_ID.to_string(),
@@ -158,7 +172,10 @@ mod tests {
});
let v1 = classify(&err, false);
assert_eq!(v1.code, "config_invalid");
assert_eq!(v1.details.get("path").and_then(|p| p.as_str()), Some("/tmp/x.toml"));
assert_eq!(
v1.details.get("path").and_then(|p| p.as_str()),
Some("/tmp/x.toml")
);
assert!(v1.hint.is_some());
}
@@ -182,7 +199,8 @@ mod tests {
// the resulting LlmError::Unreachable maps to "model_unreachable".
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_millis(500))
.build().unwrap();
.build()
.unwrap();
let err = client.get("http://127.0.0.1:1").send().unwrap_err();
let llm = LlmError::Unreachable {
endpoint: "http://127.0.0.1:1".to_string(),
@@ -198,7 +216,10 @@ mod tests {
let llm = LlmError::ModelNotPulled("gemma4:e4b".to_string());
let v1 = classify(&anyhow::Error::new(llm), false);
assert_eq!(v1.code, "model_not_pulled");
assert_eq!(v1.details.get("model").and_then(|p| p.as_str()), Some("gemma4:e4b"));
assert_eq!(
v1.details.get("model").and_then(|p| p.as_str()),
Some("gemma4:e4b")
);
}
#[test]
@@ -235,7 +256,10 @@ mod tests {
// (single source of truth). classify must not pattern-match on
// anyhow string contents — that would create two sources of
// truth. The bare anyhow string falls through to "generic".
assert_ne!(v1.code, "stale_cursor", "classify must not produce stale_cursor from bare anyhow string");
assert_ne!(
v1.code, "stale_cursor",
"classify must not produce stale_cursor from bare anyhow string"
);
}
#[test]

View File

@@ -36,9 +36,7 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
} else {
String::new()
};
let already = existing
.lines()
.any(|line| line.trim() == KEBABIGNORE_LINE);
let already = existing.lines().any(|line| line.trim() == KEBABIGNORE_LINE);
if already {
return Ok(());
}
@@ -57,11 +55,7 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
/// Copy bytes to `<external_dir>/<blake3-12>.<ext>`. Idempotent — if the
/// destination file already exists with the expected hash, the existing
/// file is reused (no second write). Returns the destination path.
pub fn copy_to_external(
external_dir: &Path,
bytes: &[u8],
ext: &str,
) -> Result<PathBuf> {
pub fn copy_to_external(external_dir: &Path, bytes: &[u8], ext: &str) -> Result<PathBuf> {
let hash = blake3::hash(bytes);
let hex = hash.to_hex();
let prefix = &hex.as_str()[..12];
@@ -82,11 +76,7 @@ pub fn copy_to_external(
/// Internal `yaml_quote` always uses double-quoted YAML form with backslash
/// escapes for `"` / `\` / control chars — agent-supplied titles with
/// special characters are safe.
pub fn inject_frontmatter(
body: &str,
title: &str,
source_uri: Option<&str>,
) -> Result<String> {
pub fn inject_frontmatter(body: &str, title: &str, source_uri: Option<&str>) -> Result<String> {
let head = body.trim_start();
if head.starts_with("---\n") || head.starts_with("---\r\n") || head.starts_with("---\r") {
anyhow::bail!(

View File

@@ -50,14 +50,14 @@ impl App {
fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result<FetchResult> {
let target = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_chunk(&app.sqlite, &id)?
.ok_or_else(|| {
anyhow::Error::new(StructuredError(ErrorV1 {
schema_version: ERROR_V1_ID.to_string(),
code: "chunk_not_found".to_string(),
message: format!("chunk_id '{}' not found", id.0),
details: serde_json::Value::Null,
hint: None,
}))
})?;
anyhow::Error::new(StructuredError(ErrorV1 {
schema_version: ERROR_V1_ID.to_string(),
code: "chunk_not_found".to_string(),
message: format!("chunk_id '{}' not found", id.0),
details: serde_json::Value::Null,
hint: None,
}))
})?;
let doc_id = target.doc_id.clone();
let doc =
@@ -107,14 +107,14 @@ fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result<FetchResult> {
fn fetch_doc(app: &App, id: DocumentId, opts: FetchOpts) -> Result<FetchResult> {
let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
.ok_or_else(|| {
anyhow::Error::new(StructuredError(ErrorV1 {
schema_version: ERROR_V1_ID.to_string(),
code: "doc_not_found".to_string(),
message: format!("doc_id '{}' not found", id.0),
details: serde_json::Value::Null,
hint: None,
}))
})?;
anyhow::Error::new(StructuredError(ErrorV1 {
schema_version: ERROR_V1_ID.to_string(),
code: "doc_not_found".to_string(),
message: format!("doc_id '{}' not found", id.0),
details: serde_json::Value::Null,
hint: None,
}))
})?;
let mut text = fmt_canonical_to_markdown(&doc);
let mut truncated = false;
@@ -176,14 +176,14 @@ fn fetch_span(
) -> Result<FetchResult> {
let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
.ok_or_else(|| {
anyhow::Error::new(StructuredError(ErrorV1 {
schema_version: ERROR_V1_ID.to_string(),
code: "doc_not_found".to_string(),
message: format!("doc_id '{}' not found", id.0),
details: serde_json::Value::Null,
hint: None,
}))
})?;
anyhow::Error::new(StructuredError(ErrorV1 {
schema_version: ERROR_V1_ID.to_string(),
code: "doc_not_found".to_string(),
message: format!("doc_id '{}' not found", id.0),
details: serde_json::Value::Null,
hint: None,
}))
})?;
// Reject line-incompatible media types (PDF / audio). `SourceType`
// (markdown / note / paper / reference / inbox) is the *user-facing*

View File

@@ -0,0 +1,328 @@
//! Per-ingest-run structured ndjson log writer (v0.20.x ingest log feature).
//!
//! Each `kebab ingest` run produces one `ingest-{run_id}.ndjson` file in
//! `config.logging.ingest_log_dir`. Records are appended line by line; the
//! last record is always `kind="summary"`. `IngestLogWriter::open` returns
//! `Ok(None)` when `ingest_log_enabled = false` so callers need not branch.
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use serde::{Deserialize, Serialize};
use time::format_description::well_known::Rfc3339;
pub struct IngestLogWriter {
file: BufWriter<File>,
path: PathBuf,
run_id: String,
started_at: SystemTime,
}
impl IngestLogWriter {
/// Open a new log file. Returns `Ok(None)` when `cfg.ingest_log_enabled == false` (AC-6).
pub fn open(cfg: &kebab_config::LoggingCfg) -> anyhow::Result<Option<Self>> {
if !cfg.ingest_log_enabled {
return Ok(None);
}
let run_id = generate_run_id();
let log_dir = expand_log_dir(&cfg.ingest_log_dir);
std::fs::create_dir_all(&log_dir)?;
let path = log_dir.join(format!("ingest-{run_id}.ndjson"));
let file = BufWriter::new(File::create(&path)?);
Ok(Some(Self {
file,
path,
run_id,
started_at: SystemTime::now(),
}))
}
pub fn write_event(&mut self, event: &LogEvent<'_>) -> anyhow::Result<()> {
serde_json::to_writer(&mut self.file, event)?;
writeln!(self.file)?;
Ok(())
}
pub fn write_summary(&mut self, summary: &IngestSummary) -> anyhow::Result<()> {
serde_json::to_writer(&mut self.file, summary)?;
writeln!(self.file)?;
Ok(())
}
pub fn flush(&mut self) -> anyhow::Result<()> {
self.file.flush()?;
Ok(())
}
pub fn run_id(&self) -> &str {
&self.run_id
}
pub fn path(&self) -> &Path {
&self.path
}
pub fn started_at(&self) -> SystemTime {
self.started_at
}
}
impl Drop for IngestLogWriter {
fn drop(&mut self) {
let _ = self.file.flush();
}
}
/// ISO 8601 compact timestamp + uuid v7 suffix: `20260528T013000Z-abc123de`.
/// uuid v7 is the workspace dep (Cargo.toml); `rand` is not added (spec §6 R-5).
fn generate_run_id() -> String {
use time::macros::format_description;
let now = time::OffsetDateTime::now_utc();
let ts = now
.format(format_description!(
"[year][month][day]T[hour][minute][second]Z"
))
.unwrap_or_else(|_| "19700101T000000Z".to_string());
let uid = uuid::Uuid::now_v7().simple().to_string();
let suffix = &uid[uid.len() - 8..];
format!("{ts}-{suffix}")
}
/// Expand `{state_dir}` placeholder → XDG state dir (spec §6 R-3).
/// Other tilde/env expansion is delegated to `kebab_config::expand_path`.
fn expand_log_dir(path: &Path) -> PathBuf {
let path_str = path.to_string_lossy();
if path_str.contains("{state_dir}") {
let state_dir = kebab_config::Config::xdg_state_dir();
PathBuf::from(path_str.replace("{state_dir}", &state_dir.to_string_lossy()))
} else {
path.to_path_buf()
}
}
/// RFC 3339 UTC timestamp for log records.
#[allow(dead_code)]
pub(crate) fn now_ts() -> String {
time::OffsetDateTime::now_utc()
.format(&Rfc3339)
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string())
}
/// Ingest event record (ndjson line). `kind` is the discriminator.
#[derive(Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum LogEvent<'a> {
Ocr {
ts: String,
doc_path: &'a str,
page: u32,
image_byte_size: Option<u64>,
image_width: Option<u32>,
image_height: Option<u32>,
ms: u64,
chars: u32,
success: bool,
reason: Option<&'a str>,
ocr_engine: &'a str,
},
ParseError {
ts: String,
doc_path: &'a str,
reason: &'a str,
message: &'a str,
},
Skip {
ts: String,
doc_path: &'a str,
reason: &'a str,
detail: Option<&'a str>,
},
Error {
ts: String,
code: &'a str,
message: &'a str,
},
}
/// Final summary record — always the last line of the log file.
/// Explicit `kind` field serializes to `"kind": "summary"`.
#[derive(Serialize, Deserialize)]
pub struct IngestSummary {
pub kind: String,
pub ts: String,
pub run_id: String,
pub scanned: u32,
pub new: u32,
pub errors: u32,
pub ocr_pages: u32,
pub ocr_failures: u32,
pub ocr_p50_ms: Option<u64>,
pub ocr_p90_ms: Option<u64>,
pub ocr_max_ms: Option<u64>,
pub duration_ms: u64,
}
impl IngestSummary {
#[allow(clippy::too_many_arguments)]
pub fn new(
ts: String,
run_id: String,
scanned: u32,
new: u32,
errors: u32,
ocr_pages: u32,
ocr_failures: u32,
ocr_ms_samples: &[u64],
duration_ms: u64,
) -> Self {
let (p50, p90, max) = percentiles(ocr_ms_samples);
Self {
kind: "summary".to_string(),
ts,
run_id,
scanned,
new,
errors,
ocr_pages,
ocr_failures,
ocr_p50_ms: p50,
ocr_p90_ms: p90,
ocr_max_ms: max,
duration_ms,
}
}
}
/// Simple percentile extraction on a sorted copy of `samples`.
/// Returns `(p50, p90, max)`. All `None` when samples is empty.
pub(crate) fn percentiles(samples: &[u64]) -> (Option<u64>, Option<u64>, Option<u64>) {
if samples.is_empty() {
return (None, None, None);
}
let mut sorted = samples.to_vec();
sorted.sort_unstable();
let n = sorted.len();
let p50 = sorted[n * 50 / 100];
let p90 = sorted[n * 90 / 100];
let max = *sorted.last().unwrap();
(Some(p50), Some(p90), Some(max))
}
#[cfg(test)]
mod tests {
use super::*;
use kebab_config::LoggingCfg;
use tempfile::TempDir;
#[test]
fn generate_run_id_has_iso_prefix_and_8_hex_suffix() {
let id = generate_run_id();
// Format: YYYYMMDDTHHmmssZ-xxxxxxxx (total len = 16+1+8 = 25)
assert_eq!(id.len(), 25, "run_id len should be 25: {id}");
let (prefix, suffix) = id.split_once('-').expect("run_id should contain '-'");
assert_eq!(prefix.len(), 16, "prefix should be 16 chars: {prefix}");
assert!(prefix.contains('T'), "prefix should contain T: {prefix}");
assert!(prefix.ends_with('Z'), "prefix should end with Z: {prefix}");
assert_eq!(suffix.len(), 8, "suffix should be 8 chars: {suffix}");
assert!(
suffix.chars().all(|c| c.is_ascii_hexdigit()),
"suffix should be hex: {suffix}"
);
}
#[test]
fn expand_log_dir_substitutes_state_dir_placeholder() {
let input = PathBuf::from("{state_dir}/logs");
let expanded = expand_log_dir(&input);
let expected = kebab_config::Config::xdg_state_dir().join("logs");
assert_eq!(expanded, expected);
assert!(!expanded.to_string_lossy().contains("{state_dir}"));
}
#[test]
fn writer_disabled_returns_none() {
let cfg = LoggingCfg {
ingest_log_enabled: false,
ingest_log_dir: PathBuf::from("/tmp/should-not-exist"),
};
let result = IngestLogWriter::open(&cfg).expect("open should not error");
assert!(result.is_none(), "disabled writer should return None");
}
#[test]
fn writer_writes_one_event_per_line_with_kind_discriminator() {
let tmp = TempDir::new().unwrap();
let cfg = LoggingCfg {
ingest_log_enabled: true,
ingest_log_dir: tmp.path().to_path_buf(),
};
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
let path = writer.path().to_path_buf();
writer
.write_event(&LogEvent::Skip {
ts: now_ts(),
doc_path: "a.zip",
reason: "builtin_blacklist",
detail: Some(".zip extension"),
})
.unwrap();
writer
.write_event(&LogEvent::Error {
ts: now_ts(),
code: "ingest_fatal",
message: "something bad",
})
.unwrap();
writer
.write_event(&LogEvent::ParseError {
ts: now_ts(),
doc_path: "weird.pdf",
reason: "lopdf_error",
message: "unexpected EOF",
})
.unwrap();
writer.flush().unwrap();
let contents = std::fs::read_to_string(&path).unwrap();
let lines: Vec<&str> = contents.lines().collect();
assert_eq!(lines.len(), 3, "expected 3 lines, got: {}", lines.len());
for line in &lines {
assert!(
line.starts_with('{'),
"each line should be JSON object: {line}"
);
assert!(
line.contains("\"kind\""),
"each line should have 'kind': {line}"
);
}
}
#[test]
fn drop_flushes_pending_buffer() {
let tmp = TempDir::new().unwrap();
let cfg = LoggingCfg {
ingest_log_enabled: true,
ingest_log_dir: tmp.path().to_path_buf(),
};
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
let path = writer.path().to_path_buf();
writer
.write_event(&LogEvent::Error {
ts: now_ts(),
code: "test",
message: "drop flush test",
})
.unwrap();
// Drop without explicit flush — Drop impl should flush BufWriter.
drop(writer);
let contents = std::fs::read_to_string(&path).unwrap();
assert!(
contents.lines().count() >= 1,
"file should have at least 1 line after drop"
);
}
}

View File

@@ -46,10 +46,13 @@ pub struct AggregateCounts {
/// Ordering invariant per design §2.4a:
///
/// ```text
/// ScanStarted < ScanCompleted < (AssetStarted < AssetFinished)*
/// < (Completed | Aborted)
/// ScanStarted < ScanCompleted
/// < (AssetStarted [< (PdfOcrStarted < PdfOcrFinished)*] < AssetFinished)*
/// < (Completed | Aborted)
/// ```
///
/// `[]` = optional, per-PDF asset only (v0.20.0 sub-item 1).
///
/// Embed-batch events (`embed_batch_started` / `embed_batch_finished`
/// in §2.4a) are reserved for a future iteration and are not emitted
/// by this task; the spec calls them out as "임의 위치" (optional).
@@ -85,6 +88,30 @@ pub enum IngestEvent {
/// aggregate at the cancel boundary. Emitted by `p9-fb-04`; this
/// task never produces `Aborted`.
Aborted { counts: AggregateCounts },
/// PDF page 별 OCR 시작 시 emit. v0.20.0 sub-item 1.
PdfOcrStarted { page: u32 },
/// PDF page 별 OCR 종료 시 emit. v0.20.0 sub-item 1.
/// `skipped` = `true` 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패).
/// `chars = 0` 만으로는 "skip" 과 "0-char OCR result" 구분 불가, `skipped` field 가 명시적.
PdfOcrFinished {
page: u32,
ms: u64,
chars: u32,
ocr_engine: String,
skipped: bool,
/// v0.20.x ingest log: raster image byte size (additive minor, optional).
#[serde(skip_serializing_if = "Option::is_none")]
image_byte_size: Option<u64>,
/// v0.20.x ingest log: raster image width in pixels (additive minor, optional).
#[serde(skip_serializing_if = "Option::is_none")]
image_width: Option<u32>,
/// v0.20.x ingest log: raster image height in pixels (additive minor, optional).
#[serde(skip_serializing_if = "Option::is_none")]
image_height: Option<u32>,
/// v0.20.x ingest log: OCR failure reason (additive minor, optional).
#[serde(skip_serializing_if = "Option::is_none")]
failure_reason: Option<String>,
},
}
/// Map a `MediaType` to the short label used by `IngestEvent::AssetStarted`.
@@ -118,10 +145,7 @@ pub fn render_skipped_breakdown(map: &std::collections::BTreeMap<String, u32>) -
/// Best-effort send into an optional `mpsc::Sender`. A dropped receiver
/// is silently absorbed — the ingest hot path must not stall on a slow
/// consumer. Logged at `trace` for diagnostics.
pub(crate) fn emit(
progress: Option<&std::sync::mpsc::Sender<IngestEvent>>,
event: IngestEvent,
) {
pub(crate) fn emit(progress: Option<&std::sync::mpsc::Sender<IngestEvent>>, event: IngestEvent) {
if let Some(tx) = progress {
if tx.send(event).is_err() {
tracing::trace!(
@@ -165,7 +189,10 @@ mod tests {
media: "markdown".into(),
};
let v = serde_json::to_value(&ev).unwrap();
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("asset_started"));
assert_eq!(
v.get("kind").and_then(|s| s.as_str()),
Some("asset_started")
);
assert_eq!(v.get("idx").and_then(serde_json::Value::as_u64), Some(1));
assert_eq!(v.get("total").and_then(serde_json::Value::as_u64), Some(10));
assert_eq!(v.get("path").and_then(|s| s.as_str()), Some("notes/foo.md"));
@@ -184,8 +211,14 @@ mod tests {
let v = serde_json::to_value(&ev).unwrap();
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("completed"));
let counts = v.get("counts").unwrap();
assert_eq!(counts.get("scanned").and_then(serde_json::Value::as_u64), Some(5));
assert_eq!(counts.get("new").and_then(serde_json::Value::as_u64), Some(2));
assert_eq!(
counts.get("scanned").and_then(serde_json::Value::as_u64),
Some(5)
);
assert_eq!(
counts.get("new").and_then(serde_json::Value::as_u64),
Some(2)
);
}
#[test]

View File

@@ -34,21 +34,25 @@
//! still allowing the cross-crate calls.
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::{Arc, Mutex};
use anyhow::{Context, anyhow};
use serde::{Deserialize, Serialize};
use kebab_chunk::{CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker};
use kebab_chunk::{
CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, CodeJavaAstV1Chunker,
CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker,
CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker,
K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker,
};
use kebab_core::{
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
EmbeddingKind, ExtractContext, IngestReport, Lang, LanguageModel, MediaType,
ParserVersion, RawAsset, SearchHit, SearchQuery, SourceScope,
SourceUri, VectorRecord, VectorStore,
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, Chunker, ChunkerVersion,
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, EmbeddingKind,
ExtractContext, IngestReport, Lang, LanguageModel, MediaType, ParserVersion, RawAsset,
SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore,
};
use kebab_llm_local::OllamaLanguageModel;
use kebab_parse_image::{OllamaVisionOcr, apply_caption, apply_ocr};
use kebab_parse_image::{OcrEngine, OllamaVisionOcr, apply_caption, apply_ocr};
use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
use kebab_source_fs::FsSourceConnector;
@@ -60,20 +64,26 @@ pub mod error_signal;
pub mod error_wire;
pub mod external;
pub mod fetch;
pub mod ingest_log;
pub mod ingest_progress;
pub mod logging;
pub mod pdf_ocr_apply;
pub mod reset;
pub mod schema;
mod staleness;
pub use app::{App, SearchResponse, short_query_hint};
pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
pub use reset::{ResetReport, ResetScope, enumerate_orphans};
pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify};
pub use fetch::fetch_with_config;
#[doc(hidden)]
pub use bulk::{BULK_QUERIES_MAX, bulk_search_with_config};
pub use schema::{Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config};
pub use error_wire::{ERROR_V1_ID, ErrorV1, StructuredError, classify};
pub use fetch::fetch_with_config;
pub use ingest_log::{IngestLogWriter, IngestSummary, LogEvent};
pub use ingest_progress::{AggregateCounts, IngestEvent, render_skipped_breakdown};
pub use kebab_config::{ConfigInvalid, ConfigNotFound};
pub use reset::{ResetReport, ResetScope, enumerate_orphans};
pub use schema::{
Capabilities, Models, SCHEMA_V1_ID, SchemaV1, Stats, WireBlock, schema_with_config,
};
pub use staleness::{compute_stale, mark_stale_in_place};
/// p9-fb-25: sentinel for files without an extension in
@@ -293,6 +303,24 @@ pub fn ingest_with_config_opts(
let app = App::open_with_config(config)?;
// v0.20.x Hook 1: init per-run log writer (None when disabled or on open failure).
let log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>> =
match crate::ingest_log::IngestLogWriter::open(&app.config.logging) {
Ok(Some(w)) => Some(Arc::new(Mutex::new(w))),
Ok(None) => None,
Err(e) => {
tracing::warn!(
target: "kebab-app",
error = %e,
"ingest_log: failed to open log file; logging disabled for this run"
);
None
}
};
let ocr_ms_samples: Arc<Mutex<Vec<u64>>> = Arc::new(Mutex::new(Vec::new()));
let ocr_pages_cnt: Arc<Mutex<u32>> = Arc::new(Mutex::new(0u32));
let ocr_failures_cnt: Arc<Mutex<u32>> = Arc::new(Mutex::new(0u32));
// Walk the workspace.
crate::ingest_progress::emit(
progress,
@@ -300,8 +328,8 @@ pub fn ingest_with_config_opts(
root: scope.root.to_string_lossy().into_owned(),
},
);
let connector = FsSourceConnector::new(&app.config)
.context("kb-app::ingest: build FsSourceConnector")?;
let connector =
FsSourceConnector::new(&app.config).context("kb-app::ingest: build FsSourceConnector")?;
let (assets, fs_skips) = connector
.scan_with_skips(&scope)
.context("kb-app::ingest: scan workspace")?;
@@ -312,6 +340,20 @@ pub fn ingest_with_config_opts(
},
);
// v0.20.x Hook 4: emit skip events from scan into log writer.
if let Some(ref lw) = log_writer {
for ev in &fs_skips.events {
if let Ok(mut w) = lw.lock() {
let _ = w.write_event(&crate::ingest_log::LogEvent::Skip {
ts: crate::ingest_log::now_ts(),
doc_path: &ev.doc_path,
reason: ev.reason,
detail: ev.detail.as_deref(),
});
}
}
}
// Embedder + vector store: build once at the top so the cold-start
// cost is paid once even when the workspace has 1000 markdown files.
let embedder = app.embedder()?;
@@ -336,18 +378,14 @@ pub fn ingest_with_config_opts(
// endpoint) aborts ingest fail-fast — better than silently disabling
// OCR/caption mid-run.
let ocr_engine: Option<OllamaVisionOcr> = if app.config.image.ocr.enabled {
Some(
OllamaVisionOcr::new(&app.config)
.context("kb-app::ingest: build OllamaVisionOcr")?,
)
Some(OllamaVisionOcr::new(&app.config).context("kb-app::ingest: build OllamaVisionOcr")?)
} else {
None
};
let caption_llm: Option<Box<dyn LanguageModel>> = if app.config.image.caption.enabled {
Some(Box::new(
OllamaLanguageModel::new(&app.config)
.context("kb-app::ingest: build OllamaLanguageModel for caption")?,
))
Some(Box::new(OllamaLanguageModel::new(&app.config).context(
"kb-app::ingest: build OllamaLanguageModel for caption",
)?))
} else {
None
};
@@ -356,6 +394,29 @@ pub fn ingest_with_config_opts(
caption_llm: caption_llm.as_deref(),
};
// p10 / v0.20 sub-item 1: PDF OCR engine eager init (H-5 resolution).
// image OCR pattern mirror — per-ingest 1회 build, fallible → fail-fast.
let pdf_ocr_engine: Option<OllamaVisionOcr> =
if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on {
let cfg = &app.config.pdf.ocr;
let endpoint = match cfg.endpoint.as_deref() {
Some(s) if !s.is_empty() => s.to_string(),
_ => app.config.models.llm.endpoint.clone(),
};
Some(
OllamaVisionOcr::from_parts(
endpoint,
cfg.model.clone(),
cfg.languages.clone(),
cfg.max_pixels,
cfg.request_timeout_secs,
)
.context("kb-app::ingest: build OllamaVisionOcr (pdf)")?,
)
} else {
None
};
// Pre-load every existing doc_id so we can label `IngestItem.kind`
// as `New` vs `Updated` correctly. `list_documents` returns one
// row per `(workspace_path, asset_id)` — index by the deterministic
@@ -381,10 +442,8 @@ pub fn ingest_with_config_opts(
// current walker scope (config narrowing / include-glob change) is
// NOT purged — we leave it in place to protect against accidental
// data loss via config edits.
let scanned_paths: std::collections::HashSet<kebab_core::WorkspacePath> = assets
.iter()
.map(|a| a.workspace_path.clone())
.collect();
let scanned_paths: std::collections::HashSet<kebab_core::WorkspacePath> =
assets.iter().map(|a| a.workspace_path.clone()).collect();
let purged_deleted_files = sweep_deleted_files(
&app,
&scanned_paths,
@@ -447,6 +506,13 @@ pub fn ingest_with_config_opts(
&existing_doc_ids,
&image_pipeline,
force_reingest,
pdf_ocr_engine.as_ref(),
progress,
opts.cancel.as_ref(),
log_writer.clone(),
ocr_ms_samples.clone(),
ocr_pages_cnt.clone(),
ocr_failures_cnt.clone(),
);
let item = match item {
@@ -458,6 +524,16 @@ pub fn ingest_with_config_opts(
error = %e,
"kb-app::ingest: per-file fatal"
);
// v0.20.x Hook 3: write per-asset error to log writer.
if let Some(ref lw) = log_writer {
if let Ok(mut w) = lw.lock() {
let _ = w.write_event(&crate::ingest_log::LogEvent::Error {
ts: crate::ingest_log::now_ts(),
code: "ingest_asset_error",
message: &format!("{e:#}"),
});
}
}
// Note: `error_count += 1` happens below in the
// `match item.kind { Error => ... }` arm — incrementing
// here too would double-count (a regression first
@@ -475,6 +551,8 @@ pub fn ingest_with_config_opts(
parser_version: None,
chunker_version: None,
warnings: Vec::new(),
pdf_ocr_pages: None,
pdf_ocr_ms_total: None,
error: Some(format!("{e:#}")),
}
}
@@ -581,8 +659,7 @@ pub fn ingest_with_config_opts(
}
}
let duration_ms = u32::try_from(started_instant.elapsed().as_millis())
.unwrap_or(u32::MAX);
let duration_ms = u32::try_from(started_instant.elapsed().as_millis()).unwrap_or(u32::MAX);
let finished_at = time::OffsetDateTime::now_utc();
// Record the ingest_runs row with aggregate counts.
@@ -682,6 +759,29 @@ pub fn ingest_with_config_opts(
}
}
// v0.20.x Hook 1 exit: write summary record + flush log writer.
if let Some(ref lw) = log_writer {
if let Ok(mut w) = lw.lock() {
let run_id = w.run_id().to_string();
let ms_samples = ocr_ms_samples.lock().map(|v| v.clone()).unwrap_or_default();
let pages = ocr_pages_cnt.lock().map(|v| *v).unwrap_or(0);
let failures = ocr_failures_cnt.lock().map(|v| *v).unwrap_or(0);
let summary = crate::ingest_log::IngestSummary::new(
crate::ingest_log::now_ts(),
run_id,
scanned_count,
new_count,
error_count,
pages,
failures,
&ms_samples,
started_instant.elapsed().as_millis() as u64,
);
let _ = w.write_summary(&summary);
let _ = w.flush();
}
}
Ok(IngestReport {
scope,
scanned: scanned_count,
@@ -840,8 +940,8 @@ fn try_skip_unchanged(
if stored_is_tier3_fallback {
// Embedder version still must match.
let embedder_match = existing_doc.last_embedding_version.as_ref()
== current_embedding_version;
let embedder_match =
existing_doc.last_embedding_version.as_ref() == current_embedding_version;
if !embedder_match {
return Ok(None);
}
@@ -863,6 +963,8 @@ fn try_skip_unchanged(
parser_version: Some(existing_doc.parser_version.clone()),
chunker_version: existing_doc.last_chunker_version.clone(),
warnings: Vec::new(),
pdf_ocr_pages: None,
pdf_ocr_ms_total: None,
error: None,
}));
}
@@ -883,23 +985,17 @@ fn try_skip_unchanged(
// sentinel removes every doc at this path (the new doc_id is
// not yet known here — it's computed downstream from the new
// PARSER_VERSION).
purge_workspace_path_for_parser_bump(app, asset).with_context(|| {
format!(
"parser-bump orphan purge at {}",
asset.workspace_path.0
)
})?;
purge_workspace_path_for_parser_bump(app, asset)
.with_context(|| format!("parser-bump orphan purge at {}", asset.workspace_path.0))?;
return Ok(None);
}
// 3. Chunker unchanged.
let chunker_match = existing_doc.last_chunker_version.as_ref()
== Some(current_chunker_version);
let chunker_match = existing_doc.last_chunker_version.as_ref() == Some(current_chunker_version);
if !chunker_match {
return Ok(None);
}
// 4. Embedder unchanged.
let embedder_match = existing_doc.last_embedding_version.as_ref()
== current_embedding_version;
let embedder_match = existing_doc.last_embedding_version.as_ref() == current_embedding_version;
if !embedder_match {
return Ok(None);
}
@@ -921,6 +1017,8 @@ fn try_skip_unchanged(
parser_version: Some(existing_doc.parser_version.clone()),
chunker_version: existing_doc.last_chunker_version.clone(),
warnings: Vec::new(),
pdf_ocr_pages: None,
pdf_ocr_ms_total: None,
error: None,
}))
}
@@ -933,7 +1031,8 @@ fn try_skip_unchanged(
fn ext_for_skip_warning(path: &str) -> String {
std::path::Path::new(path)
.extension()
.and_then(|s| s.to_str()).map_or_else(|| NO_EXT_SENTINEL.to_string(), str::to_ascii_lowercase)
.and_then(|s| s.to_str())
.map_or_else(|| NO_EXT_SENTINEL.to_string(), str::to_ascii_lowercase)
}
/// p9-fb-25: render the `IngestItem.warnings` line for a Skipped
@@ -963,6 +1062,13 @@ fn ingest_one_asset(
existing_doc_ids: &std::collections::HashSet<String>,
image_pipeline: &ImagePipeline<'_>,
force_reingest: bool,
pdf_ocr_engine: Option<&OllamaVisionOcr>,
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
ocr_ms_samples: Arc<Mutex<Vec<u64>>>,
ocr_pages_cnt: Arc<Mutex<u32>>,
ocr_failures_cnt: Arc<Mutex<u32>>,
) -> anyhow::Result<kebab_core::IngestItem> {
tracing::debug!(
target: "kebab-app::ingest",
@@ -998,14 +1104,37 @@ fn ingest_one_asset(
vector_store,
existing_doc_ids,
force_reingest,
pdf_ocr_engine,
progress,
cancel,
log_writer,
ocr_ms_samples,
ocr_pages_cnt,
ocr_failures_cnt,
);
}
// p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. p10-1D: c/cpp added.
MediaType::Code(lang)
if matches!(lang.as_str(),
"rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin"
| "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod"
| "shell" | "c" | "cpp") =>
if matches!(
lang.as_str(),
"rust"
| "python"
| "typescript"
| "javascript"
| "go"
| "java"
| "kotlin"
| "yaml"
| "dockerfile"
| "toml"
| "json"
| "xml"
| "groovy"
| "go-mod"
| "shell"
| "c"
| "cpp"
) =>
{
return ingest_one_code_asset(
app,
@@ -1032,6 +1161,8 @@ fn ingest_one_asset(
parser_version: None,
chunker_version: None,
warnings: vec![unsupported_media_warning(&asset.workspace_path.0)],
pdf_ocr_pages: None,
pdf_ocr_ms_total: None,
error: None,
});
}
@@ -1051,6 +1182,8 @@ fn ingest_one_asset(
parser_version: None,
chunker_version: None,
warnings: vec!["kb:// URI not yet supported".to_string()],
pdf_ocr_pages: None,
pdf_ocr_ms_total: None,
error: None,
});
}
@@ -1081,16 +1214,17 @@ fn ingest_one_asset(
// Frontmatter — `parse_frontmatter` returns Ok even on malformed
// frontmatter (warnings are surfaced through the `Vec<Warning>`).
let (metadata, fm_span, fm_warns) = parse_frontmatter(&bytes, &body_hints)
.context("kb-parse-md::parse_frontmatter")?;
let (metadata, fm_span, fm_warns) =
parse_frontmatter(&bytes, &body_hints).context("kb-parse-md::parse_frontmatter")?;
let body_offset_lines = match fm_span {
Some(span) => count_lines_in(&bytes[..span.end]),
None => 0,
};
let (parsed_blocks, blk_warns) = parse_blocks(&bytes[fm_span_end(fm_span)..], body_offset_lines)
.context("kb-parse-md::parse_blocks")?;
let (parsed_blocks, blk_warns) =
parse_blocks(&bytes[fm_span_end(fm_span)..], body_offset_lines)
.context("kb-parse-md::parse_blocks")?;
let mut all_warnings = Vec::with_capacity(fm_warns.len() + blk_warns.len());
all_warnings.extend(fm_warns);
@@ -1103,14 +1237,9 @@ fn ingest_one_asset(
.map(|w| format!("{:?}: {}", w.kind, w.note))
.collect();
let mut canonical = build_canonical_document(
asset,
metadata,
parsed_blocks,
parser_version,
all_warnings,
)
.context("kb-parse-md::build_canonical_document")?;
let mut canonical =
build_canonical_document(asset, metadata, parsed_blocks, parser_version, all_warnings)
.context("kb-parse-md::build_canonical_document")?;
let chunks = MdHeadingV1Chunker
.chunk(&canonical, chunk_policy)
@@ -1177,9 +1306,7 @@ fn ingest_one_asset(
dimensions,
})
.collect();
vec_store
.upsert(&records)
.context("VectorStore::upsert")?;
vec_store.upsert(&records).context("VectorStore::upsert")?;
}
}
@@ -1200,6 +1327,8 @@ fn ingest_one_asset(
parser_version: Some(parser_version.clone()),
chunker_version: Some(MdHeadingV1Chunker.chunker_version()),
warnings: warning_notes,
pdf_ocr_pages: None,
pdf_ocr_ms_total: None,
error: None,
})
}
@@ -1242,9 +1371,9 @@ fn ingest_one_image_asset(
chunk_count: None,
parser_version: None,
chunker_version: None,
warnings: vec![
"kb:// URI not yet supported".to_string(),
],
warnings: vec!["kb:// URI not yet supported".to_string()],
pdf_ocr_pages: None,
pdf_ocr_ms_total: None,
error: None,
});
}
@@ -1354,17 +1483,19 @@ fn ingest_one_image_asset(
"image document missing leading ImageRef block — OCR/caption skipped (first block: {:?})",
other.map(|b| std::mem::discriminant(b))
);
canonical.provenance.events.push(kebab_core::ProvenanceEvent {
at: now,
agent: "kb-app".to_string(),
kind: kebab_core::ProvenanceKind::Warning,
note: Some(
"image document missing leading ImageRef block — OCR/caption skipped"
.to_string(),
),
});
warning_notes
.push("ImageDispatchAnomaly: missing ImageRef block".to_string());
canonical
.provenance
.events
.push(kebab_core::ProvenanceEvent {
at: now,
agent: "kb-app".to_string(),
kind: kebab_core::ProvenanceKind::Warning,
note: Some(
"image document missing leading ImageRef block — OCR/caption skipped"
.to_string(),
),
});
warning_notes.push("ImageDispatchAnomaly: missing ImageRef block".to_string());
}
}
@@ -1455,6 +1586,8 @@ fn ingest_one_image_asset(
parser_version: Some(canonical.parser_version.clone()),
chunker_version: Some(MdHeadingV1Chunker.chunker_version()),
warnings: warning_notes,
pdf_ocr_pages: None,
pdf_ocr_ms_total: None,
error: None,
})
}
@@ -1510,10 +1643,7 @@ fn record_image_analysis_failure(
/// 3. Sweeps the SQLite `documents` row (CASCADE drops `blocks` /
/// `chunks` / `embedding_records`). The `assets` row stays — same
/// bytes, same asset_id, only the derived `doc_id` changed.
fn purge_workspace_path_for_parser_bump(
app: &App,
asset: &RawAsset,
) -> anyhow::Result<()> {
fn purge_workspace_path_for_parser_bump(app: &App, asset: &RawAsset) -> anyhow::Result<()> {
let path = &asset.workspace_path.0;
let stale = app
.sqlite
@@ -1648,21 +1778,19 @@ fn sweep_deleted_files(
}
// File is truly absent → purge.
let chunk_ids = match kebab_store_sqlite::purge_deleted_workspace_path(
&app.sqlite,
&stored_path,
) {
Ok(ids) => ids,
Err(e) => {
tracing::warn!(
target: "kebab-app",
path = %stored_path.0,
error = %e,
"sweep_deleted_files: purge failed; skipping this path"
);
continue;
}
};
let chunk_ids =
match kebab_store_sqlite::purge_deleted_workspace_path(&app.sqlite, &stored_path) {
Ok(ids) => ids,
Err(e) => {
tracing::warn!(
target: "kebab-app",
path = %stored_path.0,
error = %e,
"sweep_deleted_files: purge failed; skipping this path"
);
continue;
}
};
// Purge associated vectors (best-effort; partial failure
// acceptable — orphan vectors get cleaned by `kebab reset
@@ -1725,6 +1853,13 @@ fn ingest_one_pdf_asset(
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
force_reingest: bool,
pdf_ocr_engine: Option<&OllamaVisionOcr>,
progress: Option<&std::sync::mpsc::Sender<crate::ingest_progress::IngestEvent>>,
cancel: Option<&std::sync::Arc<std::sync::atomic::AtomicBool>>,
log_writer: Option<Arc<Mutex<crate::ingest_log::IngestLogWriter>>>,
ocr_ms_samples: Arc<Mutex<Vec<u64>>>,
ocr_pages_cnt: Arc<Mutex<u32>>,
ocr_failures_cnt: Arc<Mutex<u32>>,
) -> anyhow::Result<kebab_core::IngestItem> {
let path = match &asset.source_uri {
SourceUri::File(p) => p.clone(),
@@ -1739,9 +1874,9 @@ fn ingest_one_pdf_asset(
chunk_count: None,
parser_version: None,
chunker_version: None,
warnings: vec![
"kb:// URI not yet supported".to_string(),
],
warnings: vec!["kb:// URI not yet supported".to_string()],
pdf_ocr_pages: None,
pdf_ocr_ms_total: None,
error: None,
});
}
@@ -1778,6 +1913,105 @@ fn ingest_one_pdf_asset(
.extract_for(&asset.media_type, &ctx, &bytes)
.context("kb-app::extract_for (pdf)")?;
// v0.20 sub-item 1: post-extract OCR enrichment (PR #187 registry
// dispatch invariant 보존 — extract_for 가 normal entry).
let (pdf_ocr_pages, pdf_ocr_ms_total): (Option<u32>, Option<u64>) =
if app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on {
match pdf_ocr_engine {
Some(engine) => {
let ocr_opts = crate::pdf_ocr_apply::PdfOcrOpts {
enabled: app.config.pdf.ocr.enabled || app.config.pdf.ocr.always_on,
always_on: app.config.pdf.ocr.always_on,
valid_ratio_threshold: app.config.pdf.ocr.valid_ratio_threshold,
min_char_count: app.config.pdf.ocr.min_char_count,
lang_hint: app.config.pdf.ocr.lang_hint.clone().map(kebab_core::Lang),
cancel: cancel.cloned(),
};
// v0.20.x Hook 2: pre-clone Arcs for capture by OCR closure.
let lw_for_ocr = log_writer.clone();
let samples_for_ocr = ocr_ms_samples.clone();
let pages_for_ocr = ocr_pages_cnt.clone();
let failures_for_ocr = ocr_failures_cnt.clone();
let doc_path_for_log = asset.workspace_path.0.clone();
let summary = crate::pdf_ocr_apply::apply_ocr_to_pdf_pages(
&mut canonical,
engine,
&bytes,
&ocr_opts,
|p| match p {
crate::pdf_ocr_apply::PdfOcrProgress::Started { page } => {
if let Some(sender) = progress {
let _ = sender.send(
crate::ingest_progress::IngestEvent::PdfOcrStarted { page },
);
}
}
crate::pdf_ocr_apply::PdfOcrProgress::Finished {
page,
ms,
chars,
skipped,
image_byte_size,
image_width,
image_height,
ref failure_reason,
} => {
if let Some(sender) = progress {
let _ = sender.send(
crate::ingest_progress::IngestEvent::PdfOcrFinished {
page,
ms,
chars,
ocr_engine: engine.engine_name().to_string(),
skipped,
image_byte_size,
image_width,
image_height,
failure_reason: failure_reason.clone(),
},
);
}
// v0.20.x Hook 2: write OCR event to log writer.
let success = !skipped && failure_reason.is_none();
if let Some(ref lw) = lw_for_ocr {
if let Ok(mut w) = lw.lock() {
let _ = w.write_event(&crate::ingest_log::LogEvent::Ocr {
ts: crate::ingest_log::now_ts(),
doc_path: &doc_path_for_log,
page,
image_byte_size,
image_width,
image_height,
ms,
chars,
success,
reason: failure_reason.as_deref(),
ocr_engine: engine.engine_name(),
});
}
}
if let Ok(mut p) = pages_for_ocr.lock() {
*p += 1;
}
if success {
if let Ok(mut s) = samples_for_ocr.lock() {
s.push(ms);
}
} else if let Ok(mut f) = failures_for_ocr.lock() {
*f += 1;
}
}
},
)?;
(Some(summary.pages_ocrd), Some(summary.ms_total))
}
None => (Some(0), Some(0)),
}
} else {
(None, None)
};
// Per-medium chunker selection: PDF docs always use pdf-page-v1
// regardless of `config.chunking.chunker_version`. The chunker
// validates every block carries `SourceSpan::Page`; failure here
@@ -1818,9 +2052,7 @@ fn ingest_one_pdf_asset(
kind: EmbeddingKind::Document,
})
.collect();
let vectors = emb
.embed(&inputs)
.context("Embedder::embed (pdf chunks)")?;
let vectors = emb.embed(&inputs).context("Embedder::embed (pdf chunks)")?;
let model_id = emb.model_id();
let model_version = emb.model_version();
let dimensions = emb.dimensions();
@@ -1879,6 +2111,8 @@ fn ingest_one_pdf_asset(
parser_version: Some(canonical.parser_version.clone()),
chunker_version: Some(chunker.chunker_version()),
warnings,
pdf_ocr_pages,
pdf_ocr_ms_total,
error: None,
})
}
@@ -1902,7 +2136,7 @@ fn ingest_one_code_asset(
vector_store: Option<&Arc<kebab_store_vector::LanceVectorStore>>,
existing_doc_ids: &std::collections::HashSet<String>,
force_reingest: bool,
code_lang: &str, // <-- NEW (p10-1b Task D)
code_lang: &str, // <-- NEW (p10-1b Task D)
) -> anyhow::Result<kebab_core::IngestItem> {
let path = match &asset.source_uri {
SourceUri::File(p) => p.clone(),
@@ -1917,9 +2151,9 @@ fn ingest_one_code_asset(
chunk_count: None,
parser_version: None,
chunker_version: None,
warnings: vec![
"kb:// URI not yet supported".to_string(),
],
warnings: vec!["kb:// URI not yet supported".to_string()],
pdf_ocr_pages: None,
pdf_ocr_ms_total: None,
error: None,
});
}
@@ -1927,43 +2161,43 @@ fn ingest_one_code_asset(
// p10-1b Task D/G/J: parser_version per-lang.
let parser_version = match code_lang {
"rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()),
"python" => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.to_string()),
"rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()),
"python" => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.to_string()),
"typescript" => ParserVersion(kebab_parse_code::TS_PARSER_VERSION.to_string()),
"javascript" => ParserVersion(kebab_parse_code::JS_PARSER_VERSION.to_string()),
"go" => ParserVersion(kebab_parse_code::GO_PARSER_VERSION.to_string()),
"java" => ParserVersion(kebab_parse_code::JAVA_PARSER_VERSION.to_string()),
"kotlin" => ParserVersion(kebab_parse_code::KOTLIN_PARSER_VERSION.to_string()),
// p10-2: Tier 2 has no parse step — sentinel "none-v1".
"yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod"
=> ParserVersion("none-v1".to_string()),
"yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => {
ParserVersion("none-v1".to_string())
}
// p10-3: shell direct routes to Tier 3 (no parse step).
"shell" => ParserVersion("none-v1".to_string()),
// p10-1D: C + C++ AST extractors.
"c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()),
"c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()),
"cpp" => ParserVersion(kebab_parse_code::CPP_PARSER_VERSION.to_string()),
other => anyhow::bail!("unsupported code_lang: {other}"),
};
// p10-1b Task D/G/J/L: chunker_version per-lang.
let mut chunker_version = match code_lang {
"rust" => CodeRustAstV1Chunker.chunker_version(),
"python" => CodePythonAstV1Chunker.chunker_version(),
"rust" => CodeRustAstV1Chunker.chunker_version(),
"python" => CodePythonAstV1Chunker.chunker_version(),
"typescript" => CodeTsAstV1Chunker.chunker_version(),
"javascript" => CodeJsAstV1Chunker.chunker_version(),
"go" => CodeGoAstV1Chunker.chunker_version(),
"java" => CodeJavaAstV1Chunker.chunker_version(),
"kotlin" => CodeKotlinAstV1Chunker.chunker_version(),
"kotlin" => CodeKotlinAstV1Chunker.chunker_version(),
// p10-2 Tier 2:
"yaml" => K8sManifestResourceV1Chunker.chunker_version(),
"yaml" => K8sManifestResourceV1Chunker.chunker_version(),
"dockerfile" => DockerfileFileV1Chunker.chunker_version(),
"toml" | "json" | "xml" | "groovy" | "go-mod"
=> ManifestFileV1Chunker.chunker_version(),
"toml" | "json" | "xml" | "groovy" | "go-mod" => ManifestFileV1Chunker.chunker_version(),
// p10-3:
"shell" => CodeTextParagraphV1Chunker.chunker_version(),
"shell" => CodeTextParagraphV1Chunker.chunker_version(),
// p10-1D: C + C++ AST chunkers.
"c" => CodeCAstV1Chunker.chunker_version(),
"cpp" => CodeCppAstV1Chunker.chunker_version(),
"c" => CodeCAstV1Chunker.chunker_version(),
"cpp" => CodeCppAstV1Chunker.chunker_version(),
other => anyhow::bail!("unreachable chunker_version: {other}"),
};
@@ -2026,8 +2260,12 @@ fn ingest_one_code_asset(
// Tier 2 (yaml/dockerfile/…) and shell errors are real (e.g. non-UTF-8) — propagate.
let mut canonical = match canonical_result {
Ok(d) => d,
Err(e) if code_lang == "shell"
|| matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod") =>
Err(e)
if code_lang == "shell"
|| matches!(
code_lang,
"yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod"
) =>
{
return Err(e).context("synthesize_tier2_document failed for tier 2/3 lang");
}
@@ -2051,7 +2289,10 @@ fn ingest_one_code_asset(
// Tier 2 langs already have "none-v1" parser_version normally, so exclude them
// from the extract_fell_back guard with the !matches! exclusion.
let extract_fell_back = canonical.parser_version.0 == "none-v1"
&& !matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" | "shell");
&& !matches!(
code_lang,
"yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" | "shell"
);
let chunks_result: anyhow::Result<Vec<Chunk>> = if extract_fell_back {
// Tier 1 lang whose extractor errored — go straight to Tier 3 chunker.
@@ -2110,7 +2351,7 @@ fn ingest_one_code_asset(
// "shell" direct path is already Tier 3 — don't retry-double-up.
let chunks: Vec<Chunk> = match chunks_result {
Ok(v) if !v.is_empty() => v,
other if code_lang == "shell" => other?, // shell propagates directly
other if code_lang == "shell" => other?, // shell propagates directly
Ok(_empty) => {
tracing::warn!(
workspace_path = %asset.workspace_path.0,
@@ -2134,7 +2375,9 @@ fn ingest_one_code_asset(
canonical.parser_version = ParserVersion("none-v1".to_string());
CodeTextParagraphV1Chunker
.chunk(&canonical, chunk_policy)
.context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback after error)")?
.context(
"kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback after error)",
)?
}
};
@@ -2226,6 +2469,8 @@ fn ingest_one_code_asset(
parser_version: Some(canonical.parser_version.clone()),
chunker_version: Some(chunker_version),
warnings,
pdf_ocr_pages: None,
pdf_ocr_ms_total: None,
error: None,
})
}
@@ -2260,13 +2505,7 @@ fn synthesize_tier2_document(
symbol: Some("<file>".to_string()),
lang: Some(code_lang.to_string()),
};
let block_id: BlockId = id_for_block(
&doc_id,
"code",
&[],
0,
&span,
);
let block_id: BlockId = id_for_block(&doc_id, "code", &[], 0, &span);
let block = kebab_core::Block::Code(CodeBlock {
common: CommonBlock {
block_id,
@@ -2312,7 +2551,9 @@ fn synthesize_tier2_document(
};
let title = {
let fname = asset.workspace_path.0
let fname = asset
.workspace_path
.0
.rsplit('/')
.next()
.unwrap_or(&asset.workspace_path.0);
@@ -2558,7 +2799,9 @@ pub fn ask_with_session_with_config(
/// `data_dir_writable` check probes the resolved `storage.data_dir`
/// from that config (so `--config` users see their custom paths
/// reflected in the report rather than the XDG defaults).
pub fn doctor_with_config_path(config_path: Option<&std::path::Path>) -> anyhow::Result<DoctorReport> {
pub fn doctor_with_config_path(
config_path: Option<&std::path::Path>,
) -> anyhow::Result<DoctorReport> {
tracing::debug!("doctor() invoked");
let mut checks = Vec::new();
@@ -2576,11 +2819,7 @@ pub fn doctor_with_config_path(config_path: Option<&std::path::Path>) -> anyhow:
} else if config_path.is_some() {
// Explicit `--config <path>` that doesn't exist is a hard error
// — defaults would silently mask the user's intent.
(
false,
format!("{} (not found)", cfg_path.display()),
None,
)
(false, format!("{} (not found)", cfg_path.display()), None)
} else {
// No `--config` and no XDG file: defaults are always loadable.
(true, format!("{} (defaults)", cfg_path.display()), None)
@@ -2666,16 +2905,18 @@ pub fn ingest_file_with_config(
path: &std::path::Path,
) -> anyhow::Result<IngestReport> {
if !path.exists() {
anyhow::bail!("ingest-file: source path does not exist: {}", path.display());
anyhow::bail!(
"ingest-file: source path does not exist: {}",
path.display()
);
}
if !path.is_file() {
anyhow::bail!("ingest-file: not a regular file: {}", path.display());
}
let ext_raw = path
.extension()
.and_then(|e| e.to_str())
.ok_or_else(|| anyhow::anyhow!("ingest-file: source has no extension: {}", path.display()))?;
let ext_raw = path.extension().and_then(|e| e.to_str()).ok_or_else(|| {
anyhow::anyhow!("ingest-file: source has no extension: {}", path.display())
})?;
let ext = ext_raw.to_lowercase();
const SUPPORTED_EXTS: &[&str] = &["md", "pdf", "png", "jpg", "jpeg"];
@@ -2752,11 +2993,7 @@ pub fn ingest_stdin_with_config(
let external_dir = crate::external::ensure_external_dir(&workspace_root)?;
crate::external::ensure_kebabignore_entry(&workspace_root)?;
let dest = crate::external::copy_to_external(
&external_dir,
wrapped.as_bytes(),
"md",
)?;
let dest = crate::external::copy_to_external(&external_dir, wrapped.as_bytes(), "md")?;
ingest_file_with_config(config, &dest)
}
@@ -2764,7 +3001,10 @@ pub fn ingest_stdin_with_config(
/// Returns true if `source_path` matches any `.kebabignore` pattern
/// rooted at `workspace_root`. Used by `ingest_file_with_config` to
/// emit a stderr warn before bypassing the ignore.
fn check_kebabignore_match(workspace_root: &std::path::Path, source_path: &std::path::Path) -> bool {
fn check_kebabignore_match(
workspace_root: &std::path::Path,
source_path: &std::path::Path,
) -> bool {
let kebabignore = workspace_root.join(".kebabignore");
if !kebabignore.exists() {
return false;
@@ -2785,5 +3025,7 @@ fn check_kebabignore_match(workspace_root: &std::path::Path, source_path: &std::
Ok(m) => m,
Err(_) => return false,
};
matcher.matched(source_path, source_path.is_dir()).is_ignore()
matcher
.matched(source_path, source_path.is_dir())
.is_ignore()
}

View File

@@ -26,7 +26,9 @@ pub fn init(level: LogLevel) -> Result<WorkerGuard> {
let (nb, guard) = tracing_appender::non_blocking(file_appender);
let env_filter = match level {
LogLevel::Default => EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")),
LogLevel::Default => {
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn"))
}
LogLevel::Verbose => EnvFilter::new("info"),
LogLevel::Debug => EnvFilter::new("debug"),
};

View File

@@ -0,0 +1,323 @@
// crates/kebab-app/src/pdf_ocr_apply.rs
//
// PDF post-extract OCR enrichment. parser isolation 보존 — kebab-parse-pdf 가
// kebab-parse-image::OcrEngine 을 import 하지 않도록, helper 는 kebab-app 에 둠.
// image path 의 apply_ocr (kebab-parse-image::ocr::apply_ocr) 의
// PDF page 변형 — image 는 ImageRefBlock.ocr 를 mutate, PDF 는
// Block::Paragraph.text / inlines 를 in-place mutate (단일 OCR fallback) 또는
// 새 Block::Paragraph 를 push (always_on dual-block).
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
use std::time::Instant;
use anyhow::{Context, Result};
use kebab_core::{
Block, CanonicalDocument, CommonBlock, Inline, Lang, ProvenanceEvent, ProvenanceKind,
SourceSpan, TextBlock, id_for_block,
};
use kebab_parse_image::OcrEngine;
use kebab_parse_pdf::{compute_valid_char_ratio, extract_dctdecode_page_image};
use lopdf::Document as LopdfDocument;
use time::OffsetDateTime;
use tracing::warn;
/// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`].
/// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade
/// (`kebab_app::ingest_one_pdf_asset`) fills these from
/// `kebab_config::Config::pdf::ocr` plus runtime flags (CLI / SIGINT).
pub struct PdfOcrOpts {
/// Master switch. `false` short-circuits to
/// `PdfOcrSummary { pages_ocrd: 0, ms_total: 0 }` without lopdf reparse.
pub enabled: bool,
/// `true` → 모든 page OCR (dual-block path, new `Block::Paragraph` push).
/// `false` → text-detect block 의 `min_char_count` 또는
/// `valid_ratio_threshold` 미달인 page 만 OCR (in-place mutate).
pub always_on: bool,
/// 0.0..=1.0. text-detect block 의 `compute_valid_char_ratio` 가
/// 본 임계 미만이면 OCR fallback. Default `0.5`.
pub valid_ratio_threshold: f32,
/// text-detect block 의 char count 가 본 임계 미만이면 OCR fallback.
/// empty page (cover, blank separator) 자동 skip. Default `20`.
pub min_char_count: u32,
/// OCR engine 에 전달할 언어 힌트 (예: `Lang("kor".into())`).
/// `None` → no hint passed to engine.
pub lang_hint: Option<Lang>,
/// Optional per-page cancellation handle. checked at start of each page
/// loop iteration; set→true 시 `cancelled mid-PDF` error 반환. plan §6 E4
/// + verifier LOW L-1 resolution + spec §4.8 line 1159 명시.
pub cancel: Option<Arc<AtomicBool>>,
}
/// OCR run summary returned by [`apply_ocr_to_pdf_pages`] for the caller's
/// `IngestItem.pdf_ocr_pages` + `pdf_ocr_ms_total` wire fields (§4.6.2).
#[derive(Debug)]
pub struct PdfOcrSummary {
/// Number of pages 가 OCR pipeline 을 실제 통과 (skipped page 제외).
pub pages_ocrd: u32,
/// Cumulative wall-clock duration of successful OCR engine calls (ms).
/// `saturating_add` 사용 — 24-day cumulative 까지 overflow-safe.
pub ms_total: u64,
}
/// Post-extract OCR enrichment for PDF. Walks `canonical.blocks` page-by-page,
/// classifies each page via `text_quality::compute_valid_char_ratio` +
/// `min_char_count`, and either:
/// - skips (vector PDF + sufficient text + `always_on=false`),
/// - mutates the text-detect `Block::Paragraph` in-place with OCR output
/// (scanned/mojibake page), or
/// - pushes a new `Block::Paragraph` with dual ordinal (`always_on=true` +
/// vector page).
///
/// Errors:
/// - cancel handle (`opts.cancel = Some(true)`) → `Err("PDF OCR cancelled mid-PDF at page N")`.
/// - lopdf re-parse failure → `Err(...)`.
/// - per-page OCR engine failure 또는 DCTDecode 부재 → `ProvenanceKind::Warning`
/// event push + `emit_progress(Finished { skipped: true })` + continue
/// (no `Err` propagation).
///
/// See spec §4.1 + §4.4 for the full pipeline.
pub fn apply_ocr_to_pdf_pages<F>(
canonical: &mut CanonicalDocument,
engine: &dyn OcrEngine,
pdf_bytes: &[u8],
opts: &PdfOcrOpts,
mut emit_progress: F,
) -> Result<PdfOcrSummary>
where
F: FnMut(PdfOcrProgress),
{
if !opts.enabled {
return Ok(PdfOcrSummary {
pages_ocrd: 0,
ms_total: 0,
});
}
let pdf_doc = LopdfDocument::load_mem(pdf_bytes)
.context("kb-app::pdf_ocr_apply: re-parse PDF for image extract")?;
let page_count = pdf_doc.get_pages().len() as u32;
let mut new_events: Vec<ProvenanceEvent> = Vec::new();
let mut ocr_blocks: Vec<Block> = Vec::new();
let mut pages_ocrd: u32 = 0;
let mut ms_total: u64 = 0;
// canonical.blocks 의 page → block index map (text-detect block 의 in-place
// mutate 또는 dual-block push 결정용).
// PdfTextExtractor 가 page 마다 1 Block::Paragraph + SourceSpan::Page 를
// 생성 (§1.4) — 그 invariant 사용.
for page_num in 1..=page_count {
if let Some(cancel) = &opts.cancel {
if cancel.load(std::sync::atomic::Ordering::Relaxed) {
anyhow::bail!("PDF OCR cancelled mid-PDF at page {page_num}");
}
}
let text_block_idx = find_paragraph_block_idx(&canonical.blocks, page_num);
let text = match &canonical.blocks[text_block_idx] {
Block::Paragraph(tb) => tb.text.clone(),
_ => String::new(),
};
let chars = text.chars().count() as u32;
let valid_ratio = compute_valid_char_ratio(&text);
let needs_ocr = chars < opts.min_char_count || valid_ratio < opts.valid_ratio_threshold;
// 결정 matrix:
// always_on=true → 모든 page OCR (dual-block).
// always_on=false + needs_ocr → in-place OCR (text-detect block mutate).
// needs_ocr=false → skip.
let do_ocr = opts.always_on || needs_ocr;
if !do_ocr {
continue;
}
emit_progress(PdfOcrProgress::Started { page: page_num });
let page_image_bytes = if let Some(b) = extract_dctdecode_page_image(&pdf_doc, page_num)? {
b
} else {
let note = format!(
"page={page_num} skipped: no DCTDecode image XObject (vector PDF page or unsupported /Filter — v1 supports DCTDecode passthrough only; see release notes for normalization guidance)"
);
warn!(target: "kebab-app", "{}", note);
new_events.push(ProvenanceEvent {
at: OffsetDateTime::now_utc(),
agent: "kb-parse-pdf".to_string(),
kind: ProvenanceKind::Warning,
note: Some(note),
});
emit_progress(PdfOcrProgress::Finished {
page: page_num,
ms: 0,
chars: 0,
skipped: true,
image_byte_size: None,
image_width: None,
image_height: None,
failure_reason: None,
});
continue;
};
let start = Instant::now();
let ocr = match engine.recognize(&page_image_bytes, opts.lang_hint.as_ref()) {
Ok(t) => t,
Err(e) => {
// OCR failure: warning event + skip (text-detect block 그대로).
let note = format!(
"page={} OCR failed engine={} version={} err={}",
page_num,
engine.engine_name(),
engine.engine_version(),
e
);
warn!(target: "kebab-app", "{}", note);
new_events.push(ProvenanceEvent {
at: OffsetDateTime::now_utc(),
agent: "kb-parse-pdf".to_string(),
kind: ProvenanceKind::Warning,
note: Some(note),
});
emit_progress(PdfOcrProgress::Finished {
page: page_num,
ms: start.elapsed().as_millis() as u64,
chars: 0,
skipped: true,
image_byte_size: Some(page_image_bytes.len() as u64),
image_width: None,
image_height: None,
failure_reason: Some("ocr_error".to_string()),
});
continue;
}
};
let elapsed_ms = start.elapsed().as_millis() as u64;
let chars_ocr = ocr.joined.chars().count() as u32;
pages_ocrd = pages_ocrd.saturating_add(1);
ms_total = ms_total.saturating_add(elapsed_ms);
if opts.always_on && !needs_ocr {
// dual-block path: 새 Block::Paragraph push, ordinal = page-1 + page_count.
let ocr_ordinal = (page_num - 1) + page_count;
let span_ocr = SourceSpan::Page {
page: page_num,
char_start: Some(0),
char_end: Some(chars_ocr),
};
let block_id =
id_for_block(&canonical.doc_id, "paragraph", &[], ocr_ordinal, &span_ocr);
let common = CommonBlock {
block_id,
heading_path: Vec::new(),
source_span: span_ocr,
};
ocr_blocks.push(Block::Paragraph(TextBlock {
common,
text: ocr.joined.clone(),
inlines: if ocr.joined.is_empty() {
Vec::new()
} else {
vec![Inline::Text {
text: ocr.joined.clone(),
}]
},
}));
} else {
// in-place mutate: text-detect block (빈 또는 low-valid) 의 text/inlines 교체.
// block_id / ordinal 보존 — span 의 char_end 만 갱신.
if let Block::Paragraph(tb) = &mut canonical.blocks[text_block_idx] {
tb.text = ocr.joined.clone();
tb.inlines = if ocr.joined.is_empty() {
Vec::new()
} else {
vec![Inline::Text {
text: ocr.joined.clone(),
}]
};
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
*char_end = Some(chars_ocr);
}
}
}
new_events.push(ProvenanceEvent {
at: OffsetDateTime::now_utc(),
agent: "kb-parse-pdf".to_string(),
kind: ProvenanceKind::OcrApplied,
note: Some(format!(
"page={} engine={} version={} regions={} ms={} chars={}",
page_num,
engine.engine_name(),
engine.engine_version(),
ocr.regions.len(),
elapsed_ms,
chars_ocr
)),
});
emit_progress(PdfOcrProgress::Finished {
page: page_num,
ms: elapsed_ms,
chars: chars_ocr,
skipped: false,
image_byte_size: Some(page_image_bytes.len() as u64),
image_width: None,
image_height: None,
failure_reason: None,
});
}
canonical.blocks.extend(ocr_blocks);
canonical.provenance.events.extend(new_events);
Ok(PdfOcrSummary {
pages_ocrd,
ms_total,
})
}
fn find_paragraph_block_idx(blocks: &[Block], page_num: u32) -> usize {
blocks
.iter()
.position(|b| match b {
Block::Paragraph(tb) => matches!(
tb.common.source_span,
SourceSpan::Page { page, .. } if page == page_num
),
_ => false,
})
.expect("PdfTextExtractor emits 1 Block::Paragraph per page (invariant)")
}
/// Per-page OCR progress event 가 caller 의 `emit_progress` closure 호출 시 emit.
/// Step 6 의 ingest_one_pdf_asset 가 IngestEvent::PdfOcrStarted / PdfOcrFinished
/// 로 carry (spec §4.6.1 wire schema).
pub enum PdfOcrProgress {
/// page 별 OCR 시작 시 emit. `engine.recognize` 호출 직전.
Started {
/// 1-based PDF page number.
page: u32,
},
/// page 별 OCR 종료 시 emit (성공 / skip / failure 모두).
Finished {
/// 1-based PDF page number.
page: u32,
/// `engine.recognize` wall-clock duration. skip path 의 의미는 mixed
/// (DCTDecode 부재 시 `0`, OCR engine 실패 시 actual latency before bail).
ms: u64,
/// OCR result text 의 char count. skip 시 `0`.
chars: u32,
/// `true` = DCTDecode 부재 또는 OCR engine 실패 로 skip.
/// `false` = 정상 OCR 완료.
skipped: bool,
/// v0.20.x ingest log: raster image byte size (additive, optional).
image_byte_size: Option<u64>,
/// v0.20.x ingest log: raster image width in pixels (additive, optional).
image_width: Option<u32>,
/// v0.20.x ingest log: raster image height in pixels (additive, optional).
image_height: Option<u32>,
/// v0.20.x ingest log: failure reason string when OCR failed (additive, optional).
/// Values: "timeout" | "ocr_error" | "network_error" | None (success).
failure_reason: Option<String>,
},
}

View File

@@ -85,8 +85,7 @@ pub fn enumerate_paths(scope: ResetScope, cfg: &Config) -> Vec<PathBuf> {
ResetScope::All => vec![cfg_dir, data_dir, cache_dir, state_dir],
ResetScope::DataOnly => vec![data_dir, cache_dir, state_dir],
ResetScope::VectorOnly => {
let vector_dir =
expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
let vector_dir = expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
vec![vector_dir]
}
ResetScope::ConfigOnly => vec![cfg_dir],
@@ -137,8 +136,8 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 {
/// the double scan is acceptable for a rare destructive operation.
pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
use kebab_core::DocumentStore as _;
use kebab_source_fs::FsSourceConnector;
use kebab_core::SourceScope;
use kebab_source_fs::FsSourceConnector;
let store = kebab_store_sqlite::SqliteStore::open(cfg)
.context("enumerate_orphans: open SqliteStore")?;
@@ -160,16 +159,13 @@ pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
..Default::default()
};
let connector = FsSourceConnector::new(cfg)
.context("enumerate_orphans: build FsSourceConnector")?;
let connector =
FsSourceConnector::new(cfg).context("enumerate_orphans: build FsSourceConnector")?;
let (assets, _skips) = connector
.scan_with_skips(&scope)
.context("enumerate_orphans: scan workspace")?;
let scanned: HashSet<WorkspacePath> = assets
.into_iter()
.map(|a| a.workspace_path)
.collect();
let scanned: HashSet<WorkspacePath> = assets.into_iter().map(|a| a.workspace_path).collect();
let mut orphans: Vec<WorkspacePath> = stored
.into_iter()
@@ -206,8 +202,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
if !p.exists() {
continue;
}
std::fs::remove_dir_all(p)
.with_context(|| format!("remove {}", p.display()))?;
std::fs::remove_dir_all(p).with_context(|| format!("remove {}", p.display()))?;
removed.push(p.clone());
}
@@ -229,8 +224,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
/// Execute the `OrphansOnly` variant: reconcile stored docs against the
/// current walker scope without touching any filesystem directory.
fn execute_orphans_only(cfg: &Config) -> Result<ResetReport> {
let orphans = enumerate_orphans(cfg)
.context("execute_orphans_only: enumerate orphans")?;
let orphans = enumerate_orphans(cfg).context("execute_orphans_only: enumerate orphans")?;
if orphans.is_empty() {
return Ok(ResetReport {

View File

@@ -39,6 +39,14 @@ pub struct Capabilities {
pub struct Models {
pub parser_version: String,
pub chunker_version: String,
/// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체.
/// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존.
#[serde(default)]
pub active_parsers: Vec<String>,
/// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체.
/// 빈 corpus → empty Vec.
#[serde(default)]
pub active_chunkers: Vec<String>,
pub embedding_version: String,
pub prompt_template_version: String,
pub index_version: String,
@@ -142,10 +150,10 @@ fn capabilities_snapshot() -> Capabilities {
rag_multi_turn: true,
search_cache: true,
incremental_ingest: true,
streaming_ask: false,
streaming_ask: true,
http_daemon: false,
mcp_server: true,
single_file_ingest: false,
single_file_ingest: true,
bulk_search: true,
}
}
@@ -160,12 +168,8 @@ fn open_store_for_stats(cfg: &Config) -> anyhow::Result<kebab_store_sqlite::Sqli
kebab_store_sqlite::SqliteStore::open_existing(&db_path)
}
fn collect_stats(
cfg: &Config,
store: &kebab_store_sqlite::SqliteStore,
) -> anyhow::Result<Stats> {
let counts = store
.count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?;
fn collect_stats(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
let counts = store.count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?;
let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir)
.map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?;
@@ -190,12 +194,16 @@ fn collect_stats(
}
fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default();
let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default();
Models {
// markdown parser only — pdf-page-v1 (P7) / image extractors (P6)
// maintain their own versions; surface those when SchemaV1.models
// becomes a multi-medium map (P+).
parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
chunker_version: cfg.chunking.chunker_version.clone(),
active_parsers,
active_chunkers,
// EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
embedding_version: cfg.models.embedding.model.clone(),
prompt_template_version: cfg.rag.prompt_template_version.clone(),
@@ -268,3 +276,27 @@ mod tests_stats_ext {
assert_eq!(s.stats.stale_doc_count, 0);
}
}
#[cfg(test)]
mod tests_capabilities {
use super::*;
#[test]
fn capabilities_streaming_ask_matches_cli_surface() {
// Bug #9: kebab ask --stream 가 answer_event.v1 ndjson 191 event 정상 emit →
// capabilities.streaming_ask 가 true 여야 함.
let caps = capabilities_snapshot();
assert!(caps.streaming_ask, "streaming_ask must be true (Bug #9)");
}
#[test]
fn capabilities_single_file_ingest_matches_cli_surface() {
// Bug #9: kebab ingest-file <path> + kebab ingest-stdin --title <T> 양쪽 모두
// ingest_report.v1 정상 emit → capabilities.single_file_ingest 가 true 여야 함.
let caps = capabilities_snapshot();
assert!(
caps.single_file_ingest,
"single_file_ingest must be true (Bug #9)"
);
}
}

View File

@@ -10,11 +10,7 @@ use kebab_core::SearchHit;
///
/// p9-fb-32: mirrored in `kebab_rag::pipeline::compute_stale` (dep-boundary
/// rule prevents `kebab-rag → kebab-app`). Update both together.
pub fn compute_stale(
indexed_at: OffsetDateTime,
now: OffsetDateTime,
threshold_days: u32,
) -> bool {
pub fn compute_stale(indexed_at: OffsetDateTime, now: OffsetDateTime, threshold_days: u32) -> bool {
if threshold_days == 0 {
return false;
}
@@ -23,11 +19,7 @@ pub fn compute_stale(
}
/// Sets `stale` on each hit in place using `compute_stale`.
pub fn mark_stale_in_place(
hits: &mut [SearchHit],
now: OffsetDateTime,
threshold_days: u32,
) {
pub fn mark_stale_in_place(hits: &mut [SearchHit], now: OffsetDateTime, threshold_days: u32) {
for h in hits {
h.stale = compute_stale(h.indexed_at, now, threshold_days);
}

View File

@@ -29,9 +29,8 @@ fn rust_file_ingests_and_searches_as_code_citation() {
)
.unwrap();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
let items = report.items.as_ref().expect("items present");
@@ -127,9 +126,8 @@ fn rust_code_search_hit_has_repo() {
)
.unwrap();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert_eq!(report.errors, 0, "no ingest errors: {report:?}");
let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("mul"))
@@ -147,8 +145,7 @@ fn rust_code_search_hit_has_repo() {
.and_then(|n| n.to_str())
.map(str::to_owned);
assert_eq!(
h.repo,
expected_repo,
h.repo, expected_repo,
"SearchHit.repo must match the workspace dir name (detect_repo result)"
);
// Also sanity-check code_lang is still filled.
@@ -177,9 +174,8 @@ fn python_file_ingests_and_searches_as_code_citation() {
)
.unwrap();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert!(report.new >= 1, "python file ingested: {report:?}");
@@ -254,9 +250,8 @@ fn typescript_file_ingests_and_searches_as_code_citation() {
)
.unwrap();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert!(report.new >= 1, "ts file ingested: {report:?}");
@@ -331,9 +326,8 @@ fn javascript_file_ingests_and_searches_as_code_citation() {
)
.unwrap();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
assert!(report.new >= 1, "js file ingested: {report:?}");
@@ -515,7 +509,11 @@ fn java_file_ingests_and_searches_as_code_citation() {
line_start,
..
} => {
assert_eq!(lang.as_deref(), Some("java"), "citation.lang must be 'java'");
assert_eq!(
lang.as_deref(),
Some("java"),
"citation.lang must be 'java'"
);
assert_eq!(
symbol.as_deref(),
Some("com.foo.Foo.bar"),
@@ -586,7 +584,11 @@ fn kotlin_file_ingests_and_searches_as_code_citation() {
line_start,
..
} => {
assert_eq!(lang.as_deref(), Some("kotlin"), "citation.lang must be 'kotlin'");
assert_eq!(
lang.as_deref(),
Some("kotlin"),
"citation.lang must be 'kotlin'"
);
assert_eq!(
symbol.as_deref(),
Some("com.foo.Foo.bar"),
@@ -651,8 +653,8 @@ fn tier2_k8s_yaml_ingest_searchable() {
..Default::default()
},
};
let hits = kebab_app::search_with_config(env.config.clone(), query)
.expect("search must succeed");
let hits =
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
let h = hits
.iter()
@@ -666,7 +668,11 @@ fn tier2_k8s_yaml_ingest_searchable() {
line_start,
..
} => {
assert_eq!(lang.as_deref(), Some("yaml"), "citation.lang must be 'yaml'");
assert_eq!(
lang.as_deref(),
Some("yaml"),
"citation.lang must be 'yaml'"
);
assert_eq!(
symbol.as_deref(),
Some("Deployment/prod/api"),
@@ -730,8 +736,8 @@ fn tier2_dockerfile_ingest_searchable() {
..Default::default()
},
};
let hits = kebab_app::search_with_config(env.config.clone(), query)
.expect("search must succeed");
let hits =
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
let h = hits
.iter()
@@ -813,8 +819,8 @@ fn tier2_cargo_toml_ingest_searchable() {
..Default::default()
},
};
let hits = kebab_app::search_with_config(env.config.clone(), query)
.expect("search must succeed");
let hits =
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
let h = hits
.iter()
@@ -896,8 +902,8 @@ fn tier3_shell_ingest_searchable() {
..Default::default()
},
};
let hits = kebab_app::search_with_config(env.config.clone(), query)
.expect("search must succeed");
let hits =
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
let h = hits
.iter()
@@ -987,8 +993,8 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() {
..Default::default()
},
};
let hits = kebab_app::search_with_config(env.config.clone(), query)
.expect("search must succeed");
let hits =
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
let h = hits
.iter()
@@ -1031,14 +1037,9 @@ fn tier3_yaml_fallback_picks_up_non_k8s_yaml() {
fn rust_file_re_ingest_is_unchanged() {
let env = TestEnv::lexical_only();
std::fs::write(
env.workspace_root.join("stable.rs"),
"pub fn noop() {}\n",
)
.unwrap();
std::fs::write(env.workspace_root.join("stable.rs"), "pub fn noop() {}\n").unwrap();
let r1 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let r1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let item1 = r1
.items
.as_ref()
@@ -1049,8 +1050,7 @@ fn rust_file_re_ingest_is_unchanged() {
.unwrap();
assert_eq!(item1.kind, IngestItemKind::New);
let r2 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let r2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let item2 = r2
.items
.unwrap()
@@ -1081,9 +1081,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
)
.unwrap();
let report1 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("first ingest");
let report1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("first ingest");
let item1 = report1
.items
.as_ref()
@@ -1093,7 +1092,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
.expect("docker-compose.yml in first report");
assert!(
matches!(item1.kind, IngestItemKind::New),
"first ingest must be New, got {:?}", item1.kind
"first ingest must be New, got {:?}",
item1.kind
);
assert_eq!(
item1.chunker_version.as_ref().map(|c| c.0.as_str()),
@@ -1101,9 +1101,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
"first ingest must use Tier 3 fallback chunker"
);
let report2 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("second ingest");
let report2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("second ingest");
let item2 = report2
.items
.as_ref()
@@ -1113,7 +1112,8 @@ fn tier3_yaml_fallback_reingest_is_unchanged() {
.expect("docker-compose.yml in second report");
assert!(
matches!(item2.kind, IngestItemKind::Unchanged),
"second ingest must be Unchanged, got {:?}", item2.kind
"second ingest must be Unchanged, got {:?}",
item2.kind
);
}
@@ -1163,8 +1163,8 @@ fn tier1_c_ingest_searchable() {
..Default::default()
},
};
let hits = kebab_app::search_with_config(env.config.clone(), query)
.expect("search must succeed");
let hits =
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
let h = hits
.iter()
@@ -1247,8 +1247,8 @@ fn tier1_cpp_ingest_searchable() {
..Default::default()
},
};
let hits = kebab_app::search_with_config(env.config.clone(), query)
.expect("search must succeed");
let hits =
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
let h = hits
.iter()
@@ -1266,7 +1266,9 @@ fn tier1_cpp_ingest_searchable() {
// Symbol could be "kebab::chunk::Foo" (class) or "kebab::chunk::Foo::bar"
// (method) depending on which chunk ranks first.
assert!(
symbol.as_deref().is_some_and(|s| s.starts_with("kebab::chunk::Foo")),
symbol
.as_deref()
.is_some_and(|s| s.starts_with("kebab::chunk::Foo")),
"C++ symbol must start with namespace::Class prefix, got {symbol:?}"
);
assert!(*line_start >= 1, "line_start must be >=1");
@@ -1335,8 +1337,8 @@ fn tier2_k8s_multi_resource_yaml_ingests_without_collision() {
..Default::default()
},
};
let hits = kebab_app::search_with_config(env.config.clone(), query)
.expect("search must succeed");
let hits =
kebab_app::search_with_config(env.config.clone(), query).expect("search must succeed");
assert!(
hits.len() >= 2,
"expected ≥2 hits (Deployment + Service), got {}",
@@ -1359,9 +1361,8 @@ fn tier3_shell_reingest_is_unchanged() {
)
.unwrap();
let report1 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("first ingest");
let report1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("first ingest");
let item1 = report1
.items
.as_ref()
@@ -1371,12 +1372,12 @@ fn tier3_shell_reingest_is_unchanged() {
.expect("deploy.sh in first report");
assert!(
matches!(item1.kind, IngestItemKind::New),
"first ingest must be New, got {:?}", item1.kind
"first ingest must be New, got {:?}",
item1.kind
);
let report2 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("second ingest");
let report2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false)
.expect("second ingest");
let item2 = report2
.items
.as_ref()
@@ -1386,6 +1387,7 @@ fn tier3_shell_reingest_is_unchanged() {
.expect("deploy.sh in second report");
assert!(
matches!(item2.kind, IngestItemKind::Unchanged),
"shell reingest must be Unchanged, got {:?}", item2.kind
"shell reingest must be Unchanged, got {:?}",
item2.kind
);
}

View File

@@ -0,0 +1,60 @@
use std::sync::Mutex;
use anyhow::Result;
use kebab_core::{Lang, OcrText};
use kebab_parse_image::OcrEngine;
pub struct MockOcrEngine {
expected_texts: Vec<String>,
call_index: Mutex<usize>,
fail: bool,
}
impl MockOcrEngine {
/// Single text (backward-compat ctor for pdf_ocr_apply.rs 10 sites).
pub fn single(text: impl Into<String>, fail: bool) -> Self {
Self {
expected_texts: vec![text.into()],
call_index: Mutex::new(0),
fail,
}
}
/// Per-page texts (cursor advances per recognize call).
pub fn per_page(texts: Vec<String>, fail: bool) -> Self {
Self {
expected_texts: texts,
call_index: Mutex::new(0),
fail,
}
}
}
impl OcrEngine for MockOcrEngine {
fn engine_name(&self) -> &'static str {
"mock-ocr"
}
fn engine_version(&self) -> String {
"mock-v1".to_string()
}
fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
if self.fail {
anyhow::bail!("mock failure");
}
let mut idx = self.call_index.lock().unwrap();
let text = self
.expected_texts
.get(*idx)
.cloned()
.unwrap_or_else(|| self.expected_texts.last().cloned().unwrap_or_default());
*idx += 1;
Ok(OcrText {
joined: text,
regions: vec![],
engine: "mock-ocr".to_string(),
engine_version: "mock-v1".to_string(),
})
}
}

View File

@@ -93,8 +93,7 @@ impl TestEnv {
/// directly. Caller can invoke this multiple times to simulate
/// re-opening the binary after a corpus revision bump.
pub fn app(&self) -> kebab_app::App {
kebab_app::App::open_with_config(self.config.clone())
.expect("App::open_with_config")
kebab_app::App::open_with_config(self.config.clone()).expect("App::open_with_config")
}
}
@@ -169,3 +168,5 @@ fn copy_dir_recursive(src: &Path, dest: &Path) {
}
}
}
pub mod mock_ocr;

View File

@@ -12,7 +12,11 @@ fn open(env: &common::TestEnv) -> App {
#[test]
fn fetch_chunk_returns_target_only_when_no_context() {
let env = common::TestEnv::new();
common::ingest_md(&env, "a.md", "# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n");
common::ingest_md(
&env,
"a.md",
"# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n",
);
let app = open(&env);
// Find a chunk via search to obtain its id.
@@ -42,7 +46,8 @@ fn fetch_chunk_with_context_returns_neighbors() {
// match. The earlier fixture used 2-char tokens like `A1`/`A3` for
// section bodies — those zero-hit under trigram. Use 5-char unique
// words per section so the query can pin one chunk deterministically.
let body = "# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n";
let body =
"# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n";
common::ingest_md(&env, "multi.md", body);
let app = env.app();
@@ -110,7 +115,10 @@ fn fetch_doc_returns_serialized_markdown() {
.unwrap();
assert_eq!(result.kind, FetchKind::Doc);
let text = result.text.expect("doc text");
assert!(text.contains("Heading One"), "doc text contains heading: {text:?}");
assert!(
text.contains("Heading One"),
"doc text contains heading: {text:?}"
);
assert!(text.contains("First paragraph"), "doc text contains body");
assert!(!result.truncated);
}
@@ -155,7 +163,11 @@ fn fetch_doc_with_max_tokens_truncates() {
.unwrap();
assert!(result.truncated);
let text = result.text.expect("doc text");
assert!(text.chars().count() <= 100, "trimmed text len {}", text.chars().count());
assert!(
text.chars().count() <= 100,
"trimmed text len {}",
text.chars().count()
);
}
#[test]
@@ -292,8 +304,7 @@ fn fetch_span_line_start_beyond_total_returns_empty_text() {
fn fetch_chunk_context_at_first_chunk_clamps_lower_bound() {
let env = common::TestEnv::new();
// Multi-chunk markdown so context ±N has neighbors.
let body =
"# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
let body = "# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
common::ingest_md(&env, "boundary.md", body);
let app = env.app();
let q = kebab_core::SearchQuery {

View File

@@ -16,8 +16,8 @@
mod common;
use common::TestEnv;
use kebab_app::ingest_with_config_opts;
use kebab_app::IngestOpts;
use kebab_app::ingest_with_config_opts;
use kebab_core::{DocFilter, DocumentStore, SearchMode, SearchQuery, SourceScope};
/// Helper: open the store via `TestEnv` and run `list_documents`.
@@ -125,17 +125,10 @@ fn include_scope_narrowing_does_not_purge() {
include: vec!["**/*.rs".to_string()],
exclude: env.config.workspace.exclude.clone(),
};
let first = ingest_with_config_opts(
env.config.clone(),
wide_scope,
false,
IngestOpts::default(),
)
.expect("first ingest (wide) must succeed");
assert!(
first.new >= 2,
"expected at least 2 new docs: {first:?}"
);
let first =
ingest_with_config_opts(env.config.clone(), wide_scope, false, IngestOpts::default())
.expect("first ingest (wide) must succeed");
assert!(first.new >= 2, "expected at least 2 new docs: {first:?}");
assert_eq!(
first.purged_deleted_files, 0,
"no purges on first ingest: {first:?}"

View File

@@ -24,8 +24,7 @@ use wiremock::{Mock, MockServer, ResponseTemplate};
/// inspectable in stored DB rows.
fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf {
use image::{ImageBuffer, Rgb};
let img: ImageBuffer<Rgb<u8>, _> =
ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
let path = root.join(name);
img.save(&path).expect("write PNG fixture");
path
@@ -80,7 +79,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
// Counters: scanned should include the PNG; new ≥ 1 (markdown
// fixtures from the workspace tree may also count).
assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items);
assert!(
report.scanned >= 1,
"scanned={}, items={:?}",
report.scanned,
report.items
);
assert_eq!(report.errors, 0, "no errors on lenient OCR path");
// Locate the image doc in the report items.
@@ -94,7 +98,11 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
kebab_core::IngestItemKind::New,
"image asset must be classified New on first ingest"
);
assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk");
assert_eq!(
img_item.chunk_count,
Some(1),
"image emits exactly one chunk"
);
// Inspect the stored chunk text via kb-app's inspect_chunk facade.
let doc_id = img_item.doc_id.clone().expect("image doc id");
@@ -117,10 +125,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
// Sanity: the doc was actually persisted into SQLite (kb-app's
// list_docs facade reads the same store the chunker writes to).
let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
.expect("list_docs");
let summaries =
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).expect("list_docs");
assert!(
summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")),
summaries
.iter()
.any(|s| s.doc_path.0.ends_with("diagram.png")),
"image doc must appear in list_docs"
);
@@ -171,8 +181,7 @@ async fn ingest_image_with_ocr_and_caption_populates_both_fields() {
.iter()
.find(|i| i.doc_path.0.ends_with("diagram.png"))
.unwrap();
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
.unwrap();
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
let block = match &doc.blocks[0] {
kebab_core::Block::ImageRef(b) => b,
_ => unreachable!(),
@@ -267,8 +276,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
let cfg_clone = cfg.clone();
let scope = env.scope();
let report = spawn_blocking(move || {
kebab_app::ingest_with_config(cfg_clone, scope, false)
.expect("ingest with no OCR/caption")
kebab_app::ingest_with_config(cfg_clone, scope, false).expect("ingest with no OCR/caption")
})
.await
.expect("task");
@@ -282,8 +290,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
.find(|i| i.doc_path.0.ends_with("raw.png"))
.unwrap();
assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk");
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
.unwrap();
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
let block = match &doc.blocks[0] {
kebab_core::Block::ImageRef(b) => b,
_ => unreachable!(),
@@ -392,16 +399,12 @@ async fn re_ingest_image_produces_unchanged_with_same_doc_id() {
let scope1 = scope.clone();
let scope2 = scope.clone();
let r1 = spawn_blocking(move || {
kebab_app::ingest_with_config(cfg1, scope1, false).unwrap()
})
.await
.unwrap();
let r2 = spawn_blocking(move || {
kebab_app::ingest_with_config(cfg2, scope2, false).unwrap()
})
.await
.unwrap();
let r1 = spawn_blocking(move || kebab_app::ingest_with_config(cfg1, scope1, false).unwrap())
.await
.unwrap();
let r2 = spawn_blocking(move || kebab_app::ingest_with_config(cfg2, scope2, false).unwrap())
.await
.unwrap();
let id1 = r1
.items

View File

@@ -21,11 +21,16 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
// First ingest — populates the DB. Use the legacy entry so the
// assertions cover the "previously ingested" set without needing
// IngestOpts::default() to behave identically.
let first =
ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
assert_eq!(first.unchanged, 0, "first ingest cannot have unchanged: {first:?}");
assert!(
first.new >= 1,
"first ingest must create new docs: {first:?}"
);
assert_eq!(
first.unchanged, 0,
"first ingest cannot have unchanged: {first:?}"
);
let scanned = first.scanned;
@@ -38,9 +43,15 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
IngestOpts::default(),
)
.unwrap();
assert_eq!(second.scanned, scanned, "second scanned matches first: {second:?}");
assert_eq!(
second.scanned, scanned,
"second scanned matches first: {second:?}"
);
assert_eq!(second.new, 0, "no new docs on re-ingest: {second:?}");
assert_eq!(second.updated, 0, "nothing should be marked updated: {second:?}");
assert_eq!(
second.updated, 0,
"nothing should be marked updated: {second:?}"
);
assert_eq!(
second.unchanged, scanned,
"every doc must be Unchanged: {second:?}"
@@ -52,10 +63,12 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
fn force_reingest_bypasses_skip() {
let env = TestEnv::lexical_only();
let first =
ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
assert!(
first.new >= 1,
"first ingest must create new docs: {first:?}"
);
let scanned = first.scanned;
let second = ingest_with_config_opts(

View File

@@ -107,13 +107,9 @@ fn cancel_none_is_uncancellable_default() {
// ingest_with_config_progress (no cancel) runs to completion.
let env = TestEnv::lexical_only();
let (tx, rx) = mpsc::channel::<IngestEvent>();
let report = kebab_app::ingest_with_config_progress(
env.config.clone(),
env.scope(),
true,
Some(tx),
)
.unwrap();
let report =
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
.unwrap();
assert_eq!(report.scanned, 3);
assert_eq!(report.new, 3);

View File

@@ -107,5 +107,8 @@ fn ingest_file_errors_on_unsupported_extension() {
let err = kebab_app::ingest_file_with_config(cfg, &docx).unwrap_err();
assert!(err.to_string().contains("unsupported extension"), "{err}");
assert!(err.to_string().contains(".docx") || err.to_string().contains("docx"), "{err}");
assert!(
err.to_string().contains(".docx") || err.to_string().contains("docx"),
"{err}"
);
}

View File

@@ -8,8 +8,7 @@ use common::TestEnv;
#[test]
fn ingest_then_list_inspects_round_trip() {
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
// The fixture has 3 markdown files; first ingest should label them
// all as New.
@@ -27,17 +26,14 @@ fn ingest_then_list_inspects_round_trip() {
}
// list_docs returns the 3 docs.
let docs = kebab_app::list_docs_with_config(
env.config.clone(),
kebab_core::DocFilter::default(),
)
.unwrap();
let docs =
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
.unwrap();
assert_eq!(docs.len(), 3, "docs: {docs:?}");
// inspect_doc round-trips one of them.
let any_doc_id = docs[0].doc_id.clone();
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id)
.unwrap();
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id).unwrap();
assert_eq!(canonical.doc_id, any_doc_id);
assert!(!canonical.blocks.is_empty(), "blocks empty");
}
@@ -46,12 +42,10 @@ fn ingest_then_list_inspects_round_trip() {
fn ingest_idempotent_on_second_run() {
let env = TestEnv::lexical_only();
let r1 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let r1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
assert_eq!(r1.new, 3);
let r2 =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let r2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
// Same files re-ingested — p9-fb-23 task 7 introduced the early-skip
// path: when checksum + parser/chunker/embedding versions all match,
// the second run reports `Unchanged` rather than `Updated`. Pre-p9-fb-23
@@ -63,19 +57,16 @@ fn ingest_idempotent_on_second_run() {
assert_eq!(r2.unchanged, 3, "second run unchanged: {r2:?}");
// list_docs still has 3 docs (no duplicates).
let docs = kebab_app::list_docs_with_config(
env.config.clone(),
kebab_core::DocFilter::default(),
)
.unwrap();
let docs =
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
.unwrap();
assert_eq!(docs.len(), 3);
}
#[test]
fn ingest_summary_only_drops_items() {
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
assert_eq!(report.scanned, 3);
assert!(report.items.is_none(), "summary-only should null items");
}
@@ -87,12 +78,10 @@ fn ingest_records_ingest_runs_row_with_aggregate_counts() {
// of every run. `summary_only=true` writes `items_json=NULL`; the
// counts MUST still be present.
let env = TestEnv::lexical_only();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
.unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
assert_eq!(report.scanned, 3);
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir)
.join("kebab.sqlite");
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
let conn = rusqlite::Connection::open(&db_path).expect("open kebab.sqlite");
let (scanned, new_c, updated, skipped, errors, items_json): (
i64,
@@ -141,25 +130,18 @@ fn ingest_provider_none_skips_lance() {
// tree shape (no `<data_dir>/lancedb` directory, or no `*.lance`
// tables under it).
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
assert_eq!(report.errors, 0, "lexical-only run must not error");
assert_eq!(report.new, 3);
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir)
.join("lancedb");
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir).join("lancedb");
if lance_dir.exists() {
// If the dir was created (e.g., by an earlier consumer touching
// the path), it MUST contain no `.lance` tables.
let mut had_lance_table = false;
for entry in std::fs::read_dir(&lance_dir).expect("read lance_dir") {
let entry = entry.unwrap();
if entry
.path()
.extension()
.and_then(|s| s.to_str())
== Some("lance")
{
if entry.path().extension().and_then(|s| s.to_str()) == Some("lance") {
had_lance_table = true;
break;
}
@@ -189,8 +171,7 @@ fn list_docs_filters_by_tags_any() {
tags_any: vec!["rust".to_string()],
..Default::default()
};
let rust_docs =
kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
let rust_docs = kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
// intro.md and notes/cargo.md both tag "rust".
assert_eq!(rust_docs.len(), 2, "expected 2 rust docs: {rust_docs:?}");
}
@@ -198,8 +179,9 @@ fn list_docs_filters_by_tags_any() {
#[test]
fn inspect_doc_not_found_returns_actionable_error() {
let env = TestEnv::lexical_only();
let bogus =
kebab_core::DocumentId("0000000000000000000000000000000000000000000000000000000000000000".to_string());
let bogus = kebab_core::DocumentId(
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
);
let err = kebab_app::inspect_doc_with_config(env.config.clone(), &bogus).unwrap_err();
let msg = format!("{err:#}");
assert!(
@@ -218,8 +200,7 @@ fn inspect_chunk_not_found_returns_actionable_error() {
let bogus = kebab_core::ChunkId(
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
);
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus)
.unwrap_err();
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus).unwrap_err();
let msg = format!("{err:#}");
assert!(msg.contains("not found"), "got: {msg}");
}
@@ -251,22 +232,18 @@ fn ingest_with_config_opts_default_matches_legacy_behaviour() {
#[test]
fn ingest_stamps_chunker_version_on_document() {
let env = TestEnv::lexical_only();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
assert!(report.new >= 1, "expected at least one new doc: {report:?}");
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
let docs = kebab_app::list_docs_with_config(
env.config.clone(),
kebab_core::DocFilter::default(),
)
.unwrap();
let docs =
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
.unwrap();
assert!(!docs.is_empty(), "no docs after ingest");
for doc_entry in &docs {
let canonical =
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id)
.unwrap();
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id).unwrap();
assert!(
canonical.last_chunker_version.is_some(),
"last_chunker_version must be stamped for doc {}: got {:?}",

View File

@@ -0,0 +1,169 @@
// crates/kebab-app/tests/ingest_log_smoke.rs
//
// Integration tests for ingest_log feature (v0.20.x). Spec §5 AC-9 + AC-6.
use std::path::PathBuf;
use kebab_app::{IngestOpts, ingest_with_config_opts};
use kebab_config::{Config, LoggingCfg};
use kebab_core::SourceScope;
use serde_json::Value;
use tempfile::TempDir;
fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Config {
let data_dir = workspace.parent().unwrap().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let model_dir = workspace.parent().unwrap().join("models");
std::fs::create_dir_all(&model_dir).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.exclude.clear();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
cfg.chunking.target_tokens = 80;
cfg.chunking.overlap_tokens = 20;
cfg.logging = LoggingCfg {
ingest_log_enabled: true,
ingest_log_dir: log_dir.to_path_buf(),
};
cfg
}
/// AC-9: ingest → log file exists + each line valid JSON + last line kind=summary + scanned>0.
#[test]
fn ingest_log_smoke() {
let tmp = TempDir::new().unwrap();
let workspace = tmp.path().join("kb");
std::fs::create_dir_all(&workspace).unwrap();
let log_dir = tmp.path().join("logs");
// 1. Minimal corpus: 1 markdown + 1 scanned PDF (OCR disabled — no Ollama needed).
std::fs::write(
workspace.join("hello.md"),
"# Hello\n\nThis is a smoke test.\n",
)
.unwrap();
let pdf_src = PathBuf::from("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
if pdf_src.exists() {
std::fs::copy(&pdf_src, workspace.join("scanned.pdf")).unwrap();
}
// 2. Config with logging enabled.
let cfg = minimal_config(&workspace, &log_dir);
let scope = SourceScope {
root: workspace.clone(),
exclude: vec![],
..Default::default()
};
// 3. Run ingest.
ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
.expect("ingest should succeed");
// 4. Assert log file exists in log_dir.
let log_files: Vec<_> = std::fs::read_dir(&log_dir)
.unwrap()
.filter_map(Result::ok)
.filter(|e| {
e.file_name().to_string_lossy().starts_with("ingest-")
&& e.file_name().to_string_lossy().ends_with(".ndjson")
})
.collect();
assert_eq!(
log_files.len(),
1,
"expected exactly 1 ingest-*.ndjson file, found: {log_files:?}"
);
// 5. Parse each line as JSON — assert kind field present and valid.
let body = std::fs::read_to_string(log_files[0].path()).unwrap();
let lines: Vec<&str> = body.lines().collect();
assert!(!lines.is_empty(), "log file should not be empty");
let valid_kinds = ["ocr", "parse_error", "skip", "error", "summary"];
for line in &lines {
let v: Value = serde_json::from_str(line)
.unwrap_or_else(|e| panic!("line is not valid JSON: {e}\nline: {line}"));
let kind = v
.get("kind")
.and_then(|k| k.as_str())
.unwrap_or_else(|| panic!("line missing 'kind' field: {line}"));
assert!(
valid_kinds.contains(&kind),
"unexpected kind '{kind}' in line: {line}"
);
}
// 6. Last line must be kind=summary with scanned > 0.
let last = lines.last().unwrap();
let last_v: Value = serde_json::from_str(last).unwrap();
assert_eq!(
last_v.get("kind").and_then(|k| k.as_str()),
Some("summary"),
"last line must be kind=summary, got: {last}"
);
let scanned = last_v.get("scanned").and_then(Value::as_u64).unwrap_or(0);
assert!(scanned > 0, "summary.scanned should be > 0, got: {last}");
}
/// AC-6: ingest_log_enabled=false → no log file created.
#[test]
fn ingest_log_disabled_emits_no_file() {
let tmp = TempDir::new().unwrap();
let workspace = tmp.path().join("kb");
std::fs::create_dir_all(&workspace).unwrap();
let log_dir = tmp.path().join("logs");
std::fs::write(
workspace.join("hello.md"),
"# Hello\n\nDisabled log test.\n",
)
.unwrap();
let data_dir = tmp.path().join("data");
std::fs::create_dir_all(&data_dir).unwrap();
let model_dir = tmp.path().join("models");
std::fs::create_dir_all(&model_dir).unwrap();
let mut cfg = Config::defaults();
cfg.workspace.root = workspace.to_string_lossy().into_owned();
cfg.workspace.exclude.clear();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
cfg.logging = LoggingCfg {
ingest_log_enabled: false,
ingest_log_dir: log_dir.clone(),
};
let scope = SourceScope {
root: workspace.clone(),
exclude: vec![],
..Default::default()
};
ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
.expect("ingest should succeed");
// log_dir should either not exist or contain 0 ingest-*.ndjson files.
let log_file_count = if log_dir.exists() {
std::fs::read_dir(&log_dir)
.unwrap()
.filter_map(Result::ok)
.filter(|e| {
e.file_name().to_string_lossy().starts_with("ingest-")
&& e.file_name().to_string_lossy().ends_with(".ndjson")
})
.count()
} else {
0
};
assert_eq!(
log_file_count, 0,
"no ingest-*.ndjson file should be created when disabled"
);
}

View File

@@ -0,0 +1,117 @@
//! Integration smoke tests for the PDF OCR pipeline (§ Acceptance §9 #1 + #2).
//!
//! Tests 1 and 2 require a live Ollama endpoint — `#[ignore]` by default.
//! Manual invoke:
//! KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
//! cargo test -p kebab-app --test ingest_pdf_ocr_smoke --ignored -j 4
//!
//! Test 3 (cancel) uses a dummy endpoint + pre-set cancel — runs by default
//! to verify the cancel wiring doesn't panic/deadlock.
mod common;
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
use common::TestEnv;
fn ollama_endpoint() -> String {
std::env::var("KEBAB_PDF_OCR_ENDPOINT").unwrap_or_else(|_| "http://localhost:11434".to_string())
}
fn make_ocr_env_real() -> TestEnv {
let mut env = TestEnv::lexical_only();
env.config.pdf.ocr.enabled = true;
env.config.pdf.ocr.endpoint = Some(ollama_endpoint());
env.config.models.embedding.provider = "none".to_string();
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
let dest = env.workspace_root.join("scanned_page1.pdf");
std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
env
}
/// § Acceptance §9 #1 — real Ollama OCR + IngestItem.pdf_ocr_pages = Some(1).
#[test]
#[ignore = "real Ollama qwen2.5vl:3b dependency"]
fn ingest_with_mock_ocr_yields_pdf_ocr_summary() {
let env = make_ocr_env_real();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
assert!(report.new >= 1, "at least one PDF ingested: {report:?}");
let items = report.items.unwrap_or_default();
let pdf_item = items.iter().find(|i| i.doc_path.0.ends_with(".pdf"));
assert!(
pdf_item.is_some(),
"PDF item must appear in ingest report items: {items:?}"
);
let pdf_item = pdf_item.unwrap();
assert!(
pdf_item.pdf_ocr_pages.is_some(),
"pdf_ocr_pages must be set for scanned PDF: {pdf_item:?}"
);
assert_eq!(
pdf_item.pdf_ocr_pages.unwrap(),
1,
"scanned_page1.pdf has exactly 1 page"
);
}
/// § Acceptance §9 #2 — OCR text indexed and retrievable via lexical search.
#[test]
#[ignore = "real Ollama qwen2.5vl:3b dependency"]
fn ocr_text_indexed_and_searchable() {
let env = make_ocr_env_real();
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
// Search for a Korean morpheme expected to appear in qwen2.5vl:3b OCR
// output of the PoC ground-truth page. "다음" is a high-frequency token
// in page1.txt truth file.
let query = common::lexical_query("다음");
let hits = kebab_app::search_with_config(env.config.clone(), query).expect("search");
assert!(
!hits.is_empty(),
"OCR-indexed text must surface in lexical search results"
);
}
/// Production cancel wiring smoke — pre-set cancel exits before any OCR call.
/// Dummy endpoint (port 1 = connection-refused) means OCR HTTP calls would
/// fail, but cancel=true prevents the loop from reaching OCR at all.
/// Verifies no panic/deadlock regardless of Ok/Err outcome.
#[test]
fn ingest_with_cancel_aborts_mid_pdf() {
let mut env = TestEnv::lexical_only();
env.config.pdf.ocr.enabled = true;
env.config.pdf.ocr.endpoint = Some("http://127.0.0.1:1".to_string());
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
let dest = env.workspace_root.join("scanned_page1.pdf");
std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
let cancel = Arc::new(AtomicBool::new(true)); // pre-set — abort immediately
let result = kebab_app::ingest_with_config_cancellable(
env.config.clone(),
env.scope(),
false,
None,
Some(cancel),
);
// Both Ok (pre-cancel exit) and Err (eager OCR engine fail) are acceptable —
// key assertion is no panic/deadlock.
let _ = result;
}

View File

@@ -13,13 +13,9 @@ use kebab_core::IngestItemKind;
fn run_with_progress() -> Vec<IngestEvent> {
let env = TestEnv::lexical_only();
let (tx, rx) = mpsc::channel::<IngestEvent>();
let report = kebab_app::ingest_with_config_progress(
env.config.clone(),
env.scope(),
false,
Some(tx),
)
.unwrap();
let report =
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), false, Some(tx))
.unwrap();
assert_eq!(report.scanned, 3);
assert_eq!(report.new, 3);
@@ -116,13 +112,9 @@ fn ingest_with_config_progress_none_matches_ingest_with_config() {
// `ingest_with_config_progress(..., None)` must produce identical
// reports modulo wall-clock duration.
let env = TestEnv::lexical_only();
let r_none = kebab_app::ingest_with_config_progress(
env.config.clone(),
env.scope(),
true,
None,
)
.unwrap();
let r_none =
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, None)
.unwrap();
assert_eq!(r_none.scanned, 3);
assert_eq!(r_none.new, 3);
}
@@ -134,12 +126,77 @@ fn dropped_receiver_does_not_panic_or_fail_ingest() {
let env = TestEnv::lexical_only();
let (tx, rx) = mpsc::channel::<IngestEvent>();
drop(rx);
let report = kebab_app::ingest_with_config_progress(
env.config.clone(),
env.scope(),
true,
Some(tx),
)
.unwrap();
let report =
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
.unwrap();
assert_eq!(report.scanned, 3);
}
/// v0.20.0 sub-item 1: pdf_ocr_started + pdf_ocr_finished events 가 PDF asset 의
/// OCR-enabled ingest 시 emit 됨을 검증. real Ollama 의존 — `#[ignore]` default.
///
/// Manual invoke:
/// ```
/// KEBAB_PDF_OCR_ENABLED=true \
/// KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
/// cargo test -p kebab-app --test ingest_progress \
/// --ignored pdf_ocr_progress_emits_started_finished_events
/// ```
#[test]
#[ignore = "real Ollama dependency — manual invoke via KEBAB_PDF_OCR_ENABLED=true"]
fn pdf_ocr_progress_emits_started_finished_events() {
// F1 fixture (DCTDecode JPEG passthrough) 을 tmpdir 의 workspace 로 copy.
let tmpdir = tempfile::tempdir().expect("create tmpdir");
let workspace = tmpdir.path().join("workspace");
std::fs::create_dir_all(&workspace).expect("create workspace dir");
let f1_src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
let f1 = std::fs::read(&f1_src).expect("F1 fixture present");
std::fs::write(workspace.join("page1.pdf"), &f1).expect("copy F1");
let data_dir = tmpdir.path().join("data");
std::fs::create_dir_all(&data_dir).expect("create data dir");
let mut config = kebab_config::Config::defaults();
config.workspace.root = workspace.to_string_lossy().into_owned();
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
config.models.embedding.provider = "none".to_string();
config.models.embedding.dimensions = 0;
config.pdf.ocr.enabled = true;
if let Ok(endpoint) = std::env::var("KEBAB_PDF_OCR_ENDPOINT") {
config.pdf.ocr.endpoint = Some(endpoint);
}
let scope = kebab_core::SourceScope {
root: workspace.clone(),
..Default::default()
};
let (tx, rx) = mpsc::channel::<IngestEvent>();
let _report = kebab_app::ingest_with_config_progress(config, scope, false, Some(tx))
.expect("ingest_with_config_progress");
let events: Vec<_> = rx.iter().collect();
let started_count = events
.iter()
.filter(|e| matches!(e, IngestEvent::PdfOcrStarted { .. }))
.count();
let finished_count = events
.iter()
.filter(|e| matches!(e, IngestEvent::PdfOcrFinished { .. }))
.count();
assert!(
started_count >= 1,
"PdfOcrStarted 가 ≥ 1 emit 됨 (got {started_count})"
);
assert!(
finished_count >= 1,
"PdfOcrFinished 가 ≥ 1 emit 됨 (got {finished_count})"
);
assert_eq!(
started_count, finished_count,
"Started 와 Finished 의 count 일치"
);
}

View File

@@ -29,12 +29,14 @@ fn ingest_stdin_writes_frontmatter_and_reports_new() {
"## Body content\n\nMore.",
"Article X",
Some("https://example.com/x"),
).unwrap();
)
.unwrap();
assert_eq!(report.new, 1, "{report:?}");
// _external/ contains exactly one .md file with frontmatter.
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
let entries: Vec<_> = fs::read_dir(&ext_dir)
.unwrap()
.filter_map(std::result::Result::ok)
.collect();
assert_eq!(entries.len(), 1);
@@ -50,16 +52,13 @@ fn ingest_stdin_without_source_uri() {
let dir = tempfile::tempdir().unwrap();
let cfg = fresh_cfg(dir.path());
let report = kebab_app::ingest_stdin_with_config(
cfg.clone(),
"## Body",
"Title",
None,
).unwrap();
let report =
kebab_app::ingest_stdin_with_config(cfg.clone(), "## Body", "Title", None).unwrap();
assert_eq!(report.new, 1);
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
let entries: Vec<_> = fs::read_dir(&ext_dir)
.unwrap()
.filter_map(std::result::Result::ok)
.collect();
let content = fs::read_to_string(entries[0].path()).unwrap();

View File

@@ -17,9 +17,8 @@ fn init_workspace_header_lists_supported_extensions() {
}
kebab_app::init_workspace(true).expect("init_workspace");
let cfg_path = kebab_config::Config::xdg_config_path();
let body = std::fs::read_to_string(&cfg_path).unwrap_or_else(|e| {
panic!("read config at {}: {e}", cfg_path.display())
});
let body = std::fs::read_to_string(&cfg_path)
.unwrap_or_else(|e| panic!("read config at {}: {e}", cfg_path.display()));
assert!(
body.contains("처리 가능한 형식"),
"header lists supported types section: body=\n{body}"

View File

@@ -0,0 +1,122 @@
//! Bug #3 regression: multi-scanned PDF ingest must produce globally unique chunk_ids.
//! v0.20.0 sub-item 1 bugfix.
//!
//! Strategy: helper-level chain test (apply_ocr_to_pdf_pages → PdfPageV1Chunker).
//! Facade mock injection is unavailable (kebab-app hardcodes OllamaVisionOcr), so
//! this test covers the full OCR→chunk pipeline with real PDF fixtures + MockOcrEngine,
//! adding value beyond kebab-chunk unit test B5 (which tests PdfPageV1Chunker alone).
mod common;
use std::collections::HashSet;
use std::path::{Path, PathBuf};
use common::mock_ocr::MockOcrEngine;
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
use kebab_chunk::PdfPageV1Chunker;
use kebab_core::{
AssetStorage, Checksum, ChunkPolicy, Chunker, ExtractConfig, ExtractContext, Extractor,
MediaType, RawAsset, SourceUri, WorkspacePath, id_for_asset,
};
use kebab_parse_image::OcrEngine;
use kebab_parse_pdf::PdfTextExtractor;
use time::OffsetDateTime;
fn make_pdf_asset(path: &str, hash_char: char, byte_len: u64) -> RawAsset {
let fake_hash: String = hash_char.to_string().repeat(64);
let asset_id = id_for_asset(&fake_hash);
RawAsset {
asset_id,
source_uri: SourceUri::File(PathBuf::from(path)),
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
media_type: MediaType::Pdf,
byte_len,
checksum: Checksum(fake_hash),
discovered_at: OffsetDateTime::UNIX_EPOCH,
stored: AssetStorage::Copied {
path: PathBuf::from(path),
},
}
}
fn extract_and_ocr(
bytes: &[u8],
path: &str,
hash_char: char,
engine: &dyn OcrEngine,
) -> kebab_core::CanonicalDocument {
let asset = make_pdf_asset(path, hash_char, bytes.len() as u64);
let workspace_root = Path::new("/");
let config = ExtractConfig::default();
let ctx = ExtractContext {
asset: &asset,
workspace_root,
config: &config,
};
let mut canonical = PdfTextExtractor::new().extract(&ctx, bytes).unwrap();
let opts = PdfOcrOpts {
enabled: true,
always_on: false,
valid_ratio_threshold: 0.5,
min_char_count: 20,
lang_hint: None,
cancel: None,
};
apply_ocr_to_pdf_pages(&mut canonical, engine, bytes, &opts, |_| {}).unwrap();
canonical
}
#[test]
fn multi_scanned_pdf_ingest_no_chunk_id_collision() {
let f1_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
.expect("F1 fixture missing");
let f2_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page2.pdf")
.expect("F2 fixture missing");
// Bug #3 trigger shape: 10-char early segment + ". " + 500-char tail.
// byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500 → multi-chunk.
// overlap_bytes = min(240, 750) = 240 / chars=80 → second chunk's actual_start
// collapses to prev_min=0 without the fix → same #c0 suffix → chunk_id collision.
let trigger_text = format!("{}. {}", "".repeat(10), "".repeat(500));
let f1_engine = MockOcrEngine::single("F1 mock OCR page text", false);
let f2_engine = MockOcrEngine::single(&trigger_text, false);
let f1_canonical = extract_and_ocr(&f1_bytes, "page1.pdf", '1', &f1_engine);
let f2_canonical = extract_and_ocr(&f2_bytes, "page2.pdf", '2', &f2_engine);
let chunk_policy = ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: PdfPageV1Chunker.chunker_version(),
};
let f1_chunks = PdfPageV1Chunker
.chunk(&f1_canonical, &chunk_policy)
.unwrap();
let f2_chunks = PdfPageV1Chunker
.chunk(&f2_canonical, &chunk_policy)
.unwrap();
assert!(
f2_chunks.len() >= 2,
"F2 trigger text must produce ≥2 chunks for the collision to be possible; got {}",
f2_chunks.len()
);
let all_ids: Vec<&str> = f1_chunks
.iter()
.chain(f2_chunks.iter())
.map(|c| c.chunk_id.0.as_str())
.collect();
let total = all_ids.len();
let unique: HashSet<&str> = all_ids.iter().copied().collect();
assert_eq!(
unique.len(),
total,
"all chunk_ids must be globally unique across F1 + F2 ({} unique vs {} total — collision detected)",
unique.len(),
total,
);
}

View File

@@ -0,0 +1,358 @@
//! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern.
mod common;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
use common::mock_ocr::MockOcrEngine;
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
use kebab_core::{
AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext, Extractor,
Inline, Lang, MediaType, RawAsset, SourceSpan, SourceUri, WorkspacePath, id_for_asset,
};
use kebab_parse_pdf::PdfTextExtractor;
use time::OffsetDateTime;
// ── Fixture helpers ───────────────────────────────────────────────────────
fn f1_pdf_bytes() -> Vec<u8> {
std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
.expect("F1 fixture missing")
}
fn make_raw_asset(path: &str, media_type: MediaType, byte_len: u64) -> RawAsset {
let fake_hash = "0".repeat(64);
let asset_id = id_for_asset(&fake_hash);
RawAsset {
asset_id,
source_uri: SourceUri::File(PathBuf::from(path)),
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
media_type,
byte_len,
checksum: Checksum(fake_hash.clone()),
discovered_at: OffsetDateTime::UNIX_EPOCH,
stored: AssetStorage::Copied {
path: PathBuf::from(path),
},
}
}
/// Build a CanonicalDocument from raw PDF bytes using PdfTextExtractor.
/// F1 (scanned) returns an empty-text Block::Paragraph per page.
fn extract_canonical_from_bytes(bytes: &[u8]) -> CanonicalDocument {
let asset = make_raw_asset("test.pdf", MediaType::Pdf, bytes.len() as u64);
let workspace_root = Path::new("/");
let config = ExtractConfig::default();
let ctx = ExtractContext {
asset: &asset,
workspace_root,
config: &config,
};
PdfTextExtractor::new().extract(&ctx, bytes).unwrap()
}
/// F1 bytes → canonical with 1 empty Block::Paragraph for page 1.
fn canonical_with_empty_block() -> CanonicalDocument {
extract_canonical_from_bytes(&f1_pdf_bytes())
}
/// F1-based canonical with block text replaced by `text` (high valid_ratio, chars≥20).
fn canonical_with_filled_block(text: &str) -> CanonicalDocument {
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
let char_count = text.chars().count() as u32;
tb.text = text.to_string();
tb.inlines = vec![Inline::Text {
text: text.to_string(),
}];
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
*char_end = Some(char_count);
}
}
canonical
}
/// F1-based canonical with block text replaced by PUA codepoints (low valid_ratio).
fn canonical_with_mojibake_block() -> CanonicalDocument {
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
let pua = "\u{E000}".repeat(25); // 25 PUA codepoints → valid_ratio ≈ 0
let char_count = pua.chars().count() as u32;
tb.text = pua.clone();
tb.inlines = vec![Inline::Text { text: pua }];
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
*char_end = Some(char_count);
}
}
canonical
}
fn default_opts(enabled: bool) -> PdfOcrOpts {
PdfOcrOpts {
enabled,
always_on: false,
valid_ratio_threshold: 0.5,
min_char_count: 20,
lang_hint: None,
cancel: None,
}
}
// ── Tests ─────────────────────────────────────────────────────────────────
// Test 1: F1 + enabled=true → in-place mutate
#[test]
fn f1_input_with_ocr_enabled_replaces_empty_block() {
let bytes = f1_pdf_bytes();
let mut canonical = canonical_with_empty_block();
let engine = MockOcrEngine::single("MOCK_OCR_TEXT", false);
let opts = PdfOcrOpts {
enabled: true,
always_on: false,
valid_ratio_threshold: 0.5,
min_char_count: 20,
lang_hint: Some(Lang("kor".into())),
cancel: None,
};
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
assert_eq!(summary.pages_ocrd, 1);
let first_para = canonical.blocks.iter().find_map(|b| match b {
Block::Paragraph(tb) => Some(tb),
_ => None,
});
assert!(first_para.is_some());
assert_eq!(first_para.unwrap().text, "MOCK_OCR_TEXT");
}
// Test 2: F3 vector (mock filled canonical) + enabled=true → OCR skip (needs_ocr=false)
#[test]
fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() {
let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text
let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters.";
let mut canonical = canonical_with_filled_block(text);
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
let opts = default_opts(true);
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
assert_eq!(summary.pages_ocrd, 0, "vector PDF 의 OCR 호출 0");
let first_para = canonical.blocks.iter().find_map(|b| match b {
Block::Paragraph(tb) => Some(tb),
_ => None,
});
if let Some(tb) = first_para {
assert!(tb.text.starts_with("충분한"), "원본 text 보존");
}
}
// Test 3: F1 + enabled=false → no-op
#[test]
fn f1_input_with_ocr_disabled_keeps_empty_block() {
let bytes = f1_pdf_bytes();
let mut canonical = canonical_with_empty_block();
let engine = MockOcrEngine::single("IGNORED", false);
let opts = default_opts(false);
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
assert_eq!(summary.pages_ocrd, 0);
assert_eq!(summary.ms_total, 0);
}
// Test 4: mojibake canonical (PUA chars) + enabled=true → in-place mutate
#[test]
fn f4_input_with_ocr_enabled_replaces_mojibake_block() {
let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image
let mut canonical = canonical_with_mojibake_block();
let engine = MockOcrEngine::single("OCR_MOJIBAKE_REPLACEMENT", false);
let opts = PdfOcrOpts {
enabled: true,
always_on: false,
valid_ratio_threshold: 0.5,
min_char_count: 20,
lang_hint: None,
cancel: None,
};
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
assert_eq!(summary.pages_ocrd, 1, "mojibake page 의 OCR 호출");
let first_para = canonical.blocks.iter().find_map(|b| match b {
Block::Paragraph(tb) => Some(tb),
_ => None,
});
if let Some(tb) = first_para {
assert_eq!(tb.text, "OCR_MOJIBAKE_REPLACEMENT");
}
}
// Test 5: filled canonical + always_on=true → dual-block (+1 OCR block)
#[test]
fn f3_input_with_always_on_pushes_dual_blocks() {
let bytes = f1_pdf_bytes();
let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio.";
let mut canonical = canonical_with_filled_block(text);
let original_block_count = canonical.blocks.len();
let engine = MockOcrEngine::single("OCR_DUAL", false);
let opts = PdfOcrOpts {
enabled: true,
always_on: true,
valid_ratio_threshold: 0.5,
min_char_count: 20,
lang_hint: None,
cancel: None,
};
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
assert_eq!(summary.pages_ocrd, 1);
assert_eq!(
canonical.blocks.len(),
original_block_count + 1,
"always_on 시 새 Block::Paragraph push"
);
let texts: Vec<&str> = canonical
.blocks
.iter()
.filter_map(|b| match b {
Block::Paragraph(tb) => Some(tb.text.as_str()),
_ => None,
})
.collect();
assert!(texts.contains(&"OCR_DUAL"), "OCR block 포함");
assert!(
texts.iter().any(|t| t.starts_with("vector")),
"원본 text-detect block 보존"
);
}
// Test 6: F6 FlateDecode → extract_dctdecode_page_image=None → skip + warning
#[test]
fn f6_flatedecode_skipped_with_warning() {
let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf")
.expect("F6 fixture missing");
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
let opts = default_opts(true);
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
assert_eq!(
summary.pages_ocrd, 0,
"FlateDecode page 는 skip (DCTDecode-only v1 invariant)"
);
let warning_count = canonical
.provenance
.events
.iter()
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
.count();
assert!(warning_count >= 1, "FlateDecode skip 시 Warning event 발행");
}
// Test 7: F7 CCITTFax → skip + warning (verifier M-4 split)
#[test]
fn f7_ccittfax_skipped_with_warning() {
let bytes =
std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf").expect("F7 fixture missing");
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
let opts = default_opts(true);
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
assert_eq!(summary.pages_ocrd, 0, "CCITTFax page 는 skip");
let warning_count = canonical
.provenance
.events
.iter()
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
.count();
assert!(warning_count >= 1, "CCITTFax skip 시 Warning event 발행");
}
// Test 8: OCR engine failure → warning event + skip
#[test]
fn ocr_engine_failure_surfaces_as_warning() {
let bytes = f1_pdf_bytes();
let mut canonical = canonical_with_empty_block();
let engine = MockOcrEngine::single("", true);
let opts = default_opts(true);
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
assert_eq!(summary.pages_ocrd, 0, "OCR failure 시 pages_ocrd=0");
let warning_with_failure = canonical.provenance.events.iter().any(|e| {
e.kind == kebab_core::ProvenanceKind::Warning
&& e.note.as_deref().unwrap_or("").contains("mock failure")
});
assert!(
warning_with_failure,
"OCR failure 의 error message 가 warning event 의 note 안"
);
}
// Test 9: dual-block ordinals are deterministic and unique
#[test]
fn dual_block_ordinals_are_deterministic_and_unique() {
let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1
let text = "vector 충분한 텍스트. This text has more than twenty characters total.";
let mut canonical = canonical_with_filled_block(text);
let engine = MockOcrEngine::single("DUAL", false);
let opts = PdfOcrOpts {
enabled: true,
always_on: true,
valid_ratio_threshold: 0.5,
min_char_count: 20,
lang_hint: None,
cancel: None,
};
apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
// page_count=1 → text-detect ordinal=0, ocr ordinal=1 (page_num-1 + page_count = 0+1=1)
let para_count = canonical
.blocks
.iter()
.filter(|b| matches!(b, Block::Paragraph(_)))
.count();
assert_eq!(para_count, 2, "dual-block: text-detect + OCR");
let all_page_1 = canonical
.blocks
.iter()
.filter_map(|b| match b {
Block::Paragraph(tb) => Some(&tb.common.source_span),
_ => None,
})
.all(|s| matches!(s, SourceSpan::Page { page: 1, .. }));
assert!(all_page_1, "두 block 모두 page=1");
}
// Test 10: cancel handle aborts mid-PDF
#[test]
fn cancel_handle_aborts_mid_pdf() {
let bytes = f1_pdf_bytes();
let mut canonical = canonical_with_empty_block();
let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel
let engine = MockOcrEngine::single("IGNORED", false);
let opts = PdfOcrOpts {
enabled: true,
always_on: false,
valid_ratio_threshold: 0.5,
min_char_count: 20,
lang_hint: None,
cancel: Some(cancel.clone()),
};
let result = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {});
let err = result.expect_err("cancel=true 시 error 반환");
assert!(
format!("{err}").contains("cancelled mid-PDF"),
"error message 가 'cancelled mid-PDF' 포함: {err}"
);
}

View File

@@ -46,17 +46,13 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
operations: vec![
Operation::new("BT", vec![]),
Operation::new("Tf", vec!["F1".into(), 24.into()]),
Operation::new(
"Td",
vec![Object::Integer(100), Object::Integer(700)],
),
Operation::new("Td", vec![Object::Integer(100), Object::Integer(700)]),
Operation::new("Tj", vec![Object::string_literal(*text)]),
Operation::new("ET", vec![]),
],
};
let stream_data = content.encode().expect("content encode");
let content_id =
doc.add_object(Stream::new(dictionary! {}, stream_data));
let content_id = doc.add_object(Stream::new(dictionary! {}, stream_data));
page_dict.set("Contents", content_id);
}
let page_id = doc.add_object(page_dict);
@@ -76,8 +72,7 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
Object::Integer(842),
],
};
doc.objects
.insert(pages_id, Object::Dictionary(pages_dict));
doc.objects.insert(pages_id, Object::Dictionary(pages_dict));
let catalog_id = doc.add_object(dictionary! {
"Type" => "Catalog",
@@ -146,9 +141,8 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
write_pdf(&env.workspace_root, "three.pdf", &bytes);
let cfg = cfg_with_pdf(&env);
let report =
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
.expect("PDF ingest must succeed");
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
.expect("PDF ingest must succeed");
assert_eq!(report.errors, 0);
let items = report.items.as_ref().expect("items present");
@@ -157,23 +151,28 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
.find(|i| i.doc_path.0.ends_with("three.pdf"))
.expect("PDF item present");
assert_eq!(pdf_item.kind, IngestItemKind::New);
assert_eq!(pdf_item.block_count, Some(3), "one Block::Paragraph per page");
assert_eq!(pdf_item.chunk_count, Some(3), "one chunk per non-empty page");
assert_eq!(
pdf_item.block_count,
Some(3),
"one Block::Paragraph per page"
);
assert_eq!(
pdf_item.chunk_count,
Some(3),
"one chunk per non-empty page"
);
assert_eq!(
pdf_item.parser_version.as_ref().map(|p| p.0.as_str()),
Some("pdf-text-v1")
);
assert_eq!(
pdf_item.chunker_version.as_ref().map(|c| c.0.as_str()),
Some("pdf-page-v1")
Some("pdf-page-v1.1")
);
// Inspect the stored doc to confirm SourceSpan::Page round-trip.
let doc = kebab_app::inspect_doc_with_config(
cfg,
pdf_item.doc_id.as_ref().unwrap(),
)
.expect("inspect_doc returns the PDF document");
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap())
.expect("inspect_doc returns the PDF document");
assert_eq!(doc.blocks.len(), 3);
for (i, block) in doc.blocks.iter().enumerate() {
let want_page = (i as u32) + 1;
@@ -202,8 +201,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
write_pdf(&env.workspace_root, "stable.pdf", &bytes);
let cfg = cfg_with_pdf(&env);
let report1 =
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
let report1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
let item1 = report1
.items
.as_ref()
@@ -214,8 +212,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
.unwrap();
assert_eq!(item1.kind, IngestItemKind::New);
let report2 =
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
let report2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
let item2 = report2
.items
.unwrap()
@@ -239,8 +236,7 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
std::fs::write(&path, &bytes_v1).unwrap();
let cfg = cfg_with_pdf(&env);
let report_v1 =
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
let report_v1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
let id_v1 = report_v1
.items
.as_ref()
@@ -252,12 +248,10 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
.clone()
.unwrap();
let bytes_v2 =
build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
let bytes_v2 = build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
std::fs::write(&path, &bytes_v2).unwrap();
let report_v2 =
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
let report_v2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
let item_v2 = report_v2
.items
.as_ref()
@@ -282,9 +276,11 @@ fn encrypted_pdf_fails_with_qpdf_hint() {
write_pdf(&env.workspace_root, "secret.pdf", &bytes);
let cfg = cfg_with_pdf(&env);
let report =
kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
assert_eq!(report.errors, 1, "encrypted PDF must increment errors exactly once");
let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
assert_eq!(
report.errors, 1,
"encrypted PDF must increment errors exactly once"
);
let items = report.items.as_ref().unwrap();
let pdf_item = items
.iter()
@@ -310,9 +306,11 @@ fn corrupt_pdf_fails_without_storing() {
write_pdf(&env.workspace_root, "corrupt.pdf", &bytes);
let cfg = cfg_with_pdf(&env);
let report =
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
assert_eq!(report.errors, 1, "corrupt PDF must increment errors exactly once");
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
assert_eq!(
report.errors, 1,
"corrupt PDF must increment errors exactly once"
);
let items = report.items.as_ref().unwrap();
let pdf_item = items
.iter()
@@ -322,11 +320,8 @@ fn corrupt_pdf_fails_without_storing() {
// Confirm the doc was NOT stored — list_docs returns nothing for
// this path.
let summaries = kebab_app::list_docs_with_config(
cfg,
kebab_core::DocFilter::default(),
)
.unwrap();
let summaries =
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
assert!(
!summaries
.iter()
@@ -341,14 +336,15 @@ fn corrupt_pdf_fails_without_storing() {
#[test]
fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
let env = TestEnv::lexical_only();
let bytes =
build_text_pdf(&[Some("first page"), None, Some("third page")]);
let bytes = build_text_pdf(&[Some("first page"), None, Some("third page")]);
write_pdf(&env.workspace_root, "mixed.pdf", &bytes);
let cfg = cfg_with_pdf(&env);
let report =
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
assert_eq!(report.errors, 0, "scanned candidate is a Warning, not Error");
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
assert_eq!(
report.errors, 0,
"scanned candidate is a Warning, not Error"
);
let pdf_item = report
.items
.as_ref()
@@ -365,14 +361,10 @@ fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
assert_eq!(
pdf_item.chunk_count,
Some(2),
"pdf-page-v1 emits 0 chunks for the empty page; total = 2"
"pdf-page-v1.1 emits 0 chunks for the empty page; total = 2"
);
let doc = kebab_app::inspect_doc_with_config(
cfg,
pdf_item.doc_id.as_ref().unwrap(),
)
.unwrap();
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
let warnings: Vec<_> = doc
.provenance
.events
@@ -419,8 +411,7 @@ fn ingest_report_arithmetic_invariant_holds_with_corrupt_pdf() {
write_pdf(&env.workspace_root, "broken.pdf", &corrupt_pdf());
let cfg = cfg_with_pdf(&env);
let report =
kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
let total = report.new + report.updated + report.skipped + report.errors;
assert_eq!(
report.scanned, total,
@@ -441,14 +432,12 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
let pages: Vec<String> = (1..=50)
.map(|i| format!("Page {i} body — lorem ipsum dolor sit amet."))
.collect();
let page_refs: Vec<Option<&str>> =
pages.iter().map(|s| Some(s.as_str())).collect();
let page_refs: Vec<Option<&str>> = pages.iter().map(|s| Some(s.as_str())).collect();
let bytes = build_text_pdf(&page_refs);
write_pdf(&env.workspace_root, "long.pdf", &bytes);
let cfg = cfg_with_pdf(&env);
let report =
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
assert_eq!(report.errors, 0);
let pdf_item = report
.items
@@ -466,8 +455,7 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
// Round-trip: list_docs sees the long PDF.
let summaries =
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
.unwrap();
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
assert!(summaries.iter().any(|s| s.doc_path.0.ends_with("long.pdf")));
}
@@ -476,13 +464,11 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
#[test]
fn inspect_doc_surfaces_page_spans() {
let env = TestEnv::lexical_only();
let bytes =
build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
let bytes = build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
write_pdf(&env.workspace_root, "inspect.pdf", &bytes);
let cfg = cfg_with_pdf(&env);
let report =
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
let pdf_item = report
.items
.as_ref()
@@ -490,19 +476,12 @@ fn inspect_doc_surfaces_page_spans() {
.iter()
.find(|i| i.doc_path.0.ends_with("inspect.pdf"))
.unwrap();
let doc = kebab_app::inspect_doc_with_config(
cfg,
pdf_item.doc_id.as_ref().unwrap(),
)
.unwrap();
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
assert_eq!(doc.parser_version.0, "pdf-text-v1");
assert_eq!(doc.blocks.len(), 3);
for block in &doc.blocks {
match block {
Block::Paragraph(p) => assert!(matches!(
p.common.source_span,
SourceSpan::Page { .. }
)),
Block::Paragraph(p) => assert!(matches!(p.common.source_span, SourceSpan::Page { .. })),
other => panic!("expected Paragraph, got {other:?}"),
}
}

View File

@@ -78,19 +78,15 @@ fn reset_orphans_only_purges_out_of_scope_docs() {
narrow_cfg.workspace.exclude = vec!["b.rs".to_string(), "c.rs".to_string()];
// Run orphans-only reset.
let report = execute(ResetScope::OrphansOnly, &narrow_cfg)
.expect("orphans-only reset must succeed");
let report =
execute(ResetScope::OrphansOnly, &narrow_cfg).expect("orphans-only reset must succeed");
assert_eq!(
report.orphans_purged, 2,
"expected 2 orphans purged (b.rs + c.rs): {report:?}"
);
let mut purged: Vec<String> = report
.purged_paths
.iter()
.map(|p| p.0.clone())
.collect();
let mut purged: Vec<String> = report.purged_paths.iter().map(|p| p.0.clone()).collect();
purged.sort();
assert_eq!(
purged,

View File

@@ -0,0 +1,79 @@
//! Integration tests for Bug #13: schema.v1.models.active_parsers + active_chunkers.
use kebab_app::schema_with_config;
use kebab_config::Config;
use kebab_core::SourceScope;
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
let mut cfg = Config::defaults();
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
cfg.workspace.exclude.clear();
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
cfg.models.embedding.provider = "none".to_string();
cfg.models.embedding.dimensions = 0;
cfg.chunking.target_tokens = 80;
cfg.chunking.overlap_tokens = 20;
cfg
}
fn minimal_scope(workspace_root: &std::path::Path) -> SourceScope {
SourceScope {
root: workspace_root.to_path_buf(),
include: vec![],
exclude: vec![],
}
}
#[test]
fn schema_models_active_arrays_empty_on_empty_corpus() {
let dir = tempfile::tempdir().unwrap();
let workspace = dir.path().join("kb");
std::fs::create_dir_all(&workspace).unwrap();
let cfg = minimal_config(dir.path(), &workspace);
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
store.run_migrations().unwrap();
drop(store);
let s = schema_with_config(&cfg).unwrap();
assert!(
s.models.active_parsers.is_empty(),
"empty corpus → no parsers"
);
assert!(
s.models.active_chunkers.is_empty(),
"empty corpus → no chunkers"
);
// backward compat: 기존 단일 field 는 markdown default 보존.
assert_eq!(s.models.parser_version, kebab_parse_md::PARSER_VERSION);
}
#[test]
fn schema_emits_active_parsers_and_chunkers_array_after_ingest() {
let dir = tempfile::tempdir().unwrap();
let workspace = dir.path().join("kb");
std::fs::create_dir_all(&workspace).unwrap();
std::fs::write(workspace.join("a.md"), "# A\nhello world\n").unwrap();
let cfg = minimal_config(dir.path(), &workspace);
let scope = minimal_scope(&workspace);
kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap();
let s = schema_with_config(&cfg).unwrap();
assert!(
!s.models.active_parsers.is_empty(),
"active_parsers populated after ingest"
);
assert!(
!s.models.active_chunkers.is_empty(),
"active_chunkers populated after ingest"
);
// active arrays must be sorted (ORDER BY in SQL).
let mut sorted = s.models.active_parsers.clone();
sorted.sort();
assert_eq!(
s.models.active_parsers, sorted,
"active_parsers must be sorted"
);
}

View File

@@ -57,7 +57,7 @@ fn schema_report_reflects_freshly_ingested_kb() {
schema.wire.schemas
);
assert!(schema.capabilities.json_mode);
assert!(!schema.capabilities.streaming_ask);
assert!(schema.capabilities.streaming_ask); // Bug #9: streaming_ask is now true
assert!(
schema.capabilities.mcp_server,
"mcp_server should be true after fb-30",

View File

@@ -27,7 +27,10 @@ fn search_with_opts_no_budget_matches_search() {
assert_eq!(resp.hits.len(), baseline.len());
assert!(!resp.truncated);
assert!(resp.next_cursor.is_none(), "k=5 against 1 doc → no next page");
assert!(
resp.next_cursor.is_none(),
"k=5 against 1 doc → no next page"
);
}
#[test]
@@ -62,7 +65,11 @@ fn budget_truncates_snippets_when_below_threshold() {
fn cursor_paginates_to_next_page() {
let env = common::TestEnv::new();
for i in 0..6 {
common::ingest_md(&env, &format!("d{i}.md"), &format!("# T{i}\n\nrust topic {i}\n"));
common::ingest_md(
&env,
&format!("d{i}.md"),
&format!("# T{i}\n\nrust topic {i}\n"),
);
}
let app = env.app();
@@ -88,7 +95,10 @@ fn cursor_paginates_to_next_page() {
page1.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
let p2_ids: std::collections::HashSet<_> =
page2.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
assert!(p1_ids.is_disjoint(&p2_ids), "page 2 must not repeat page 1 hits");
assert!(
p1_ids.is_disjoint(&p2_ids),
"page 2 must not repeat page 1 hits"
);
}
#[test]

View File

@@ -75,11 +75,9 @@ fn lexical_multi_token_korean_query_hits() {
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
.expect("ingest must succeed");
let hits = kebab_app::search_with_config(
env.config.clone(),
common::lexical_query("해시 충돌"),
)
.expect("search must succeed");
let hits =
kebab_app::search_with_config(env.config.clone(), common::lexical_query("해시 충돌"))
.expect("search must succeed");
assert!(
!hits.is_empty(),
@@ -113,11 +111,9 @@ fn lexical_mixed_korean_english_multi_token_query_hits() {
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
.expect("ingest must succeed");
let hits = kebab_app::search_with_config(
env.config.clone(),
common::lexical_query("Rust 충돌은"),
)
.expect("search must succeed");
let hits =
kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust 충돌은"))
.expect("search must succeed");
assert!(
!hits.is_empty(),

View File

@@ -35,8 +35,8 @@ fn lexical_search_returns_hits_after_ingest() {
fn lexical_search_empty_query_returns_empty() {
let env = TestEnv::lexical_only();
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query(" "))
.unwrap();
let hits =
kebab_app::search_with_config(env.config.clone(), common::lexical_query(" ")).unwrap();
assert!(hits.is_empty(), "blank query must short-circuit empty");
}
@@ -107,17 +107,17 @@ fn search_uncached_returns_same_hits_as_cached() {
#[test]
fn first_ingest_bumps_corpus_revision() {
let env = TestEnv::lexical_only();
let store_before =
kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
let store_before = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
store_before.run_migrations().unwrap();
assert_eq!(store_before.corpus_revision(), 0, "fresh store seeds 0");
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
assert!(report.new + report.updated > 0, "first ingest must commit ≥1 doc");
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
assert!(
report.new + report.updated > 0,
"first ingest must commit ≥1 doc"
);
let store_after =
kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
let store_after = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
assert!(
store_after.corpus_revision() >= 1,
"ingest commit must bump corpus_revision (got {})",

View File

@@ -29,7 +29,9 @@ fn fresh_doc_is_not_stale_with_default_threshold() {
assert!(
hits.iter().all(|h| !h.stale),
"freshly-ingested doc must not be stale at default 30d threshold: {:?}",
hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::<Vec<_>>()
hits.iter()
.map(|h| (h.doc_path.0.clone(), h.stale))
.collect::<Vec<_>>()
);
}
@@ -50,7 +52,9 @@ fn threshold_zero_disables_staleness() {
assert!(
hits.iter().all(|h| !h.stale),
"threshold=0 disables staleness even for year-old docs: {:?}",
hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::<Vec<_>>()
hits.iter()
.map(|h| (h.doc_path.0.clone(), h.stale))
.collect::<Vec<_>>()
);
}

View File

@@ -14,7 +14,8 @@ use common::TestEnv;
fn require_avx_or_panic() {
#[cfg(target_arch = "x86_64")]
{
assert!(std::is_x86_feature_detected!("avx"),
assert!(
std::is_x86_feature_detected!("avx"),
"kb-app vector integration test requires AVX-capable hardware; \
host CPU lacks AVX. Run on an AVX-capable machine."
);
@@ -28,8 +29,7 @@ fn ingest_then_hybrid_search_returns_hits() {
require_avx_or_panic();
let env = TestEnv::with_embeddings();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
assert_eq!(report.new, 3);
@@ -55,8 +55,7 @@ fn ingest_then_vector_search_carries_embedding_model() {
require_avx_or_panic();
let env = TestEnv::with_embeddings();
let report =
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
assert_eq!(report.new, 3);

View File

@@ -13,11 +13,7 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap();
std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap();
let report = kebab_app::ingest_with_config(
env.config.clone(),
env.scope(),
false,
).unwrap();
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
let items = report.items.as_ref().expect("items array populated");
let docx_item = items
@@ -39,5 +35,8 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
vec!["unsupported media type: <no-ext>".to_string()],
);
assert_eq!(report.skipped_by_extension.get("docx").copied(), Some(1));
assert_eq!(report.skipped_by_extension.get("<no-ext>").copied(), Some(1));
assert_eq!(
report.skipped_by_extension.get("<no-ext>").copied(),
Some(1)
);
}

View File

@@ -44,8 +44,8 @@ fn twin_files_fetch_span_uses_correct_asset() {
std::fs::write(dir_b.join("note.md"), content).unwrap();
// Ingest all files (fixture workspace + our two new twins).
let report = ingest_with_config(env.config.clone(), env.scope(), false)
.expect("ingest must succeed");
let report =
ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest must succeed");
assert_eq!(report.errors, 0, "no ingest errors; report={report:?}");
// Both twin paths must appear as New in the report.
@@ -53,8 +53,7 @@ fn twin_files_fetch_span_uses_correct_asset() {
let twin_items: Vec<_> = items
.iter()
.filter(|i| {
i.doc_path.0.ends_with("src_a/note.md")
|| i.doc_path.0.ends_with("src_b/note.md")
i.doc_path.0.ends_with("src_a/note.md") || i.doc_path.0.ends_with("src_b/note.md")
})
.collect();
assert_eq!(
@@ -149,7 +148,10 @@ fn twin_files_fetch_span_uses_correct_asset() {
// at either twin, making one twin's span fetch behave incorrectly.
let report2 = ingest_with_config(env.config.clone(), env.scope(), false)
.expect("second ingest must succeed");
assert_eq!(report2.errors, 0, "no ingest errors on second run; report={report2:?}");
assert_eq!(
report2.errors, 0,
"no ingest errors on second run; report={report2:?}"
);
// Re-open app after second ingest and verify span still works on both.
let app2 = env.app();

View File

@@ -43,9 +43,7 @@ fn twin_files_second_ingest_is_unchanged() {
let items = first.items.as_ref().expect("items must be present");
let twin_items: Vec<_> = items
.iter()
.filter(|i| {
i.doc_path.0.ends_with("__init__.py")
})
.filter(|i| i.doc_path.0.ends_with("__init__.py"))
.collect();
assert_eq!(
twin_items.len(),
@@ -63,8 +61,14 @@ fn twin_files_second_ingest_is_unchanged() {
// Second ingest — same files, same content → both must be Unchanged.
let second = ingest_with_config(env.config.clone(), env.scope(), false)
.expect("second ingest must succeed");
assert_eq!(second.errors, 0, "second ingest: no errors; report={second:?}");
assert_eq!(second.new, 0, "second ingest: no new docs; report={second:?}");
assert_eq!(
second.errors, 0,
"second ingest: no errors; report={second:?}"
);
assert_eq!(
second.new, 0,
"second ingest: no new docs; report={second:?}"
);
assert_eq!(
second.updated, 0,
"second ingest: no updated docs (twin-file bug would set this to 2); report={second:?}"

View File

@@ -39,17 +39,11 @@ impl Chunker for CodeCAstV1Chunker {
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
for b in &doc.blocks {
let c = match b {
Block::Code(c) => c,
_ => anyhow::bail!(
"CodeCAstV1Chunker only handles code docs (got non-Code block)"
),
_ => anyhow::bail!("CodeCAstV1Chunker only handles code docs (got non-Code block)"),
};
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
anyhow::bail!(
@@ -68,9 +62,12 @@ impl Chunker for CodeCAstV1Chunker {
_ => unreachable!("validated above"),
};
let (ls, le, symbol, lang) = match &cb.common.source_span {
SourceSpan::Code { line_start, line_end, symbol, lang } => {
(*line_start, *line_end, symbol.clone(), lang.clone())
}
SourceSpan::Code {
line_start,
line_end,
symbol,
lang,
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
_ => unreachable!("validated above"),
};
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +81,13 @@ impl Chunker for CodeCAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
None, span, cb.code.clone(),
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
None,
span,
cb.code.clone(),
));
} else {
let parts = split_oversize(&cb.code);
@@ -93,9 +95,7 @@ impl Chunker for CodeCAstV1Chunker {
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
let part_ls = ls + off_start;
let part_le = ls + off_end;
let part_sym = symbol
.as_ref()
.map(|s| format!("{s} [part {}/{n}]", i + 1));
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
let span = SourceSpan::Code {
line_start: part_ls,
line_end: part_le,
@@ -103,8 +103,13 @@ impl Chunker for CodeCAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
Some(part_ls), span, text,
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
Some(part_ls),
span,
text,
));
}
}
@@ -183,9 +188,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
mod tests {
use super::*;
use kebab_core::{
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
SourceType, TrustLevel, WorkspacePath,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use time::OffsetDateTime;
@@ -206,39 +211,60 @@ mod tests {
};
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
Block::Code(CodeBlock {
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
common: CommonBlock {
block_id: bid,
heading_path: vec![],
source_span: span,
},
lang: Some("c".into()),
code: (*code).to_string(),
})
})
.collect();
CanonicalDocument {
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
lang: Lang("und".into()), blocks,
doc_id,
source_asset_id: aid,
workspace_path: wp,
title: "a".into(),
lang: Lang("und".into()),
blocks,
metadata: Metadata {
aliases: vec![], tags: vec![],
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
user_id_alias: None, user: Default::default(),
repo: Some("kebab".into()), git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)), code_lang: Some("c".into()),
source_type: SourceType::Note,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: Some("kebab".into()),
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("c".into()),
},
provenance: Provenance { events: vec![] },
parser_version: pv, schema_version: 1, doc_version: 1,
last_chunker_version: None, last_embedding_version: None,
parser_version: pv,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
fn policy() -> ChunkPolicy {
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
}
}
#[test]
fn chunker_version_is_code_c_ast_v1() {
assert_eq!(CodeCAstV1Chunker.chunker_version(),
ChunkerVersion("code-c-ast-v1".into()));
assert_eq!(
CodeCAstV1Chunker.chunker_version(),
ChunkerVersion("code-c-ast-v1".into())
);
}
#[test]
@@ -256,7 +282,12 @@ mod tests {
assert_eq!(c.chunker_version.0, "code-c-ast-v1");
}
match &chunks[0].source_spans[0] {
SourceSpan::Code { symbol, line_start, line_end, .. } => {
SourceSpan::Code {
symbol,
line_start,
line_end,
..
} => {
assert_eq!(symbol.as_deref(), Some("parse"));
assert_eq!((*line_start, *line_end), (1, 3));
}
@@ -266,22 +297,32 @@ mod tests {
#[test]
fn oversize_unit_splits_into_parts_with_unique_ids() {
let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::<String>();
let body = (0..500)
.map(|i| format!("\tx{i} = {i};\n"))
.collect::<String>();
let code = format!("int big() {{\n{body}\n}}");
let doc = code_doc(&[("big", 1, 502, &code)]);
let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
assert!(
chunks.len() >= 2,
"oversize unit must split, got {}",
chunks.len()
);
for c in &chunks {
match &c.source_spans[0] {
SourceSpan::Code { symbol, .. } => {
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}");
assert!(
symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}"
);
}
_ => unreachable!(),
}
}
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
let n = ids.len(); ids.sort_unstable(); ids.dedup();
let n = ids.len();
ids.sort_unstable();
ids.dedup();
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
}
@@ -295,7 +336,8 @@ mod tests {
heading_path: vec![],
source_span: SourceSpan::Line { start: 1, end: 1 },
},
text: "x".into(), inlines: vec![],
text: "x".into(),
inlines: vec![],
})];
let err = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
assert!(err.to_string().contains("CodeCAstV1Chunker"));
@@ -304,11 +346,19 @@ mod tests {
#[test]
fn deterministic_chunk_ids_1000() {
let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
let base: Vec<String> = CodeCAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let base: Vec<String> = CodeCAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
for _ in 0..1000 {
let again: Vec<String> = CodeCAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let again: Vec<String> = CodeCAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
assert_eq!(again, base);
}
}
@@ -316,7 +366,9 @@ mod tests {
#[test]
fn policy_hash_matches_md_heading_v1() {
let p = policy();
assert_eq!(CodeCAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p));
assert_eq!(
CodeCAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p)
);
}
}

View File

@@ -39,17 +39,13 @@ impl Chunker for CodeCppAstV1Chunker {
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
for b in &doc.blocks {
let c = match b {
Block::Code(c) => c,
_ => anyhow::bail!(
"CodeCppAstV1Chunker only handles code docs (got non-Code block)"
),
_ => {
anyhow::bail!("CodeCppAstV1Chunker only handles code docs (got non-Code block)")
}
};
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
anyhow::bail!(
@@ -68,9 +64,12 @@ impl Chunker for CodeCppAstV1Chunker {
_ => unreachable!("validated above"),
};
let (ls, le, symbol, lang) = match &cb.common.source_span {
SourceSpan::Code { line_start, line_end, symbol, lang } => {
(*line_start, *line_end, symbol.clone(), lang.clone())
}
SourceSpan::Code {
line_start,
line_end,
symbol,
lang,
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
_ => unreachable!("validated above"),
};
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeCppAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
None, span, cb.code.clone(),
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
None,
span,
cb.code.clone(),
));
} else {
let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeCppAstV1Chunker {
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
let part_ls = ls + off_start;
let part_le = ls + off_end;
let part_sym = symbol
.as_ref()
.map(|s| format!("{s} [part {}/{n}]", i + 1));
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
let span = SourceSpan::Code {
line_start: part_ls,
line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeCppAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
Some(part_ls), span, text,
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
Some(part_ls),
span,
text,
));
}
}
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
mod tests {
use super::*;
use kebab_core::{
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
SourceType, TrustLevel, WorkspacePath,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use time::OffsetDateTime;
@@ -206,39 +213,60 @@ mod tests {
};
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
Block::Code(CodeBlock {
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
common: CommonBlock {
block_id: bid,
heading_path: vec![],
source_span: span,
},
lang: Some("cpp".into()),
code: (*code).to_string(),
})
})
.collect();
CanonicalDocument {
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
lang: Lang("und".into()), blocks,
doc_id,
source_asset_id: aid,
workspace_path: wp,
title: "a".into(),
lang: Lang("und".into()),
blocks,
metadata: Metadata {
aliases: vec![], tags: vec![],
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
user_id_alias: None, user: Default::default(),
repo: Some("kebab".into()), git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)), code_lang: Some("cpp".into()),
source_type: SourceType::Note,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: Some("kebab".into()),
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("cpp".into()),
},
provenance: Provenance { events: vec![] },
parser_version: pv, schema_version: 1, doc_version: 1,
last_chunker_version: None, last_embedding_version: None,
parser_version: pv,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
fn policy() -> ChunkPolicy {
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
}
}
#[test]
fn chunker_version_is_code_cpp_ast_v1() {
assert_eq!(CodeCppAstV1Chunker.chunker_version(),
ChunkerVersion("code-cpp-ast-v1".into()));
assert_eq!(
CodeCppAstV1Chunker.chunker_version(),
ChunkerVersion("code-cpp-ast-v1".into())
);
}
#[test]
@@ -256,7 +284,12 @@ mod tests {
assert_eq!(c.chunker_version.0, "code-cpp-ast-v1");
}
match &chunks[0].source_spans[0] {
SourceSpan::Code { symbol, line_start, line_end, .. } => {
SourceSpan::Code {
symbol,
line_start,
line_end,
..
} => {
assert_eq!(symbol.as_deref(), Some("parse"));
assert_eq!((*line_start, *line_end), (1, 3));
}
@@ -266,22 +299,32 @@ mod tests {
#[test]
fn oversize_unit_splits_into_parts_with_unique_ids() {
let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::<String>();
let body = (0..500)
.map(|i| format!("\tx{i} = {i};\n"))
.collect::<String>();
let code = format!("int big() {{\n{body}\n}}");
let doc = code_doc(&[("big", 1, 502, &code)]);
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
assert!(
chunks.len() >= 2,
"oversize unit must split, got {}",
chunks.len()
);
for c in &chunks {
match &c.source_spans[0] {
SourceSpan::Code { symbol, .. } => {
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}");
assert!(
symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}"
);
}
_ => unreachable!(),
}
}
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
let n = ids.len(); ids.sort_unstable(); ids.dedup();
let n = ids.len();
ids.sort_unstable();
ids.dedup();
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
}
@@ -295,7 +338,8 @@ mod tests {
heading_path: vec![],
source_span: SourceSpan::Line { start: 1, end: 1 },
},
text: "x".into(), inlines: vec![],
text: "x".into(),
inlines: vec![],
})];
let err = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
assert!(err.to_string().contains("CodeCppAstV1Chunker"));
@@ -304,11 +348,19 @@ mod tests {
#[test]
fn deterministic_chunk_ids_1000() {
let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
let base: Vec<String> = CodeCppAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let base: Vec<String> = CodeCppAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
for _ in 0..1000 {
let again: Vec<String> = CodeCppAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let again: Vec<String> = CodeCppAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
assert_eq!(again, base);
}
}
@@ -316,7 +368,9 @@ mod tests {
#[test]
fn policy_hash_matches_md_heading_v1() {
let p = policy();
assert_eq!(CodeCppAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p));
assert_eq!(
CodeCppAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p)
);
}
}

View File

@@ -39,17 +39,13 @@ impl Chunker for CodeGoAstV1Chunker {
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
for b in &doc.blocks {
let c = match b {
Block::Code(c) => c,
_ => anyhow::bail!(
"CodeGoAstV1Chunker only handles code docs (got non-Code block)"
),
_ => {
anyhow::bail!("CodeGoAstV1Chunker only handles code docs (got non-Code block)")
}
};
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
anyhow::bail!(
@@ -68,9 +64,12 @@ impl Chunker for CodeGoAstV1Chunker {
_ => unreachable!("validated above"),
};
let (ls, le, symbol, lang) = match &cb.common.source_span {
SourceSpan::Code { line_start, line_end, symbol, lang } => {
(*line_start, *line_end, symbol.clone(), lang.clone())
}
SourceSpan::Code {
line_start,
line_end,
symbol,
lang,
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
_ => unreachable!("validated above"),
};
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeGoAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
None, span, cb.code.clone(),
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
None,
span,
cb.code.clone(),
));
} else {
let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeGoAstV1Chunker {
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
let part_ls = ls + off_start;
let part_le = ls + off_end;
let part_sym = symbol
.as_ref()
.map(|s| format!("{s} [part {}/{n}]", i + 1));
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
let span = SourceSpan::Code {
line_start: part_ls,
line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeGoAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
Some(part_ls), span, text,
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
Some(part_ls),
span,
text,
));
}
}
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
mod tests {
use super::*;
use kebab_core::{
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
SourceType, TrustLevel, WorkspacePath,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use time::OffsetDateTime;
@@ -206,46 +213,72 @@ mod tests {
};
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
Block::Code(CodeBlock {
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
common: CommonBlock {
block_id: bid,
heading_path: vec![],
source_span: span,
},
lang: Some("go".into()),
code: (*code).to_string(),
})
})
.collect();
CanonicalDocument {
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
lang: Lang("und".into()), blocks,
doc_id,
source_asset_id: aid,
workspace_path: wp,
title: "a".into(),
lang: Lang("und".into()),
blocks,
metadata: Metadata {
aliases: vec![], tags: vec![],
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
user_id_alias: None, user: Default::default(),
repo: Some("kebab".into()), git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)), code_lang: Some("go".into()),
source_type: SourceType::Note,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: Some("kebab".into()),
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("go".into()),
},
provenance: Provenance { events: vec![] },
parser_version: pv, schema_version: 1, doc_version: 1,
last_chunker_version: None, last_embedding_version: None,
parser_version: pv,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
fn policy() -> ChunkPolicy {
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
}
}
#[test]
fn chunker_version_is_code_go_ast_v1() {
assert_eq!(CodeGoAstV1Chunker.chunker_version(),
ChunkerVersion("code-go-ast-v1".into()));
assert_eq!(
CodeGoAstV1Chunker.chunker_version(),
ChunkerVersion("code-go-ast-v1".into())
);
}
#[test]
fn one_chunk_per_unit_preserves_code_span() {
let doc = code_doc(&[
("parse", 1, 3, "func parse() {\n\t// x\n}"),
("Foo.double", 5, 7, "func double() int {\n\t//\n\treturn 0\n}"),
(
"Foo.double",
5,
7,
"func double() int {\n\t//\n\treturn 0\n}",
),
]);
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert_eq!(chunks.len(), 2);
@@ -256,7 +289,12 @@ mod tests {
assert_eq!(c.chunker_version.0, "code-go-ast-v1");
}
match &chunks[0].source_spans[0] {
SourceSpan::Code { symbol, line_start, line_end, .. } => {
SourceSpan::Code {
symbol,
line_start,
line_end,
..
} => {
assert_eq!(symbol.as_deref(), Some("parse"));
assert_eq!((*line_start, *line_end), (1, 3));
}
@@ -266,22 +304,33 @@ mod tests {
#[test]
fn oversize_unit_splits_into_parts_with_unique_ids() {
let body = (0..500).map(|i| format!("\tx{i} := {i}")).collect::<Vec<_>>().join("\n");
let body = (0..500)
.map(|i| format!("\tx{i} := {i}"))
.collect::<Vec<_>>()
.join("\n");
let code = format!("func big() {{\n{body}\n}}");
let doc = code_doc(&[("big", 1, 502, &code)]);
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
assert!(
chunks.len() >= 2,
"oversize unit must split, got {}",
chunks.len()
);
for c in &chunks {
match &c.source_spans[0] {
SourceSpan::Code { symbol, .. } => {
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}");
assert!(
symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}"
);
}
_ => unreachable!(),
}
}
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
let n = ids.len(); ids.sort_unstable(); ids.dedup();
let n = ids.len();
ids.sort_unstable();
ids.dedup();
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
}
@@ -295,7 +344,8 @@ mod tests {
heading_path: vec![],
source_span: SourceSpan::Line { start: 1, end: 1 },
},
text: "x".into(), inlines: vec![],
text: "x".into(),
inlines: vec![],
})];
let err = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
assert!(err.to_string().contains("CodeGoAstV1Chunker"));
@@ -304,11 +354,19 @@ mod tests {
#[test]
fn deterministic_chunk_ids_1000() {
let doc = code_doc(&[("parse", 1, 2, "func parse() {}\n")]);
let base: Vec<String> = CodeGoAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let base: Vec<String> = CodeGoAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
for _ in 0..1000 {
let again: Vec<String> = CodeGoAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let again: Vec<String> = CodeGoAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
assert_eq!(again, base);
}
}
@@ -316,7 +374,9 @@ mod tests {
#[test]
fn policy_hash_matches_md_heading_v1() {
let p = policy();
assert_eq!(CodeGoAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p));
assert_eq!(
CodeGoAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p)
);
}
}

View File

@@ -39,11 +39,7 @@ impl Chunker for CodeJavaAstV1Chunker {
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
for b in &doc.blocks {
let c = match b {
Block::Code(c) => c,
@@ -68,9 +64,12 @@ impl Chunker for CodeJavaAstV1Chunker {
_ => unreachable!("validated above"),
};
let (ls, le, symbol, lang) = match &cb.common.source_span {
SourceSpan::Code { line_start, line_end, symbol, lang } => {
(*line_start, *line_end, symbol.clone(), lang.clone())
}
SourceSpan::Code {
line_start,
line_end,
symbol,
lang,
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
_ => unreachable!("validated above"),
};
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeJavaAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
None, span, cb.code.clone(),
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
None,
span,
cb.code.clone(),
));
} else {
let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeJavaAstV1Chunker {
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
let part_ls = ls + off_start;
let part_le = ls + off_end;
let part_sym = symbol
.as_ref()
.map(|s| format!("{s} [part {}/{n}]", i + 1));
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
let span = SourceSpan::Code {
line_start: part_ls,
line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeJavaAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
Some(part_ls), span, text,
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
Some(part_ls),
span,
text,
));
}
}
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
mod tests {
use super::*;
use kebab_core::{
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
SourceType, TrustLevel, WorkspacePath,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use time::OffsetDateTime;
@@ -206,39 +213,60 @@ mod tests {
};
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
Block::Code(CodeBlock {
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
common: CommonBlock {
block_id: bid,
heading_path: vec![],
source_span: span,
},
lang: Some("java".into()),
code: (*code).to_string(),
})
})
.collect();
CanonicalDocument {
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
lang: Lang("und".into()), blocks,
doc_id,
source_asset_id: aid,
workspace_path: wp,
title: "a".into(),
lang: Lang("und".into()),
blocks,
metadata: Metadata {
aliases: vec![], tags: vec![],
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
user_id_alias: None, user: Default::default(),
repo: Some("kebab".into()), git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)), code_lang: Some("java".into()),
source_type: SourceType::Note,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: Some("kebab".into()),
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("java".into()),
},
provenance: Provenance { events: vec![] },
parser_version: pv, schema_version: 1, doc_version: 1,
last_chunker_version: None, last_embedding_version: None,
parser_version: pv,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
fn policy() -> ChunkPolicy {
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
}
}
#[test]
fn chunker_version_is_code_java_ast_v1() {
assert_eq!(CodeJavaAstV1Chunker.chunker_version(),
ChunkerVersion("code-java-ast-v1".into()));
assert_eq!(
CodeJavaAstV1Chunker.chunker_version(),
ChunkerVersion("code-java-ast-v1".into())
);
}
#[test]
@@ -256,7 +284,12 @@ mod tests {
assert_eq!(c.chunker_version.0, "code-java-ast-v1");
}
match &chunks[0].source_spans[0] {
SourceSpan::Code { symbol, line_start, line_end, .. } => {
SourceSpan::Code {
symbol,
line_start,
line_end,
..
} => {
assert_eq!(symbol.as_deref(), Some("parse"));
assert_eq!((*line_start, *line_end), (1, 3));
}
@@ -266,22 +299,33 @@ mod tests {
#[test]
fn oversize_unit_splits_into_parts_with_unique_ids() {
let body = (0..500).map(|i| format!("\tint x{i} = {i};")).collect::<Vec<_>>().join("\n");
let body = (0..500)
.map(|i| format!("\tint x{i} = {i};"))
.collect::<Vec<_>>()
.join("\n");
let code = format!("void big() {{\n{body}\n}}");
let doc = code_doc(&[("big", 1, 502, &code)]);
let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
assert!(
chunks.len() >= 2,
"oversize unit must split, got {}",
chunks.len()
);
for c in &chunks {
match &c.source_spans[0] {
SourceSpan::Code { symbol, .. } => {
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}");
assert!(
symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}"
);
}
_ => unreachable!(),
}
}
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
let n = ids.len(); ids.sort_unstable(); ids.dedup();
let n = ids.len();
ids.sort_unstable();
ids.dedup();
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
}
@@ -295,7 +339,8 @@ mod tests {
heading_path: vec![],
source_span: SourceSpan::Line { start: 1, end: 1 },
},
text: "x".into(), inlines: vec![],
text: "x".into(),
inlines: vec![],
})];
let err = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
assert!(err.to_string().contains("CodeJavaAstV1Chunker"));
@@ -304,11 +349,19 @@ mod tests {
#[test]
fn deterministic_chunk_ids_1000() {
let doc = code_doc(&[("parse", 1, 2, "void parse() {}\n")]);
let base: Vec<String> = CodeJavaAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let base: Vec<String> = CodeJavaAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
for _ in 0..1000 {
let again: Vec<String> = CodeJavaAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let again: Vec<String> = CodeJavaAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
assert_eq!(again, base);
}
}
@@ -316,7 +369,9 @@ mod tests {
#[test]
fn policy_hash_matches_md_heading_v1() {
let p = policy();
assert_eq!(CodeJavaAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p));
assert_eq!(
CodeJavaAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p)
);
}
}

View File

@@ -39,17 +39,13 @@ impl Chunker for CodeJsAstV1Chunker {
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
for b in &doc.blocks {
let c = match b {
Block::Code(c) => c,
_ => anyhow::bail!(
"CodeJsAstV1Chunker only handles code docs (got non-Code block)"
),
_ => {
anyhow::bail!("CodeJsAstV1Chunker only handles code docs (got non-Code block)")
}
};
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
anyhow::bail!(
@@ -68,9 +64,12 @@ impl Chunker for CodeJsAstV1Chunker {
_ => unreachable!("validated above"),
};
let (ls, le, symbol, lang) = match &cb.common.source_span {
SourceSpan::Code { line_start, line_end, symbol, lang } => {
(*line_start, *line_end, symbol.clone(), lang.clone())
}
SourceSpan::Code {
line_start,
line_end,
symbol,
lang,
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
_ => unreachable!("validated above"),
};
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeJsAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
None, span, cb.code.clone(),
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
None,
span,
cb.code.clone(),
));
} else {
let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeJsAstV1Chunker {
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
let part_ls = ls + off_start;
let part_le = ls + off_end;
let part_sym = symbol
.as_ref()
.map(|s| format!("{s} [part {}/{n}]", i + 1));
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
let span = SourceSpan::Code {
line_start: part_ls,
line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeJsAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
Some(part_ls), span, text,
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
Some(part_ls),
span,
text,
));
}
}
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
mod tests {
use super::*;
use kebab_core::{
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
SourceType, TrustLevel, WorkspacePath,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use time::OffsetDateTime;
@@ -206,46 +213,72 @@ mod tests {
};
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
Block::Code(CodeBlock {
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
common: CommonBlock {
block_id: bid,
heading_path: vec![],
source_span: span,
},
lang: Some("javascript".into()),
code: (*code).to_string(),
})
})
.collect();
CanonicalDocument {
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
lang: Lang("und".into()), blocks,
doc_id,
source_asset_id: aid,
workspace_path: wp,
title: "a".into(),
lang: Lang("und".into()),
blocks,
metadata: Metadata {
aliases: vec![], tags: vec![],
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
user_id_alias: None, user: Default::default(),
repo: Some("kebab".into()), git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)), code_lang: Some("javascript".into()),
source_type: SourceType::Note,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: Some("kebab".into()),
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("javascript".into()),
},
provenance: Provenance { events: vec![] },
parser_version: pv, schema_version: 1, doc_version: 1,
last_chunker_version: None, last_embedding_version: None,
parser_version: pv,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
fn policy() -> ChunkPolicy {
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
}
}
#[test]
fn chunker_version_is_code_js_ast_v1() {
assert_eq!(CodeJsAstV1Chunker.chunker_version(),
ChunkerVersion("code-js-ast-v1".into()));
assert_eq!(
CodeJsAstV1Chunker.chunker_version(),
ChunkerVersion("code-js-ast-v1".into())
);
}
#[test]
fn one_chunk_per_unit_preserves_code_span() {
let doc = code_doc(&[
("parse", 1, 3, "function parse() {\n // x\n}"),
("Foo.double", 5, 7, "function double() {\n //\n return 0;\n}"),
(
"Foo.double",
5,
7,
"function double() {\n //\n return 0;\n}",
),
]);
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert_eq!(chunks.len(), 2);
@@ -256,7 +289,12 @@ mod tests {
assert_eq!(c.chunker_version.0, "code-js-ast-v1");
}
match &chunks[0].source_spans[0] {
SourceSpan::Code { symbol, line_start, line_end, .. } => {
SourceSpan::Code {
symbol,
line_start,
line_end,
..
} => {
assert_eq!(symbol.as_deref(), Some("parse"));
assert_eq!((*line_start, *line_end), (1, 3));
}
@@ -266,22 +304,33 @@ mod tests {
#[test]
fn oversize_unit_splits_into_parts_with_unique_ids() {
let body = (0..500).map(|i| format!(" const x{i} = {i};")).collect::<Vec<_>>().join("\n");
let body = (0..500)
.map(|i| format!(" const x{i} = {i};"))
.collect::<Vec<_>>()
.join("\n");
let code = format!("function big() {{\n{body}\n}}");
let doc = code_doc(&[("big", 1, 502, &code)]);
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
assert!(
chunks.len() >= 2,
"oversize unit must split, got {}",
chunks.len()
);
for c in &chunks {
match &c.source_spans[0] {
SourceSpan::Code { symbol, .. } => {
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}");
assert!(
symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}"
);
}
_ => unreachable!(),
}
}
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
let n = ids.len(); ids.sort_unstable(); ids.dedup();
let n = ids.len();
ids.sort_unstable();
ids.dedup();
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
}
@@ -295,7 +344,8 @@ mod tests {
heading_path: vec![],
source_span: SourceSpan::Line { start: 1, end: 1 },
},
text: "x".into(), inlines: vec![],
text: "x".into(),
inlines: vec![],
})];
let err = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
assert!(err.to_string().contains("CodeJsAstV1Chunker"));
@@ -304,11 +354,19 @@ mod tests {
#[test]
fn deterministic_chunk_ids_1000() {
let doc = code_doc(&[("parse", 1, 2, "function parse() {}\n")]);
let base: Vec<String> = CodeJsAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let base: Vec<String> = CodeJsAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
for _ in 0..1000 {
let again: Vec<String> = CodeJsAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let again: Vec<String> = CodeJsAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
assert_eq!(again, base);
}
}
@@ -316,7 +374,9 @@ mod tests {
#[test]
fn policy_hash_matches_md_heading_v1() {
let p = policy();
assert_eq!(CodeJsAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p));
assert_eq!(
CodeJsAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p)
);
}
}

View File

@@ -39,11 +39,7 @@ impl Chunker for CodeKotlinAstV1Chunker {
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
for b in &doc.blocks {
let c = match b {
Block::Code(c) => c,
@@ -68,9 +64,12 @@ impl Chunker for CodeKotlinAstV1Chunker {
_ => unreachable!("validated above"),
};
let (ls, le, symbol, lang) = match &cb.common.source_span {
SourceSpan::Code { line_start, line_end, symbol, lang } => {
(*line_start, *line_end, symbol.clone(), lang.clone())
}
SourceSpan::Code {
line_start,
line_end,
symbol,
lang,
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
_ => unreachable!("validated above"),
};
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeKotlinAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
None, span, cb.code.clone(),
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
None,
span,
cb.code.clone(),
));
} else {
let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeKotlinAstV1Chunker {
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
let part_ls = ls + off_start;
let part_le = ls + off_end;
let part_sym = symbol
.as_ref()
.map(|s| format!("{s} [part {}/{n}]", i + 1));
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
let span = SourceSpan::Code {
line_start: part_ls,
line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeKotlinAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
Some(part_ls), span, text,
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
Some(part_ls),
span,
text,
));
}
}
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
mod tests {
use super::*;
use kebab_core::{
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
SourceType, TrustLevel, WorkspacePath,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use time::OffsetDateTime;
@@ -206,46 +213,72 @@ mod tests {
};
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
Block::Code(CodeBlock {
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
common: CommonBlock {
block_id: bid,
heading_path: vec![],
source_span: span,
},
lang: Some("kotlin".into()),
code: (*code).to_string(),
})
})
.collect();
CanonicalDocument {
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
lang: Lang("und".into()), blocks,
doc_id,
source_asset_id: aid,
workspace_path: wp,
title: "a".into(),
lang: Lang("und".into()),
blocks,
metadata: Metadata {
aliases: vec![], tags: vec![],
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
user_id_alias: None, user: Default::default(),
repo: Some("kebab".into()), git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)), code_lang: Some("kotlin".into()),
source_type: SourceType::Note,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: Some("kebab".into()),
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("kotlin".into()),
},
provenance: Provenance { events: vec![] },
parser_version: pv, schema_version: 1, doc_version: 1,
last_chunker_version: None, last_embedding_version: None,
parser_version: pv,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
fn policy() -> ChunkPolicy {
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
}
}
#[test]
fn chunker_version_is_code_kotlin_ast_v1() {
assert_eq!(CodeKotlinAstV1Chunker.chunker_version(),
ChunkerVersion("code-kotlin-ast-v1".into()));
assert_eq!(
CodeKotlinAstV1Chunker.chunker_version(),
ChunkerVersion("code-kotlin-ast-v1".into())
);
}
#[test]
fn one_chunk_per_unit_preserves_code_span() {
let doc = code_doc(&[
("parse", 1, 3, "fun parse() {\n\t// x\n}"),
("Foo.double", 5, 7, "fun double(): Int {\n\t//\n\treturn 0\n}"),
(
"Foo.double",
5,
7,
"fun double(): Int {\n\t//\n\treturn 0\n}",
),
]);
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert_eq!(chunks.len(), 2);
@@ -256,7 +289,12 @@ mod tests {
assert_eq!(c.chunker_version.0, "code-kotlin-ast-v1");
}
match &chunks[0].source_spans[0] {
SourceSpan::Code { symbol, line_start, line_end, .. } => {
SourceSpan::Code {
symbol,
line_start,
line_end,
..
} => {
assert_eq!(symbol.as_deref(), Some("parse"));
assert_eq!((*line_start, *line_end), (1, 3));
}
@@ -266,22 +304,33 @@ mod tests {
#[test]
fn oversize_unit_splits_into_parts_with_unique_ids() {
let body = (0..500).map(|i| format!("\tval x{i} = {i}")).collect::<Vec<_>>().join("\n");
let body = (0..500)
.map(|i| format!("\tval x{i} = {i}"))
.collect::<Vec<_>>()
.join("\n");
let code = format!("fun big() {{\n{body}\n}}");
let doc = code_doc(&[("big", 1, 502, &code)]);
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
assert!(
chunks.len() >= 2,
"oversize unit must split, got {}",
chunks.len()
);
for c in &chunks {
match &c.source_spans[0] {
SourceSpan::Code { symbol, .. } => {
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}");
assert!(
symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}"
);
}
_ => unreachable!(),
}
}
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
let n = ids.len(); ids.sort_unstable(); ids.dedup();
let n = ids.len();
ids.sort_unstable();
ids.dedup();
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
}
@@ -295,7 +344,8 @@ mod tests {
heading_path: vec![],
source_span: SourceSpan::Line { start: 1, end: 1 },
},
text: "x".into(), inlines: vec![],
text: "x".into(),
inlines: vec![],
})];
let err = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
assert!(err.to_string().contains("CodeKotlinAstV1Chunker"));
@@ -304,11 +354,19 @@ mod tests {
#[test]
fn deterministic_chunk_ids_1000() {
let doc = code_doc(&[("parse", 1, 2, "fun parse() {}\n")]);
let base: Vec<String> = CodeKotlinAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let base: Vec<String> = CodeKotlinAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
for _ in 0..1000 {
let again: Vec<String> = CodeKotlinAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let again: Vec<String> = CodeKotlinAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
assert_eq!(again, base);
}
}
@@ -316,7 +374,9 @@ mod tests {
#[test]
fn policy_hash_matches_md_heading_v1() {
let p = policy();
assert_eq!(CodeKotlinAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p));
assert_eq!(
CodeKotlinAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p)
);
}
}

View File

@@ -39,11 +39,7 @@ impl Chunker for CodePythonAstV1Chunker {
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
for b in &doc.blocks {
let c = match b {
Block::Code(c) => c,
@@ -68,9 +64,12 @@ impl Chunker for CodePythonAstV1Chunker {
_ => unreachable!("validated above"),
};
let (ls, le, symbol, lang) = match &cb.common.source_span {
SourceSpan::Code { line_start, line_end, symbol, lang } => {
(*line_start, *line_end, symbol.clone(), lang.clone())
}
SourceSpan::Code {
line_start,
line_end,
symbol,
lang,
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
_ => unreachable!("validated above"),
};
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodePythonAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
None, span, cb.code.clone(),
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
None,
span,
cb.code.clone(),
));
} else {
let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodePythonAstV1Chunker {
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
let part_ls = ls + off_start;
let part_le = ls + off_end;
let part_sym = symbol
.as_ref()
.map(|s| format!("{s} [part {}/{n}]", i + 1));
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
let span = SourceSpan::Code {
line_start: part_ls,
line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodePythonAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
Some(part_ls), span, text,
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
Some(part_ls),
span,
text,
));
}
}
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
mod tests {
use super::*;
use kebab_core::{
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
SourceType, TrustLevel, WorkspacePath,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use time::OffsetDateTime;
@@ -206,39 +213,60 @@ mod tests {
};
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
Block::Code(CodeBlock {
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
common: CommonBlock {
block_id: bid,
heading_path: vec![],
source_span: span,
},
lang: Some("python".into()),
code: (*code).to_string(),
})
})
.collect();
CanonicalDocument {
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
lang: Lang("und".into()), blocks,
doc_id,
source_asset_id: aid,
workspace_path: wp,
title: "a".into(),
lang: Lang("und".into()),
blocks,
metadata: Metadata {
aliases: vec![], tags: vec![],
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
user_id_alias: None, user: Default::default(),
repo: Some("kebab".into()), git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)), code_lang: Some("python".into()),
source_type: SourceType::Note,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: Some("kebab".into()),
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("python".into()),
},
provenance: Provenance { events: vec![] },
parser_version: pv, schema_version: 1, doc_version: 1,
last_chunker_version: None, last_embedding_version: None,
parser_version: pv,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
fn policy() -> ChunkPolicy {
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
}
}
#[test]
fn chunker_version_is_code_python_ast_v1() {
assert_eq!(CodePythonAstV1Chunker.chunker_version(),
ChunkerVersion("code-python-ast-v1".into()));
assert_eq!(
CodePythonAstV1Chunker.chunker_version(),
ChunkerVersion("code-python-ast-v1".into())
);
}
#[test]
@@ -256,7 +284,12 @@ mod tests {
assert_eq!(c.chunker_version.0, "code-python-ast-v1");
}
match &chunks[0].source_spans[0] {
SourceSpan::Code { symbol, line_start, line_end, .. } => {
SourceSpan::Code {
symbol,
line_start,
line_end,
..
} => {
assert_eq!(symbol.as_deref(), Some("parse"));
assert_eq!((*line_start, *line_end), (1, 3));
}
@@ -266,22 +299,33 @@ mod tests {
#[test]
fn oversize_unit_splits_into_parts_with_unique_ids() {
let body = (0..500).map(|i| format!(" x{i} = {i}")).collect::<Vec<_>>().join("\n");
let body = (0..500)
.map(|i| format!(" x{i} = {i}"))
.collect::<Vec<_>>()
.join("\n");
let code = format!("def big():\n{body}\n");
let doc = code_doc(&[("big", 1, 502, &code)]);
let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
assert!(
chunks.len() >= 2,
"oversize unit must split, got {}",
chunks.len()
);
for c in &chunks {
match &c.source_spans[0] {
SourceSpan::Code { symbol, .. } => {
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}");
assert!(
symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}"
);
}
_ => unreachable!(),
}
}
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
let n = ids.len(); ids.sort_unstable(); ids.dedup();
let n = ids.len();
ids.sort_unstable();
ids.dedup();
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
}
@@ -295,7 +339,8 @@ mod tests {
heading_path: vec![],
source_span: SourceSpan::Line { start: 1, end: 1 },
},
text: "x".into(), inlines: vec![],
text: "x".into(),
inlines: vec![],
})];
let err = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
assert!(err.to_string().contains("CodePythonAstV1Chunker"));
@@ -304,11 +349,19 @@ mod tests {
#[test]
fn deterministic_chunk_ids_1000() {
let doc = code_doc(&[("parse", 1, 2, "def parse(): pass\n")]);
let base: Vec<String> = CodePythonAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let base: Vec<String> = CodePythonAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
for _ in 0..1000 {
let again: Vec<String> = CodePythonAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let again: Vec<String> = CodePythonAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
assert_eq!(again, base);
}
}
@@ -316,7 +369,9 @@ mod tests {
#[test]
fn policy_hash_matches_md_heading_v1() {
let p = policy();
assert_eq!(CodePythonAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p));
assert_eq!(
CodePythonAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p)
);
}
}

View File

@@ -39,11 +39,7 @@ impl Chunker for CodeRustAstV1Chunker {
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
for b in &doc.blocks {
let c = match b {
Block::Code(c) => c,
@@ -68,9 +64,12 @@ impl Chunker for CodeRustAstV1Chunker {
_ => unreachable!("validated above"),
};
let (ls, le, symbol, lang) = match &cb.common.source_span {
SourceSpan::Code { line_start, line_end, symbol, lang } => {
(*line_start, *line_end, symbol.clone(), lang.clone())
}
SourceSpan::Code {
line_start,
line_end,
symbol,
lang,
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
_ => unreachable!("validated above"),
};
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeRustAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
None, span, cb.code.clone(),
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
None,
span,
cb.code.clone(),
));
} else {
let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeRustAstV1Chunker {
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
let part_ls = ls + off_start;
let part_le = ls + off_end;
let part_sym = symbol
.as_ref()
.map(|s| format!("{s} [part {}/{n}]", i + 1));
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
let span = SourceSpan::Code {
line_start: part_ls,
line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeRustAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
Some(part_ls), span, text,
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
Some(part_ls),
span,
text,
));
}
}
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
mod tests {
use super::*;
use kebab_core::{
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
SourceType, TrustLevel, WorkspacePath,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use time::OffsetDateTime;
@@ -206,39 +213,60 @@ mod tests {
};
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
Block::Code(CodeBlock {
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
common: CommonBlock {
block_id: bid,
heading_path: vec![],
source_span: span,
},
lang: Some("rust".into()),
code: (*code).to_string(),
})
})
.collect();
CanonicalDocument {
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
lang: Lang("und".into()), blocks,
doc_id,
source_asset_id: aid,
workspace_path: wp,
title: "a".into(),
lang: Lang("und".into()),
blocks,
metadata: Metadata {
aliases: vec![], tags: vec![],
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
user_id_alias: None, user: Default::default(),
repo: Some("kebab".into()), git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)), code_lang: Some("rust".into()),
source_type: SourceType::Note,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: Some("kebab".into()),
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("rust".into()),
},
provenance: Provenance { events: vec![] },
parser_version: pv, schema_version: 1, doc_version: 1,
last_chunker_version: None, last_embedding_version: None,
parser_version: pv,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
fn policy() -> ChunkPolicy {
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
}
}
#[test]
fn chunker_version_is_code_rust_ast_v1() {
assert_eq!(CodeRustAstV1Chunker.chunker_version(),
ChunkerVersion("code-rust-ast-v1".into()));
assert_eq!(
CodeRustAstV1Chunker.chunker_version(),
ChunkerVersion("code-rust-ast-v1".into())
);
}
#[test]
@@ -256,7 +284,12 @@ mod tests {
assert_eq!(c.chunker_version.0, "code-rust-ast-v1");
}
match &chunks[0].source_spans[0] {
SourceSpan::Code { symbol, line_start, line_end, .. } => {
SourceSpan::Code {
symbol,
line_start,
line_end,
..
} => {
assert_eq!(symbol.as_deref(), Some("parse"));
assert_eq!((*line_start, *line_end), (1, 3));
}
@@ -266,22 +299,33 @@ mod tests {
#[test]
fn oversize_unit_splits_into_parts_with_unique_ids() {
let body = (0..500).map(|i| format!(" let x{i} = {i};")).collect::<Vec<_>>().join("\n");
let body = (0..500)
.map(|i| format!(" let x{i} = {i};"))
.collect::<Vec<_>>()
.join("\n");
let code = format!("pub fn big() {{\n{body}\n}}");
let doc = code_doc(&[("big", 1, 502, &code)]);
let chunks = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
assert!(
chunks.len() >= 2,
"oversize unit must split, got {}",
chunks.len()
);
for c in &chunks {
match &c.source_spans[0] {
SourceSpan::Code { symbol, .. } => {
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}");
assert!(
symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}"
);
}
_ => unreachable!(),
}
}
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
let n = ids.len(); ids.sort_unstable(); ids.dedup();
let n = ids.len();
ids.sort_unstable();
ids.dedup();
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
}
@@ -295,7 +339,8 @@ mod tests {
heading_path: vec![],
source_span: SourceSpan::Line { start: 1, end: 1 },
},
text: "x".into(), inlines: vec![],
text: "x".into(),
inlines: vec![],
})];
let err = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
assert!(err.to_string().contains("CodeRustAstV1Chunker"));
@@ -304,11 +349,19 @@ mod tests {
#[test]
fn deterministic_chunk_ids_1000() {
let doc = code_doc(&[("parse", 1, 2, "fn parse(){}\n}")]);
let base: Vec<String> = CodeRustAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let base: Vec<String> = CodeRustAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
for _ in 0..1000 {
let again: Vec<String> = CodeRustAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let again: Vec<String> = CodeRustAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
assert_eq!(again, base);
}
}
@@ -316,7 +369,9 @@ mod tests {
#[test]
fn policy_hash_matches_md_heading_v1() {
let p = policy();
assert_eq!(CodeRustAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p));
assert_eq!(
CodeRustAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p)
);
}
}

View File

@@ -9,7 +9,7 @@
use crate::tier2_shared::{build_chunk_no_symbol, policy_hash};
use anyhow::Result;
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
pub const VERSION_LABEL: &str = "code-text-paragraph-v1";

View File

@@ -39,17 +39,13 @@ impl Chunker for CodeTsAstV1Chunker {
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
for b in &doc.blocks {
let c = match b {
Block::Code(c) => c,
_ => anyhow::bail!(
"CodeTsAstV1Chunker only handles code docs (got non-Code block)"
),
_ => {
anyhow::bail!("CodeTsAstV1Chunker only handles code docs (got non-Code block)")
}
};
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
anyhow::bail!(
@@ -68,9 +64,12 @@ impl Chunker for CodeTsAstV1Chunker {
_ => unreachable!("validated above"),
};
let (ls, le, symbol, lang) = match &cb.common.source_span {
SourceSpan::Code { line_start, line_end, symbol, lang } => {
(*line_start, *line_end, symbol.clone(), lang.clone())
}
SourceSpan::Code {
line_start,
line_end,
symbol,
lang,
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
_ => unreachable!("validated above"),
};
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
@@ -84,8 +83,13 @@ impl Chunker for CodeTsAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
None, span, cb.code.clone(),
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
None,
span,
cb.code.clone(),
));
} else {
let parts = split_oversize(&cb.code);
@@ -93,9 +97,7 @@ impl Chunker for CodeTsAstV1Chunker {
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
let part_ls = ls + off_start;
let part_le = ls + off_end;
let part_sym = symbol
.as_ref()
.map(|s| format!("{s} [part {}/{n}]", i + 1));
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
let span = SourceSpan::Code {
line_start: part_ls,
line_end: part_le,
@@ -103,8 +105,13 @@ impl Chunker for CodeTsAstV1Chunker {
lang: lang.clone(),
};
out.push(make_chunk(
doc, &chunker_version, &block_ids, &base_policy_hash,
Some(part_ls), span, text,
doc,
&chunker_version,
&block_ids,
&base_policy_hash,
Some(part_ls),
span,
text,
));
}
}
@@ -183,9 +190,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
mod tests {
use super::*;
use kebab_core::{
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
SourceType, TrustLevel, WorkspacePath,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use time::OffsetDateTime;
@@ -206,46 +213,72 @@ mod tests {
};
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
Block::Code(CodeBlock {
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
common: CommonBlock {
block_id: bid,
heading_path: vec![],
source_span: span,
},
lang: Some("typescript".into()),
code: (*code).to_string(),
})
})
.collect();
CanonicalDocument {
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
lang: Lang("und".into()), blocks,
doc_id,
source_asset_id: aid,
workspace_path: wp,
title: "a".into(),
lang: Lang("und".into()),
blocks,
metadata: Metadata {
aliases: vec![], tags: vec![],
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
user_id_alias: None, user: Default::default(),
repo: Some("kebab".into()), git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)), code_lang: Some("typescript".into()),
source_type: SourceType::Note,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user: Default::default(),
repo: Some("kebab".into()),
git_branch: Some("main".into()),
git_commit: Some("0".repeat(40)),
code_lang: Some("typescript".into()),
},
provenance: Provenance { events: vec![] },
parser_version: pv, schema_version: 1, doc_version: 1,
last_chunker_version: None, last_embedding_version: None,
parser_version: pv,
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
}
}
fn policy() -> ChunkPolicy {
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
ChunkPolicy {
target_tokens: 500,
overlap_tokens: 80,
respect_markdown_headings: false,
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
}
}
#[test]
fn chunker_version_is_code_ts_ast_v1() {
assert_eq!(CodeTsAstV1Chunker.chunker_version(),
ChunkerVersion("code-ts-ast-v1".into()));
assert_eq!(
CodeTsAstV1Chunker.chunker_version(),
ChunkerVersion("code-ts-ast-v1".into())
);
}
#[test]
fn one_chunk_per_unit_preserves_code_span() {
let doc = code_doc(&[
("parse", 1, 3, "function parse(): void {\n // x\n}"),
("Foo.double", 5, 7, "function double(): number {\n //\n return 0;\n}"),
(
"Foo.double",
5,
7,
"function double(): number {\n //\n return 0;\n}",
),
]);
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert_eq!(chunks.len(), 2);
@@ -256,7 +289,12 @@ mod tests {
assert_eq!(c.chunker_version.0, "code-ts-ast-v1");
}
match &chunks[0].source_spans[0] {
SourceSpan::Code { symbol, line_start, line_end, .. } => {
SourceSpan::Code {
symbol,
line_start,
line_end,
..
} => {
assert_eq!(symbol.as_deref(), Some("parse"));
assert_eq!((*line_start, *line_end), (1, 3));
}
@@ -266,22 +304,33 @@ mod tests {
#[test]
fn oversize_unit_splits_into_parts_with_unique_ids() {
let body = (0..500).map(|i| format!(" const x{i} = {i};")).collect::<Vec<_>>().join("\n");
let body = (0..500)
.map(|i| format!(" const x{i} = {i};"))
.collect::<Vec<_>>()
.join("\n");
let code = format!("function big(): void {{\n{body}\n}}");
let doc = code_doc(&[("big", 1, 502, &code)]);
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
assert!(
chunks.len() >= 2,
"oversize unit must split, got {}",
chunks.len()
);
for c in &chunks {
match &c.source_spans[0] {
SourceSpan::Code { symbol, .. } => {
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}");
assert!(
symbol.as_deref().unwrap().starts_with("big [part "),
"part-numbered symbol, got {symbol:?}"
);
}
_ => unreachable!(),
}
}
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
let n = ids.len(); ids.sort_unstable(); ids.dedup();
let n = ids.len();
ids.sort_unstable();
ids.dedup();
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
}
@@ -295,7 +344,8 @@ mod tests {
heading_path: vec![],
source_span: SourceSpan::Line { start: 1, end: 1 },
},
text: "x".into(), inlines: vec![],
text: "x".into(),
inlines: vec![],
})];
let err = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
assert!(err.to_string().contains("CodeTsAstV1Chunker"));
@@ -304,11 +354,19 @@ mod tests {
#[test]
fn deterministic_chunk_ids_1000() {
let doc = code_doc(&[("parse", 1, 2, "function parse(): void {}\n")]);
let base: Vec<String> = CodeTsAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let base: Vec<String> = CodeTsAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
for _ in 0..1000 {
let again: Vec<String> = CodeTsAstV1Chunker.chunk(&doc, &policy())
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
let again: Vec<String> = CodeTsAstV1Chunker
.chunk(&doc, &policy())
.unwrap()
.into_iter()
.map(|c| c.chunk_id.0)
.collect();
assert_eq!(again, base);
}
}
@@ -316,7 +374,9 @@ mod tests {
#[test]
fn policy_hash_matches_md_heading_v1() {
let p = policy();
assert_eq!(CodeTsAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p));
assert_eq!(
CodeTsAstV1Chunker.policy_hash(&p),
crate::MdHeadingV1Chunker.policy_hash(&p)
);
}
}

View File

@@ -7,7 +7,7 @@
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
use anyhow::Result;
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
pub const VERSION_LABEL: &str = "dockerfile-file-v1";

View File

@@ -8,7 +8,7 @@
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
use anyhow::Result;
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
pub const VERSION_LABEL: &str = "k8s-manifest-resource-v1";
@@ -49,19 +49,14 @@ impl Chunker for K8sManifestResourceV1Chunker {
.get("apiVersion")
.and_then(|v| v.as_str())
.unwrap_or("");
let kind = mapping
.get("kind")
.and_then(|v| v.as_str())
.unwrap_or("");
let kind = mapping.get("kind").and_then(|v| v.as_str()).unwrap_or("");
// Skip non-k8s documents.
if api.is_empty() || kind.is_empty() {
continue;
}
let metadata = mapping
.get("metadata")
.and_then(|v| v.as_mapping());
let metadata = mapping.get("metadata").and_then(|v| v.as_mapping());
let name = metadata
.and_then(|m| m.get("name"))
.and_then(|v| v.as_str())
@@ -118,10 +113,7 @@ fn split_yaml_documents(text: &str) -> Vec<YamlSlice<'_>> {
.enumerate()
.filter_map(|(i, l)| {
let trimmed = l.trim_end();
if trimmed == "---"
|| trimmed.starts_with("--- ")
|| trimmed.starts_with("---\t")
{
if trimmed == "---" || trimmed.starts_with("--- ") || trimmed.starts_with("---\t") {
Some(i)
} else {
None

View File

@@ -23,14 +23,14 @@ mod code_js_ast_v1;
mod code_kotlin_ast_v1;
mod code_python_ast_v1;
mod code_rust_ast_v1;
pub mod code_text_paragraph_v1;
mod code_ts_ast_v1;
pub mod dockerfile_file_v1;
pub mod k8s_manifest_resource_v1;
pub mod manifest_file_v1;
mod md_heading_v1;
mod pdf_page_v1;
mod tier2_shared;
pub mod k8s_manifest_resource_v1;
pub mod dockerfile_file_v1;
pub mod manifest_file_v1;
pub mod code_text_paragraph_v1;
pub use code_c_ast_v1::CodeCAstV1Chunker;
pub use code_cpp_ast_v1::CodeCppAstV1Chunker;
@@ -40,10 +40,10 @@ pub use code_js_ast_v1::CodeJsAstV1Chunker;
pub use code_kotlin_ast_v1::CodeKotlinAstV1Chunker;
pub use code_python_ast_v1::CodePythonAstV1Chunker;
pub use code_rust_ast_v1::CodeRustAstV1Chunker;
pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker;
pub use code_ts_ast_v1::CodeTsAstV1Chunker;
pub use dockerfile_file_v1::DockerfileFileV1Chunker;
pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
pub use manifest_file_v1::ManifestFileV1Chunker;
pub use md_heading_v1::MdHeadingV1Chunker;
pub use pdf_page_v1::PdfPageV1Chunker;
pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
pub use dockerfile_file_v1::DockerfileFileV1Chunker;
pub use manifest_file_v1::ManifestFileV1Chunker;
pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker;

View File

@@ -8,7 +8,7 @@
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
use anyhow::Result;
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker};
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
pub const VERSION_LABEL: &str = "manifest-file-v1";

View File

@@ -1,8 +1,8 @@
//! `md-heading-v1` — heading-aware Markdown chunker.
use kebab_core::{
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker,
ChunkerVersion, DocumentId, SourceSpan, id_for_chunk,
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
SourceSpan, id_for_chunk,
};
/// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label
@@ -99,11 +99,7 @@ impl Chunker for MdHeadingV1Chunker {
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
let policy_hash = self.policy_hash(policy);
let chunker_version = self.chunker_version();
let mut out: Vec<Chunk> = Vec::new();
@@ -152,22 +148,12 @@ impl Chunker for MdHeadingV1Chunker {
// `collect_overlap_seed` keeps seed ≤ target/2, so
// a flush here never produces a chunk smaller than
// the seed budget.
let would_exceed = acc.text_tokens + next_tokens
> policy.target_tokens
let would_exceed = acc.text_tokens + next_tokens > policy.target_tokens
&& acc.has_non_heading_content();
if would_exceed {
let overlap_seed = collect_overlap_seed(
&acc,
policy.overlap_tokens,
policy.target_tokens,
);
flush(
&mut acc,
doc,
&chunker_version,
&policy_hash,
&mut out,
);
let overlap_seed =
collect_overlap_seed(&acc, policy.overlap_tokens, policy.target_tokens);
flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
// Seed next accumulator with the prior chunk's
// tail blocks (paragraph-level overlap). The
// heading is *not* re-included here — it lives
@@ -292,10 +278,11 @@ fn build_chunk(
) -> Chunk {
debug_assert!(!blocks.is_empty(), "build_chunk requires ≥1 block");
let block_ids: Vec<BlockId> =
blocks.iter().map(|b| common(b).block_id.clone()).collect();
let source_spans: Vec<SourceSpan> =
blocks.iter().map(|b| common(b).source_span.clone()).collect();
let block_ids: Vec<BlockId> = blocks.iter().map(|b| common(b).block_id.clone()).collect();
let source_spans: Vec<SourceSpan> = blocks
.iter()
.map(|b| common(b).source_span.clone())
.collect();
// heading_path: pick the first non-Heading block's heading_path
// (which already includes every parent heading per kb-normalize).
@@ -339,12 +326,7 @@ fn build_chunk(
text.len().div_ceil(BYTES_PER_TOKEN)
};
let chunk_id = id_for_chunk(
&doc.doc_id,
chunker_version,
&block_ids,
policy_hash,
);
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, &block_ids, policy_hash);
Chunk {
chunk_id,
@@ -400,14 +382,8 @@ fn render_block_text(b: &Block) -> String {
} else {
i.alt.clone()
};
let ocr = i
.ocr
.as_ref()
.map_or("", |o| o.joined.as_str());
let cap = i
.caption
.as_ref()
.map_or("", |c| c.text.as_str());
let ocr = i.ocr.as_ref().map_or("", |o| o.joined.as_str());
let cap = i.caption.as_ref().map_or("", |c| c.text.as_str());
[alt.as_str(), ocr, cap]
.iter()
.filter(|s| !s.is_empty())
@@ -447,9 +423,8 @@ fn common(b: &Block) -> &kebab_core::CommonBlock {
mod tests {
use super::*;
use kebab_core::{
AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang,
Metadata, Provenance, SourceType, TableBlock, TextBlock, TrustLevel,
WorkspacePath, id_for_block,
AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang, Metadata, Provenance,
SourceType, TableBlock, TextBlock, TrustLevel, WorkspacePath, id_for_block,
};
use time::OffsetDateTime;
@@ -492,12 +467,7 @@ mod tests {
SourceSpan::Line { start, end }
}
fn common_for(
kind: &str,
heading_path: &[String],
ordinal: u32,
s: SourceSpan,
) -> CommonBlock {
fn common_for(kind: &str, heading_path: &[String], ordinal: u32, s: SourceSpan) -> CommonBlock {
CommonBlock {
block_id: id_for_block(&doc_id(), kind, heading_path, ordinal, &s),
heading_path: heading_path.to_vec(),
@@ -532,12 +502,7 @@ mod tests {
})
}
fn paragraph(
text: &str,
heading_path: &[&str],
ordinal: u32,
line: u32,
) -> Block {
fn paragraph(text: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
Block::Paragraph(TextBlock {
common: common_for("paragraph", &hp, ordinal, span(line, line)),
@@ -546,12 +511,7 @@ mod tests {
})
}
fn code_block(
code: &str,
heading_path: &[&str],
ordinal: u32,
s: SourceSpan,
) -> Block {
fn code_block(code: &str, heading_path: &[&str], ordinal: u32, s: SourceSpan) -> Block {
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
Block::Code(CodeBlock {
common: common_for("code", &hp, ordinal, s),
@@ -578,12 +538,7 @@ mod tests {
})
}
fn image_ref(
alt: &str,
heading_path: &[&str],
ordinal: u32,
line: u32,
) -> Block {
fn image_ref(alt: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
Block::ImageRef(ImageRefBlock {
common: common_for("imageref", &hp, ordinal, span(line, line)),

View File

@@ -53,18 +53,21 @@
//! one chunk per atomic block. PdfPageV1 cannot.
//!
//! Workaround that doesn't change the §4.2 recipe: feed a per-chunk
//! variant `format!("{base_policy_hash}#c{char_start}")` into the
//! recipe's `policy_hash` slot (so distinct chunks distinguish via
//! different policy_hash inputs), while storing the unmodified
//! `base_policy_hash` in `Chunk.policy_hash` so the field still answers
//! "what policy was active". Logged in `tasks/HOTFIXES.md`.
//! variant `format!("{base_policy_hash}#c{segment_start}")` into the
//! recipe's `policy_hash` slot. `segment_start` is the pre-overlap
//! segment boundary, strictly increasing across the returned chunks
//! even when the overlap walk collapses `actual_start` to a previous
//! chunk's `prev_min`. Unmodified `base_policy_hash` is stored in
//! `Chunk.policy_hash` so the field still answers "what policy was
//! active". v1.1 second-iteration patch — logged in
//! `tasks/HOTFIXES.md` (2026-05-27).
use kebab_core::{
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
SourceSpan, id_for_chunk,
};
const VERSION_LABEL: &str = "pdf-page-v1";
const VERSION_LABEL: &str = "pdf-page-v1.1";
const BYTES_PER_TOKEN: usize = 3;
const POLICY_HASH_HEX_LEN: usize = 16;
@@ -89,11 +92,7 @@ impl Chunker for PdfPageV1Chunker {
hex[..POLICY_HASH_HEX_LEN].to_string()
}
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>> {
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
// Validate up front — every block must be a Paragraph carrying
// SourceSpan::Page. A mixed document signals a routing bug in
// the caller (e.g. running this chunker on Markdown) and is
@@ -106,18 +105,13 @@ impl Chunker for PdfPageV1Chunker {
),
};
if !matches!(common.source_span, SourceSpan::Page { .. }) {
anyhow::bail!(
"PdfPageV1Chunker only handles PDF docs (got non-Page source_span)"
);
anyhow::bail!("PdfPageV1Chunker only handles PDF docs (got non-Page source_span)");
}
}
let base_policy_hash = self.policy_hash(policy);
let chunker_version = self.chunker_version();
let target_bytes = policy
.target_tokens
.saturating_mul(BYTES_PER_TOKEN)
.max(1);
let target_bytes = policy.target_tokens.saturating_mul(BYTES_PER_TOKEN).max(1);
// Clamp the overlap to half the target. Without this, a policy
// with `overlap_tokens >= target_tokens` would make every chunk
// fully re-emit the previous chunk's text — mirrors
@@ -146,7 +140,7 @@ impl Chunker for PdfPageV1Chunker {
continue;
}
for (char_start, char_end, slice) in
for (segment_start, char_start, char_end, slice) in
chunk_page(&p.text, target_bytes, overlap_bytes)
{
// PDF chars-per-page comfortably fits in u32 (a single
@@ -154,20 +148,20 @@ impl Chunker for PdfPageV1Chunker {
// typography); silent `as u32` truncation would only
// surface on corrupted input, where an explicit panic
// is preferable to an off-by-2^32 span.
let char_start_u32 = u32::try_from(char_start)
.expect("page chars fit in u32");
let char_end_u32 =
u32::try_from(char_end).expect("page chars fit in u32");
let char_start_u32 = u32::try_from(char_start).expect("page chars fit in u32");
let char_end_u32 = u32::try_from(char_end).expect("page chars fit in u32");
let span = SourceSpan::Page {
page: page_num,
char_start: Some(char_start_u32),
char_end: Some(char_end_u32),
};
let block_ids: Vec<BlockId> = vec![p.common.block_id.clone()];
// Per-chunk policy_hash variant prevents chunk_id
// collision when a page produces multiple chunks. See
// module docs for rationale.
let per_chunk_hash = format!("{base_policy_hash}#c{char_start}");
// v0.20.0 sub-item 1 bugfix (#3): per-chunk policy_hash
// variant uses `segment_start` (pre-overlap boundary,
// strictly increasing) instead of `char_start` (post-
// overlap, may collapse to prev_min). See module docs +
// spec §4.1 root cause + HOTFIXES.md 2026-05-27.
let per_chunk_hash = format!("{base_policy_hash}#c{segment_start}");
let chunk_id =
id_for_chunk(&doc.doc_id, &chunker_version, &block_ids, &per_chunk_hash);
let token_estimate = slice.len().div_ceil(BYTES_PER_TOKEN);
@@ -198,18 +192,28 @@ impl Chunker for PdfPageV1Chunker {
}
/// Split a single page's text into ordered chunks, each represented as
/// `(char_start, char_end, text_slice)`. Char positions are within the
/// page text, suitable for `SourceSpan::Page::char_start` / `char_end`.
/// `(segment_start, actual_start, chunk_end, text_slice)`.
///
/// - `segment_start` = pre-overlap segment boundary. Strictly increasing
/// across the returned vec. Use this for chunk_id uniqueness suffixes.
/// - `actual_start` = post-overlap start char index. May collapse to a
/// previous chunk's `actual_start` under aggressive overlap policy.
/// Use this for `SourceSpan::Page::char_start`.
/// - `chunk_end` = chunk's end char index (exclusive).
///
/// Returns an empty vector when `text` is empty or whitespace-only.
fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usize, usize, String)> {
fn chunk_page(
text: &str,
target_bytes: usize,
overlap_bytes: usize,
) -> Vec<(usize, usize, usize, String)> {
let chars: Vec<char> = text.chars().collect();
let n = chars.len();
if n == 0 {
return Vec::new();
}
if text.len() <= target_bytes {
return vec![(0, n, text.to_string())];
return vec![(0, 0, n, text.to_string())];
}
// Build candidate boundary positions (char indices where a chunk
@@ -222,8 +226,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
let c = chars[k];
let nx = chars[k + 1];
let is_paragraph_break = c == '\n' && nx == '\n';
let is_sentence_end =
matches!(c, '.' | '?' | '!') && nx.is_whitespace();
let is_sentence_end = matches!(c, '.' | '?' | '!') && nx.is_whitespace();
if (is_paragraph_break || is_sentence_end) && k + 2 <= n {
bounds.push(k + 2);
}
@@ -235,11 +238,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
bounds.dedup();
// UTF-8 byte length of the slice between two char indices.
let byte_len = |a: usize, b: usize| -> usize {
chars[a..b].iter().map(|c| c.len_utf8()).sum()
};
let byte_len = |a: usize, b: usize| -> usize { chars[a..b].iter().map(|c| c.len_utf8()).sum() };
let mut chunks: Vec<(usize, usize, String)> = Vec::new();
let mut chunks: Vec<(usize, usize, usize, String)> = Vec::new();
let mut seg_idx: usize = 0;
while seg_idx + 1 < bounds.len() {
let start = bounds[seg_idx];
@@ -264,7 +265,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
// have absorbed up to `overlap_bytes` of bytes, but never past
// the previous chunk's start (no full re-emission).
let actual_start = if let Some(prev) = chunks.last() {
let prev_min = prev.0;
// prev tuple shape = (segment_start, actual_start, chunk_end, slice).
// overlap walk floor = previous chunk's actual_start (prev.1).
let prev_min = prev.1;
let mut a = start;
let mut acc_o: usize = 0;
while a > prev_min {
@@ -281,7 +284,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
};
let slice: String = chars[actual_start..chunk_end].iter().collect();
chunks.push((actual_start, chunk_end, slice));
chunks.push((start, actual_start, chunk_end, slice));
seg_idx = end_idx;
}
@@ -390,7 +393,11 @@ mod tests {
assert_eq!(c.heading_path, Vec::<String>::new());
assert_eq!(c.source_spans.len(), 1);
match c.source_spans[0] {
SourceSpan::Page { page, char_start, char_end } => {
SourceSpan::Page {
page,
char_start,
char_end,
} => {
assert_eq!(page, (i as u32) + 1);
assert_eq!(char_start, Some(0));
assert!(char_end.unwrap() > 0);
@@ -435,11 +442,16 @@ mod tests {
// N-1's char_end).
for w in chunks.windows(2) {
let prev_end = match w[0].source_spans[0] {
SourceSpan::Page { char_end: Some(e), .. } => e,
SourceSpan::Page {
char_end: Some(e), ..
} => e,
_ => panic!("missing char_end"),
};
let next_start = match w[1].source_spans[0] {
SourceSpan::Page { char_start: Some(s), .. } => s,
SourceSpan::Page {
char_start: Some(s),
..
} => s,
_ => panic!("missing char_start"),
};
assert!(
@@ -653,11 +665,17 @@ mod tests {
// overlap) is the failure mode.
for w in chunks.windows(2) {
let prev_start = match w[0].source_spans[0] {
SourceSpan::Page { char_start: Some(s), .. } => s,
SourceSpan::Page {
char_start: Some(s),
..
} => s,
_ => panic!("missing char_start"),
};
let next_start = match w[1].source_spans[0] {
SourceSpan::Page { char_start: Some(s), .. } => s,
SourceSpan::Page {
char_start: Some(s),
..
} => s,
_ => panic!("missing char_start"),
};
assert!(
@@ -674,6 +692,43 @@ mod tests {
assert_eq!(ids.len(), total, "chunk_ids must remain unique");
}
#[test]
fn multi_chunk_page_with_aggressive_overlap_produces_unique_chunk_ids() {
// 한국어 OCR text 의 trigger shape: 10 char "가" + ". " + 500 char "나".
// → first segment [0, 12), second segment [12, n).
// page_text byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500
// → multi-chunk. overlap_bytes = min(240, 750) = 240 chars=80
// → second chunk 의 actual_start 가 prev_min=0 collapse → same `#c0`.
//
// default_policy(500, 80) — target_tokens=500 → target_bytes=500*3=1500
// (한국어 3byte/char 환산), overlap_tokens=80 → overlap_bytes=min(240, 750)=240.
// verifier round 1 L-3 보강.
let early_seg = "".repeat(10);
let tail = "".repeat(500);
let page_text = format!("{early_seg}. {tail}");
let doc = make_pdf_doc(&[&page_text]);
let policy = default_policy(500, 80); // target=1500 byte, overlap=240 byte
let chunks = PdfPageV1Chunker.chunk(&doc, &policy).unwrap();
assert!(
chunks.len() >= 2,
"expected ≥2 chunks for {} byte page; got {}",
page_text.len(),
chunks.len()
);
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
ids.sort_unstable();
let total = ids.len();
ids.dedup();
assert_eq!(
ids.len(),
total,
"all chunk_ids must be unique even when overlap walks actual_start back to prev_min"
);
}
#[test]
fn policy_hash_matches_md_heading_v1_for_identical_policy() {
// Cross-chunker policy fingerprint identity — important so a

View File

@@ -113,7 +113,14 @@ pub(crate) fn build_chunk(
symbol: Some(symbol.to_string()),
lang: Some(lang.to_string()),
};
build_chunk_from_span(doc, chunker_version, base_policy_hash, text, span, split_key)
build_chunk_from_span(
doc,
chunker_version,
base_policy_hash,
text,
span,
split_key,
)
}
/// Like `build_chunk` but emits `symbol: None`. Used by Tier 3 (per spec §9.3).

View File

@@ -13,9 +13,9 @@ use std::path::PathBuf;
use kebab_chunk::CodeCAstV1Chunker;
use kebab_core::{
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
id_for_block, id_for_doc,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use serde_json::Value;
use time::OffsetDateTime;

View File

@@ -15,9 +15,9 @@ use std::path::PathBuf;
use kebab_chunk::CodeCppAstV1Chunker;
use kebab_core::{
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
id_for_block, id_for_doc,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use kebab_parse_code::CppAstExtractor;
use serde_json::Value;
@@ -171,7 +171,9 @@ fn extract_cpp_fixture() -> CanonicalDocument {
workspace_root: &root,
config: &cfg,
};
CppAstExtractor::new().extract(&ctx, src.as_bytes()).unwrap()
CppAstExtractor::new()
.extract(&ctx, src.as_bytes())
.unwrap()
}
// ---------------------------------------------------------------------------
@@ -261,43 +263,61 @@ fn code_cpp_ast_extractor_snapshot() {
let doc = extract_cpp_fixture();
// Verify the extractor emits all expected named units.
let block_syms: Vec<Option<String>> = doc.blocks.iter().filter_map(|b| match b {
Block::Code(c) => match &c.common.source_span {
SourceSpan::Code { symbol, .. } => Some(symbol.clone()),
let block_syms: Vec<Option<String>> = doc
.blocks
.iter()
.filter_map(|b| match b {
Block::Code(c) => match &c.common.source_span {
SourceSpan::Code { symbol, .. } => Some(symbol.clone()),
_ => None,
},
_ => None,
},
_ => None,
}).collect();
})
.collect();
// Must include namespace-qualified class and its methods
assert!(
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker")),
block_syms
.iter()
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker")),
"class unit missing: {block_syms:?}"
);
assert!(
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker")),
block_syms
.iter()
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker")),
"ctor unit missing: {block_syms:?}"
);
assert!(
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker")),
block_syms
.iter()
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker")),
"dtor unit missing: {block_syms:?}"
);
assert!(
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::chunk_doc")),
block_syms
.iter()
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::chunk_doc")),
"chunk_doc unit missing: {block_syms:?}"
);
assert!(
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::operator()")),
block_syms
.iter()
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::operator()")),
"operator() unit missing: {block_syms:?}"
);
// Template function (inside kebab::chunk namespace in the fixture)
assert!(
block_syms.iter().any(|s| s.as_deref() == Some("kebab::chunk::identity")),
block_syms
.iter()
.any(|s| s.as_deref() == Some("kebab::chunk::identity")),
"identity template fn unit missing: {block_syms:?}"
);
// Free function in outer namespace
assert!(
block_syms.iter().any(|s| s.as_deref() == Some("kebab::global_helper")),
block_syms
.iter()
.any(|s| s.as_deref() == Some("kebab::global_helper")),
"global_helper unit missing: {block_syms:?}"
);
// Global main
@@ -312,14 +332,23 @@ fn code_cpp_ast_extractor_snapshot() {
fn code_cpp_ast_extractor_chunks_deterministic() {
let doc1 = extract_cpp_fixture();
let doc2 = extract_cpp_fixture();
assert_eq!(doc1.blocks, doc2.blocks, "extractor output non-deterministic");
assert_eq!(
doc1.blocks, doc2.blocks,
"extractor output non-deterministic"
);
let policy = fixed_policy();
let chunks1 = CodeCppAstV1Chunker.chunk(&doc1, &policy).unwrap();
let chunks2 = CodeCppAstV1Chunker.chunk(&doc2, &policy).unwrap();
assert_eq!(
chunks1.iter().map(|c| c.chunk_id.0.clone()).collect::<Vec<_>>(),
chunks2.iter().map(|c| c.chunk_id.0.clone()).collect::<Vec<_>>(),
chunks1
.iter()
.map(|c| c.chunk_id.0.clone())
.collect::<Vec<_>>(),
chunks2
.iter()
.map(|c| c.chunk_id.0.clone())
.collect::<Vec<_>>(),
"chunker output non-deterministic"
);
}

View File

@@ -13,9 +13,9 @@ use std::path::PathBuf;
use kebab_chunk::CodeGoAstV1Chunker;
use kebab_core::{
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
id_for_block, id_for_doc,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use serde_json::Value;
use time::OffsetDateTime;

View File

@@ -13,9 +13,9 @@ use std::path::PathBuf;
use kebab_chunk::CodeJavaAstV1Chunker;
use kebab_core::{
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
id_for_block, id_for_doc,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use serde_json::Value;
use time::OffsetDateTime;

View File

@@ -13,9 +13,9 @@ use std::path::PathBuf;
use kebab_chunk::CodeJsAstV1Chunker;
use kebab_core::{
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
id_for_block, id_for_doc,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use serde_json::Value;
use time::OffsetDateTime;

View File

@@ -13,9 +13,9 @@ use std::path::PathBuf;
use kebab_chunk::CodeKotlinAstV1Chunker;
use kebab_core::{
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
id_for_block, id_for_doc,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use serde_json::Value;
use time::OffsetDateTime;

View File

@@ -13,9 +13,9 @@ use std::path::PathBuf;
use kebab_chunk::CodePythonAstV1Chunker;
use kebab_core::{
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
id_for_block, id_for_doc,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use serde_json::Value;
use time::OffsetDateTime;

View File

@@ -13,9 +13,9 @@ use std::path::PathBuf;
use kebab_chunk::CodeRustAstV1Chunker;
use kebab_core::{
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
id_for_block, id_for_doc,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use serde_json::Value;
use time::OffsetDateTime;

View File

@@ -13,9 +13,9 @@ use std::path::PathBuf;
use kebab_chunk::CodeTsAstV1Chunker;
use kebab_core::{
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
id_for_block, id_for_doc,
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
WorkspacePath, id_for_block, id_for_doc,
};
use serde_json::Value;
use time::OffsetDateTime;

View File

@@ -124,7 +124,11 @@ fn dockerfile_emits_single_chunk() {
Some("<dockerfile>"),
"symbol must be '<dockerfile>'"
);
assert_eq!(lang.as_deref(), Some("dockerfile"), "lang must be 'dockerfile'");
assert_eq!(
lang.as_deref(),
Some("dockerfile"),
"lang must be 'dockerfile'"
);
}
other => panic!("expected SourceSpan::Code, got {other:?}"),
}

View File

@@ -110,13 +110,11 @@ fn k8s_multi_doc_emits_one_chunk_per_resource() {
let symbols: Vec<&str> = chunks
.iter()
.map(|c| {
match &c.source_spans[0] {
SourceSpan::Code { symbol, .. } => {
symbol.as_deref().expect("symbol must be Some for k8s chunks")
}
other => panic!("expected Code span, got {other:?}"),
}
.map(|c| match &c.source_spans[0] {
SourceSpan::Code { symbol, .. } => symbol
.as_deref()
.expect("symbol must be Some for k8s chunks"),
other => panic!("expected Code span, got {other:?}"),
})
.collect();
@@ -270,7 +268,11 @@ fn k8s_oversize_splits_into_line_windows_sharing_symbol() {
let ranges: Vec<(u32, u32)> = chunks
.iter()
.map(|c| match &c.source_spans[0] {
SourceSpan::Code { line_start, line_end, .. } => (*line_start, *line_end),
SourceSpan::Code {
line_start,
line_end,
..
} => (*line_start, *line_end),
other => panic!("expected Code span, got {other:?}"),
})
.collect();

View File

@@ -15,7 +15,7 @@ use std::path::PathBuf;
use kebab_chunk::MdHeadingV1Chunker;
use kebab_core::{
AssetId, AssetStorage, Checksum, ChunkPolicy, ChunkerVersion, Chunker, MediaType,
AssetId, AssetStorage, Checksum, ChunkPolicy, Chunker, ChunkerVersion, MediaType,
ParserVersion, RawAsset, SourceUri, WorkspacePath,
};
use kebab_parse_md::{BodyHints, build_canonical_document, parse_blocks, parse_frontmatter};
@@ -65,8 +65,7 @@ fn long_section_chunks_snapshot() {
Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1,
None => 1,
};
let (blocks, parse_warns) =
parse_blocks(&bytes, body_offset_lines).expect("blocks parse");
let (blocks, parse_warns) = parse_blocks(&bytes, body_offset_lines).expect("blocks parse");
// Pin parser_version so doc_id / block_ids are reproducible.
let parser_version = ParserVersion("kb-chunk-snapshot-test-0".into());
@@ -74,9 +73,8 @@ fn long_section_chunks_snapshot() {
metadata.aliases.sort();
metadata.tags.sort();
let doc =
build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns)
.expect("build_canonical_document");
let doc = build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns)
.expect("build_canonical_document");
// Pin policy so policy_hash and chunk_ids are reproducible.
let policy = ChunkPolicy {
@@ -102,8 +100,7 @@ fn long_section_chunks_snapshot() {
baseline_path.display()
),
};
let expected: Value =
serde_json::from_str(&baseline_text).expect("baseline parses as json");
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
if actual != expected {
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
@@ -154,14 +151,8 @@ fn long_section_chunks_are_deterministic() {
let mut metadata = metadata;
metadata.aliases.sort();
metadata.tags.sort();
let doc = build_canonical_document(
&asset,
metadata,
blocks,
&parser_version,
parse_warns,
)
.expect("build_canonical_document");
let doc = build_canonical_document(&asset, metadata, blocks, &parser_version, parse_warns)
.expect("build_canonical_document");
let ids: Vec<String> = MdHeadingV1Chunker
.chunk(&doc, &policy)
.unwrap()

View File

@@ -107,9 +107,7 @@ fn cargo_toml_single_chunk_with_toml_lang() {
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
let doc = manifest_doc("toml", &text);
let chunks = ManifestFileV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
assert_eq!(
chunks.len(),
@@ -149,9 +147,7 @@ fn package_json_single_chunk_with_json_lang() {
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
let doc = manifest_doc("json", &text);
let chunks = ManifestFileV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
assert_eq!(
chunks.len(),
@@ -191,9 +187,7 @@ fn pom_xml_single_chunk_with_xml_lang() {
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
let doc = manifest_doc("xml", &text);
let chunks = ManifestFileV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
assert_eq!(
chunks.len(),
@@ -233,9 +227,7 @@ fn go_mod_single_chunk_with_go_mod_lang() {
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
let doc = manifest_doc("go-mod", &text);
let chunks = ManifestFileV1Chunker
.chunk(&doc, &policy())
.expect("chunk");
let chunks = ManifestFileV1Chunker.chunk(&doc, &policy()).expect("chunk");
assert_eq!(
chunks.len(),

View File

@@ -156,7 +156,7 @@ enum Cmd {
/// p9-fb-36: filter by `assets.media_type` kind. Comma-separated.
/// Aliases: `md` → `markdown`. Other accepted: `markdown`, `pdf`,
/// `image`, `audio`, `other`. Unknown values match nothing.
/// `image`, `audio`, `code`, `other`. Unknown values match nothing.
#[arg(long, value_delimiter = ',')]
media: Vec<String>,
@@ -179,7 +179,12 @@ enum Cmd {
/// canonical). Repeatable or comma-separated.
/// Examples: `rust`, `python`, `typescript`.
/// Unknown values produce empty hits.
#[arg(long = "code-lang", value_name = "LANG", num_args = 1, value_delimiter = ',')]
#[arg(
long = "code-lang",
value_name = "LANG",
num_args = 1,
value_delimiter = ','
)]
code_lang: Vec<String>,
/// p9-fb-37: emit pre-fusion lexical / vector / RRF candidate
@@ -464,7 +469,9 @@ fn parse_bool_env(s: &str) -> Result<bool, String> {
match s.to_ascii_lowercase().as_str() {
"1" | "true" | "yes" | "on" => Ok(true),
"0" | "false" | "no" | "off" => Ok(false),
other => Err(format!("expected 1/0/true/false/yes/no/on/off, got {other:?}")),
other => Err(format!(
"expected 1/0/true/false/yes/no/on/off, got {other:?}"
)),
}
}
@@ -551,8 +558,14 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
"created {}",
kebab_config::Config::xdg_config_path().display()
);
println!("created {}", kebab_config::Config::xdg_data_dir().display());
println!("created {}", kebab_config::Config::xdg_state_dir().display());
println!(
"created {}",
kebab_config::Config::xdg_data_dir().display()
);
println!(
"created {}",
kebab_config::Config::xdg_state_dir().display()
);
println!("hint edit the config above, then `kebab ingest`");
}
Ok(())
@@ -565,7 +578,9 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
} => {
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
let scope = kebab_core::SourceScope {
root: root.clone().unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)),
root: root
.clone()
.unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)),
exclude: cfg.workspace.exclude.clone(),
..Default::default()
};
@@ -580,9 +595,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
.unwrap_or(false);
let mode = progress::ProgressMode::from_flags(cli.json, cli.quiet, plain_env);
let (tx, rx) = std::sync::mpsc::channel::<kebab_app::IngestEvent>();
let display_handle = std::thread::spawn(move || {
progress::ProgressDisplay::new(mode).run(rx)
});
let display_handle =
std::thread::spawn(move || progress::ProgressDisplay::new(mode).run(rx));
// p9-fb-04: register a Ctrl-C handler that flips the same
// AtomicBool the facade polls at each step boundary. The
@@ -614,7 +628,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
if cli.json {
println!("{}", serde_json::to_string(&wire::wire_ingest(&report))?);
} else {
let skipped_breakdown = kebab_app::render_skipped_breakdown(&report.skipped_by_extension);
let skipped_breakdown =
kebab_app::render_skipped_breakdown(&report.skipped_by_extension);
let purged_suffix = if report.purged_deleted_files > 0 {
format!(" purged {}", report.purged_deleted_files)
} else {
@@ -640,7 +655,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
let docs = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())?;
if cli.json {
println!("{}", serde_json::to_string(&wire::wire_doc_summaries(&docs))?);
println!(
"{}",
serde_json::to_string(&wire::wire_doc_summaries(&docs))?
);
} else {
for d in &docs {
println!("{}\t{}", d.doc_id, d.doc_path.0);
@@ -667,7 +685,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
let chunk_id: kebab_core::ChunkId = id.parse()?;
let chunk = kebab_app::inspect_chunk_with_config(cfg, &chunk_id)?;
println!("{}", serde_json::to_string(&wire::wire_chunk_inspection(&chunk))?);
println!(
"{}",
serde_json::to_string(&wire::wire_chunk_inspection(&chunk))?
);
Ok(())
}
},
@@ -708,7 +729,10 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
};
let result = kebab_app::fetch_with_config(cfg, query, opts)?;
if cli.json {
println!("{}", serde_json::to_string(&wire::wire_fetch_result(&result))?);
println!(
"{}",
serde_json::to_string(&wire::wire_fetch_result(&result))?
);
} else {
render_fetch_plain(&result);
}
@@ -752,30 +776,21 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
if line.trim().is_empty() {
continue;
}
let v: serde_json::Value =
serde_json::from_str(&line).map_err(|e| {
anyhow::Error::new(kebab_app::StructuredError(
kebab_app::ErrorV1 {
schema_version: kebab_app::ERROR_V1_ID
.to_string(),
code: "config_invalid".to_string(),
message: format!(
"stdin ndjson line {} parse error: {e}",
lineno + 1
),
details: serde_json::Value::Null,
hint: Some(
"each line must be a JSON object with at least `query`"
.to_string(),
),
},
))
})?;
let v: serde_json::Value = serde_json::from_str(&line).map_err(|e| {
anyhow::Error::new(kebab_app::StructuredError(kebab_app::ErrorV1 {
schema_version: kebab_app::ERROR_V1_ID.to_string(),
code: "config_invalid".to_string(),
message: format!("stdin ndjson line {} parse error: {e}", lineno + 1),
details: serde_json::Value::Null,
hint: Some(
"each line must be a JSON object with at least `query`".to_string(),
),
}))
})?;
raw_items.push(v);
}
let (items, summary) =
kebab_app::bulk_search_with_config(cfg, raw_items)?;
let (items, summary) = kebab_app::bulk_search_with_config(cfg, raw_items)?;
if cli.json {
let mut stdout = std::io::stdout().lock();
@@ -799,11 +814,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
if let Some(err) = &item.error {
writeln!(stdout, "error: {err}")?;
} else if let Some(resp) = &item.response {
writeln!(
stdout,
"{}",
serde_json::to_string_pretty(resp)?
)?;
writeln!(stdout, "{}", serde_json::to_string_pretty(resp)?)?;
}
writeln!(stdout)?;
}
@@ -819,6 +830,17 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
// p9-fb-42: bulk mode requires no query; single-query mode requires query.
let query_text = match query.as_ref() {
Some(q) if q.trim().is_empty() => {
return Err(anyhow::Error::new(kebab_app::StructuredError(
kebab_app::ErrorV1 {
schema_version: kebab_app::ERROR_V1_ID.to_string(),
code: "invalid_input".to_string(),
message: "query is empty; provide a non-empty search term or use --bulk".into(),
details: serde_json::Value::Null,
hint: Some("e.g. `kebab search 'rust async'` or `kebab search --bulk < queries.ndjson`".into()),
},
)));
}
Some(q) => q.clone(),
None => {
return Err(anyhow::anyhow!("query is required unless --bulk is set"));
@@ -832,8 +854,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
other => other.to_string(),
}
}
let media_norm: Vec<String> =
media.iter().map(|s| normalize_media_alias(s)).collect();
let media_norm: Vec<String> = media.iter().map(|s| normalize_media_alias(s)).collect();
// p9-fb-36: parse --ingested-after as RFC3339; structured error on failure.
let ingested_after_parsed: Option<time::OffsetDateTime> =
@@ -845,8 +866,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
) {
Ok(ts) => Some(ts),
Err(e) => {
return Err(anyhow::Error::new(
kebab_app::StructuredError(kebab_app::ErrorV1 {
return Err(anyhow::Error::new(kebab_app::StructuredError(
kebab_app::ErrorV1 {
schema_version: kebab_app::ERROR_V1_ID.to_string(),
code: "config_invalid".to_string(),
message: format!(
@@ -856,8 +877,8 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
hint: Some(
"expected format like 2026-04-01T00:00:00Z".to_string(),
),
}),
));
},
)));
}
}
}
@@ -932,11 +953,7 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
};
println!(
"{:>2}. {:.4} {}{}{}",
h.rank,
h.retrieval.fusion_score,
stale_tag,
h.doc_path.0,
heading,
h.rank, h.retrieval.fusion_score, stale_tag, h.doc_path.0, heading,
);
}
// p9-fb-34: truncation hint goes to stderr so it
@@ -958,15 +975,33 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
if let Some(t) = &resp.trace {
eprintln!();
eprintln!("Trace:");
eprintln!(" lexical ({} hits, {}ms):", t.lexical.len(), t.timing.lexical_ms);
eprintln!(
" lexical ({} hits, {}ms):",
t.lexical.len(),
t.timing.lexical_ms
);
for c in t.lexical.iter().take(3) {
eprintln!(" rank={} score={:.4} chunk={}", c.rank, c.score, c.chunk_id.0);
eprintln!(
" rank={} score={:.4} chunk={}",
c.rank, c.score, c.chunk_id.0
);
}
eprintln!(" vector ({} hits, {}ms):", t.vector.len(), t.timing.vector_ms);
eprintln!(
" vector ({} hits, {}ms):",
t.vector.len(),
t.timing.vector_ms
);
for c in t.vector.iter().take(3) {
eprintln!(" rank={} score={:.4} chunk={}", c.rank, c.score, c.chunk_id.0);
eprintln!(
" rank={} score={:.4} chunk={}",
c.rank, c.score, c.chunk_id.0
);
}
eprintln!(" fusion ({} inputs, {}ms)", t.rrf_inputs.len(), t.timing.fusion_ms);
eprintln!(
" fusion ({} inputs, {}ms)",
t.rrf_inputs.len(),
t.timing.fusion_ms
);
eprintln!(" total: {}ms", t.timing.total_ms);
}
}
@@ -988,6 +1023,17 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
multi_hop,
} => {
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
if query.trim().is_empty() {
return Err(anyhow::Error::new(kebab_app::StructuredError(
kebab_app::ErrorV1 {
schema_version: kebab_app::ERROR_V1_ID.to_string(),
code: "invalid_input".to_string(),
message: "query is empty; provide a non-empty prompt".into(),
details: serde_json::Value::Null,
hint: Some("e.g. `kebab ask \"explain this code\"`".into()),
},
)));
}
if *stream {
// p9-fb-33: streaming branch. Background thread runs
// ask_with_config (which calls into the rag pipeline);
@@ -1017,16 +1063,12 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
let cfg2 = cfg.clone();
let q = query.clone();
let session2 = session.clone();
let handle = std::thread::spawn(
move || -> anyhow::Result<kebab_core::Answer> {
match session2.as_deref() {
Some(sid) => kebab_app::ask_with_session_with_config(
cfg2, sid, &q, opts,
),
None => kebab_app::ask_with_config(cfg2, &q, opts),
}
},
);
let handle = std::thread::spawn(move || -> anyhow::Result<kebab_core::Answer> {
match session2.as_deref() {
Some(sid) => kebab_app::ask_with_session_with_config(cfg2, sid, &q, opts),
None => kebab_app::ask_with_config(cfg2, &q, opts),
}
});
// Drain receiver, write ndjson to stderr until
// completion or BrokenPipe.
@@ -1302,9 +1344,18 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
println!("{}", serde_json::to_string_pretty(&agg)?);
} else {
println!("run_id: {run_id}");
println!("queries: {} ({} failed)", agg.total_queries, agg.failed_queries);
println!("hit@1: {:.4}", agg.hit_at_k.get(&1).copied().unwrap_or(0.0));
println!("hit@5: {:.4}", agg.hit_at_k.get(&5).copied().unwrap_or(0.0));
println!(
"queries: {} ({} failed)",
agg.total_queries, agg.failed_queries
);
println!(
"hit@1: {:.4}",
agg.hit_at_k.get(&1).copied().unwrap_or(0.0)
);
println!(
"hit@5: {:.4}",
agg.hit_at_k.get(&5).copied().unwrap_or(0.0)
);
println!("MRR: {:.4}", agg.mrr);
}
Ok(())
@@ -1354,8 +1405,12 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
} else {
println!(
"ingest-file: scanned={} new={} updated={} unchanged={} skipped={} errors={}",
report.scanned, report.new, report.updated,
report.unchanged, report.skipped, report.errors
report.scanned,
report.new,
report.updated,
report.unchanged,
report.skipped,
report.errors
);
}
Ok(())
@@ -1368,20 +1423,20 @@ fn run(cli: &Cli) -> anyhow::Result<()> {
.read_to_string(&mut body)
.context("kebab ingest-stdin: read stdin")?;
let cfg = kebab_config::Config::load(cli.config.as_deref())?;
let report = kebab_app::ingest_stdin_with_config(
cfg,
&body,
title,
source_uri.as_deref(),
)?;
let report =
kebab_app::ingest_stdin_with_config(cfg, &body, title, source_uri.as_deref())?;
if cli.json {
let v = wire::wire_ingest(&report);
println!("{}", serde_json::to_string(&v)?);
} else {
println!(
"ingest-stdin: scanned={} new={} updated={} unchanged={} skipped={} errors={}",
report.scanned, report.new, report.updated,
report.unchanged, report.skipped, report.errors
report.scanned,
report.new,
report.updated,
report.unchanged,
report.skipped,
report.errors
);
}
Ok(())
@@ -1410,10 +1465,7 @@ fn render_ask_plain_citations(
writeln!(w)?;
writeln!(w, "근거:")?;
for (idx, c) in ans.citations.iter().enumerate() {
let marker = c
.marker
.clone()
.unwrap_or_else(|| format!("{}", idx + 1));
let marker = c.marker.clone().unwrap_or_else(|| format!("{}", idx + 1));
// p9-fb-32: `[stale]` prefix on the URI for citations whose
// `stale: true`. Yellow on TTY, plain otherwise — mirrors the
// search-plain renderer in `Cmd::Search`.
@@ -1474,7 +1526,10 @@ fn print_schema_text(s: &kebab_app::SchemaV1) {
println!(" parser_version {}", s.models.parser_version);
println!(" chunker_version {}", s.models.chunker_version);
println!(" embedding_version {}", s.models.embedding_version);
println!(" prompt_template_version {}", s.models.prompt_template_version);
println!(
" prompt_template_version {}",
s.models.prompt_template_version
);
println!(" index_version {}", s.models.index_version);
println!(" corpus_revision {}", s.models.corpus_revision);
println!();
@@ -1523,9 +1578,7 @@ fn confirm_destructive(
/// Confirm prompt for `--orphans-only`: shows the orphan count + a
/// sample of up to 5 paths so the user knows what will be purged before
/// committing. No filesystem paths are removed — only store records.
fn confirm_orphans_only(
orphan_paths: &[kebab_core::WorkspacePath],
) -> anyhow::Result<bool> {
fn confirm_orphans_only(orphan_paths: &[kebab_core::WorkspacePath]) -> anyhow::Result<bool> {
use std::io::Write;
let n = orphan_paths.len();
let mut out = std::io::stderr().lock();
@@ -1538,11 +1591,7 @@ fn confirm_orphans_only(
return Ok(true);
}
let sample: Vec<&str> = orphan_paths
.iter()
.take(5)
.map(|p| p.0.as_str())
.collect();
let sample: Vec<&str> = orphan_paths.iter().take(5).map(|p| p.0.as_str()).collect();
let sample_str = sample.join(", ");
let ellipsis = if n > 5 { ", …" } else { "" };
@@ -1571,19 +1620,28 @@ fn render_fetch_plain(r: &kebab_core::FetchResult) {
if !r.context_before.is_empty() {
println!("\n=== before ===");
for c in &r.context_before {
let heading = c.heading_path.last().map_or("", std::string::String::as_str);
let heading = c
.heading_path
.last()
.map_or("", std::string::String::as_str);
println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text);
}
}
if let Some(c) = &r.chunk {
println!("\n=== target ===");
let heading = c.heading_path.last().map_or("", std::string::String::as_str);
let heading = c
.heading_path
.last()
.map_or("", std::string::String::as_str);
println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text);
}
if !r.context_after.is_empty() {
println!("\n=== after ===");
for c in &r.context_after {
let heading = c.heading_path.last().map_or("", std::string::String::as_str);
let heading = c
.heading_path
.last()
.map_or("", std::string::String::as_str);
println!("[{} § {}]\n{}\n", c.chunk_id.0, heading, c.text);
}
}
@@ -1615,8 +1673,8 @@ mod tests {
//! against a synthetic `Answer` instead.
use super::*;
use kebab_core::{
Answer, AnswerCitation, AnswerRetrievalSummary, Citation, ModelRef,
PromptTemplateVersion, SearchMode, TokenUsage, TraceId, WorkspacePath,
Answer, AnswerCitation, AnswerRetrievalSummary, Citation, ModelRef, PromptTemplateVersion,
SearchMode, TokenUsage, TraceId, WorkspacePath,
};
use time::OffsetDateTime;
@@ -1712,4 +1770,3 @@ mod tests {
);
}
}

View File

@@ -124,11 +124,9 @@ impl ProgressDisplay {
bar.set_length(u64::from(*total));
bar.set_position(0);
bar.set_style(
ProgressStyle::with_template(
"ingest [{bar:30}] {pos}/{len} {wide_msg}",
)
.unwrap()
.progress_chars("=> "),
ProgressStyle::with_template("ingest [{bar:30}] {pos}/{len} {wide_msg}")
.unwrap()
.progress_chars("=> "),
);
bar.set_message("");
}
@@ -170,11 +168,7 @@ impl ProgressDisplay {
let _ = writeln!(
err,
"ingest: complete (scanned={} new={} updated={} skipped={} errors={})",
counts.scanned,
counts.new,
counts.updated,
counts.skipped,
counts.errors,
counts.scanned, counts.new, counts.updated, counts.skipped, counts.errors,
);
}
}
@@ -193,14 +187,42 @@ impl ProgressDisplay {
let _ = writeln!(
err,
"ingest: aborted (scanned={} new={} updated={} skipped={} errors={})",
counts.scanned,
counts.new,
counts.updated,
counts.skipped,
counts.errors,
counts.scanned, counts.new, counts.updated, counts.skipped, counts.errors,
);
}
}
// v0.20.0 sub-item 1: per-page PDF OCR events — sub-progress lines
// under AssetStarted for scanned PDF. spec §4.6.1 line 1085-1086.
// skipped=true 시 (DCTDecode 부재 또는 engine fail) skip line.
IngestEvent::PdfOcrStarted { page } => {
if !quiet {
let mut err = std::io::stderr().lock();
let _ = writeln!(err, " 📷 OCR page {page}...");
}
}
IngestEvent::PdfOcrFinished {
page,
ms,
chars,
ocr_engine,
skipped,
..
} => {
if !quiet {
let mut err = std::io::stderr().lock();
if *skipped {
let _ = writeln!(
err,
" ⊘ OCR page {page} skipped (no DCTDecode or engine fail, {ms}ms)"
);
} else {
let _ = writeln!(
err,
" ✓ OCR page {page} ({chars} chars, {ms}ms via {ocr_engine})"
);
}
}
}
}
Ok(())
}
@@ -231,7 +253,10 @@ mod tests {
#[test]
fn from_flags_json_takes_priority_over_tty() {
assert_eq!(ProgressMode::from_flags(true, false, false), ProgressMode::Json);
assert_eq!(
ProgressMode::from_flags(true, false, false),
ProgressMode::Json
);
}
#[test]

View File

@@ -114,10 +114,7 @@ pub fn wire_answer(a: &Answer) -> Value {
/// The timestamp is added at emit time (caller fills `ts`), since the
/// pipeline doesn't carry one in the in-process enum — mirrors the
/// `wire_ingest_progress` pattern (§2 ingest_progress.v1).
pub fn wire_answer_event(
ev: &kebab_app::StreamEvent,
ts: time::OffsetDateTime,
) -> Value {
pub fn wire_answer_event(ev: &kebab_app::StreamEvent, ts: time::OffsetDateTime) -> Value {
let mut v = serde_json::to_value(ev).expect("StreamEvent serializes");
let ts_str = ts
.format(&time::format_description::well_known::Rfc3339)
@@ -161,9 +158,7 @@ pub fn wire_reset(r: &kebab_app::ResetReport) -> Value {
/// wall-clock — the emit site is the only place that knows the moment
/// of emission, so the timestamp is stamped here rather than carried
/// on the event itself.
pub fn wire_ingest_progress(
event: &kebab_app::IngestEvent,
) -> anyhow::Result<Value> {
pub fn wire_ingest_progress(event: &kebab_app::IngestEvent) -> anyhow::Result<Value> {
let mut v = serde_json::to_value(event)?;
if let Value::Object(ref mut map) = v {
map.insert(
@@ -305,15 +300,15 @@ mod tests {
let v = wire_search_response(&r);
assert_eq!(schema_of(&v), Some("search_response.v1"));
assert!(v.get("hits").and_then(|h| h.as_array()).is_some());
assert_eq!(
v.get("hits").and_then(|h| h.as_array()).unwrap().len(),
0
);
assert_eq!(v.get("hits").and_then(|h| h.as_array()).unwrap().len(), 0);
assert_eq!(
v.get("next_cursor").and_then(|c| c.as_str()),
Some("opaque-cursor-abc")
);
assert_eq!(v.get("truncated").and_then(serde_json::Value::as_bool), Some(true));
assert_eq!(
v.get("truncated").and_then(serde_json::Value::as_bool),
Some(true)
);
}
#[test]
@@ -322,23 +317,36 @@ mod tests {
let schema = SchemaV1 {
schema_version: "schema.v1".to_string(),
kebab_version: "0.2.1".to_string(),
wire: WireBlock { schemas: vec!["answer.v1".to_string()] },
wire: WireBlock {
schemas: vec!["answer.v1".to_string()],
},
capabilities: Capabilities {
json_mode: true, ingest_progress: true, ingest_cancellation: true,
rag_multi_turn: true, search_cache: true, incremental_ingest: true,
streaming_ask: false, http_daemon: false, mcp_server: false,
single_file_ingest: false, bulk_search: true,
json_mode: true,
ingest_progress: true,
ingest_cancellation: true,
rag_multi_turn: true,
search_cache: true,
incremental_ingest: true,
streaming_ask: false,
http_daemon: false,
mcp_server: false,
single_file_ingest: false,
bulk_search: true,
},
models: Models {
parser_version: "x".to_string(),
chunker_version: "y".to_string(),
active_parsers: vec![],
active_chunkers: vec![],
embedding_version: "z".to_string(),
prompt_template_version: "w".to_string(),
index_version: "v".to_string(),
corpus_revision: 7,
},
stats: Stats {
doc_count: 1, chunk_count: 2, asset_count: 1,
doc_count: 1,
chunk_count: 2,
asset_count: 1,
last_ingest_at: None,
media_breakdown: Default::default(),
lang_breakdown: Default::default(),
@@ -350,7 +358,10 @@ mod tests {
};
let v = wire_schema(&schema);
assert_eq!(schema_of(&v), Some("schema.v1"));
assert_eq!(v.get("kebab_version").and_then(Value::as_str), Some("0.2.1"));
assert_eq!(
v.get("kebab_version").and_then(Value::as_str),
Some("0.2.1")
);
}
#[test]
@@ -365,7 +376,10 @@ mod tests {
};
let v = wire_error_v1(&err);
assert_eq!(schema_of(&v), Some("error.v1"));
assert_eq!(v.get("code").and_then(Value::as_str), Some("config_invalid"));
assert_eq!(
v.get("code").and_then(Value::as_str),
Some("config_invalid")
);
}
#[test]
@@ -391,8 +405,10 @@ mod tests {
#[test]
fn search_response_with_trace_serializes_trace_field() {
use kebab_core::{SearchTrace, TraceCandidate, TraceFusionInput,
TraceTiming, ChunkId, DocumentId, WorkspacePath};
use kebab_core::{
ChunkId, DocumentId, SearchTrace, TraceCandidate, TraceFusionInput, TraceTiming,
WorkspacePath,
};
let r = kebab_app::SearchResponse {
hits: vec![],
next_cursor: None,
@@ -412,7 +428,12 @@ mod tests {
vector_rank: None,
fusion_score: 0.0,
}],
timing: TraceTiming { lexical_ms: 5, vector_ms: 0, fusion_ms: 1, total_ms: 7 },
timing: TraceTiming {
lexical_ms: 5,
vector_ms: 0,
fusion_ms: 1,
total_ms: 7,
},
}),
hint: None,
};

View File

@@ -0,0 +1,64 @@
//! Integration tests for Bug #10: explicit --config <path> that does not exist
//! must fail with exit≠0 and error.v1 code=config_not_found (not silently fall
//! back to XDG defaults).
use serde_json::Value;
use std::process::Command;
fn kebab_bin() -> String {
env!("CARGO_BIN_EXE_kebab").to_string()
}
fn parse_error_v1(stderr: &str) -> Value {
let last = stderr
.lines()
.last()
.expect("expected error.v1 ndjson on stderr");
serde_json::from_str(last)
.unwrap_or_else(|e| panic!("expected ndjson on stderr: {e}\nstderr={stderr}"))
}
#[test]
fn invalid_config_path_emits_error_v1_with_nonzero_exit() {
let absent = "/tmp/__kebab_bugfix3_absolute_nonexistent.toml";
assert!(!std::path::Path::new(absent).exists());
let out = Command::new(kebab_bin())
.args(["search", "rust", "--config", absent, "--json"])
.output()
.expect("spawn kebab");
assert_ne!(
out.status.code(),
Some(0),
"exit must be nonzero on missing --config"
);
let stderr = String::from_utf8_lossy(&out.stderr);
let v = parse_error_v1(&stderr);
assert_eq!(v["schema_version"], "error.v1");
assert_eq!(v["code"], "config_not_found");
assert!(v["hint"].is_string(), "hint must be present");
}
#[test]
fn invalid_relative_config_path_emits_config_not_found() {
// Bug #10 spec §6 R-1: relative path も cwd-relative で cover.
let tmp = tempfile::tempdir().unwrap();
let out = Command::new(kebab_bin())
.args([
"search",
"rust",
"--config",
"nonexistent-rel.toml",
"--json",
])
.current_dir(tmp.path())
.output()
.expect("spawn kebab");
assert_ne!(out.status.code(), Some(0));
let stderr = String::from_utf8_lossy(&out.stderr);
let v = parse_error_v1(&stderr);
assert_eq!(v["schema_version"], "error.v1");
assert_eq!(v["code"], "config_not_found");
}

View File

@@ -0,0 +1,50 @@
//! Integration tests for Bug #14: empty or whitespace-only query must emit
//! error.v1 code=invalid_input and exit nonzero (not silent 0-hit return).
use serde_json::Value;
use std::process::Command;
fn kebab_bin() -> String {
env!("CARGO_BIN_EXE_kebab").to_string()
}
fn parse_error_v1(stderr: &str) -> Value {
let last = stderr
.lines()
.last()
.expect("expected error.v1 ndjson on stderr");
serde_json::from_str(last)
.unwrap_or_else(|e| panic!("expected ndjson on stderr: {e}\nstderr={stderr}"))
}
#[test]
fn search_empty_query_emits_invalid_input() {
for q in ["", " "] {
let out = Command::new(kebab_bin())
.args(["search", q, "--json"])
.output()
.expect("spawn kebab");
assert_ne!(
out.status.code(),
Some(0),
"empty/whitespace query must fail (q={q:?})"
);
let stderr = String::from_utf8_lossy(&out.stderr);
let v = parse_error_v1(&stderr);
assert_eq!(v["schema_version"], "error.v1", "stderr={stderr}");
assert_eq!(v["code"], "invalid_input", "stderr={stderr}");
}
}
#[test]
fn ask_empty_query_emits_invalid_input() {
let out = Command::new(kebab_bin())
.args(["ask", "", "--json"])
.output()
.expect("spawn kebab");
assert_ne!(out.status.code(), Some(0));
let stderr = String::from_utf8_lossy(&out.stderr);
let v = parse_error_v1(&stderr);
assert_eq!(v["schema_version"], "error.v1");
assert_eq!(v["code"], "invalid_input");
}

View File

@@ -2,11 +2,10 @@
//! on stderr while non-json mode emits the legacy `error:` text prefix.
//!
//! The `config_invalid` code is triggered by supplying an *existing* but
//! malformed TOML file via `--config`. Note: supplying a *non-existent*
//! path does NOT trigger this error — Config::load silently falls back to
//! defaults when the specified config file is absent (by design, so that
//! `kebab doctor` runs before `kebab init` is ever called). A file that
//! exists but fails TOML parsing is the reliable path to `config_invalid`.
//! malformed TOML file via `--config`. A file that exists but fails TOML
//! parsing is the reliable path to `config_invalid`. Supplying a path that
//! does not exist emits `config_not_found` instead (Bug #10 fix, v0.20.0
//! bugfix3); see `cli_config_not_found.rs` for those tests.
use std::process::Command;
@@ -37,12 +36,7 @@ fn json_mode_emits_error_v1_on_config_invalid() {
std::fs::write(&bad_config, b"this is not { valid toml !!!").unwrap();
let mut cmd = Command::new(kebab_bin());
cmd.args([
"--json",
"--config",
bad_config.to_str().unwrap(),
"ingest",
]);
cmd.args(["--json", "--config", bad_config.to_str().unwrap(), "ingest"]);
for (k, v) in xdg_envs(tmp.path()) {
cmd.env(k, v);
}
@@ -56,7 +50,10 @@ fn json_mode_emits_error_v1_on_config_invalid() {
assert_eq!(exit_code, 2, "expected exit code 2, got {exit_code}");
let stderr = String::from_utf8(out.stderr).unwrap();
let first_line = stderr.lines().next().expect("stderr must have at least one line");
let first_line = stderr
.lines()
.next()
.expect("stderr must have at least one line");
let v: serde_json::Value =
serde_json::from_str(first_line).expect("stderr first line must be valid JSON");

View File

@@ -0,0 +1,17 @@
// crates/kebab-cli/tests/cli_help_smoke.rs
//
// Regression pin — `kebab search --help` 의 `--media` value list 가
// `code` 를 노출. Bug #7 (v0.20.0 bugfix round 2 spec §4.4).
#[test]
fn search_help_lists_code_in_media_values() {
let out = std::process::Command::new(env!("CARGO_BIN_EXE_kebab"))
.args(["search", "--help"])
.output()
.expect("kebab search --help");
let stdout = String::from_utf8_lossy(&out.stdout);
assert!(
stdout.contains("`code`"),
"search --help must list 'code' as accepted --media value; stdout = {stdout}"
);
}

View File

@@ -72,21 +72,34 @@ max_context_tokens = 8000
workspace = workspace.display(),
data = data.display(),
),
).unwrap();
)
.unwrap();
let src = dir.path().join("doc.md");
fs::write(&src, "# A\n\nbody.").unwrap();
let bin = env!("CARGO_BIN_EXE_kebab");
let out = Command::new(bin)
.args(["--json", "--config", cfg_path.to_str().unwrap(), "ingest-file"])
.args([
"--json",
"--config",
cfg_path.to_str().unwrap(),
"ingest-file",
])
.arg(&src)
.output()
.unwrap();
assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
assert!(
out.status.success(),
"stderr: {}",
String::from_utf8_lossy(&out.stderr)
);
let stdout = String::from_utf8_lossy(&out.stdout);
let v: serde_json::Value = serde_json::from_str(stdout.trim()).unwrap();
assert_eq!(v.get("schema_version").and_then(|s| s.as_str()), Some("ingest_report.v1"));
assert_eq!(
v.get("schema_version").and_then(|s| s.as_str()),
Some("ingest_report.v1")
);
assert_eq!(v.get("new").and_then(serde_json::Value::as_u64), Some(1));
}

View File

@@ -73,13 +73,18 @@ max_context_tokens = 8000
workspace = workspace.display(),
data = data.display(),
),
).unwrap();
)
.unwrap();
let bin = env!("CARGO_BIN_EXE_kebab");
let mut child = Command::new(bin)
.args([
"--json", "--config", cfg_path.to_str().unwrap(),
"ingest-stdin", "--title", "X",
"--json",
"--config",
cfg_path.to_str().unwrap(),
"ingest-stdin",
"--title",
"X",
])
.stdin(Stdio::piped())
.stdout(Stdio::piped())
@@ -91,10 +96,17 @@ max_context_tokens = 8000
stdin.write_all(b"## Body\n\nbody text.\n").unwrap();
}
let out = child.wait_with_output().unwrap();
assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
assert!(
out.status.success(),
"stderr: {}",
String::from_utf8_lossy(&out.stderr)
);
let stdout = String::from_utf8_lossy(&out.stdout);
let v: serde_json::Value = serde_json::from_str(stdout.trim()).unwrap();
assert_eq!(v.get("schema_version").and_then(|s| s.as_str()), Some("ingest_report.v1"));
assert_eq!(
v.get("schema_version").and_then(|s| s.as_str()),
Some("ingest_report.v1")
);
assert_eq!(v.get("new").and_then(serde_json::Value::as_u64), Some(1));
}

View File

@@ -112,7 +112,13 @@ fn kebab_readonly_env_blocks_ingest() {
fn readonly_json_mode_emits_error_v1() {
let (tmp, ws) = fixture_workspace();
let out = Command::new(kebab_bin())
.args(["--readonly", "--json", "ingest", "--root", ws.to_str().unwrap()])
.args([
"--readonly",
"--json",
"ingest",
"--root",
ws.to_str().unwrap(),
])
.envs(xdg_envs(tmp.path()))
.output()
.unwrap();
@@ -164,12 +170,22 @@ fn quiet_flag_suppresses_progress_stderr() {
fn quiet_with_json_stdout_has_report_stderr_is_empty() {
let (tmp, ws) = fixture_workspace();
let out = Command::new(kebab_bin())
.args(["--quiet", "--json", "ingest", "--root", ws.to_str().unwrap()])
.args([
"--quiet",
"--json",
"ingest",
"--root",
ws.to_str().unwrap(),
])
.envs(xdg_envs(tmp.path()))
.output()
.unwrap();
assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
assert!(
out.status.success(),
"stderr: {}",
String::from_utf8_lossy(&out.stderr)
);
let stderr = String::from_utf8_lossy(&out.stderr);
assert!(stderr.is_empty(), "expected empty stderr, got: {stderr}");
let stdout = String::from_utf8_lossy(&out.stdout);

View File

@@ -90,12 +90,7 @@ fn ingest_human_non_tty_emits_progress_lines_to_stderr() {
// target is `hidden` and progress lines go to stderr instead.
let (tmp, ws) = fixture_workspace();
let mut cmd = Command::new(kebab_bin());
cmd.args([
"ingest",
"--root",
ws.to_str().unwrap(),
"--summary-only",
]);
cmd.args(["ingest", "--root", ws.to_str().unwrap(), "--summary-only"]);
for (k, v) in xdg_envs(tmp.path()) {
cmd.env(k, v);
}
@@ -155,8 +150,14 @@ fn ingest_json_progress_lines_carry_kind_and_ts() {
saw_completed = true;
// Counts mirror the report.
let counts = v.get("counts").unwrap();
assert_eq!(counts.get("scanned").and_then(serde_json::Value::as_u64), Some(2));
assert_eq!(counts.get("new").and_then(serde_json::Value::as_u64), Some(2));
assert_eq!(
counts.get("scanned").and_then(serde_json::Value::as_u64),
Some(2)
);
assert_eq!(
counts.get("new").and_then(serde_json::Value::as_u64),
Some(2)
);
}
}
assert!(saw_scan_started, "missing scan_started event");

View File

@@ -50,9 +50,18 @@ fn reset_data_only_yes_removes_data_dir_and_keeps_config() {
);
assert!(!xdg_data.join("kebab").exists(), "data dir should be gone");
assert!(!xdg_cache.join("kebab").exists(), "cache dir should be gone");
assert!(!xdg_state.join("kebab").exists(), "state dir should be gone");
assert!(xdg_cfg.join("kebab/marker").exists(), "config dir preserved");
assert!(
!xdg_cache.join("kebab").exists(),
"cache dir should be gone"
);
assert!(
!xdg_state.join("kebab").exists(),
"state dir should be gone"
);
assert!(
xdg_cfg.join("kebab/marker").exists(),
"config dir preserved"
);
}
#[test]
@@ -101,7 +110,11 @@ fn reset_data_only_yes_json_emits_reset_report_v1() {
.env("XDG_STATE_HOME", tmp.path().join("state"))
.output()
.unwrap();
assert!(out.status.success(), "stderr: {}", String::from_utf8_lossy(&out.stderr));
assert!(
out.status.success(),
"stderr: {}",
String::from_utf8_lossy(&out.stderr)
);
let v: serde_json::Value = serde_json::from_slice(&out.stdout).unwrap();
assert_eq!(

View File

@@ -32,10 +32,9 @@ fn schema_path(name: &str) -> PathBuf {
}
fn parse_schema(name: &str) -> serde_json::Value {
let text = std::fs::read_to_string(schema_path(name))
.unwrap_or_else(|e| panic!("read {name}: {e}"));
serde_json::from_str(&text)
.unwrap_or_else(|e| panic!("{name} must parse as valid JSON: {e}"))
let text =
std::fs::read_to_string(schema_path(name)).unwrap_or_else(|e| panic!("read {name}: {e}"));
serde_json::from_str(&text).unwrap_or_else(|e| panic!("{name} must parse as valid JSON: {e}"))
}
#[test]

View File

@@ -41,8 +41,7 @@ fn relax_score_gate(cfg: &Path) {
#[ignore = "requires real Ollama on 127.0.0.1:11434"]
fn stream_emits_ndjson_events_on_stderr() {
let dir = tempfile::tempdir().unwrap();
let (cfg, workspace, _data) =
common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
relax_score_gate(&cfg);
fs::write(
workspace.join("a.md"),
@@ -93,12 +92,8 @@ fn stream_emits_ndjson_events_on_stderr() {
// stdout: last line is answer.v1 (backwards compat with the
// non-streaming path — same wire shape, just emitted after the
// ndjson event stream rather than instead of it).
let final_line = stdout
.lines()
.last()
.expect("stdout has at least one line");
let answer: Value =
serde_json::from_str(final_line).expect("stdout final line = answer.v1");
let final_line = stdout.lines().last().expect("stdout has at least one line");
let answer: Value = serde_json::from_str(final_line).expect("stdout final line = answer.v1");
assert_eq!(answer["schema_version"], "answer.v1");
}
@@ -109,8 +104,7 @@ fn non_stream_path_unchanged() {
// emits a single `answer.v1` line on stdout — fb-33 must not
// perturb the existing wire surface.
let dir = tempfile::tempdir().unwrap();
let (cfg, workspace, _data) =
common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
relax_score_gate(&cfg);
fs::write(
workspace.join("a.md"),
@@ -140,8 +134,7 @@ fn stream_cancels_when_stderr_closes() {
use std::process::{Command, Stdio};
let dir = tempfile::tempdir().unwrap();
let (cfg, workspace, _data) =
common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
relax_score_gate(&cfg);
fs::write(
workspace.join("a.md"),
@@ -198,15 +191,10 @@ fn stream_cancels_when_stderr_closes() {
#[ignore = "requires real Ollama on 127.0.0.1:11434"]
fn stream_score_gate_refusal_emits_only_retrieval_done() {
let dir = tempfile::tempdir().unwrap();
let (cfg, workspace, _data) =
common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
let (cfg, workspace, _data) = common::write_config_with_llm_model(dir.path(), 30, "gemma4:e4b");
// Intentionally NO relax_score_gate — keep the default 0.30
// so the thin-doc + unrelated-query combo trips refusal.
fs::write(
workspace.join("a.md"),
"# Title\n\nrust is a language.\n",
)
.unwrap();
fs::write(workspace.join("a.md"), "# Title\n\nrust is a language.\n").unwrap();
common::ingest(&cfg, &workspace);
let (stdout, stderr) =
@@ -230,12 +218,8 @@ fn stream_score_gate_refusal_emits_only_retrieval_done() {
);
// Stdout still has answer.v1 with grounded=false.
let final_line = stdout
.lines()
.last()
.expect("stdout has at least one line");
let answer: Value =
serde_json::from_str(final_line).expect("answer.v1");
let final_line = stdout.lines().last().expect("stdout has at least one line");
let answer: Value = serde_json::from_str(final_line).expect("answer.v1");
assert_eq!(answer["schema_version"], "answer.v1");
assert_eq!(answer["grounded"], false);
}

View File

@@ -21,7 +21,11 @@ fn cargo_bin() -> &'static str {
env!("CARGO_BIN_EXE_kebab")
}
fn run_bulk_with_stdin(cfg: &std::path::Path, stdin_body: &str, json: bool) -> std::process::Output {
fn run_bulk_with_stdin(
cfg: &std::path::Path,
stdin_body: &str,
json: bool,
) -> std::process::Output {
let mut cmd = Command::new(cargo_bin());
cmd.arg("--config").arg(cfg).arg("search").arg("--bulk");
if json {
@@ -94,7 +98,10 @@ fn empty_stdin_returns_empty_results_with_zero_summary() {
let out = run_bulk_with_stdin(&cfg, "", true);
assert!(out.status.success());
let stdout = String::from_utf8_lossy(&out.stdout);
assert!(stdout.trim().is_empty(), "expected empty stdout, got: {stdout}");
assert!(
stdout.trim().is_empty(),
"expected empty stdout, got: {stdout}"
);
let stderr = String::from_utf8_lossy(&out.stderr);
assert!(stderr.contains("bulk_summary: total=0 succeeded=0 failed=0"));
}

View File

@@ -19,7 +19,10 @@ fn line_variant_serialization_unchanged() {
assert_eq!(v["end"], 2);
assert_eq!(v["section"], "§14");
// Must not bleed Code-variant keys.
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
assert!(
v.get("line_start").is_none(),
"line_start must be absent: {v}"
);
assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
assert!(v.get("code").is_none(), "code must be absent: {v}");
}
@@ -48,7 +51,10 @@ fn page_variant_serialization_unchanged() {
let v = serde_json::to_value(&c).unwrap();
assert_eq!(v["kind"], "page");
assert_eq!(v["page"], 13);
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
assert!(
v.get("line_start").is_none(),
"line_start must be absent: {v}"
);
assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
}
@@ -67,7 +73,10 @@ fn region_variant_serialization_unchanged() {
assert_eq!(v["y"], 20);
assert_eq!(v["w"], 100);
assert_eq!(v["h"], 200);
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
assert!(
v.get("line_start").is_none(),
"line_start must be absent: {v}"
);
}
#[test]
@@ -79,7 +88,10 @@ fn caption_variant_serialization_unchanged() {
let v = serde_json::to_value(&c).unwrap();
assert_eq!(v["kind"], "caption");
assert_eq!(v["model"], "qwen2.5-vl:7b");
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
assert!(
v.get("line_start").is_none(),
"line_start must be absent: {v}"
);
}
#[test]
@@ -95,6 +107,9 @@ fn time_variant_serialization_unchanged() {
assert_eq!(v["start_ms"], 1000);
assert_eq!(v["end_ms"], 5000);
assert_eq!(v["speaker"], "Alice");
assert!(v.get("line_start").is_none(), "line_start must be absent: {v}");
assert!(
v.get("line_start").is_none(),
"line_start must be absent: {v}"
);
assert!(v.get("symbol").is_none(), "symbol must be absent: {v}");
}

View File

@@ -24,10 +24,8 @@ fn fetch_chunk_json_emits_fetch_result_v1() {
common::ingest(&cfg, &workspace);
// Find chunk_id via search.
let (search_stdout, _) = common::run_search_with_args(
&cfg,
&["--json", "--mode", "lexical", "--k", "1", "apples"],
);
let (search_stdout, _) =
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "1", "apples"]);
let search: Value = serde_json::from_str(search_stdout.trim())
.unwrap_or_else(|e| panic!("search not JSON: {search_stdout:?}: {e}"));
let chunk_id = search["hits"][0]["chunk_id"]
@@ -35,10 +33,7 @@ fn fetch_chunk_json_emits_fetch_result_v1() {
.expect("chunk_id on first hit")
.to_string();
let (stdout, _) = common::run_fetch_with_args(
&cfg,
&["--json", "chunk", &chunk_id],
);
let (stdout, _) = common::run_fetch_with_args(&cfg, &["--json", "chunk", &chunk_id]);
let v: Value = serde_json::from_str(stdout.trim())
.unwrap_or_else(|e| panic!("fetch not JSON: {stdout:?}: {e}"));
assert_eq!(v["schema_version"], "fetch_result.v1");
@@ -59,10 +54,8 @@ fn fetch_doc_json_with_max_tokens_truncates() {
common::ingest(&cfg, &workspace);
// Find doc_id via search.
let (search_stdout, _) = common::run_search_with_args(
&cfg,
&["--json", "--mode", "lexical", "--k", "1", "Lorem"],
);
let (search_stdout, _) =
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "1", "Lorem"]);
let search: Value = serde_json::from_str(search_stdout.trim())
.unwrap_or_else(|e| panic!("search not JSON: {search_stdout:?}: {e}"));
let doc_id = search["hits"][0]["doc_id"]
@@ -70,10 +63,8 @@ fn fetch_doc_json_with_max_tokens_truncates() {
.expect("doc_id on first hit")
.to_string();
let (stdout, _) = common::run_fetch_with_args(
&cfg,
&["--json", "doc", &doc_id, "--max-tokens", "20"],
);
let (stdout, _) =
common::run_fetch_with_args(&cfg, &["--json", "doc", &doc_id, "--max-tokens", "20"]);
let v: Value = serde_json::from_str(stdout.trim())
.unwrap_or_else(|e| panic!("fetch not JSON: {stdout:?}: {e}"));
assert_eq!(v["kind"], "doc");

View File

@@ -32,12 +32,9 @@ fn search_with_doc_id_filter_returns_only_target_doc() {
common::ingest(&cfg, &workspace);
// First, search without a doc-id filter to find what doc_ids exist.
let (stdout, _) = common::run_search_with_args(
&cfg,
&["--json", "--mode", "lexical", "rust"],
);
let resp: Value = serde_json::from_str(stdout.trim())
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
let (stdout, _) = common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]);
let resp: Value =
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
let hits = resp["hits"].as_array().expect("hits array");
assert!(
hits.len() >= 2,
@@ -147,15 +144,19 @@ fn search_with_media_filter_md_alias_normalizes_to_markdown() {
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
// Only a markdown file — the `md` alias should match it.
fs::write(workspace.join("notes.md"), "# Notes\n\nrust async programming\n").unwrap();
fs::write(
workspace.join("notes.md"),
"# Notes\n\nrust async programming\n",
)
.unwrap();
common::ingest(&cfg, &workspace);
let (stdout, _) = common::run_search_with_args(
&cfg,
&["--json", "--mode", "lexical", "--media", "md", "rust"],
);
let resp: Value = serde_json::from_str(stdout.trim())
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
let resp: Value =
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
let hits = resp["hits"].as_array().expect("hits array");
assert!(
@@ -189,10 +190,8 @@ fn search_with_tag_filter_matches_frontmatter_tags() {
common::ingest(&cfg, &workspace);
// Without filter — both docs must produce hits.
let (unfiltered, _) = common::run_search_with_args(
&cfg,
&["--json", "--mode", "lexical", "rust"],
);
let (unfiltered, _) =
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]);
let uresp: Value = serde_json::from_str(unfiltered.trim())
.unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}"));
let uhits = uresp["hits"].as_array().expect("unfiltered hits array");
@@ -254,10 +253,8 @@ fn search_with_two_tag_filters_returns_or_within_tags() {
common::ingest(&cfg, &workspace);
// Without filter: all three docs produce hits.
let (unfiltered, _) = common::run_search_with_args(
&cfg,
&["--json", "--mode", "lexical", "rust"],
);
let (unfiltered, _) =
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "rust"]);
let uresp: Value = serde_json::from_str(unfiltered.trim())
.unwrap_or_else(|e| panic!("not JSON (unfiltered): {unfiltered:?}: {e}"));
let uhits = uresp["hits"].as_array().expect("unfiltered hits array");
@@ -270,10 +267,7 @@ fn search_with_two_tag_filters_returns_or_within_tags() {
let (filtered, _) = common::run_search_with_args(
&cfg,
&[
"--json", "--mode", "lexical",
"--tag", "rust",
"--tag", "async",
"rust",
"--json", "--mode", "lexical", "--tag", "rust", "--tag", "async", "rust",
],
);
let fresp: Value = serde_json::from_str(filtered.trim())
@@ -301,6 +295,12 @@ fn search_with_two_tag_filters_returns_or_within_tags() {
.collect();
let has_a = paths.iter().any(|p| p.ends_with("a.md"));
let has_b = paths.iter().any(|p| p.ends_with("b.md"));
assert!(has_a, "--tag rust must include a.md (rust-tagged): paths={paths:?}");
assert!(has_b, "--tag async must include b.md (async-tagged): paths={paths:?}");
assert!(
has_a,
"--tag rust must include a.md (rust-tagged): paths={paths:?}"
);
assert!(
has_b,
"--tag async must include b.md (async-tagged): paths={paths:?}"
);
}

View File

@@ -5,7 +5,7 @@
//! inject spurious keys into the existing markdown corpus wire shape.
use kebab_core::{
Citation, ChunkId, ChunkerVersion, DocumentId, IndexVersion, RetrievalDetail, ScoreKind,
ChunkId, ChunkerVersion, Citation, DocumentId, IndexVersion, RetrievalDetail, ScoreKind,
SearchHit, WorkspacePath,
};

View File

@@ -23,12 +23,10 @@ fn search_json_emits_search_response_v1_wrapper() {
fs::write(workspace.join("a.md"), "# T\n\napples are red.\n").unwrap();
common::ingest(&cfg, &workspace);
let (stdout, _stderr) = common::run_search_with_args(
&cfg,
&["--json", "--mode", "lexical", "apples"],
);
let v: Value = serde_json::from_str(stdout.trim())
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
let (stdout, _stderr) =
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "apples"]);
let v: Value =
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
assert_eq!(v["schema_version"], "search_response.v1");
assert!(v["hits"].is_array(), "hits must be array, got {v}");
assert!(
@@ -67,8 +65,8 @@ fn search_json_truncates_with_max_tokens() {
&cfg,
&["--json", "--mode", "lexical", "--max-tokens", "30", "rust"],
);
let v: Value = serde_json::from_str(stdout.trim())
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
let v: Value =
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
assert_eq!(
v["truncated"], true,
"30-token cap must trip truncation: {v}"
@@ -88,10 +86,8 @@ fn search_json_cursor_paginates() {
}
common::ingest(&cfg, &workspace);
let (page1, _) = common::run_search_with_args(
&cfg,
&["--json", "--mode", "lexical", "--k", "2", "rust"],
);
let (page1, _) =
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "--k", "2", "rust"]);
let v1: Value = serde_json::from_str(page1.trim())
.unwrap_or_else(|e| panic!("page1 not JSON: {page1:?}: {e}"));
let cursor = v1["next_cursor"]
@@ -101,14 +97,7 @@ fn search_json_cursor_paginates() {
let (page2, _) = common::run_search_with_args(
&cfg,
&[
"--json",
"--mode",
"lexical",
"--k",
"2",
"--cursor",
cursor,
"rust",
"--json", "--mode", "lexical", "--k", "2", "--cursor", cursor, "rust",
],
);
let v2: Value = serde_json::from_str(page2.trim())
@@ -118,23 +107,13 @@ fn search_json_cursor_paginates() {
.as_array()
.expect("page1 hits array")
.iter()
.map(|h| {
h["chunk_id"]
.as_str()
.expect("chunk_id string")
.to_string()
})
.map(|h| h["chunk_id"].as_str().expect("chunk_id string").to_string())
.collect();
let p2_ids: Vec<String> = v2["hits"]
.as_array()
.expect("page2 hits array")
.iter()
.map(|h| {
h["chunk_id"]
.as_str()
.expect("chunk_id string")
.to_string()
})
.map(|h| h["chunk_id"].as_str().expect("chunk_id string").to_string())
.collect();
assert!(
!p2_ids.is_empty(),
@@ -161,10 +140,8 @@ fn search_stale_cursor_returns_error_v1_with_stale_cursor_code() {
common::ingest(&cfg, &workspace);
// Get a valid cursor first.
let (page1_stdout, _) = common::run_search_with_args(
&cfg,
&["--mode", "lexical", "--json", "--k", "1", "apples"],
);
let (page1_stdout, _) =
common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "--k", "1", "apples"]);
let v1: Value = serde_json::from_str(page1_stdout.trim()).expect("json");
let cursor = v1["next_cursor"]
.as_str()
@@ -181,16 +158,8 @@ fn search_stale_cursor_returns_error_v1_with_stale_cursor_code() {
let cfg_str = cfg.to_str().expect("utf8");
let out = std::process::Command::new(exe)
.args([
"--config",
cfg_str,
"--json",
"search",
"--mode",
"lexical",
"--json",
"--cursor",
&cursor,
"apples",
"--config", cfg_str, "--json", "search", "--mode", "lexical", "--json", "--cursor",
&cursor, "apples",
])
.output()
.expect("kebab search --cursor");
@@ -234,10 +203,8 @@ fn search_plain_emits_truncated_hint_to_stderr() {
}
common::ingest(&cfg, &workspace);
let (_stdout, stderr) = common::run_search_with_args(
&cfg,
&["--mode", "lexical", "--max-tokens", "30", "rust"],
);
let (_stdout, stderr) =
common::run_search_with_args(&cfg, &["--mode", "lexical", "--max-tokens", "30", "rust"]);
assert!(
stderr.contains("[truncated;"),
"stderr must carry truncated hint: {stderr:?}"
@@ -254,10 +221,7 @@ fn search_plain_emits_short_query_hint_to_stderr() {
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
common::ingest(&cfg, &workspace);
let (_stdout, stderr) = common::run_search_with_args(
&cfg,
&["--mode", "lexical", "ab"],
);
let (_stdout, stderr) = common::run_search_with_args(&cfg, &["--mode", "lexical", "ab"]);
assert!(
stderr.contains("[hint]"),
"stderr must carry short-query hint: {stderr:?}"
@@ -278,18 +242,18 @@ fn search_json_emits_hint_field_for_short_query() {
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
common::ingest(&cfg, &workspace);
let (stdout, _stderr) = common::run_search_with_args(
&cfg,
&["--json", "--mode", "lexical", "ab"],
);
let v: Value = serde_json::from_str(stdout.trim())
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
let (stdout, _stderr) =
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "ab"]);
let v: Value =
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
assert!(
v["hits"].as_array().unwrap().is_empty(),
"empty hits expected for short query in empty KB: {v}"
);
assert_eq!(
v["hint"].as_str().expect("hint field set on short empty result"),
v["hint"]
.as_str()
.expect("hint field set on short empty result"),
"3자 이상 키워드 권장 (trigram tokenizer 제약)",
"hint must carry the standard advisory: {v}"
);
@@ -305,12 +269,10 @@ fn search_json_omits_hint_field_when_query_is_long_enough() {
let (cfg, workspace, _data) = common::write_config(dir.path(), 30);
common::ingest(&cfg, &workspace);
let (stdout, _stderr) = common::run_search_with_args(
&cfg,
&["--json", "--mode", "lexical", "abc"],
);
let v: Value = serde_json::from_str(stdout.trim())
.unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
let (stdout, _stderr) =
common::run_search_with_args(&cfg, &["--json", "--mode", "lexical", "abc"]);
let v: Value =
serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("not JSON: {stdout:?}: {e}"));
assert!(
v.get("hint").is_none(),
"hint must be absent for ≥3-char queries: {v}"

View File

@@ -16,10 +16,8 @@ fn lexical_mode_hits_carry_bm25_score_kind() {
doc_with_term(&workspace);
common::ingest(&cfg, &workspace);
let (stdout, _stderr) = common::run_search_with_args(
&cfg,
&["--mode", "lexical", "--json", "rust"],
);
let (stdout, _stderr) =
common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "rust"]);
let v: Value = serde_json::from_str(stdout.trim()).expect("valid JSON");
let hits = v["hits"].as_array().expect("hits array");
assert!(!hits.is_empty(), "expected at least 1 hit");
@@ -40,10 +38,8 @@ fn old_wire_reader_compat_score_kind_optional_field() {
doc_with_term(&workspace);
common::ingest(&cfg, &workspace);
let (stdout, _stderr) = common::run_search_with_args(
&cfg,
&["--mode", "lexical", "--json", "rust"],
);
let (stdout, _stderr) =
common::run_search_with_args(&cfg, &["--mode", "lexical", "--json", "rust"]);
let v: Value = serde_json::from_str(stdout.trim()).unwrap();
let hit = &v["hits"][0];
assert!(hit.get("score_kind").is_some(), "score_kind always emitted");

View File

@@ -59,15 +59,14 @@ fn search_json_includes_indexed_at_and_stale() {
.get("hits")
.and_then(|h| h.as_array())
.unwrap_or_else(|| panic!("expected hits array, got {stdout}"));
let first = arr.first().unwrap_or_else(|| panic!("expected ≥1 hit, got empty hits: {stdout}"));
let first = arr
.first()
.unwrap_or_else(|| panic!("expected ≥1 hit, got empty hits: {stdout}"));
assert!(
first.get("indexed_at").is_some(),
"missing indexed_at in {first}"
);
assert!(
first.get("stale").is_some(),
"missing stale in {first}"
);
assert!(first.get("stale").is_some(), "missing stale in {first}");
assert_eq!(
first["stale"], false,
"freshly ingested doc must not be stale at default 30d threshold"

Some files were not shown because too many files have changed in this diff Show More