Compare commits
436 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 99f8cfa691 | |||
| d85d7348a5 | |||
| edac3ae737 | |||
| 6ec4e6809f | |||
| 1011c75fff | |||
| 8f7b6ee538 | |||
| 76841af7d3 | |||
| 980e20fd8d | |||
| cd79ed326c | |||
| 9dbf9d781d | |||
| 9501edd82b | |||
| 4b4a4c0b32 | |||
| f2cc325cf3 | |||
| b7e022a5e3 | |||
| bd7c4fd7ef | |||
| 4dcb4a45d6 | |||
| 6d86214060 | |||
| 6bbb8f854b | |||
| 2a4df4d48d | |||
| 16f3d6eef2 | |||
| fa89c7b561 | |||
| a4c81fed86 | |||
| 5b7c02fe13 | |||
| 88c5b83dea | |||
| 2619b7bff7 | |||
| e9b520216e | |||
| a8fd76499c | |||
| 0282a81c67 | |||
| f3587b7143 | |||
| 483b1ec06b | |||
| d279f343e7 | |||
| b56469f010 | |||
| 6ba8cb2c88 | |||
| afa8af0f88 | |||
| b9d20d23d1 | |||
| 86b4e1ebd0 | |||
| 825543549d | |||
| bcb8b93751 | |||
| 116b3e6377 | |||
| 69b53d1c97 | |||
| a271352e33 | |||
| cde4d75f6b | |||
| bddcd53688 | |||
| 2a207f9868 | |||
| cc31868d24 | |||
| 0df47febf0 | |||
| b12a616ab2 | |||
| 848b75c069 | |||
| 467a974901 | |||
| 098413922b | |||
| 695010ea7a | |||
| 8bb7c276d0 | |||
| 01a03463a6 | |||
| b6ad947378 | |||
| 1529e6d991 | |||
| 5ad1f98227 | |||
| a58cae2ff3 | |||
| 7a1dff1684 | |||
| 0988f66331 | |||
| 82e02aa4fe | |||
| db4af0cc72 | |||
| ab20202241 | |||
| a51e6395c0 | |||
| fe4c854673 | |||
| 1de3f4ffca | |||
| 7fbfec647d | |||
| ca8c83b1ba | |||
| 6c611990d8 | |||
| 166b1404e4 | |||
| 2d0168b7ab | |||
| 4afcaf96d2 | |||
| 16c4579399 | |||
| 40d7faee71 | |||
| a3bb2580bf | |||
| 2429189447 | |||
| d93b757cf1 | |||
| 571996938c | |||
| be79bdb83d | |||
| 4e76f103c1 | |||
| 4fd672193f | |||
| 1454321b12 | |||
| 649ec35108 | |||
| dece5e89fc | |||
| 3cb49f1f9b | |||
| f5ff823984 | |||
| b82eaec21a | |||
| 6daa43375b | |||
| 85efeeca3e | |||
| 2b4ba8e104 | |||
| b08941d6ab | |||
| 6bf4e82e62 | |||
| a0c7fa3d1a | |||
| ebc6bf45c4 | |||
| d8fdc815be | |||
| 9f2a56d091 | |||
| fe20be8195 | |||
| 028d9ad4ea | |||
| a3513c9110 | |||
| f2a76cfe94 | |||
| 8c56ef3010 | |||
| 5d9ea588ed | |||
| 53ec9b4dc5 | |||
| 21b52bc285 | |||
| 97fd895a10 | |||
| d13eb87401 | |||
| 26f3a7756c | |||
| 881f949fcb | |||
| c5de5f812b | |||
| f94e0c4a9b | |||
| 923b959610 | |||
| b63af20b72 | |||
| e8f44a57e3 | |||
| 4b4a8cbb3a | |||
| 4dc1c10be1 | |||
| bd86f61c9c | |||
| b134ae9dd5 | |||
| 597d8b70ad | |||
| b106120e93 | |||
| 43366b1b15 | |||
| 70507e94ca | |||
| 7bbdc89ae3 | |||
| 7c24734cc7 | |||
| 9a36a06f97 | |||
| 35c987df1c | |||
| d9ec7b8dc3 | |||
| 4e451c9f7c | |||
| 6482bf1321 | |||
| 5977c8cdf1 | |||
| 89d334a92b | |||
| 09333d0b05 | |||
| 685007789a | |||
| 445b096215 | |||
| 415227bf76 | |||
| f9dc0f749f | |||
| bef0c98867 | |||
| f8a4c79727 | |||
| f60304beb4 | |||
| 6a9551e0fa | |||
| 46e99470eb | |||
| 9b44e27dfe | |||
| 854a180365 | |||
| 5bba95fd71 | |||
| 2c7fa7142a | |||
| d9c7aabce1 | |||
| 10b0e2f4f2 | |||
| 28f513795e | |||
| 760eee89c8 | |||
| f763049923 | |||
| 8cf73d1f43 | |||
| a58ee10dfb | |||
| e674ff474b | |||
| 241ded59df | |||
| 436fd015a2 | |||
| d9acda517a | |||
| b4d9e60816 | |||
| 90726ab283 | |||
| 1d4e301e5e | |||
| 48197687b7 | |||
| c9e05941c5 | |||
| 4c5ccd5447 | |||
| b9ee09f176 | |||
| 4672cba6c6 | |||
| fd918a60ce | |||
| 9f003ef1cd | |||
| 8d81bc1071 | |||
| c2cd3a7ab7 | |||
| fb3952d54f | |||
| aeeff3635b | |||
| 9d7faab650 | |||
| bcd1e37dab | |||
| e7a4330798 | |||
| 574e1b1ca1 | |||
| c1e82cca92 | |||
| 2c05dbd0dd | |||
| 96766406aa | |||
| 710945c4b0 | |||
| d4395a306b | |||
| bd48baa19a | |||
| b02ac8200e | |||
| 336962715a | |||
| 1a224bf983 | |||
| a210bf5d52 | |||
| 429287f6cb | |||
| 08495eb425 | |||
| 98cf4e8a04 | |||
| 4030f04f37 | |||
| 7c27633df2 | |||
| 3712d005cc | |||
| 7c85de065a | |||
| a0ccc7b021 | |||
| a8fd6994d2 | |||
| 505b3889fb | |||
| 772575d8f0 | |||
| 00ffe9c792 | |||
| 681c48b2a3 | |||
| 546c1564b0 | |||
| 79ad6e376f | |||
| 6ffbe0a5a3 | |||
| ab3408cb49 | |||
| b807fd5aa5 | |||
| 93436f9eca | |||
| 11ce7847a1 | |||
| 1d88dccf8a | |||
| 1eb0bbecb3 | |||
| 44fbffff26 | |||
| 63aece3ea1 | |||
| 28a8bbeace | |||
| 52a97303dc | |||
| 71fb2cbcb3 | |||
| 85855ef596 | |||
| da25ce330b | |||
| 5bfea3c28b | |||
| b6756f8ce3 | |||
| 016f380428 | |||
| bf28a1e4d9 | |||
| 24221826ed | |||
| 8a2f7affa6 | |||
| f28a422f79 | |||
| c56242d04f | |||
| 17c48a0ee6 | |||
| 64a009314c | |||
| ddfe7ba099 | |||
| 104363a0db | |||
| 6188a50c1c | |||
| 94e6146013 | |||
| 12c7dc9efb | |||
| cd1d4fb807 | |||
| 7150c376bb | |||
| 6280abf2df | |||
| 192da45dbf | |||
| cf35f36f88 | |||
| ed34f2e03f | |||
| 624b44c46b | |||
| caf690dc72 | |||
| 1640ecf288 | |||
| 90e77631a8 | |||
| fa251db48f | |||
| 3114c31841 | |||
| 271329efbd | |||
| f2867540d2 | |||
| e118844256 | |||
| 41c5edc517 | |||
| d02149c010 | |||
| 0c69b9621b | |||
| 0d69d85757 | |||
| a67300317b | |||
| abb05ebc23 | |||
| 26fdc4f344 | |||
| 3f5e0e6e90 | |||
| 578a60e3bb | |||
| 64f518e08e | |||
| fa9f91ead4 | |||
| 9ee89c2a94 | |||
| 13a3361ba2 | |||
| 0def913abd | |||
| ff9d5f5f86 | |||
| 70a5068c0d | |||
| 93ddece111 | |||
| 67559fb3ce | |||
| d79e432916 | |||
| 0ee18149e7 | |||
| 8a68289499 | |||
| 6ac7fea7b9 | |||
| fe123c0c6d | |||
| 753b1ff5e5 | |||
| 8dcedc4b11 | |||
| 8781c6112b | |||
| 14197b5e02 | |||
| 584247f1ea | |||
| a0c0dca321 | |||
| 667495ae6a | |||
| 08d72a12e0 | |||
| 1969c8e3b5 | |||
| c6207d196e | |||
| 840c6c40a6 | |||
| b81574afa9 | |||
| 6beff35a2f | |||
| 75a4207aa1 | |||
| 86aa180ad7 | |||
| 802c573c07 | |||
| 438870ee25 | |||
| 192835e5bf | |||
| 1034de25a2 | |||
| d1560be80d | |||
| b2a2902e38 | |||
| 03cd41c48f | |||
| 926042049c | |||
| e0a29225da | |||
| b541567946 | |||
| a58d400abd | |||
| 8add684ffc | |||
| 7a90df1485 | |||
| 46f408dc0f | |||
| 49e60fb314 | |||
| 6bc7a83d3c | |||
| df3c5b8caf | |||
| 5051ea7534 | |||
| 88d7fbc182 | |||
| 0b7d8af759 | |||
| 9342b9543f | |||
| a8aa03042f | |||
| 9d4a60aac5 | |||
| 8ce7a911ee | |||
| 75c1c7b911 | |||
| b5c12ecb6f | |||
| a1192ce3b2 | |||
| 17ee400fd5 | |||
| 217dddb4ba | |||
| 308666dbd5 | |||
| 522ae7b8bc | |||
| 166e1ddfaf | |||
| 226ce8b744 | |||
| 22d4161728 | |||
| 51004ac593 | |||
| 8996e73282 | |||
| 22dba09857 | |||
| aaa90b1754 | |||
| 077f92f41e | |||
| 5ce7f60932 | |||
| 47857b2622 | |||
| 1e4cff879b | |||
| 2d7a566624 | |||
| 813bdd1a16 | |||
| ff1bedbef5 | |||
| 30e03c7a12 | |||
| 2ce6ae47c5 | |||
| ebc4ef2eea | |||
| 7bda1509b7 | |||
| 61d48d67a3 | |||
| f4c840b994 | |||
| 15244b7494 | |||
| a7f7ab9f93 | |||
| 1b19e33a4f | |||
| 9c9e391b15 | |||
| f95cd55484 | |||
| ab288135e9 | |||
| c19aa006d0 | |||
| f1a4f67e12 | |||
| 6463c52827 | |||
| 2559d0d95a | |||
| 4524830306 | |||
| 8cdd3903c7 | |||
| 8b89961ada | |||
| eec90996aa | |||
| ce1c778b4a | |||
| 453ec15df4 | |||
| 1e6de9fe9f | |||
| 9fa2a1ebac | |||
| 749c6ae240 | |||
| 5f2bd9e97e | |||
| 1ce06c1e2d | |||
| d26efe167f | |||
| d6d165df01 | |||
| 2baa846c6b | |||
| 27baec82ea | |||
| acf8cf3be2 | |||
| ea5f7b22c8 | |||
| 5497c6e7b5 | |||
| 5a90940f1c | |||
| 4389b887f0 | |||
| 360f825f3a | |||
| 641b92af7d | |||
| 08fb743598 | |||
| 0a2a7ae214 | |||
| 803d02b68b | |||
| 4e8b84c4e0 | |||
| 16dc02cfa2 | |||
| 74f1b0571b | |||
| 918ee6c0be | |||
| 68ada396f3 | |||
| 23c4ad97b9 | |||
| 1f566b8bfa | |||
| 26562588e3 | |||
| 4503b5b12f | |||
| 44813df052 | |||
| d6bb6cfd3b | |||
| d53995a6d4 | |||
| c215034653 | |||
| 31245a4328 | |||
| acb61b6830 | |||
| 20feb3133e | |||
| de63f161ac | |||
| 1815091247 | |||
| 6a0b340941 | |||
| 9664e97497 | |||
| 8bdb3e8090 | |||
| dcad9ccda2 | |||
| ed0f4769b3 | |||
| 0c61758931 | |||
| 39b766ea59 | |||
| 7f287abacb | |||
| d715631928 | |||
| 73e5b359d8 | |||
| c780aca904 | |||
| b1d5047399 | |||
| 80c2d31fb3 | |||
| 97e9f558f4 | |||
| da51e59081 | |||
| 11a0fc758f | |||
| b5d1fe8c1e | |||
| 580576c2c6 | |||
| 808b92a6c5 | |||
| c74f8d269e | |||
| df85bafa7f | |||
| a93b33ffbe | |||
| 402a4506a2 | |||
| a531dc37dc | |||
| 7a6a24ad10 | |||
| 42712b50c2 | |||
| 9f3edb7e24 | |||
| 5c265bb59f | |||
| a08ed32199 | |||
| 9362cd0aae | |||
|
|
7961f8813d | ||
|
|
7bbd2c0cbf | ||
|
|
d13f58d28a | ||
|
|
298f4adc81 | ||
|
|
4e8b70a04b | ||
|
|
682f7dd3a2 | ||
|
|
40b3ea8408 | ||
|
|
9fce24b106 | ||
|
|
8bbe25dc10 | ||
|
|
abfdcbd31d | ||
|
|
69d1593bc5 | ||
|
|
2a8451c033 | ||
|
|
ff11f81f7f | ||
|
|
bf4ebf8d2a | ||
|
|
351c7a0826 | ||
|
|
7329ba96ee | ||
|
|
fa4eeb5a87 | ||
|
|
3b1e878aed | ||
|
|
005a9011ea | ||
|
|
c6d61b0b37 | ||
|
|
49487dc46b | ||
| 2c2bf9bac5 | |||
|
|
7c6c2e8102 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,6 +1,7 @@
|
||||
.superpowers/
|
||||
.worktrees/
|
||||
.claude/
|
||||
/target/
|
||||
.omc/
|
||||
/target
|
||||
**/*.rs.bk
|
||||
Cargo.lock.bak
|
||||
|
||||
61
CLAUDE.md
61
CLAUDE.md
@@ -27,7 +27,7 @@ cargo build --release # produces target/release/kebab
|
||||
|
||||
`-j 1` for the full workspace test isn't optional: 18 integration-test binaries each link `lance` + `datafusion` + `arrow` + `tantivy` and the parallel link step exhausts memory (linker gets SIGKILL'd, build silently fails partway). Per-crate runs are fine in parallel.
|
||||
|
||||
`target/` is 6–10 GB after a fresh build (DataFusion + Lance + fastembed + 18 × test-binary debug info). The dev/test profile is already trimmed (`debug = "line-tables-only"`, `split-debuginfo = "unpacked"` — see workspace `Cargo.toml`). Run `cargo clean` after phase merges if disk pressure shows up; backtraces still resolve to function + line.
|
||||
`target/` is 6–10 GB after a fresh build but **balloons to 90+ GB after a few task cycles** (each fb-* batch adds incremental compile artifacts on top of the existing 18 × test-binary debug info). The dev/test profile is already trimmed (`debug = "line-tables-only"`, `split-debuginfo = "unpacked"` — see workspace `Cargo.toml`). Run `cargo clean` **routinely after each merged PR**, not just "if pressure shows up" — disk space is tight and recovery via `cargo clean` is cheap (one re-link per crate on next build). Verified pattern: 92 GB → 0 GB in seconds, backtraces still resolve to function + line.
|
||||
|
||||
## The facade rule
|
||||
|
||||
@@ -81,11 +81,68 @@ Bump 자체는 단순 minor / patch 한 줄 수정 (`Cargo.toml` workspace `vers
|
||||
Release 절차:
|
||||
|
||||
1. `gitea-release v<X.Y.Z>` (gitea-ops skill) 으로 tag + push + release notes.
|
||||
2. release notes 는 사용자 도그푸딩에 영향 가는 surface 변경 위주 — wire schema 추가, CLI flag 신규, TUI 키 변경, V00X migration 등.
|
||||
2. release notes 는 사용자 도그푸딩에 영향이 가는 surface 변경을 위주로 — wire schema 추가, CLI flag 신규, TUI 키 변경, V00X migration 등 — 다룬다. 이때 추가된 기능과 변경사항은 유저가 이해할 수 있도록 친절하고 자세하게 풀어서 설명해야 하며, 단순히 commit subject 를 나열하는 형태로 끝내면 안 된다. 필요하다면 도그푸딩이나 테스트 결과도 함께 적어 둔다.
|
||||
3. 프리-1.0 (`0.x.y`) 단계: minor bump 시 wire schema additive / surface 변경 누적, patch bump 시 bug fix only.
|
||||
|
||||
**bump 시점 = release 시점 같은 commit**. 즉 commit `chore: bump version 0.x → 0.y` 직후 같은 commit 에 tag. v0.1.0 (`2319206`) 처럼 bump 없이 tag 만 찍는 패턴은 후속 release 가 대상 commit 을 헷갈리게 함 — pre-release snapshot 은 SHA reference 로 충분.
|
||||
|
||||
## Dogfood trigger
|
||||
|
||||
도그푸딩 = 새 binary 를 실제 KB / 실제 query 로 돌려보고 user-visible 동작이 spec 의 의도와 일치하는지 확인하는 종단 검증. unit / integration test 가 못 잡는 회귀 (UX 어색함, performance regression, 의외의 token 처리, embedding drift, RAG hallucination) 를 catch 함. PR 머지 전 또는 머지 직후 release notes 작성 전에 실시.
|
||||
|
||||
### 도그푸딩이 필요한 시점
|
||||
|
||||
다음 트리거 중 하나라도 hit 시 도그푸딩 필수. **모두 release-level 또는 user-visible behavior 변경 임**.
|
||||
|
||||
**Schema / migration**:
|
||||
- 신규 V00X migration (예: V007 trigram, V008 OCR mirror, V009 morphological) — `corpus_revision` cascade + auto-backfill 정책의 사용자 경험 확인.
|
||||
- frozen design contract 변경 (`docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` §X 갱신) — verbatim CI diff-check 외의 user-visible side effect 확인.
|
||||
|
||||
**Wire schema / CLI surface**:
|
||||
- 신규 `--json` 필드, exit code 변경, 또는 schema major bump (v1 → v2) — agent / external integration 의 호환성 검증.
|
||||
- `kebab` 의 subcommand 또는 flag 추가/삭제/rename — agent skill / muscle memory 영향.
|
||||
|
||||
**Search / RAG behavior**:
|
||||
- FTS5 tokenizer / chunker / embedder 모델 / RAG prompt template 변경 — 같은 query 의 hit ordering, snippet, RAG citation 패턴이 자연스럽게 변화하는지.
|
||||
- score gate, RRF fusion ratio, NLI threshold 같은 ranking 파라미터 default 변경.
|
||||
|
||||
**Performance**:
|
||||
- ingest / search / ask latency 의 의도된 변화 (예: lindera tokenize, OCR 추가, multi-hop RAG) — actual wall-clock 측정 + release notes 에 명시.
|
||||
- 대용량 KB (수천 doc / 만 chunk) 의 first-boot eager backfill 시간이 사용자 hang 인지에 영향 안 가는지.
|
||||
|
||||
**Language / locale**:
|
||||
- 한국어 / 일본어 / 중국어 lexical 동작 변경 (V007 trigram, V009 morphological, future N-gram).
|
||||
- 영어 substring 매칭 같은 ad-hoc 부산물의 회귀.
|
||||
|
||||
**File / asset surface**:
|
||||
- 신규 source 형식 (PDF OCR, audio, video) — extractor / chunker 의 실제 corpus 동작.
|
||||
- `.kebabignore` / `_external/` 같은 workspace 정책 변경.
|
||||
|
||||
**Release-level**: 위 트리거 중 하나가 hit 되어 `Cargo.toml` workspace `version` bump 가 필요하면, **bump commit 이전에 도그푸딩 evidence 가 HOTFIXES + release notes 에 명시** 되어 있어야 함. evidence 없는 release 는 사용자가 "왜 bump 했는지" 추적 불가.
|
||||
|
||||
### 도그푸딩 데이터 보관소
|
||||
|
||||
모든 도그푸딩 source 문서 + KB state + 로그는 `/build/dogfood/` 한 디렉토리에 누적 보관한다. **분류는 문서 의미 / 종류 / 형식 기준만** — kebab version, 생성 시점, scenario name 같은 prefix 금지 (`v0.20.1-dogfood/`, `dogfood-v018/` 같은 디렉토리 신설 X). 자세한 layout 은 `/build/dogfood/README.md` 참조.
|
||||
|
||||
- `/build/dogfood/corpus/` — source 문서 (read-only). format 별 분류 (`markdown/`, `code/`, `html/`, `images/`, `pdf/`, `manifest/`, `resources/`) + 각 format 내 category 별 (예: `markdown/{korean,english,bilingual,tech-docs,coding-md-corpus,topics,notes,edge-cases}`, `code/{rust,python,...}`). 새 fixture 는 적절한 category subdir 에 추가.
|
||||
- `/build/dogfood/kb/` — 도그푸딩 run 의 KB 출력 (SQLite + LanceDB + assets + models). 매 run 마다 reset 가능. 별 KB 디렉토리 신설 X.
|
||||
- `/build/dogfood/logs/` — 누적 실행 로그 (ndjson + stderr + summary).
|
||||
- `/build/dogfood/config.toml` — canonical 도그푸딩 config (없으면 `kebab init` 후 path override).
|
||||
- `/build/dogfood/_archive/` — regeneratable stale state (이전 run 의 sqlite/lancedb, XDG snapshot). 디스크 압박 시 wipe 가능.
|
||||
|
||||
`/tmp/kebab-smoke/`, `/tmp/kebab-*`, `/build/cache/dogfood*`, `/home/altair823/KnowledgeBase`, `~/.config/kebab/`, `~/.local/share/kebab/`, `~/.local/state/kebab/` 같은 위치 신규 사용 금지 — 모두 `/build/dogfood/` 로 일관. ad-hoc fixture 가 필요하면 `corpus/<format>/<category>/` 에 추가.
|
||||
|
||||
### 도그푸딩 결과 기록
|
||||
|
||||
도그푸딩 evidence 는 두 곳에 cascade:
|
||||
|
||||
1. **`tasks/HOTFIXES.md` 의 dated entry** — 시나리오 별 hit count 표 + snippet evidence + known limitation. 미래에 spec drift 의심 시 git history 외 immediate reference 가 됨.
|
||||
2. **`docs/release-notes/v<X.Y.Z>-draft.md`** (또는 gitea release body) — 사용자 도그푸딩 영향에 영향이 가는 surface 변경을 4 단락 (변경 사실 / trade-off / mitigation / upgrade 절차) 으로 풀어서 설명. evidence link.
|
||||
|
||||
도그푸딩 단계에서 *발견된 bug* (spec 과 실제 동작의 mismatch, performance regression, UX 어색함) 는 즉시 fix → re-dogfood. fix 가 별 PR 으로 빠지면 머지 후 HOTFIXES 에 dated entry.
|
||||
|
||||
DOGFOOD scenario catalog (§1~§13) 는 `docs/DOGFOOD.md`. 신규 release 마다 §관련 section 의 scenario list 갱신 + 신규 scenario 추가.
|
||||
|
||||
## Naming + paths
|
||||
|
||||
- Crate prefix: `kebab-` (kebab-case package, `kebab_` snake_case in Rust modules).
|
||||
|
||||
1771
Cargo.lock
generated
1771
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
133
Cargo.toml
133
Cargo.toml
@@ -2,17 +2,16 @@
|
||||
resolver = "3"
|
||||
members = [
|
||||
"crates/kebab-core",
|
||||
"crates/kebab-parse-types",
|
||||
"crates/kebab-config",
|
||||
"crates/kebab-source-fs",
|
||||
"crates/kebab-parse-md",
|
||||
"crates/kebab-normalize",
|
||||
"crates/kebab-chunk",
|
||||
"crates/kebab-store-sqlite",
|
||||
"crates/kebab-store-vector",
|
||||
"crates/kebab-search",
|
||||
"crates/kebab-embed",
|
||||
"crates/kebab-embed-local",
|
||||
"crates/kebab-embed-candle",
|
||||
"crates/kebab-llm",
|
||||
"crates/kebab-llm-local",
|
||||
"crates/kebab-rag",
|
||||
@@ -23,6 +22,8 @@ members = [
|
||||
"crates/kebab-parse-pdf",
|
||||
"crates/kebab-tui",
|
||||
"crates/kebab-mcp",
|
||||
"crates/kebab-parse-code",
|
||||
"crates/kebab-nli",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -30,7 +31,95 @@ edition = "2024"
|
||||
rust-version = "1.85"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/altair823/kebab"
|
||||
version = "0.6.0"
|
||||
version = "0.22.0" # v0.22.0 — candle 임베딩 provider (NUMA-안전, opt-in `provider=candle` + `num_threads`/KEBAB_EMBED_THREADS). fastembed default 불변, embedding_version 유지(재색인 0). — CLAUDE.md §Release 도그푸딩 트리거
|
||||
|
||||
# pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with
|
||||
# intentional allow-list. The allowed lints are either cosmetic (doc style),
|
||||
# informational (function size), or carry intentional truncation we accept
|
||||
# (numeric casts in tokenizer/ONNX inputs, hash modular reduction, etc).
|
||||
[workspace.lints.clippy]
|
||||
pedantic = { level = "warn", priority = -1 }
|
||||
# Intentional u32 ↔ i64 casts in kebab-nli (ONNX i64 inputs from tokenizer u32 ids).
|
||||
# u64 ↔ usize across kebab-store-sqlite row counts. Wide truncation is auditable
|
||||
# at use site, not lint-wide.
|
||||
cast_possible_truncation = "allow"
|
||||
cast_possible_wrap = "allow"
|
||||
cast_sign_loss = "allow"
|
||||
cast_precision_loss = "allow"
|
||||
# Doc markdown style is cosmetic; we run rustdoc on demand.
|
||||
doc_markdown = "allow"
|
||||
missing_errors_doc = "allow"
|
||||
missing_panics_doc = "allow"
|
||||
# Informational only — splitting a long pipeline function isn't always cleaner.
|
||||
too_many_lines = "allow"
|
||||
# `Foo::default()` is concise and idiomatic here; `<Foo as Default>::default()`
|
||||
# adds noise without surfacing intent.
|
||||
default_trait_access = "allow"
|
||||
# Module name prefix on public items keeps the wire/log surface readable
|
||||
# (`refusal_reason::no_chunks` etc).
|
||||
module_name_repetitions = "allow"
|
||||
# We use `#[must_use]` deliberately on public results, not blanket.
|
||||
must_use_candidate = "allow"
|
||||
# `String` arg sometimes signals "I'll consume this" — let signature decide.
|
||||
needless_pass_by_value = "allow"
|
||||
# Idiomatic single-line bindings stay; let-else expansion isn't always clearer.
|
||||
manual_let_else = "allow"
|
||||
# `use` after `let` is a common kebab pattern (scoped imports next to use site).
|
||||
items_after_statements = "allow"
|
||||
# Naming pairs like `chunk_id` / `chunks_id` are intentional domain terms.
|
||||
similar_names = "allow"
|
||||
# `iter.map(format!).collect::<String>()` is idiomatic when the per-element
|
||||
# string is genuinely independent — `fold` only wins on accumulation patterns.
|
||||
format_collect = "allow"
|
||||
# Exhaustive `match` with explicit variant arms (vs `_`) catches future
|
||||
# variant additions at compile time (kebab core's `RefusalReason` pattern).
|
||||
match_wildcard_for_single_variants = "allow"
|
||||
# Copy types under `&self` keep call-site discipline; auto-deref noise > tiny perf gain.
|
||||
trivially_copy_pass_by_ref = "allow"
|
||||
# `unnecessary_wraps` flags helpers that could drop `Result`, but keeping the
|
||||
# Result allows future error variants without churning callers.
|
||||
unnecessary_wraps = "allow"
|
||||
# NLI score / RRF fusion / similarity threshold comparisons are intentional —
|
||||
# floats live in the `[0, 1]` band and are compared with explicit thresholds.
|
||||
float_cmp = "allow"
|
||||
# File-extension dispatch is keyed on ASCII conventions; case sensitivity
|
||||
# is part of the spec for `.md`, `.pdf`, etc.
|
||||
case_sensitive_file_extension_comparisons = "allow"
|
||||
# Config / opts structs intentionally bundle boolean flags (ingest options,
|
||||
# search modes, etc) — splitting them into enums would obscure the wire shape.
|
||||
struct_excessive_bools = "allow"
|
||||
# `bytecount` crate would be a new dep just for one-off ASCII counts.
|
||||
naive_bytecount = "allow"
|
||||
# `#[ignore]` annotations on tests document via the test name + nearby comment.
|
||||
ignore_without_reason = "allow"
|
||||
# `format!` push patterns are a hot path for kebab-tui's progressive rendering;
|
||||
# `write!` rewrite needs a verified-equal benchmark before swapping.
|
||||
format_push_string = "allow"
|
||||
# Builder-style `with_*` methods return `Self`; the existing `#[must_use]`
|
||||
# discipline lives on aggregate constructors, not every chainable setter.
|
||||
return_self_not_must_use = "allow"
|
||||
# Match arms grouped by side-effect over body equality (e.g. snake_case wire
|
||||
# label tables) — fanning them out keeps adding a new variant trivial.
|
||||
match_same_arms = "allow"
|
||||
# Remaining style-only warnings: trailing `continue` is sometimes clearer than
|
||||
# rewriting, `_x` underscored bindings document intent at the use site, and
|
||||
# `!(a == b)` reads better than `a != b` when paired with a complementary check.
|
||||
needless_continue = "allow"
|
||||
used_underscore_binding = "allow"
|
||||
nonminimal_bool = "allow"
|
||||
# Other one-off cosmetic items: large literal formatting, doc link quoting,
|
||||
# `Clone::clone_from` swap, `str::replace` chaining, `Iterator::any` ergonomics.
|
||||
unreadable_literal = "allow"
|
||||
many_single_char_names = "allow"
|
||||
doc_link_with_quotes = "allow"
|
||||
assigning_clones = "allow"
|
||||
collapsible_str_replace = "allow"
|
||||
trivial_regex = "allow"
|
||||
elidable_lifetime_names = "allow"
|
||||
range_plus_one = "allow"
|
||||
explicit_iter_loop = "allow"
|
||||
implicit_hasher = "allow"
|
||||
ref_option = "allow"
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1"
|
||||
@@ -53,6 +142,7 @@ proptest = "1"
|
||||
# p9-fb-19: LRU cache for `App::search` results. Bounded capacity
|
||||
# from `config.search.cache_capacity` (default 256, ~1.3 MB cap).
|
||||
lru = "0.12"
|
||||
lopdf = "0.32"
|
||||
# fastembed-rs ships ONNX runtime via the `ort-download-binaries` feature
|
||||
# in its default set (which also pulls `hf-hub` for first-run model
|
||||
# downloads). Pinned to the 4.x line per task p3-2 (current 5.x release
|
||||
@@ -81,6 +171,43 @@ rmcp = { version = "1.6", default-features = false, features = ["server"
|
||||
# sync via reqwest::blocking — wiremock is dev-only there).
|
||||
wiremock = "0.6"
|
||||
base64 = "0.22"
|
||||
# Pure-Rust git library for repo metadata detection (kebab-parse-code).
|
||||
# No `git` binary required. Default features include thread-safety + most
|
||||
# object-reading capabilities needed for HEAD name + commit SHA queries.
|
||||
gix = { version = "0.70", default-features = false, features = ["revision"] }
|
||||
# Rust source parsing for code ingest (kebab-parse-code, p10-1A-2). The
|
||||
# chunker stays tree-sitter-free — AST work is parser-side per design §6.3.
|
||||
tree-sitter = "0.26"
|
||||
tree-sitter-rust = "0.24"
|
||||
# Python / TS / JS grammars for code ingest (kebab-parse-code, p10-1B).
|
||||
tree-sitter-python = "0.25.0"
|
||||
tree-sitter-typescript = "0.23.2"
|
||||
tree-sitter-javascript = "0.25.0"
|
||||
# Go grammar for code ingest (kebab-parse-code, p10-1C-Go).
|
||||
tree-sitter-go = "0.25.0"
|
||||
# JVM family grammars for code ingest (kebab-parse-code, p10-1C-JK).
|
||||
tree-sitter-java = "0.23.5"
|
||||
tree-sitter-kotlin-ng = "1.1.0" # bare tree-sitter-kotlin requires ts <0.23; -ng uses tree-sitter-language 0.1 (ts 0.26 compat)
|
||||
# C/C++ family grammars for code ingest (kebab-parse-code, p10-1D).
|
||||
tree-sitter-c = "0.24.2"
|
||||
tree-sitter-cpp = "0.23.4"
|
||||
# fb-41 PR-9 (kebab-nli): mDeBERTa-v3 XNLI verifier deps. Versions match
|
||||
# the fastembed 4.9 transitive set so the ONNX Runtime + tokenizer stack
|
||||
# stays single-versioned across the workspace. ort `default-features=false`
|
||||
# drops the bundled binary downloader (fastembed already provides one);
|
||||
# tokenizers `default-features=false, onig` swaps the default `esaxx` regex
|
||||
# backend for `onig` so the build doesn't need libstdc++ headers (verified
|
||||
# via PR-9a pre-flight: SentencePiece tokenizer.json loads + KR/EN encode).
|
||||
# hf-hub uses `ureq + rustls-tls` to stay aligned with kebab-embed-local's
|
||||
# pure-Rust TLS stack.
|
||||
ort = { version = "=2.0.0-rc.9", default-features = false, features = ["ndarray"] }
|
||||
tokenizers = { version = "0.21", default-features = false, features = ["onig"] }
|
||||
hf-hub = { version = "0.4", default-features = false, features = ["ureq", "rustls-tls"] }
|
||||
ndarray = "0.16"
|
||||
# Korean morphological tokenizer (FTS v0.20.x, §6.1). lindera-ko-dic bundles
|
||||
# the KO-DIC dictionary as an embedded blob via the embed-ko-dic feature.
|
||||
lindera = "3"
|
||||
lindera-ko-dic = "3"
|
||||
|
||||
# Disk-footprint trim for dev / test builds. Codegen, opt-level, and
|
||||
# behavior are unchanged — only DWARF debug info is reduced (line
|
||||
|
||||
101
HANDOFF.md
101
HANDOFF.md
@@ -4,7 +4,7 @@
|
||||
|
||||
## 한 줄 요약
|
||||
|
||||
P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. `kebab ingest` 가 markdown / image / PDF 모두 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공 — 사용자가 `?` 로 ask, `/` 로 search, Library Enter / Search `i` 로 inspect, Search `g` 로 editor jump. 다음 후보 = P9-5 (desktop tauri) 또는 보류 중인 P8 (audio) 의 시스템 dep brainstorm.
|
||||
P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) + P10 전체 머지 완료 (현재 **v0.18.0**). `kebab ingest` 가 markdown / image / PDF / 소스코드 (Rust / Python / TS / JS / Go / Java / Kotlin / C / C++) / Tier 2 리소스 파일 (yaml/k8s / dockerfile / toml / json / xml / groovy / go-mod) + Tier 3 paragraph fallback (shell / 비-k8s YAML / AST 실패 케이스) 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page / code citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공. **v0.17.0 cut (2026-05-24)**: 한국어 trigram FTS5 tokenizer (PR #159) + C typedef alias unit (PR #160) + `code_lang_chunk_breakdown` additive (PR #161). **v0.17.1 cut (2026-05-25)**: 확장 도그푸딩 후 `[models.llm] request_timeout_secs` config 노브 (PR #162) + sudo 없이 ollama 설치 + `kebab ask --stream` UX 권장 docs (PR #163). **v0.17.2 cut (2026-05-25)**: v0.17.1 post-dogfood polish — `[image.ocr] request_timeout_secs` 별 노브 (PR #164, v0.17.1 미진행 closure) + `heading_path` FTS5 column filter 로 text-only 매칭 + raw-mode escape hatch (PR #165, 2026-05-24 v0.17.0 trigram entry 의 JSON 노이즈 closure). **v0.18.0 cut (2026-05-26)**: fb-41 multi-hop RAG + NLI verification ship (PR #176-180) — `kebab ask --multi-hop` 의 decompose → decide → synthesize loop + mDeBERTa-v3 XNLI ONNX post-synthesize entailment 검사. dogfood S7 caffeine hallucination 의 silent LLM-self-judge ceiling 해결 (nli_score 0.0035 graceful refuse). 추가 `chore: workspace-wide cleanup + post-PR9 refactor` (PR #181) — clippy::pedantic baseline + H1 config wiring + 9 new tests. 자세한 영향은 [v0.17.0 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.17.0) + [v0.17.1 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.17.1) + [v0.17.2 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.17.2) + [v0.18.0 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.18.0). 구조적으로 남은 component 는 P9-5 (desktop tauri) 하나뿐, P8 (audio) 는 사용자 보류.
|
||||
|
||||
## Phase 로드맵
|
||||
|
||||
@@ -17,20 +17,37 @@ P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료.
|
||||
| **P4** | Local LLM + RAG + grounded answer | `kebab-llm`, `kebab-llm-local`, `kebab-rag` | P3 | ✅ 완료 |
|
||||
| **P5** | Golden query / regression eval | `kebab-eval` | P4 | ✅ 완료 |
|
||||
| **P6** | 이미지 ingestion (OCR + caption) | `kebab-parse-image` | P5 | ✅ 완료 (4/4 component, OCR/caption Ollama-vision) |
|
||||
| **P7** | PDF text + page citation | `kebab-parse-pdf` | P5 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring) |
|
||||
| **P7** | PDF text + page citation + scanned OCR (v0.20.0 sub-item 1) | `kebab-parse-pdf` + `kebab-app::pdf_ocr_apply` | P5 + P6 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring + post-extract OCR enrichment via qwen2.5vl:3b vision LLM) |
|
||||
| **P8** | 음성 transcription + timestamp citation | `kebab-parse-audio` | P5 | ⏸ 보류 (whisper-rs 시스템 dep brainstorm 필요) |
|
||||
| **P9** | TUI + desktop app | `kebab-tui`, `kebab-desktop` | P5 | 🟡 진행 (4/5 component — P9-1/2/3/4 완료 [Library / Search / Ask / Inspect], P9-5 desktop 예정 · 도그푸딩 피드백 **20/20 ✅**) |
|
||||
| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)**, **2 ✅ (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` — v0.14.0)**, **3 ✅ (Tier 3 paragraph fallback: code-text-paragraph-v1 — v0.15.0)**, **1D ✅ (C + C++ AST chunkers, code-c-ast-v1 + code-cpp-ast-v1 — v0.16.0)** |
|
||||
|
||||
P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
|
||||
|
||||
## Component 카운트
|
||||
|
||||
총 33 component task — spec 시점 31 개 + 후속 wiring task 3 (P3-5 / P6-4 / P7-3) 가 머지 시점에 추가됨. per-component 진행 + status 는 [tasks/INDEX.md](tasks/INDEX.md).
|
||||
총 33 component task — spec 시점 31 개 + 후속 wiring task 3 (P3-5 / P6-4 / P7-3) 가 머지 시점에 추가됨. v0.18.0 cut 시점에 fb-41 multi-hop RAG + NLI verification (PR-9 5 sub-PRs) 가 P9 추가 component 로 ship — `kebab-nli` 신규 crate (mDeBERTa-v3 XNLI ONNX verifier) + `kebab-rag::ask_multi_hop` (decompose/decide/synthesize loop + step 8.5 NLI hook). per-component 진행 + status 는 [tasks/INDEX.md](tasks/INDEX.md).
|
||||
|
||||
## 머지 후 발견된 버그 / 결정 (요약)
|
||||
|
||||
- **candle 임베딩 백엔드 다변화** (2026-06-01, Track 1, v0.22.0): `provider = "candle"` opt-in 추가 — 같은 `multilingual-e5-large` 모델을 순수 Rust(candle)로 돌려 듀얼소켓 NUMA 서버의 onnxruntime 48-스레드 double-free 를 회피. `[models.embedding].num_threads`(+env `KEBAB_EMBED_THREADS`)로 CPU 스레드 캡. fastembed default 동작·벡터 불변, `embedding_version` 유지(재색인 0). Phase 0 스파이크 패리티 cosine 1.000000. 상세 HOTFIXES 동일 일자.
|
||||
- **config 마이그레이션** (2026-05-31, PR #198): `kebab config migrate` 추가 — 기존 config.toml 에 빠진 섹션을 주석과 함께 채우고 deprecated 정리(멱등·`.bak`·dry-run, 값/주석 보존). `schema_version` 1→2, `init` 도 섹션 주석 포함, doctor 에 `config_migration` 체크. 상세 HOTFIXES 동일 일자.
|
||||
|
||||
머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만:
|
||||
|
||||
- **2026-05-31 Phase 2 doc-side expansion 별칭(개별 dense 벡터) + 파생물 캐시(V012)** — v0.21.0 cut. 색인 시 LLM 이 청크별 별칭("같은 의미 다른 표현")을 생성, 줄별 **개별 dense 벡터**(sentinel `{chunk}#alias#N`)로 색인 (묶음 1벡터는 평균화 희석으로 회귀 → 폐기) + boilerplate 청크 skip. `[ingest.expansion]` default off. 측정(나무위키 ~1000 문서 CS corpus): 변형 일관성 14/18 → **16/18**, spread 0.222→0.111, 대조군 false-positive 별칭 무죄. 비용 병목(별칭 18문서 2.5h)은 **파생물 캐시(V012, 청크 내용 해시 키)**로 해소 — 정답 3개 cold 1879s → warm 13s **≈ 145배**, embedding+별칭 LLM 캐싱, version_key cascade 정합. search/ask 가 `kebab.sqlite`+`lancedb` 만으로 동작 → 외부 서버 색인 후 DB 만 복사하는 이식 워크플로 가능. **결정/known limitation**: grounded/refusal 판정이 부분 인용을 grounded 로 오분류(정직한 거부가 false-positive 로 집계) — 별도 개선 후보. stack·svm 설명형 2개 잔존. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-31), 측정: `docs/superpowers/handoffs/2026-05-31-namu-wiki-alias-cache-study.md`.
|
||||
- **2026-05-29 v0.20.2 dogfood findings + 검색 품질 baseline** — 8-finding 라운드 완료. (1) Ask 응답언어: rag-v3 default (질문 언어 = 답변 언어). (2) eval `--config` facade 패치 로 dogfood KB 직접 eval 가능. (3) 검색 품질 baseline — hybrid hit@3=1.0 / MRR=0.833, lexical hit@3=1.0 / MRR=0.7 (golden 10 query). **O-2 known limitation**: 소형 모델(gemma4:e4b) refusal 메시지의 query 언어 불일치 가능 — 판정은 정상, 표시 문구만 해당. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-29).
|
||||
- **v0.20 sub-item 1 (scanned PDF OCR via qwen2.5vl:3b)**: post-extract enrichment pattern (`kebab-app::pdf_ocr_apply`, H-1 resolution), DCTDecode-only v1 scope (FlateDecode/CCITTFax page 는 warning + skip), parser_version `"pdf-text-v1"` 보존 + force-reingest UX 명문 (H-4).
|
||||
- **2026-05-26 kebab-normalize + kebab-parse-types 흡수 (24 → 22 crates, design §3.7b 재작성)** — v0.19.0 cut. 4 parser 중 markdown 한 갈래만 lift 를 경유하는 reality 가 design §3.7b 의 fan-in ≥ 2 가정과 diverge → thin layer (`kebab-parse-types`) + `kebab-normalize` 두 crate 가 `kebab-parse-md` 로 흡수. 5 사용 type + 3 forward-declared struct 모두 `kebab-parse-md::{types,normalize}` module 의 `pub` re-export 로 보존. wire / surface impact = 0 (CLI / TUI / MCP / `--json` / config / XDG / parser_version 모두 unchanged). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-26 design deviation entry).
|
||||
- **2026-05-26 v0.18.0 fb-41 multi-hop RAG + NLI verification ship (PR #176-180) + post-PR9 cleanup (PR #181)** — pre-v0.18.0 dogfood (`/build/cache/dogfood-v018/`, 33 assets / 205 chunks, gemma3:4b CPU only / 16 GB RAM) 에서 발견된 S7 caffeine hallucination 의 root cause = LLM-self-judge ceiling (synthesize 가 chunks 와 무관한 Adam optimizer gradient 식을 silent emit, self-judge 가 reject 못함). 학계 표준 (Self-RAG, CRAG, Auto-GDA, MedTrust-RAG) 결론 = deterministic post-synthesis verification. mDeBERTa-v3 XNLI ONNX (280 MB, Xenova HF) 가 `(packed_chunks, answer)` entailment 검사 — `[rag] nli_threshold > 0` (default 0.0 = disabled, production 권장 0.5) 일 때 활성. dogfood retest 측정 — S7 PR-8 baseline `grounded=true + Adam hallucination` → PR-9 `nli_verification_failed, nli_score 0.0035`. wire additive minor — `answer.v1.verification` field + `refusal_reason` 의 `nli_verification_failed` / `nli_model_unavailable` 추가, pre-v0.18 reader 무영향. 5 sub-PR 시퀀스 + cleanup PR (clippy::pedantic baseline + 의도적 30+ allow + H1 `[models.nli].model` config wiring + 9 new tests). post-refactor retest = PR-9d byte-identical (deterministic 확인). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 fb-41 PR-9 closure entry + S3 follow-up).
|
||||
- **2026-05-25 v0.17.2 post-v0.17.1 polish (PR #164 + #165)** — v0.17.1 의 두 follow-up closure. (1) `[image.ocr] request_timeout_secs` 별 노브 — `crates/kebab-parse-image/src/ocr.rs::REQUEST_TIMEOUT` hard 300s 제거, LLM 쪽 패턴 (PR #162) 을 OCR 어댑터에 동일 적용. 사용자 결정으로 별 노브 분리 (OCR vs LLM 의 cold start 패턴이 달라 독립 조절). v0.17.1 미진행 항목 closure. (2) `chunks_fts` 의 `heading_path` 컬럼이 JSON 표기 + path 세그먼트 까지 trigram 색인 → query false positive 가능 문제 closure. `lexical.rs::build_match_string` 가 non-raw 분기 결과를 `text : (<expr>)` 로 wrap — heading 색인 V007 verbatim 유지, 매칭만 text 한정. 사용자가 명시 heading 검색 하려면 raw mode `'heading_path : <token>'` escape hatch (SKILL.md 갱신). 둘 다 additive (옛 config 호환) / re-ingest 불필요. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 v0.17.2 두 entry).
|
||||
- **2026-05-25 v0.17.1 post-dogfood (PR #162 + #163)** — 확장 도그푸딩 (16 GB CPU only, gemma4:e4b 시도) 에서 발견된 두 follow-up 한 묶음. (1) `crates/kebab-llm-local/src/ollama.rs::REQUEST_TIMEOUT` hard 300s → `[models.llm] request_timeout_secs` config + env override (additive, default 300, `=0` 은 disable 아닌 "즉시 timeout" 이라 doc 명시). (2) README + SMOKE 에 sudo / systemd 없이 ollama 설치 + ≤4B Q4 권장 모델 + `kebab ask --stream` UX 권장 docs. additive only — 옛 config / wire 호환. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25).
|
||||
- **2026-05-24 v0.17.0 PR-C `code_lang_chunk_breakdown` additive (closure of 2026-05-22 LOW)** — `schema.v1.stats` 에 chunk 수 집계 신규 키. 기존 `code_lang_breakdown` (doc count) 와 sister. 또 기존 두 필드 JSON schema description 의 "chunk count" 오기재 → "doc count" 로 정정. wire additive — schema_version bump 불필요. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-24 PR-C).
|
||||
- **2026-05-24 v0.17.0 PR-B C typedef alias unit (closure of 2026-05-21)** — `kebab-parse-code::c::extract_blocks` 의 `type_definition` 분기로 inner anonymous struct/enum/union → declarator 의 typedef alias 이름으로 synthetic unit 방출. `PARSER_VERSION code-c-v1` → `code-c-v2` bump + 같은-asset/다른-doc_id 케이스용 `purge_workspace_path_for_parser_bump` cascade (`stale_chunk_ids_for_workspace_path_except_doc_id` + `purge_document_at_workspace_path_except_doc_id` helper 신규). 사용자 작업 불필요 (다음 ingest 가 자동 재처리). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-24 PR-B).
|
||||
- **2026-05-24 v0.17.0 PR-A 한국어 trigram tokenizer 채택 (closure of 2026-05-22 한국어 lexical)** — `chunks_fts` 가 FTS5 `unicode61` → `trigram` 으로 V007 migration (자동 backfill, re-ingest 불필요). `lexical.rs::build_match_string` trigram-aware 재설계 — multi-token 한국어 query (`해시 충돌`) 가 whole-phrase 후보로 hit, 한영 혼합 (`Rust 충돌은`) 도 OR-combined. 2자 이하 query 는 0-hit + CLI/TUI/wire `hint` 안내. 영어 lexical 도 substring 매칭으로 바뀜 (recall ↑ / 단어 경계 ↓). `kebab.sqlite` 크기 ~2-5배 증가 (trigram index). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-24).
|
||||
- **2026-05-22 P10 종합 도그푸딩 round 2 (한국어 lexical 검색 한계)** — `kebab search --mode lexical` 의 한국어 query 가 FTS5 `unicode61` 토크나이저에서 거의 0 hit (어절 단위 토큰화 → 부분 매칭 불가). 기본 hybrid 모드는 `multilingual-e5-small` vector 가 carry 해 한국어 검색 정상. **closure**: 위 2026-05-24 v0.17.0 entry.
|
||||
- **2026-05-20 P10-1B (Rust 1A symbol path 비일관 + expression-level 함수 미방출)** — (a) Rust `code-rust-ast-v1` 은 file-scope nesting 만 (workspace path prefix 없음), 1B 의 Python/TypeScript/JavaScript 는 workspace 경로 → module path prefix 사용 (비일관 수용, retrofit = chunker_version bump + reindex 필요, 사용자 명시 요청까지 보류); (b) TS/JS 의 `const foo = () => {...}` 같은 expression-level 함수는 `<top-level>` glue 로 처리됨 (declaration-level 단위만 1B 1차 범위). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-20) 두 항목.
|
||||
- **2026-05-19 P10-1A-2 (code_rust_ast_v1.rs + SourceType)** — `AST_CHUNK_MAX_LINES` 상수가 `IngestCodeCfg.ast_chunk_max_lines` 를 읽지 않고 모듈 상수 200 고정 (Chunker trait 이 per-medium config 미노출); `SourceType::Code` variant 부재로 code 파일이 `SourceType::Note` 로 분류됨 — 두 항목 모두 `tasks/HOTFIXES.md` (2026-05-19) 에 기록.
|
||||
- **2026-05-07 fb-26 (progress.rs)** — `Aborted` unconditional writeln (TTY duplicate) + `Completed` TTY no summary fixed; `KEBAB_PROGRESS=plain` env + quiet suppression added
|
||||
- **2026-05-07 fb-28 (main.rs)** — `--readonly` (KEBAB_READONLY) blocks Ingest/IngestFile/IngestStdin/Reset; `--quiet` suppresses progress stderr; error.v1 code: "readonly_mode"
|
||||
|
||||
@@ -78,13 +95,66 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
|
||||
|
||||
## 다음 task 후보
|
||||
|
||||
- **P9-2 TUI search** — `App.search` slot 채움. Library 의 `/` 가 enable 됨.
|
||||
- **P9-3 TUI ask** — `App.ask` slot 채움. `?` enable.
|
||||
- **P9-4 TUI inspect** — `App.inspect` slot 채움. `Enter` enable.
|
||||
- **P9-5 desktop tauri** — 별도 분기. PDF citation rendering UI 가치 큼.
|
||||
- **P8 audio brainstorm** — whisper-rs 시스템 dep 받을지 / 외부 transcription endpoint 사용할지 사용자 결정 필요. 사용자 패턴 (책+PDF 위주, audio 의향 없음) 상 후순위.
|
||||
구조적으로 미완인 component 는 P9-5 하나뿐. 나머지는 도그푸딩 follow-up (아래 "P10 dogfooding 백로그") 또는 사용자 결정 대기.
|
||||
|
||||
P9-2/3/4 는 P9-1 의 parallel-safety contract (sub-state slot 패턴) 덕에 병렬 진행 가능 — 같은 `App` 손대지 않음.
|
||||
- **P9-5 desktop tauri** — 마지막 남은 P9 component. `kebab-desktop` crate + Tauri 앱, 별도 분기. PDF citation rendering UI 가치 큼. 사용자 우선순위 (P9 우선 · 책/PDF 위주) 와 부합.
|
||||
- **P10 도그푸딩 round 2 follow-up** — ✅ v0.17.0 cut (2026-05-24) 으로 세 항목 모두 closure (한국어 trigram PR-A + C typedef alias PR-B + code_lang_chunk_breakdown additive PR-C). 상세 cross-link: 아래 "P10 dogfooding 백로그" 절 + `tasks/HOTFIXES.md` (2026-05-24 PR-A/B/C).
|
||||
- **P8 audio brainstorm** — whisper-rs 시스템 dep 받을지 / 외부 transcription endpoint 사용할지 사용자 결정 필요. 사용자 패턴 (책+PDF 위주, audio 의향 없음) 상 보류.
|
||||
- **fb-41 multi-hop reasoning** — ⏳ 미구현, XL, eval 인프라 선행 + brainstorm 필요.
|
||||
- **Rust symbol path retrofit** — Rust `code-rust-ast-v1` symbol 이 file-scope-only (1B+ 는 module prefix). `code-rust-ast-v2` bump + Rust corpus re-ingest 비용 → 사용자 명시 요청까지 보류. HOTFIXES `2026-05-20`.
|
||||
|
||||
### v0.20.0 sub-item 1 (PDF scanned OCR) 머지 후 priorities (2026-05-28, 사용자 결정)
|
||||
|
||||
PR #189 (2026-05-28 머지, commit `09333d0`) 으로 PDF scanned OCR (qwen2.5vl:3b vision LLM) + 4 round bugfix (#2/#3/#4/#6/#7/#9/#10/#11/#13/#14) + ingest log feature 가 main 으로 진입. 다음 작업 순서 = **C → B → A → G**.
|
||||
|
||||
- **C — 한국어 morphological tokenizer (Bug #8 follow-up)** ✅ **v0.20.1 머지 완료**.
|
||||
- V007 trigram 의 ≥3 char query 제약 (HOTFIXES `2026-05-22`) — '한국' 같은 2-char 한국어 query 0 hit → V009 migration + lindera-ko-dic tokenizer + tokenized_korean_text column + first-boot eager backfill 으로 해소. branch `feat/korean-morphological-tokenizer` (8 commit + 5 follow-up).
|
||||
- scope: search index 재빌드 cascade (corpus_revision bump) + V007 trigram 보존 (backward-compat).
|
||||
- 사용자 surface: `kebab search` 의 한국어 2자 query ('한국', '서울') 매칭. README + SKILL + release notes 반영.
|
||||
|
||||
- **B — OCR dense page coverage** ⏳ C 다음.
|
||||
- metro-korea.pdf page 8/13 timeout (180s, dense newspaper article). vision LLM 의 output token 과대 → 정상 timeout.
|
||||
- 가능한 path: (a) per-page `max_pixels` 동적 조정 (high-resolution page 만 축소), (b) column-level sub-region OCR (newspaper layout 분할 후 OCR call 분리), (c) model upgrade (qwen2.5vl:7b — Ollama 모델 변경 + max_pixels trade-off), (d) OCR timeout 점진 축소 (180s → 120s → 90s) — round 마다 p90 측정 후.
|
||||
- mojibake.pdf `pdf_ocr_pages: 0` (round 1 부터 동일) — text-detect path fallback 강화 검토.
|
||||
- 별 sub-item.
|
||||
|
||||
- **A — v0.20 의 deferred sub-items (frozen design contract)** ⏳ B 다음.
|
||||
- **sub-item 2** — Multi-region image dispatch (`OcrText.regions` bbox 분리) — image OCR + PDF column-aware OCR.
|
||||
- **sub-item 3** — PDF normalize integration (`ParsedPdfPage` production caller + `build_canonical_document_from_pdf_pages` + cross-page reference graph).
|
||||
- **TODO #4** — Per-page image / table extraction (PDF figure / table extract).
|
||||
- **TODO #5** — Enricher trait 도입 — OCR + caption 의 `Extractor` trait 통합 (post-extract enrichment 의 generalization).
|
||||
- 각 sub-item 별 spec/plan/executor cycle.
|
||||
|
||||
- **G — v0.20.1 patch release + release notes** ⏳ A 머지 후 (또는 C/B 시점에 따라 조기 cut).
|
||||
- CLAUDE.md release 룰 — sub-item 1 base + bugfix1-4 + log feature + logging r2 누적 → minor surface 변경 다수 + wire schema additive minor + config 신규 → **v0.20.1 patch bump + release notes**.
|
||||
- 핵심 surface (사용자 도그푸딩 가이드 형식):
|
||||
- **한국어 2자 query 지원** (`kebab search` 에서 '한국', '서울' 같은 2자 단어 매칭 — V009 morphological tokenizer).
|
||||
- OCR timeout default 180s (HOTFIXES 2026-05-28).
|
||||
- `[logging]` config section (default enabled) + `{state_dir}/logs/ingest-{run_id}.ndjson` 자동 생성.
|
||||
- `[logging] keep_recent_runs` (100) + `retention_days` (30) — OR-on-stale cleanup.
|
||||
- `ingest_progress.v1.pdf_ocr_finished` 의 4 추가 field (image_byte_size, image_width, image_height, failure_reason) — image_w/h 가 round 2 (PR #190) 에서 실제 capture.
|
||||
- `schema.v1.models` 의 `active_parsers` + `active_chunkers` (additive minor).
|
||||
- V008 migration — `pdf_ocr_events` table (per-OCR-call historical record).
|
||||
- 새 wire schemas — `ocr_stats.v1` + `ocr_failures.v1` (CLI inspect 의 emit).
|
||||
- CLI `kebab inspect ocr-stats` + `kebab inspect ocr-failures` — sweet-spot 점진 분석.
|
||||
- CLI `--media code` first-class, empty query → `invalid_input`, `--config` missing → `config_not_found` + exit 2.
|
||||
- capabilities.streaming_ask + single_file_ingest 가 true (이전 false 거짓 정정).
|
||||
- bump 작업: workspace `Cargo.toml` version → 0.20.1, tag, gitea-release.
|
||||
|
||||
### v0.20 후속 bug catalog (non-blocking known)
|
||||
|
||||
본 PR #189 dogfood 에서 **falsified** 또는 **design constraint** 로 분류 — fix 안 함:
|
||||
- Bug #8 (V007 trigram 2-char query 한계) → 위 C 항목.
|
||||
- Bug #12 (Code block wire `.code` field, `.text` 가 아닌 jq fallback artifact) — falsified.
|
||||
- ask 한국어 query phrasing-sensitive refusal — RAG corner case / NLI gate behavior. 별도 brainstorm.
|
||||
|
||||
### Logging feature enhancements — ✅ closed (PR #190, 2026-05-28 merged commit `7bbdc89a`)
|
||||
|
||||
logging round 2 (PR #190) 으로 4 enhancement 모두 closed:
|
||||
- ✅ `image_width` + `image_height` capture (raster JPEG decode).
|
||||
- ✅ SQLite mirror (V008 `pdf_ocr_events` table + dual-write).
|
||||
- ✅ CLI query (`kebab inspect ocr-stats` + `ocr-failures` — `ocr_stats.v1` + `ocr_failures.v1` wire schemas).
|
||||
- ✅ log retention (`keep_recent_runs` + `retention_days` — file + SQLite cleanup).
|
||||
|
||||
### P9 dogfooding 백로그 (fb-26 ~ fb-42) — release 분할
|
||||
|
||||
@@ -93,11 +163,20 @@ P9-2/3/4 는 P9-1 의 parallel-safety contract (sub-state slot 패턴) 덕에
|
||||
- **0.3.0 — agent foundation** ✅ cut 2026-05-07: fb-26 (log), fb-27 (introspection/error wire), fb-28 (readonly/quiet). ~~fb-29 (daemon)~~ → 🚫 **deferred** — fb-30 stdio MCP 가 동일 가치를 daemon 복잡도 없이 제공.
|
||||
- **0.4.0 — agent integration (MCP)** ✅ cut: fb-30 (MCP stdio), fb-31 (single-file/stdin ingest).
|
||||
- **0.5.0 — agent surface refinement (additive)** ✅ cut 2026-05-10: fb-32 (stale doc indicator), fb-33 (streaming ask), fb-34 (output budget controls), fb-35 (verbatim fetch), fb-36 (search filter args), fb-37 (trace + stats). 모두 wire schema additive minor.
|
||||
- **0.6.0 — RAG quality** 🟡 진행: fb-38 (score semantics) ✅ 머지 (2026-05-10), fb-40 (fact-grounded answer / rag-v2 prompt) ✅ 머지 (2026-05-10), fb-39 (retrieval precision tuning, embedding_version cascade) — 미진행 (eval golden set 선행 필요).
|
||||
- **0.7.0 또는 P+**: fb-41 (multi-hop reasoning, XL), fb-42 (bulk multi-query / rerank, Nice).
|
||||
- **0.6.0 — RAG quality** ✅ 대부분 머지 (2026-05-10): fb-38 (score semantics) ✅, fb-39 (eval foundation — `precision_at_k_chunk` metric) ✅, fb-39b (embedding upgrade — multilingual-e5-large default) ✅, fb-40 (fact-grounded answer / rag-v2 prompt) ✅. 잔여 = fb-39 의 retrieval precision lever 실제 적용 (eval golden set 확장 선행 필요).
|
||||
- **0.7.0 또는 P+**: fb-41 (multi-hop reasoning, XL) — ⏳ 미구현 · brainstorm 필요; fb-42 (bulk multi-query) ✅ 머지 (2026-05-10, bulk only — rerank hint 은 deferred).
|
||||
|
||||
각 fb spec frontmatter 의 `target_version` 필드가 source of truth. INDEX.md 의 release subheader 도 동일 grouping.
|
||||
|
||||
### P10 dogfooding 백로그 (2026-05-22 round 2)
|
||||
|
||||
P10 종합 도그푸딩 round 2 (`/build/cache/dogfood-p10b/`, OSS 8 repo + 한국어 위키 문서 10편) 에서 발견된 follow-up 후보. 자세한 내용 + 우선순위 근거는 `tasks/HOTFIXES.md` (2026-05-22).
|
||||
|
||||
- **한국어 lexical tokenizer** — ✅ v0.17.0 (2026-05-24) PR-A 머지 (#159). V007 trigram migration 자동 backfill + `build_match_string` 재설계 + CLI/TUI/wire hint. HOTFIXES `2026-05-24 PR-A` 참조.
|
||||
- **code_lang_chunk_breakdown chunk 단위 집계 (LOW)** — ✅ v0.17.0 (2026-05-24) PR-C 머지 (#161). `schema.v1.stats` additive 필드. HOTFIXES `2026-05-24 PR-C` 참조.
|
||||
- **C typedef-wrapped struct (LOW)** — ✅ v0.17.0 (2026-05-24) PR-B 머지 (#160). `type_definition` 분기 + `PARSER_VERSION code-c-v2` bump + orphan purge cascade. HOTFIXES `2026-05-24 PR-B` 참조.
|
||||
- **ranking glue chunk 편향 (deferred)** — 자동 heuristic 은 user intent misalignment 위험. 사용자 명시 요청 전까지 surface 변경 0 유지. 1주+ 실사용 후 재 brainstorm.
|
||||
|
||||
## 검증된 운영 동작 (release binary, fastembed enabled)
|
||||
|
||||
P7-3 머지 직후 25 시나리오 smoke 통과 — markdown + image + PDF 5 자산 워크스페이스에서 doctor / ingest / list / inspect / search (lex/vec/hybrid) / re-ingest / byte-edit re-ingest / corrupt PDF / RAG ask + page citation 모두. 자세한 시나리오 표는 conversation 기록 참조; 워크스페이스에 직접 돌려보는 절차는 [docs/SMOKE.md](docs/SMOKE.md).
|
||||
|
||||
257
README.md
257
README.md
@@ -1,121 +1,141 @@
|
||||
# kebab — Local-first Knowledge Base
|
||||
# kebab — Local-first Knowledge Base + RAG
|
||||
|
||||
`kebab` 는 개인용 로컬 knowledge base + RAG 도구다. Markdown / PDF / 이미지를 한 곳에 색인하고, 의미 검색 + page-단위 citation 포함 LLM 답변을 단일 binary 로 제공한다. 모든 추론은 로컬 (Ollama / fastembed) 에서 돌아간다. 대상 하드웨어: M4 48GB MacBook 1대, 사용자 1명.
|
||||
|
||||
## 사전 요구
|
||||
|
||||
- **Rust toolchain** ≥ 1.85 (workspace 가 edition 2024 + resolver 3 사용). [rustup](https://rustup.rs) 권장.
|
||||
- **Ollama** — `kebab ask` 와 이미지 OCR/caption 가 사용. `https://ollama.com/download` 에서 설치 후 `ollama serve` 실행. 기본 LLM 은 gemma4 계열 (`ollama pull gemma4:e4b`) — OCR / caption 도 같은 family 라 모델 하나만 pull 하면 됨. 더 큰 variant 원하면 `gemma4:26b` 등으로 config override. config 의 `[models.llm].endpoint` 에 host:port 명시.
|
||||
- **빌드 디스크** — 첫 빌드 시 `target/` 가 6–10 GB (Lance + DataFusion + fastembed). 여유 확인.
|
||||
- **fastembed 모델** — 첫 `kebab ingest` 시 `multilingual-e5-large` (~1.3 GB, fb-39b) 자동 다운로드. `config.toml` 에서 `model = "multilingual-e5-small"` 로 명시하면 이전 모델 사용.
|
||||
|
||||
## 설치
|
||||
|
||||
표준 경로는 `cargo install` — `~/.cargo/bin/kebab` 가 PATH 에 있는지만 확인하면 끝.
|
||||
|
||||
```bash
|
||||
# 1) repo clone
|
||||
git clone https://gitea.altair823.xyz/altair823-org/kebab.git
|
||||
cd kebab
|
||||
|
||||
# 2) binary 빌드 + 설치 (~/.cargo/bin/kebab)
|
||||
cargo install --path crates/kebab-cli --locked
|
||||
|
||||
# 3) PATH 확인 (아직 추가 안 했으면 ~/.bashrc / ~/.zshrc 에 추가)
|
||||
which kebab # → /Users/<you>/.cargo/bin/kebab 같은 경로
|
||||
kebab --version # → kebab 0.1.0
|
||||
```
|
||||
|
||||
git URL 직접 install 도 가능 (clone 없이):
|
||||
|
||||
```bash
|
||||
cargo install --git https://gitea.altair823.xyz/altair823-org/kebab.git --bin kebab --locked
|
||||
```
|
||||
|
||||
업데이트는 `git pull && cargo install --path crates/kebab-cli --locked --force` 또는 git URL 형식의 경우 `cargo install --git ... --force`.
|
||||
|
||||
제거는 `cargo uninstall kebab-cli`. 이 명령은 binary 만 지우고 워크스페이스 데이터는 그대로 남는다. 데이터까지 정리하려면 `kebab reset --all --yes` (config + data + cache + state 4 개 XDG 경로 모두 wipe — **irreversible**, 재시작 시 `kebab init` 다시 실행). 부분 wipe 는 `kebab reset --data-only` (config 보존), `kebab reset --vector-only` (Lance + `embedding_records` 만, 다음 ingest 가 re-embed) 등.
|
||||
`kebab` 는 개인용 로컬 knowledge base + RAG 도구다. Markdown · PDF · 이미지 · 소스코드를 한 곳에 색인하고, 하이브리드 의미 검색과 근거 인용을 포함한 LLM 답변을 **단일 binary** 로 제공한다. 모든 추론은 로컬 (Ollama + fastembed) 에서 돌아간다.
|
||||
|
||||
## Quick start
|
||||
|
||||
사전 요구는 두 가지뿐이다.
|
||||
|
||||
- **Rust toolchain** ≥ 1.85 (workspace 가 edition 2024 사용). [rustup](https://rustup.rs).
|
||||
- **Ollama** — `kebab ask` 와 이미지/PDF OCR 가 사용. [공식 설치 안내](https://ollama.com/download) 참고 후 `ollama serve` 실행. 기본 LLM family 는 gemma4 (`ollama pull gemma4:e4b`) — OCR/caption 도 같은 family 라 모델 하나면 된다. CPU-only 환경이면 소형 모델 (예: `gemma3:4b`) 을 권장.
|
||||
|
||||
```bash
|
||||
# 첫 실행 — XDG 경로에 데이터 디렉토리 + config.toml 생성
|
||||
# 1) 빌드 + 설치 (~/.cargo/bin/kebab)
|
||||
git clone https://gitea.altair823.xyz/altair823-org/kebab.git
|
||||
cd kebab
|
||||
cargo install --path crates/kebab-cli --locked
|
||||
|
||||
# 2) 데이터 디렉토리 + config.toml 생성 (XDG 경로)
|
||||
kebab init
|
||||
|
||||
# config 손보고 — workspace.root, 모델 endpoint 등 설정 (지원 형식은 md / png / jpg / pdf 로 고정)
|
||||
# 3) config 최소 손보기 — workspace.root (색인할 폴더) 와 LLM endpoint
|
||||
${EDITOR:-vi} ~/.config/kebab/config.toml
|
||||
|
||||
# 색인 (Markdown / 이미지 / PDF 모두 한 번에)
|
||||
# 4) 색인 (Markdown · PDF · 이미지 · 소스코드 한 번에)
|
||||
kebab ingest
|
||||
|
||||
# 검색 (citation 의 source_span 이 매체별로 line / region / page)
|
||||
kebab search "Markdown chunking 규칙" --mode hybrid
|
||||
# 5) 검색 (hybrid = lexical + vector RRF, citation 포함)
|
||||
kebab search "Markdown chunking 규칙"
|
||||
|
||||
# 질문 (Ollama 필요, PDF 인용 시 page 번호 surface)
|
||||
# 6) 질문 (RAG 답변 + 근거 인용, Ollama 필요)
|
||||
kebab ask "내 KB 설계에서 저장소 전략은?"
|
||||
|
||||
# Ratatui 셸 (Library + Search + Ask + Inspect 패널, desktop 진행 중)
|
||||
kebab tui
|
||||
|
||||
# 헬스 체크 (config 경로 / 데이터 디렉토리 쓰기 가능 여부)
|
||||
kebab doctor
|
||||
```
|
||||
|
||||
격리된 임시 워크스페이스로 돌려보는 절차는 [docs/SMOKE.md](docs/SMOKE.md) — `--config <path>` 로 분리. 이미지 / PDF fixture 가 필요하면 두 example 바이너리 (`cargo run --release --example gen_smoke_pdf -p kebab-parse-pdf` / `gen_smoke_png -p kebab-parse-image`) 로 시스템 dep 없이 in-tree 생성 가능.
|
||||
clone 없이 git URL 로 바로 설치할 수도 있다: `cargo install --git https://gitea.altair823.xyz/altair823-org/kebab.git --bin kebab --locked`. 업데이트는 동일 명령에 `--force`. 제거는 `cargo uninstall kebab-cli` (데이터는 보존 — 데이터까지 지우려면 `kebab reset --all --yes`).
|
||||
|
||||
설치 없이 dev 흐름으로 돌려볼 때는 `cargo run --release -p kebab-cli -- <subcommand>` 또는 `cargo build --release && ./target/release/kebab <subcommand>`.
|
||||
설치 없이 dev 흐름으로 돌려볼 때는 `cargo run --release -p kebab-cli -- <subcommand>`. 격리된 임시 워크스페이스로 검증하는 절차는 [docs/SMOKE.md](docs/SMOKE.md) (`--config <path>` 로 분리).
|
||||
|
||||
## 핵심 기능
|
||||
|
||||
### 하이브리드 검색 + citation
|
||||
|
||||
lexical (FTS5 BM25) 과 vector (cosine) 두 채널을 **RRF fusion** 으로 합쳐 검색한다. 모든 hit 은 출처 위치를 매체별로 정확히 담는다 — Markdown/코드는 line, 이미지는 region, PDF 는 page. `--tag` · `--media` · `--lang` · `--path-glob` 등 다양한 필터와 `--max-tokens` · `--cursor` 같은 agent budget flag 를 지원한다.
|
||||
|
||||
### doc-side expansion 별칭 (opt-in)
|
||||
|
||||
색인 시 각 청크에 대해 "같은 의미의 다른 표현"(동의어 · 약어 · 한↔영 번역 · 풀어쓴 설명) 별칭을 LLM 으로 생성해 별도 dense 벡터로 색인한다. 설명형 query 나 cross-lingual query 의 검색 일관성을 높인다 (나무위키 ~1000 문서 CS corpus 측정: 변형 일관성 14/18 → 16/18, 대조군 false-positive 미유발). 청크당 LLM 호출이 들어 비용이 크므로 **default off** — `[ingest.expansion] enabled = true` 로 opt-in.
|
||||
|
||||
### 파생물 캐시 (자동)
|
||||
|
||||
embedding 벡터와 별칭 LLM 결과를 청크 **내용 해시** 로 캐싱한다 (`derivation_cache`). 재색인·갱신 시 내용이 같은 청크는 재계산을 건너뛴다 (측정: cold 1879s → warm 13s ≈ 145배). 캐시 키에 모델·프롬프트·차원 버전이 포함돼 버전 변경 시 자동 무효화된다 (cascade 안전). 별도 설정 없이 투명하게 동작한다. (현재 TTL/LRU 자동 정리는 미구현 — 누적된 캐시는 `kebab reset` 으로만 정리.)
|
||||
|
||||
### 외부 계산 + 로컬 검색 워크플로
|
||||
|
||||
search/ask 는 asset 파일 없이 `kebab.sqlite` + `lancedb` 만으로 동작한다. 비싼 색인(임베딩·OCR·별칭 생성)을 성능 좋은 서버에서 수행한 뒤, 이 두 산출물만 로컬로 복사하면 그대로 검색·질문할 수 있다.
|
||||
|
||||
### 멀티미디어 색인
|
||||
|
||||
Markdown · PDF · 이미지(OCR + caption) · 소스코드(Rust/Python/TS/JS/Go/Java/Kotlin/C/C++ AST) · 리소스(YAML/Dockerfile/TOML/JSON/XML 등)를 확장자에 따라 자동으로 적절한 chunker 에 라우팅한다. embedded text 가 없는 scanned PDF 는 `[pdf.ocr]` 로 page-단위 OCR (opt-in). 전체 확장자→chunker 매핑은 [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).
|
||||
|
||||
### RAG (근거 인용 + 거절)
|
||||
|
||||
검색 결과를 근거로 LLM 답변을 생성하고 [#번호] 인용을 단다. 근거가 부족하면 답을 지어내지 않고 거절한다. compound 질문은 `--multi-hop` 으로 분해→synthesize. 답변의 groundedness 는 mDeBERTa XNLI 로 검증할 수 있다 (`[rag] nli_threshold`, default off).
|
||||
|
||||
### TUI
|
||||
|
||||
`kebab tui` 는 Ratatui 셸 — Library / Search / Ask / Inspect 패널을 vim-style 모드로 다룬다. 키 매핑은 앱 내 `F1` cheatsheet 가 권위 소스다.
|
||||
|
||||
## 명령
|
||||
|
||||
| 명령 | 동작 |
|
||||
|------|------|
|
||||
| `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 |
|
||||
| `kebab ingest [<path>]` | Markdown / 이미지 / PDF 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. |
|
||||
| `kebab search --mode {lexical,vector,hybrid} "<query>" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor <opaque>] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID] [--trace] [--bulk]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` 는 반복 가능 flag (`--tag rust --tag async`) 로 OR 매칭, `--media` 는 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). **`--trace` (p9-fb-37)** — `search_response.v1.trace` 에 lexical / vector pre-fusion 후보 + RRF union + per-stage timing (`lexical_ms` / `vector_ms` / `fusion_ms` / `total_ms`) 노출. trace 요청은 캐시 우회 (`--no-cache` 없이도 항상 cold). **`--bulk` (p9-fb-42)** — stdin ndjson 으로 N query 한 번에 실행. `--json` 면 stdout per-query ndjson (`bulk_search_item.v1`) + stderr summary (`bulk_summary: total=N succeeded=S failed=F`). Cap 100. agent 가 query decomposition 후 sub-query 일괄 실행 시 single round-trip — App instance 재사용으로 캐시 / embedder cold-start 비용 한 번만. Per-query failure 는 item 의 `error` (error.v1) 에 격리, 다른 query 계속 진행. |
|
||||
| `kebab ingest [<path>]` | 워크스페이스 스캔 후 새/변경 문서 색인 (idempotent · incremental, `--force-reingest` 로 강제 재처리). 미지원 확장자는 자동 skip |
|
||||
| `kebab ingest-file <path>` | 단일 파일 ingest (workspace 외부 가능 — `_external/` 로 deterministic copy) |
|
||||
| `kebab ingest-stdin --title <T>` | stdin 의 markdown 본문 ingest |
|
||||
| `kebab search --mode {lexical,vector,hybrid} "<query>" [flags]` | 검색 (default hybrid = RRF fusion, citation 포함). 필터/budget flag 는 `--help` |
|
||||
| `kebab ask "<query>" [flags]` | RAG 답변 + 근거 인용 (Ollama 필요). `--session` (multi-turn) · `--stream` · `--multi-hop` |
|
||||
| `kebab list docs` | 색인된 문서 목록 |
|
||||
| `kebab inspect doc <id>` / `kebab inspect chunk <id>` | raw record 보기 |
|
||||
| `kebab fetch chunk <id> [--context N]` / `kebab fetch doc <id> [--max-tokens N]` / `kebab fetch span <doc_id> <ls> <le> [--max-tokens N]` | (p9-fb-35) verbatim text fetch from indexed corpus. wire = `fetch_result.v1` (kind discriminator). chunk: target + ±N ordinal-context chunks. doc: full normalized markdown. span: 1-based line range (PDF/audio rejected as `error.v1.code = span_not_supported`). chars/4 budget on doc/span. |
|
||||
| `kebab ask "<query>" [--show-citations / --hide-citations] [--session <id>] [--stream]` | RAG 답변 + 근거 인용. 답변 후 `근거:` block 으로 full path / line range / score 한 줄씩 (default ON — `--hide-citations` 로 끄기, pipe 시 유용). 근거 부족 시 거절. Ollama 필요. `--session <id>` 로 multi-turn — 첫 호출에서 SQLite `chat_sessions` 에 자동 생성, 이후 호출은 prior turns 를 history 로 받아 follow-up. session id 는 사용자 지정 (e.g. `kb-rust-async-2026-05`) — `kebab reset --data-only` 로 모든 session wipe. **`--stream` (p9-fb-33)** 로 ndjson `answer_event.v1` event (retrieval_done → token* → final) 를 stderr 에 흘리고 stdout 마지막 줄에 기존 `answer.v1` — agent 가 token 즉시 소비 가능 |
|
||||
| `kebab doctor` | 설정/모델/DB 헬스 체크 |
|
||||
| `kebab tui` | Ratatui 셸 (Library + Search + Ask + Inspect 패널, desktop 진행 중). Library 에서 `r` 키로 background ingest 시작 — 화면 하단 status bar 가 진행 표시, 완료/abort 시 final 라인 잠시 유지 후 자동 hide. ingest 진행 중 `Esc` / `Ctrl-C` 가 cancel signal (그 외에는 quit). vim-style mode (header 우측 `-- NORMAL --` / `-- INSERT --`) — Library/Inspect 는 자동 NORMAL, Search/Ask 는 자동 INSERT. `i` 로 Normal→Insert (모든 pane — p9-fb-21), `Esc` 로 Insert→Normal 어디서나. mode-authoritative dispatch — Search 의 `j/k/o/g`, Ask 의 `e/j/k` 는 NORMAL 모드에서만 명령으로 동작, INSERT 에서는 입력 문자로 typing. (Search 의 chunk inspect 키는 `i`→`o` 로 rebind — `i` 가 universal Insert toggle.) **`F1` 로 cheatsheet popup** (현재 pane 의 키 매핑 + global 토글 표) — `Esc` / `F1` 로 닫기. Search 패널은 200ms debounce 후 background worker 가 검색 — 키 입력으로 UI freeze 안 됨, 사용자가 계속 타이핑하면 stale 결과 자동 폐기 (generation counter). Ask 패널은 multi-turn — 같은 conversation 안에서 Q1/A1, Q2/A2 transcript 누적, 다음 질문이 이전 턴을 history 로 받아 답변. 답변 본문은 markdown 렌더 (bold/italic/inline code/heading/list/code fence/table/blockquote, raw `**bold**` 가 실제 굵게 표시). `Ctrl-L` 로 새 conversation 시작. Search 의 `g` 키가 `$EDITOR` (기본 `vi`) 로 hit 의 citation 위치 열기 — 종료 후 TUI 화면이 자동으로 깨끗이 redraw. CLI `kebab ask` 는 raw markdown 그대로 (terminal 호환성 위해). Library 의 doc-list 가 한글 / 일본어 / 중국어 (CJK) 제목을 wide-char 정확한 column width 로 truncate — 한글 제목이 한 줄을 넘기지 않음 (CJK 1 자 = 2 col). Search/Ask/Filter 입력의 cursor 가 wide char 위에서 column 단위로 정렬 — 한글 입력 시 caret 이 글자 옆에 정확히 놓임. `← / →` 로 입력 문자열 중간 cursor 이동 (한글 한 글자 = 2 column 이라도 한 번에 이동), `Home / End` 로 양 끝 점프, `Delete` 로 cursor 위치 char 삭제 — 모든 input pane (Ask / Search / Library filter overlay) 동일 (p9-fb-22). Ask 트랜스크립트는 새 답변이 viewport 아래로 누적될 때 자동으로 tail 을 따라감 (auto-scroll); `j` / `k` 로 위로 스크롤하면 freeze, `Shift-G` 로 다시 bottom + auto-tail 재개. 화면 하단 hint line 은 한국어 동사구로 (`"위로"` / `"아래로"` / `"필터"` / `"타이핑 검색어"` / `"Esc 로 NORMAL 모드"` / `"i 입력모드"` 등) + 현재 (pane, mode) 조합에 맞춰 자동 분기, **첫 fragment 가 항상 `F1 도움말`** (cheatsheet 발견성 보장). 모든 모드에서 항상 떠 있는 상태바 — `kebab v<version> │ <pane> │ <docs> docs │ <state>` (state: streaming/searching/indexing/idle, ingest 진행 중에는 progress 가 같은 자리에 흡수됨). Ask 진입 시 conversation id 8 자 prefix 도 함께 표시. Ask 트랜스크립트와 Inspect 양쪽에서 `PgUp / PgDn` 으로 10 줄씩 페이지 스크롤. Library 의 doc list 위에는 `TITLE / TAGS / UPDATED / CHUNKS` 컬럼 헤더 행 표시 (display-width 정렬, Hangul / CJK 안전). |
|
||||
| `kebab reset [--all / --data-only / --vector-only / --config-only] [--yes]` | XDG 데이터 wipe. **Irreversible.** TTY 면 confirm prompt, 아니면 `--yes` 필수. `--vector-only` 는 SQLite `embedding_records` 도 함께 truncate (orphan 방지) |
|
||||
| `kebab eval run / compare` | golden query 회귀 측정 |
|
||||
| `kebab schema [--json]` | introspection — wire schemas / capabilities / models / stats 한 번에. `--json` 은 `schema.v1` wire; 사람 모드는 서식 출력. **stats 에 (p9-fb-37) `media_breakdown` (5 keys: markdown / pdf / image / audio / other) + `lang_breakdown` (BCP-47 코드, NULL 은 literal `"null"`) + `index_bytes` (sqlite + lancedb on-disk 합계) + `stale_doc_count` (`config.search.stale_threshold_days` 초과 doc 수) 추가.** |
|
||||
| `kebab ingest-file <path>` | 단일 파일 ingest (workspace 외부 가능). 바이트는 `<workspace.root>/_external/<hash12>.<ext>` 로 copy. `.kebabignore` 매치 시 stderr warn 후 진행 (explicit ingest 가 bypass intent). |
|
||||
| `kebab ingest-stdin --title <T> [--source-uri <URI>]` | stdin 의 markdown 본문 ingest. frontmatter (title + source_uri) 자동 prepend. v1 markdown only. |
|
||||
| `kebab mcp` | MCP (Model Context Protocol) stdio server. agent host (Claude Code / Cursor / OpenAI Agents) 가 spawn 하여 tool 호출 (`search` / `bulk_search` / `ask` / `fetch` / `schema` / `doctor` / `ingest_file` / `ingest_stdin`). `--config` honor. |
|
||||
| `kebab inspect doc <id>` / `inspect chunk <id>` | raw record 보기 |
|
||||
| `kebab fetch chunk\|doc\|span <id> [flags]` | indexed corpus 에서 verbatim text fetch |
|
||||
| `kebab eval run \| aggregate \| compare \| variants` | golden query 회귀 측정 + 변형 일관성 진단 |
|
||||
| `kebab schema [--json]` | introspection — wire schemas / capabilities / models / stats |
|
||||
| `kebab doctor` | 설정 / 모델 / DB 헬스 체크 |
|
||||
| `kebab tui` | Ratatui 셸 (Library / Search / Ask / Inspect) |
|
||||
| `kebab mcp` | MCP stdio server (`search` / `bulk_search` / `ask` / `fetch` / `schema` / `doctor` / `ingest_file` / `ingest_stdin`) |
|
||||
| `kebab reset [--all \| --data-only \| --vector-only \| --config-only \| --orphans-only] [--yes]` | XDG 데이터 wipe (**irreversible**) |
|
||||
|
||||
모든 명령에 `--json` 플래그. 출력은 frozen wire schema v1 (`schema_version` 항상 포함, 예: `ingest_report.v1`, `ingest_progress.v1`, `search_hit.v1`, `answer.v1`, `doctor.v1`, `reset_report.v1`, `schema.v1`). `--json` 모드에서 fatal error 는 stderr 에 `error.v1` ndjson 으로 emit (exit code 0/1/2/3 unchanged).
|
||||
모든 명령에 `--json` 플래그가 있고, 출력은 frozen **wire schema v1** 을 따른다 (`schema_version` 항상 포함). `--json` 모드에서 fatal error 는 stderr 에 `error.v1` ndjson 으로 emit (exit code 0/1/2/3 불변). 글로벌 flag: `--readonly` (write-path 비활성화), `--quiet` (human stderr 억제), env `KEBAB_PROGRESS=plain`. 전체 flag·wire 의미는 `kebab <cmd> --help` 와 [docs/wire-schema/v1/](docs/wire-schema/v1/). 외부 agent 통합(Claude Code skill / MCP)은 [docs/mcp-usage.md](docs/mcp-usage.md) 와 [integrations/](integrations/).
|
||||
|
||||
글로벌 플래그: `--readonly` (또는 `KEBAB_READONLY=1`) — 모든 write-path 명령 (`ingest` / `ingest-file` / `ingest-stdin` / `reset`) 을 비활성화, exit 1. `--quiet` — 진행 바 / hint 등 human-readable stderr 억제 (exit code / stdout 출력은 그대로). `KEBAB_PROGRESS=plain` — TTY 가 없는 환경에서도 진행 상황을 plain-text 한 줄씩 stderr 로 출력 (spinner 대신).
|
||||
## Configuration
|
||||
|
||||
### Score 해석 (fb-38)
|
||||
`~/.config/kebab/config.toml` 은 `kebab init` 가 XDG 경로에 생성한다. 핵심 노브만 정리한다 (전체 절은 생성된 파일 주석 참고, 예시는 [docs/SMOKE.md](docs/SMOKE.md)).
|
||||
|
||||
`search_hit.v1.score` 는 **ranking signal** 이지 confidence 가 아니다. `score_kind` 필드로 의미 선언:
|
||||
```toml
|
||||
[workspace]
|
||||
root = "~/KnowledgeBase" # 색인할 폴더. 절대 / tilde / env / 상대 경로 가능.
|
||||
# 상대 경로의 base 는 config.toml 위치 (cwd 무관).
|
||||
|
||||
| `score_kind` | 의미 | 범위 |
|
||||
|--------------|------|------|
|
||||
| `rrf` (hybrid) | RRF normalized | `[0, 1]`, ceiling = 1.0 (양 채널 rank=1) |
|
||||
| `bm25` (lexical) | raw BM25 | unbounded (≥ 0) |
|
||||
| `cosine` (vector) | cosine sim | `[-1, 1]` |
|
||||
[models.embedding]
|
||||
provider = "fastembed" # "fastembed"(기본, onnxruntime) / "candle"(순수 Rust)
|
||||
# / "none"(lexical-only). candle 는 같은 모델·같은 벡터를
|
||||
# 순수 Rust 로 돌려 NUMA 서버의 onnxruntime 48-스레드
|
||||
# double-free 를 피하는 opt-in 백엔드 (재색인 불필요).
|
||||
model = "multilingual-e5-large" # 다국어 sentence embedding (1024-dim).
|
||||
# 첫 ingest 시 ONNX (~1.3GB) 자동 다운로드.
|
||||
# candle provider 는 safetensors (~2GB) 다운로드.
|
||||
dimensions = 1024 # config 와 LanceDB stored dim 불일치 시 검색 0건.
|
||||
num_threads = 0 # candle 전용 CPU 스레드 캡 (0=auto=#cores).
|
||||
# env KEBAB_EMBED_THREADS 가 우선. NUMA 노드 바인딩은
|
||||
# numactl 과 조합. fastembed provider 는 무시.
|
||||
|
||||
#### RRF 수식 (hybrid mode)
|
||||
[models.llm]
|
||||
endpoint = "http://localhost:11434" # Ollama host:port
|
||||
model = "gemma4:e4b"
|
||||
# request_timeout_secs = 300 # 큰 모델은 늘림. 0 은 disable 이 아니라 "즉시 timeout".
|
||||
|
||||
```
|
||||
chunk c 의 raw RRF = Σ_m 1 / (k_rrf + rank_m(c))
|
||||
[ingest.expansion] # doc-side expansion 별칭 (opt-in)
|
||||
enabled = false # true 면 청크당 LLM 호출로 별칭 생성 — 비용 큼.
|
||||
embed_aliases = true # 별칭을 줄별 개별 dense 벡터로 색인.
|
||||
max_aliases_per_chunk = 8
|
||||
|
||||
여기서 m ∈ {lexical, vector}, k_rrf = config.search.rrf_k (default 60).
|
||||
양 채널 모두 rank=1 일 때 raw RRF = 2 / (k_rrf + 1) ≈ 0.0328.
|
||||
[search]
|
||||
stale_threshold_days = 30 # search hit / citation 의 stale 플래그 기준 (0 = off).
|
||||
|
||||
normalize: rrf_score = raw_rrf / (2 / (k_rrf + 1))
|
||||
→ rrf_score ∈ [0, 1]. 양쪽 rank=1 → 1.0, 한 쪽만 등장 → ≈ 0.5 천장.
|
||||
[rag]
|
||||
prompt_template_version = "rag-v3" # 답변 언어 = 질문 언어. rag-v1/v2 는 legacy.
|
||||
nli_threshold = 0.0 # >0 (예: 0.5) 면 mDeBERTa XNLI groundedness 검증.
|
||||
```
|
||||
|
||||
`rrf_score = 0.5` 의 의미: chunk 가 한 채널 (lexical 또는 vector) 에서만 rank 1 로 등장. confidence 50% 가 아님 — RRF 수식의 산술적 천장.
|
||||
- **파생물 캐시** — embedding·별칭 결과를 내용 해시로 자동 캐싱한다 (위 「핵심 기능」 참고). 설정 항목 없음.
|
||||
- **`[ingest.code]`** — code ingest 의 skip 정책 (`skip_generated_header`, `max_file_bytes`, `extra_skip_globs`). `.gitignore` 자동 honor, `.kebabignore` 는 추가 layer.
|
||||
- **`[pdf.ocr]`** — scanned PDF 의 page-단위 OCR (default off / opt-in, page 당 ~수십 초 cost). 활성화 후 v0.19 시절 색인분은 `kebab ingest --force-reingest` 로 재처리.
|
||||
- **`--config <path>`** — 임시 워크스페이스 / 격리 테스트용 (CLI · TUI 모두 honor).
|
||||
- **`kebab config migrate`** — 새 버전에서 추가된 config 섹션을 기존 `config.toml` 에 설명 주석과 함께 채워 넣는다 (사용자가 손본 값·주석·순서는 보존, 멱등, 변경 시 자동 `.bak` 백업). `--dry-run` 으로 변경 미리보기. `kebab doctor` 가 갱신 필요 시 안내한다. `kebab init` 으로 새로 생성되는 config.toml 도 섹션별 주석을 포함한다.
|
||||
- **`KEBAB_*` env** — 일부 키 override (`KEBAB_RAG_SCORE_GATE`, `KEBAB_EVAL_GOLDEN` 등).
|
||||
- **XDG layout**: `~/.config/kebab/`, `~/.local/share/kebab/`, `~/.cache/kebab/`, `~/.local/state/kebab/`.
|
||||
|
||||
agent 가 trust threshold 가 필요하면 top-level `score` 가 아닌 nested `retrieval.lexical_score` (BM25 raw) / `retrieval.vector_score` (cosine raw) 사용.
|
||||
|
||||
## 논리 아키텍처
|
||||
## 아키텍처
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
@@ -131,8 +151,8 @@ flowchart TB
|
||||
end
|
||||
|
||||
subgraph Pipeline["도메인 + 파이프라인"]
|
||||
parse["parse-md / parse-pdf / parse-image"]
|
||||
chunker["chunker (md-heading-v1, pdf-page-v1)"]
|
||||
parse["parse-md / parse-pdf / parse-image / parse-code"]
|
||||
chunker["chunker (md / pdf / code-AST / manifest)"]
|
||||
embedder["embedder (fastembed multilingual-e5-large)"]
|
||||
retriever["retriever (lexical / vector / hybrid RRF)"]
|
||||
rag["RAG pipeline"]
|
||||
@@ -174,65 +194,22 @@ flowchart TB
|
||||
rag --> ollama
|
||||
```
|
||||
|
||||
`kebab-app` 가 facade — UI binary 가 store / parse / search / llm / rag 를 직접 참조하지 않는다 (frozen 설계 §8). 자세한 crate-level 의존성 + 디렉토리 + 핵심 기술 결정은 [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).
|
||||
v0.21.0 기준 핵심 설계:
|
||||
|
||||
## Configuration
|
||||
- **crate facade** — `kebab-app` 가 유일한 facade다. UI binary (`kebab-cli` / `kebab-tui`) 는 store / parse / search / llm / rag 를 직접 참조하지 않는다 (frozen 설계 §8). 각 user-facing 엔트리는 `*_with_config(cfg, …)` 동반 함수로 explicit config 를 thread 한다.
|
||||
- **chunk_id 는 위치 기반** — chunk 의 정체성은 문서 내 위치(ordinal + span)다. 반면 파생물 캐시 키는 **내용 해시**라, 내용이 같으면 위치·문서가 달라도 동일 캐시를 재사용한다.
|
||||
- **wire schema v1** — 모든 `--json` 출력은 `schema_version` 을 담는 frozen contract다. 깨는 변경은 `*.v2` major bump을 요구한다.
|
||||
- **versioning cascade** — `parser_version` / `chunker_version` / `embedding_version` / `prompt_template_version` / `index_version` 변경은 downstream record(청크·임베딩·캐시·eval)를 무효화한다.
|
||||
|
||||
- `~/.config/kebab/config.toml` — `kebab init` 가 XDG 경로에 생성. `[workspace]` (root, exclude — include 필드는 제거됨, 지원 형식은 자동 결정), `[storage]`, `[chunking]`, `[models.embedding]`, `[models.llm]`, `[image.ocr]`, `[image.caption]`, `[search]`, `[rag]`, `[ui]` 절.
|
||||
- `[models.embedding]` —
|
||||
- `model` (default `"multilingual-e5-large"`, fb-39b) — 다국어 sentence embedding 모델. 1024-dim. ONNX (~1.3 GB) 첫 실행 시 fastembed cache (`config.storage.model_dir/fastembed/`) 에 자동 다운로드. `"multilingual-e5-small"` (384 dim) 는 backwards-compat 으로 사용 가능 — TOML 에 명시.
|
||||
- `dimensions` (default `1024`) — 모델의 embedding 차원. config 와 LanceDB stored dim 불일치 시 검색 결과 0 건 (orphan table). 모델 변경 시 `kebab reset --vector-only && kebab ingest` 로 vector index 재구축 권장.
|
||||
- `[ui] theme = "dark" | "light"` 로 TUI 팔레트 선택 (default `"dark"`, 알 수 없는 값은 dark fallback).
|
||||
- `[search] stale_threshold_days = 30` (p9-fb-32) — search hit / RAG citation 의 `stale` 플래그 기준 (default 30 일, `0` 으로 비활성화). 옛 config 의 `workspace.include = [...]` 은 silently 무시 + 단발 deprecation warning (p9-fb-25).
|
||||
- `[rag] prompt_template_version` (default `"rag-v2"`) — RAG system prompt version. `"rag-v1"` 은 legacy backwards-compat (사용자 명시 시 유지). v2 강화 규칙: (1) fact 인용 시 [#번호] 앞에 chunk 속 원문 큰따옴표 표기, (2) 학습 지식 동원 금지, (3) 근거 모호 시 "확실하지 않다" 명시.
|
||||
- `--config <path>` flag — 임시 워크스페이스 / 격리 테스트 시 사용. CLI / TUI 모두 honor.
|
||||
- `KEBAB_*` env — 일부 키 override (`KEBAB_RAG_SCORE_GATE`, `KEBAB_EVAL_GOLDEN`, `KEBAB_COMMIT_HASH` 등).
|
||||
- XDG layout: `~/.config/kebab/`, `~/.local/share/kebab/`, `~/.cache/kebab/`, `~/.local/state/kebab/`.
|
||||
- `workspace.root` 경로 형식: 절대 (`/foo/bar`) / tilde (`~/KnowledgeBase`, default) / env (`${XDG_DATA_HOME}/kebab`) / 상대 (`./notes`, `notes`, `../shared/x`) 모두 가능. **상대 경로의 base 는 config.toml 자체가 위치한 디렉토리** — 사용자의 `cwd` 와 무관 (`--config /tmp/cfg.toml` + `root = "kb"` → `/tmp/kb`). p9-fb-05 정책.
|
||||
|
||||
config 예시는 [docs/SMOKE.md](docs/SMOKE.md) 의 `/tmp/kebab-smoke/config.toml` 블록 참조.
|
||||
|
||||
## 외부 AI 통합
|
||||
|
||||
`--json` 출력 + frozen wire schema v1 가 stable contract. 통합 옵션:
|
||||
|
||||
- **Claude Code skill** — repo 의 [`integrations/claude-code/`](integrations/claude-code/) 가 ship-ready skill. `cp -r integrations/claude-code/kebab ~/.claude/skills/` 한 번이면 새 Claude Code 세션부터 자동 trigger (내부 시스템 / 위키 lookup / 사내 runbook 질문). multi-turn 은 `kebab ask --session <id> --json` 으로 영속 — skill 이 conversation id 관리하면 외부 agent 도 `--repl` 없이 stateful 대화 가능 (p9-fb-18).
|
||||
- **Codex / 기타 agent host** — `--json` + frozen wire schema v1 가 stable contract. 동일 패턴으로 ~50줄 wrapper 작성 가능. `integrations/<host>/` 에 추가 PR 환영.
|
||||
- **MCP server** — stdio JSON-RPC 로 `kebab-app` facade 1:1 노출. `kebab mcp` 참조.
|
||||
- **HTTP wrapper** — `kebab serve --bind 127.0.0.1:7711` (P+, local-only 가치 신중).
|
||||
|
||||
## MCP 사용
|
||||
|
||||
`kebab mcp` 가 stdio MCP server. 8 tool: `search` / `bulk_search` (p9-fb-42 — N query 한 번에) / `ask` / `fetch` (p9-fb-35) / `schema` / `doctor` / `ingest_file` / `ingest_stdin`.
|
||||
|
||||
Claude Code 빠른 등록 (`~/.claude/mcp.json` 또는 host 동등 위치):
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"kebab": {
|
||||
"command": "kebab",
|
||||
"args": ["mcp"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
자세한 사용법 (Cursor / OpenAI Agents / Copilot CLI config, per-tool 입출력 예시, troubleshooting, multi-turn ask + session 관리, performance / security) — **[docs/mcp-usage.md](docs/mcp-usage.md)** 참조.
|
||||
crate-level 의존성 그래프 · 디렉토리 트리 · 확장자→chunker 전체 매핑 · 핵심 기술 결정은 [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md), 진척도는 [HANDOFF.md](HANDOFF.md).
|
||||
|
||||
## 비-목표
|
||||
|
||||
다중 사용자 SaaS / K8s / 원격 vector DB / enterprise RBAC / 실시간 협업 / 모든 파일 포맷의 완벽한 parsing / agent 임의 파일 수정 / multi-workspace / LLM-as-judge eval / CLIP 시각 embedding / `kebab://` protocol handler — frozen 설계 §11 / §0 참조.
|
||||
다중 사용자 SaaS / K8s / 원격 vector DB / enterprise RBAC / 실시간 협업 / agent 임의 파일 수정 / multi-workspace / LLM-as-judge eval / CLIP 시각 embedding — frozen 설계 §0 / §11 참조.
|
||||
|
||||
## 라이선스
|
||||
## 버전 / 라이선스 / 참고
|
||||
|
||||
`MIT OR Apache-2.0` (workspace `Cargo.toml` 의 `license` 필드).
|
||||
|
||||
## 참고
|
||||
|
||||
- 진척도: [HANDOFF.md](HANDOFF.md)
|
||||
- 아키텍처: [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)
|
||||
- Frozen 설계: [docs/superpowers/specs/2026-04-27-kebab-final-form-design.md](docs/superpowers/specs/2026-04-27-kebab-final-form-design.md)
|
||||
- Task 인덱스: [tasks/INDEX.md](tasks/INDEX.md)
|
||||
- 머지 후 hotfix 로그: [tasks/HOTFIXES.md](tasks/HOTFIXES.md)
|
||||
- Smoke 절차: [docs/SMOKE.md](docs/SMOKE.md)
|
||||
- **버전**: v0.21.0 (`kebab --version` 으로 확인).
|
||||
- **라이선스**: `MIT OR Apache-2.0`.
|
||||
- 진척도: [HANDOFF.md](HANDOFF.md) · 아키텍처: [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) · Frozen 설계: [docs/superpowers/specs/2026-04-27-kebab-final-form-design.md](docs/superpowers/specs/2026-04-27-kebab-final-form-design.md)
|
||||
- Task 인덱스: [tasks/INDEX.md](tasks/INDEX.md) · Hotfix 로그: [tasks/HOTFIXES.md](tasks/HOTFIXES.md) · Smoke 절차: [docs/SMOKE.md](docs/SMOKE.md) · MCP 사용: [docs/mcp-usage.md](docs/mcp-usage.md)
|
||||
|
||||
@@ -12,17 +12,21 @@ kebab-core = { path = "../kebab-core" }
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
kebab-source-fs = { path = "../kebab-source-fs" }
|
||||
kebab-parse-md = { path = "../kebab-parse-md" }
|
||||
kebab-parse-types = { path = "../kebab-parse-types" }
|
||||
kebab-normalize = { path = "../kebab-normalize" }
|
||||
kebab-chunk = { path = "../kebab-chunk" }
|
||||
kebab-store-sqlite = { path = "../kebab-store-sqlite" }
|
||||
kebab-store-vector = { path = "../kebab-store-vector" }
|
||||
kebab-search = { path = "../kebab-search" }
|
||||
kebab-embed = { path = "../kebab-embed" }
|
||||
kebab-embed-local = { path = "../kebab-embed-local" }
|
||||
kebab-embed-candle = { path = "../kebab-embed-candle" }
|
||||
kebab-llm = { path = "../kebab-llm" }
|
||||
kebab-llm-local = { path = "../kebab-llm-local" }
|
||||
kebab-rag = { path = "../kebab-rag" }
|
||||
# p9-fb-41 PR-9c-2: facade construction of OnnxNliVerifier when
|
||||
# `[rag] nli_threshold > 0`. Trait-only consumption via kebab-rag's
|
||||
# `with_verifier`; no kebab-nli internals leak into kebab-app code
|
||||
# beyond the construction site in `open_with_config`.
|
||||
kebab-nli = { path = "../kebab-nli" }
|
||||
# P6-4: image extractor + OCR + caption adapters live here. App
|
||||
# threads them into the per-asset dispatch (see `ingest_one_asset`
|
||||
# image branch). Trait-only consumption — no `kebab-parse-image`
|
||||
@@ -32,11 +36,21 @@ kebab-parse-image = { path = "../kebab-parse-image" }
|
||||
# per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the
|
||||
# resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
|
||||
kebab-parse-pdf = { path = "../kebab-parse-pdf" }
|
||||
lopdf = { workspace = true }
|
||||
# Enhancement 1 (v0.20.x r2): JPEG dimension decode in pdf_ocr_apply.rs.
|
||||
# jpeg feature added explicitly (F3 closure-r1) rather than relying on
|
||||
# feature unification via kebab-parse-image.
|
||||
image = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
|
||||
# p10-1A-2: Rust AST extractor lives here. App threads it into the
|
||||
# per-asset dispatch (see `ingest_one_asset` Code branch) and runs the
|
||||
# resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`.
|
||||
kebab-parse-code = { path = "../kebab-parse-code" }
|
||||
anyhow = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] }
|
||||
tracing-appender = "0.2"
|
||||
@@ -54,21 +68,38 @@ unicode-normalization = "0.1"
|
||||
ignore = "0.4"
|
||||
# p9-fb-34: opaque pagination cursor encodes payload as base64.
|
||||
base64 = { workspace = true }
|
||||
# Enhancement 3 (v0.20.x r2): direct SQL queries for inspect_ocr_stats/failures.
|
||||
rusqlite = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
# doc-side expansion (Phase 2) Task 4: ExpansionGenerator unit tests build
|
||||
# MockLanguageModel (gated behind kebab-llm's `mock` feature, default OFF in
|
||||
# [dependencies]). Enabling it here turns it on for the test build only.
|
||||
kebab-llm = { path = "../kebab-llm", features = ["mock"] }
|
||||
rusqlite = { workspace = true }
|
||||
filetime = "0.2"
|
||||
tempfile = { workspace = true }
|
||||
# Image-pipeline integration tests use wiremock to stub Ollama for OCR
|
||||
# / caption HTTP calls. Async runtime to host the mock server only;
|
||||
# the kb-app code under test stays sync.
|
||||
wiremock = { workspace = true }
|
||||
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||
image = { version = "0.25", default-features = false, features = ["png"] }
|
||||
image = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
|
||||
# P7-3 PDF integration tests build in-memory PDF fixtures via the same
|
||||
# lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
|
||||
# to the same major (0.32) so byte output is identical between the two
|
||||
# fixture surfaces.
|
||||
lopdf = "0.32"
|
||||
lopdf = { workspace = true }
|
||||
# error_wire::tests::llm_unreachable_classifies_to_model_unreachable needs a real
|
||||
# reqwest::Error (private constructor) — built from a connect-refused call.
|
||||
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
|
||||
|
||||
[features]
|
||||
# Marker feature — spec §6.3 Option A (단순): lindera 는 kebab-chunk 가 default dep 으로 소유.
|
||||
# disable path 없음; 이 feature 는 spec §6.3 명시를 honor 하는 role 만.
|
||||
default = ["fts_korean_morphological"]
|
||||
fts_korean_morphological = []
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
@@ -40,11 +40,18 @@ use anyhow::{Context, Result, anyhow};
|
||||
use lru::LruCache;
|
||||
|
||||
use kebab_core::{
|
||||
Answer, Embedder, IndexVersion, LanguageModel, Retriever, SearchHit, SearchMode,
|
||||
SearchOpts, SearchQuery, VectorStore,
|
||||
Answer, DocumentStore, Embedder, ExtractContext, Extractor, IndexVersion, LanguageModel,
|
||||
MediaType, Retriever, SearchHit, SearchMode, SearchOpts, SearchQuery, VectorStore,
|
||||
};
|
||||
use kebab_embed_candle::CandleEmbedder;
|
||||
use kebab_embed_local::FastembedEmbedder;
|
||||
use kebab_llm_local::OllamaLanguageModel;
|
||||
use kebab_parse_code::{
|
||||
CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor,
|
||||
KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor,
|
||||
};
|
||||
use kebab_parse_image::ImageExtractor;
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use kebab_rag::{AskOpts, RagPipeline};
|
||||
use kebab_search::{HybridRetriever, LexicalRetriever, VectorRetriever};
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
@@ -73,6 +80,14 @@ pub struct SearchResponse {
|
||||
/// p9-fb-37: present when caller passed `SearchOpts.trace = true`.
|
||||
/// Consumers that ignore trace should leave this `None`.
|
||||
pub trace: Option<kebab_core::SearchTrace>,
|
||||
/// v0.17.0 A5 Step 4b: human / agent-readable advisory string set
|
||||
/// when the empty hit list is likely due to a query shorter than the
|
||||
/// FTS5 trigram tokenizer's 3-char minimum. `None` otherwise. CLI
|
||||
/// surfaces it on stderr (text mode); MCP / `--json` consumers
|
||||
/// surface it however they prefer. See
|
||||
/// `docs/superpowers/specs/2026-05-22-korean-trigram-tokenizer-design.md`
|
||||
/// §3.3.
|
||||
pub hint: Option<String>,
|
||||
}
|
||||
|
||||
/// Facade state — see module docs for lifetime rules.
|
||||
@@ -84,6 +99,12 @@ pub struct SearchResponse {
|
||||
pub struct App {
|
||||
pub(crate) config: kebab_config::Config,
|
||||
pub(crate) sqlite: Arc<SqliteStore>,
|
||||
/// post-v0.18.0 extractor-dispatch-unification: polymorphic Extractor
|
||||
/// registry. App init 시 1회 등록되어 `extract_for(...)` 가 lookup
|
||||
/// 한다. 현재 11 entry (ImageExtractor + PdfTextExtractor + 9 AST).
|
||||
/// MarkdownExtractor 는 별 PR 에서 추가 — markdown ingest path 는
|
||||
/// 본 PR 에서 free-function 그대로 유지.
|
||||
pub(crate) extractors: Vec<Box<dyn Extractor + Send + Sync>>,
|
||||
/// Memoized embedder — built lazily on first `embedder()` call when
|
||||
/// embeddings are enabled. `OnceLock` keeps the struct `Sync` and
|
||||
/// the build path cold-only-once.
|
||||
@@ -102,6 +123,17 @@ pub struct App {
|
||||
/// `corpus_revision` snapshot embedded in `SearchCacheKey`
|
||||
/// invalidates every entry the moment a new ingest commit lands.
|
||||
search_cache: Option<Mutex<LruCache<SearchCacheKey, Vec<SearchHit>>>>,
|
||||
/// p9-fb-41 PR-9c-2: NLI verifier built eagerly at
|
||||
/// `open_with_config` time when `config.rag.nli_threshold > 0`,
|
||||
/// consumed by `RagPipeline::with_verifier` on every `ask` /
|
||||
/// `ask_with_session` call. `None` when the gate is disabled
|
||||
/// (default, threshold = 0) — multi-hop skips step 8.5 entirely
|
||||
/// and single-pass never touches the verifier.
|
||||
///
|
||||
/// Built eagerly (not lazy) so the `open_with_config` `?`
|
||||
/// propagation surfaces NLI model construction errors at App
|
||||
/// boot time, before any user query runs.
|
||||
pipeline_verifier: Option<Arc<dyn kebab_nli::NliVerifier>>,
|
||||
}
|
||||
|
||||
/// p9-fb-19: cache key for `App::search`. Includes every field that
|
||||
@@ -158,20 +190,108 @@ impl App {
|
||||
sqlite
|
||||
.run_migrations()
|
||||
.context("kb-app: run SqliteStore migrations")?;
|
||||
// V009 의 tokenized_korean_text column 의 first-boot eager backfill.
|
||||
// 신규 ingest 의 chunks_ai trigger 가 이미 채우므로 NULL row 가 없으면 즉시 0 반환 (idempotent).
|
||||
// V007 → V009 업그레이드 시 KB 크기 비례 (~10000 chunk 당 ~30-60s).
|
||||
let backfill_count = sqlite
|
||||
.backfill_tokenized_korean_text(
|
||||
|done, total| {
|
||||
if total > 0 && done % 500 == 0 {
|
||||
tracing::info!(
|
||||
target: "kebab-app",
|
||||
"korean tokenizer backfill: {done}/{total}"
|
||||
);
|
||||
}
|
||||
},
|
||||
kebab_chunk::tokenize_korean_morphological,
|
||||
)
|
||||
.unwrap_or_else(|e| {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
"korean tokenizer backfill failed: {e}"
|
||||
);
|
||||
0
|
||||
});
|
||||
if backfill_count > 0 {
|
||||
tracing::info!(
|
||||
target: "kebab-app",
|
||||
"korean tokenizer backfill complete: {backfill_count} chunks updated"
|
||||
);
|
||||
}
|
||||
// p9-fb-19: build the LRU cache from config. Capacity 0 →
|
||||
// `None` (cache disabled — every search hits the retrievers).
|
||||
let search_cache = NonZeroUsize::new(config.search.cache_capacity)
|
||||
.map(|cap| Mutex::new(LruCache::new(cap)));
|
||||
// post-v0.18.0 extractor-dispatch-unification: build the 11-entry
|
||||
// Extractor registry. All entries are state-less unit structs with
|
||||
// zero-cost `new()`, so init cost is effectively 0 and side effects
|
||||
// are 0 — `pipeline_verifier` fallible `?` below may bail but the
|
||||
// already-constructed `extractors` Vec drops without cost. Markdown
|
||||
// is NOT registered (see field doc).
|
||||
let extractors: Vec<Box<dyn Extractor + Send + Sync>> = vec![
|
||||
Box::new(ImageExtractor::new()),
|
||||
Box::new(PdfTextExtractor::new()),
|
||||
Box::new(RustAstExtractor::new()),
|
||||
Box::new(PythonAstExtractor::new()),
|
||||
Box::new(TypescriptAstExtractor::new()),
|
||||
Box::new(JavascriptAstExtractor::new()),
|
||||
Box::new(GoAstExtractor::new()),
|
||||
Box::new(JavaAstExtractor::new()),
|
||||
Box::new(KotlinAstExtractor::new()),
|
||||
Box::new(CAstExtractor::new()),
|
||||
Box::new(CppAstExtractor::new()),
|
||||
];
|
||||
// p9-fb-41 PR-9c-2: build the NLI verifier when the gate is
|
||||
// enabled. App carries it on `RagPipeline` via
|
||||
// `with_verifier` so the rag crate doesn't have to know about
|
||||
// kebab-nli construction. Failure (`?`) surfaces as a user-
|
||||
// facing error at App boot — never a panic in the pipeline's
|
||||
// `expect("verifier must be Some when nli_threshold > 0.0")`.
|
||||
let pipeline_verifier: Option<Arc<dyn kebab_nli::NliVerifier>> = if config.rag.nli_threshold
|
||||
> 0.0
|
||||
{
|
||||
let v = kebab_nli::OnnxNliVerifier::new(&config)
|
||||
.context("kebab-app: construct OnnxNliVerifier (config.rag.nli_threshold > 0)")?;
|
||||
Some(Arc::new(v))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(Self {
|
||||
config,
|
||||
sqlite: Arc::new(sqlite),
|
||||
extractors,
|
||||
embedder: OnceLock::new(),
|
||||
vector: OnceLock::new(),
|
||||
llm: OnceLock::new(),
|
||||
search_cache,
|
||||
pipeline_verifier,
|
||||
})
|
||||
}
|
||||
|
||||
/// Polymorphic dispatcher for the [`Extractor`] trait. Looks up the
|
||||
/// first Extractor whose `supports(media)` returns true and invokes
|
||||
/// `extract(ctx, bytes)` on it.
|
||||
///
|
||||
/// Errors with `anyhow!("no Extractor for media_type {media:?}")`
|
||||
/// when no matching Extractor is registered. Callers in
|
||||
/// `ingest_one_*_asset` reach this only after the outer 4-arm
|
||||
/// dispatch (`MediaType::Markdown` / `Image` / `Pdf` / `Code(lang)`)
|
||||
/// has matched, so a miss is a programming error — NOT a user-
|
||||
/// facing skip.
|
||||
pub(crate) fn extract_for(
|
||||
&self,
|
||||
media: &MediaType,
|
||||
ctx: &ExtractContext<'_>,
|
||||
bytes: &[u8],
|
||||
) -> Result<kebab_core::CanonicalDocument> {
|
||||
let extractor = self
|
||||
.extractors
|
||||
.iter()
|
||||
.find(|e| e.supports(media))
|
||||
.ok_or_else(|| anyhow!("no Extractor for media_type {media:?}"))?;
|
||||
extractor.extract(ctx, bytes)
|
||||
}
|
||||
|
||||
/// Run a [`SearchQuery`] through the configured retriever stack and
|
||||
/// return the top-k hits. p9-fb-19: result is served from the
|
||||
/// in-process LRU cache when the same `(query_norm, mode, k,
|
||||
@@ -235,7 +355,9 @@ impl App {
|
||||
// so other in-flight searches can use the cache concurrently.
|
||||
drop(guard);
|
||||
let hits = self.search_uncached(query)?;
|
||||
let mut guard = cache.lock().unwrap_or_else(|e| e.into_inner());
|
||||
let mut guard = cache
|
||||
.lock()
|
||||
.unwrap_or_else(std::sync::PoisonError::into_inner);
|
||||
guard.put(key, hits.clone());
|
||||
Ok(hits)
|
||||
}
|
||||
@@ -296,6 +418,15 @@ impl App {
|
||||
now,
|
||||
self.config.search.stale_threshold_days,
|
||||
);
|
||||
// p10-1A-2: backfill `code_lang` from the Citation::Code `lang`
|
||||
// field. The search layer (kebab-search) constructs SearchHit with
|
||||
// `code_lang: None`; we own the post-processing here in kebab-app
|
||||
// and can fill it cheaply from data already present in the hit.
|
||||
backfill_code_lang(&mut hits);
|
||||
// p10-1A-2 Task 8b: backfill `repo` from the document's
|
||||
// `Metadata.repo`. Unlike `code_lang`, this cannot be derived from
|
||||
// the Citation alone — it requires a store lookup by `doc_id`.
|
||||
self.backfill_repo(&mut hits);
|
||||
Ok(hits)
|
||||
}
|
||||
|
||||
@@ -306,11 +437,7 @@ impl App {
|
||||
///
|
||||
/// `SearchResponse.next_cursor` and `truncated` are independent
|
||||
/// signals — see `SearchResponse` doc for details.
|
||||
pub fn search_with_opts(
|
||||
&self,
|
||||
query: SearchQuery,
|
||||
opts: SearchOpts,
|
||||
) -> Result<SearchResponse> {
|
||||
pub fn search_with_opts(&self, query: SearchQuery, opts: SearchOpts) -> Result<SearchResponse> {
|
||||
use crate::cursor;
|
||||
|
||||
let corpus_revision = self.sqlite.corpus_revision().to_string();
|
||||
@@ -387,16 +514,19 @@ impl App {
|
||||
now,
|
||||
self.config.search.stale_threshold_days,
|
||||
);
|
||||
// p10-1A-2: backfill code_lang — same as search_uncached.
|
||||
backfill_code_lang(&mut traced_hits);
|
||||
// p10-1A-2 Task 8b: backfill repo — same as search_uncached.
|
||||
self.backfill_repo(&mut traced_hits);
|
||||
|
||||
// Apply offset + k_effective truncation (mirrors non-trace path).
|
||||
let drop_n = offset.min(traced_hits.len());
|
||||
traced_hits.drain(..drop_n);
|
||||
let mut hits: Vec<SearchHit> =
|
||||
traced_hits.into_iter().take(k_effective).collect();
|
||||
let mut hits: Vec<SearchHit> = traced_hits.into_iter().take(k_effective).collect();
|
||||
|
||||
// Snippet truncation if opts.snippet_chars set (mirror non-trace path).
|
||||
if opts.snippet_chars.is_some() {
|
||||
for h in hits.iter_mut() {
|
||||
for h in &mut hits {
|
||||
if h.snippet.chars().count() > snippet_chars {
|
||||
h.snippet = trim_to_chars(&h.snippet, snippet_chars);
|
||||
}
|
||||
@@ -405,28 +535,32 @@ impl App {
|
||||
|
||||
// Trace path skips the budget loop. Caller will inspect
|
||||
// `hits.len()` and `trace.timing` rather than paginate.
|
||||
let hint: Option<String> = None;
|
||||
return Ok(SearchResponse {
|
||||
hits,
|
||||
next_cursor: None,
|
||||
truncated: false,
|
||||
trace: Some(trace),
|
||||
hint,
|
||||
});
|
||||
}
|
||||
|
||||
// backfill_code_lang + backfill_repo are applied inside `search`
|
||||
// via `search_uncached` — no explicit call needed here. Trace
|
||||
// branch above calls them directly because it bypasses `search`.
|
||||
let mut all_hits = self.search(fetch_query)?;
|
||||
|
||||
// Skip offset.
|
||||
let drop_n = offset.min(all_hits.len());
|
||||
all_hits.drain(..drop_n);
|
||||
let mut hits: Vec<SearchHit> =
|
||||
all_hits.into_iter().take(k_effective).collect();
|
||||
let mut hits: Vec<SearchHit> = all_hits.into_iter().take(k_effective).collect();
|
||||
|
||||
// Apply snippet_chars override if shorter than what the
|
||||
// retriever returned (retriever already honored
|
||||
// `config.search.snippet_chars`; this only kicks in when the
|
||||
// caller asked for *less*).
|
||||
if opts.snippet_chars.is_some() {
|
||||
for h in hits.iter_mut() {
|
||||
for h in &mut hits {
|
||||
if h.snippet.chars().count() > snippet_chars {
|
||||
h.snippet = trim_to_chars(&h.snippet, snippet_chars);
|
||||
}
|
||||
@@ -440,15 +574,11 @@ impl App {
|
||||
// Step 1: shorten snippets progressively to a 60-char floor.
|
||||
const SNIPPET_FLOOR: usize = 60;
|
||||
let mut current_snippet_cap = snippet_chars;
|
||||
while estimate_chars(&hits) > max_chars
|
||||
&& current_snippet_cap > SNIPPET_FLOOR
|
||||
{
|
||||
current_snippet_cap =
|
||||
(current_snippet_cap / 2).max(SNIPPET_FLOOR);
|
||||
for h in hits.iter_mut() {
|
||||
while estimate_chars(&hits) > max_chars && current_snippet_cap > SNIPPET_FLOOR {
|
||||
current_snippet_cap = (current_snippet_cap / 2).max(SNIPPET_FLOOR);
|
||||
for h in &mut hits {
|
||||
if h.snippet.chars().count() > current_snippet_cap {
|
||||
h.snippet =
|
||||
trim_to_chars(&h.snippet, current_snippet_cap);
|
||||
h.snippet = trim_to_chars(&h.snippet, current_snippet_cap);
|
||||
truncated = true;
|
||||
}
|
||||
}
|
||||
@@ -489,11 +619,13 @@ impl App {
|
||||
None
|
||||
};
|
||||
|
||||
let hint: Option<String> = None;
|
||||
Ok(SearchResponse {
|
||||
hits,
|
||||
next_cursor,
|
||||
truncated,
|
||||
trace: None,
|
||||
hint,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -502,11 +634,27 @@ impl App {
|
||||
pub fn ask(&self, query: &str, opts: AskOpts) -> Result<Answer> {
|
||||
let retriever = self.build_retriever(opts.mode)?;
|
||||
let llm = self.llm()?;
|
||||
let pipeline =
|
||||
RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
|
||||
let pipeline = self.build_pipeline(retriever, llm);
|
||||
pipeline.ask(query, opts)
|
||||
}
|
||||
|
||||
/// p9-fb-41 PR-9c-2: shared pipeline builder used by [`Self::ask`]
|
||||
/// and [`Self::ask_with_session`]. Attaches the App-built NLI
|
||||
/// verifier (when `cfg.rag.nli_threshold > 0`) via
|
||||
/// `RagPipeline::with_verifier`, keeping the construction site in
|
||||
/// a single place so the two call paths can't drift.
|
||||
fn build_pipeline(
|
||||
&self,
|
||||
retriever: Arc<dyn Retriever>,
|
||||
llm: Arc<dyn LanguageModel>,
|
||||
) -> RagPipeline {
|
||||
let pipeline = RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
|
||||
match &self.pipeline_verifier {
|
||||
Some(v) => pipeline.with_verifier(v.clone()),
|
||||
None => pipeline,
|
||||
}
|
||||
}
|
||||
|
||||
/// p9-fb-18: shared retriever-stack builder used by [`Self::ask`]
|
||||
/// and [`Self::ask_with_session`]. Lexical mode uses the FTS5
|
||||
/// retriever directly; vector / hybrid require embeddings (and
|
||||
@@ -571,12 +719,7 @@ impl App {
|
||||
/// returns; on persistence error, the answer is still returned
|
||||
/// (don't lose the user's compute) but the error is logged so
|
||||
/// the operator notices.
|
||||
pub fn ask_with_session(
|
||||
&self,
|
||||
session_id: &str,
|
||||
query: &str,
|
||||
opts: AskOpts,
|
||||
) -> Result<Answer> {
|
||||
pub fn ask_with_session(&self, session_id: &str, query: &str, opts: AskOpts) -> Result<Answer> {
|
||||
use kebab_core::traits::{ChatSessionRepo, ChatSessionRow, ChatTurnRow};
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
@@ -609,17 +752,13 @@ impl App {
|
||||
|
||||
// p9-fb-18 R1: shared retriever builder removes the prior
|
||||
// copy of `ask`'s 35-line stack — see [`Self::build_retriever`].
|
||||
// p9-fb-41 PR-9c-2: shared `build_pipeline` attaches the NLI
|
||||
// verifier when the gate is enabled.
|
||||
let retriever = self.build_retriever(opts.mode)?;
|
||||
let llm = self.llm()?;
|
||||
let pipeline =
|
||||
RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
|
||||
let answer = pipeline.ask_with_history(
|
||||
query,
|
||||
history,
|
||||
session_id.to_string(),
|
||||
next_index,
|
||||
opts,
|
||||
)?;
|
||||
let pipeline = self.build_pipeline(retriever, llm);
|
||||
let answer =
|
||||
pipeline.ask_with_history(query, history, session_id.to_string(), next_index, opts)?;
|
||||
|
||||
// Auto-create the session header on first use. Title from
|
||||
// the first question (≤40 chars after trim).
|
||||
@@ -660,7 +799,8 @@ impl App {
|
||||
turn_index: next_index,
|
||||
question: query.to_string(),
|
||||
answer: answer.answer.clone(),
|
||||
citations_json: serde_json::to_string(&answer.citations).unwrap_or_else(|_| "[]".to_string()),
|
||||
citations_json: serde_json::to_string(&answer.citations)
|
||||
.unwrap_or_else(|_| "[]".to_string()),
|
||||
created_at: now_unix,
|
||||
};
|
||||
if let Err(e) = self.sqlite.append_turn(&turn_row) {
|
||||
@@ -694,10 +834,26 @@ impl App {
|
||||
if let Some(e) = self.embedder.get() {
|
||||
return Ok(Some(e.clone()));
|
||||
}
|
||||
let emb: Arc<dyn Embedder + Send + Sync> = Arc::new(
|
||||
FastembedEmbedder::new(&self.config)
|
||||
.context("kb-app: load FastembedEmbedder")?,
|
||||
);
|
||||
// Provider branch (Track 1 spec §3). `embeddings_disabled()` above
|
||||
// already handled `"none"`; here we route the live providers.
|
||||
// `fastembed`/`onnx`/(empty) keep the default onnxruntime path
|
||||
// (vectors unchanged — `embedding_version` is preserved); `candle`
|
||||
// selects the pure-Rust NUMA-safe backend.
|
||||
let provider = self.config.models.embedding.provider.as_str();
|
||||
let emb: Arc<dyn Embedder + Send + Sync> = match provider {
|
||||
"fastembed" | "onnx" | "" => Arc::new(
|
||||
FastembedEmbedder::new(&self.config).context("kb-app: load FastembedEmbedder")?,
|
||||
),
|
||||
"candle" => Arc::new(
|
||||
CandleEmbedder::new(&self.config).context("kb-app: load CandleEmbedder")?,
|
||||
),
|
||||
other => {
|
||||
return Err(anyhow!(
|
||||
"kb-app: unknown embedding provider {other:?}; expected one of \
|
||||
`fastembed` (default), `candle`, or `none` (lexical-only)"
|
||||
));
|
||||
}
|
||||
};
|
||||
// `set` returns Err if another thread won the race; in that case
|
||||
// the loser still returns the (now-cached) winner via `get()`.
|
||||
let _ = self.embedder.set(emb.clone());
|
||||
@@ -772,19 +928,68 @@ impl App {
|
||||
/// clear` admin command). No-op when the cache is disabled.
|
||||
pub fn clear_search_cache(&self) {
|
||||
if let Some(cache) = self.search_cache.as_ref() {
|
||||
let mut guard = cache.lock().unwrap_or_else(|e| e.into_inner());
|
||||
let mut guard = cache
|
||||
.lock()
|
||||
.unwrap_or_else(std::sync::PoisonError::into_inner);
|
||||
guard.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// p10-1A-2 Task 8b: back-fill `SearchHit.repo` from the originating
|
||||
/// document's `Metadata.repo` for every hit whose `repo` field is
|
||||
/// currently `None`. The search layer (kebab-search) constructs hits
|
||||
/// with `repo: None` because it has no store access; we fill it here
|
||||
/// in kebab-app post-retrieval via a per-distinct-`doc_id` store lookup.
|
||||
///
|
||||
/// Deduplication: a small `HashMap` accumulates the
|
||||
/// `(doc_id → Option<String>)` mapping so each unique document is
|
||||
/// fetched at most once. Search result sets are small (default k ≤ 20),
|
||||
/// so the map overhead is negligible. A `None` entry is cached too
|
||||
/// (document not found or no repo in metadata) to avoid re-querying.
|
||||
///
|
||||
/// Non-repo documents (markdown, PDF, plain text, code files outside a
|
||||
/// git tree) correctly keep `repo: None` — `Metadata.repo` is already
|
||||
/// `None` for those, so the assignment is a no-op.
|
||||
fn backfill_repo(&self, hits: &mut [SearchHit]) {
|
||||
use kebab_core::DocumentId;
|
||||
use std::collections::HashMap;
|
||||
|
||||
// doc_id → Option<String> where None means "not found / no repo"
|
||||
let mut cache: HashMap<DocumentId, Option<String>> = HashMap::new();
|
||||
|
||||
for hit in hits.iter_mut() {
|
||||
if hit.repo.is_some() {
|
||||
continue;
|
||||
}
|
||||
let repo_val = cache.entry(hit.doc_id.clone()).or_insert_with(|| {
|
||||
// Deliberately non-aborting: a failed store lookup for
|
||||
// one hit must not abort the whole search response. Log
|
||||
// the error so it's observable rather than silently
|
||||
// dropped (review #140 round 1).
|
||||
match self.sqlite.get_document(&hit.doc_id) {
|
||||
Ok(opt) => opt.and_then(|doc| doc.metadata.repo),
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
doc_id = %hit.doc_id,
|
||||
error = %e,
|
||||
"backfill_repo: get_document failed; leaving hit.repo = None"
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
});
|
||||
if let Some(r) = repo_val {
|
||||
hit.repo = Some(r.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve the embedder + vector store, surfacing the user-friendly
|
||||
/// "switch to --mode lexical" error when embeddings are disabled.
|
||||
fn require_embeddings(
|
||||
&self,
|
||||
) -> Result<(
|
||||
Arc<dyn Embedder + Send + Sync>,
|
||||
Arc<LanceVectorStore>,
|
||||
)> {
|
||||
) -> Result<(Arc<dyn Embedder + Send + Sync>, Arc<LanceVectorStore>)> {
|
||||
let emb = self.embedder()?.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"embeddings disabled (config.models.embedding.provider == \"none\" \
|
||||
@@ -806,8 +1011,16 @@ impl App {
|
||||
/// the active config. This token surfaces in `SearchHit.index_version`
|
||||
/// and on snapshot tests; including the chunker version pins it to
|
||||
/// the chunking policy in effect.
|
||||
///
|
||||
/// V009 (2026-05-28): FTS5 tokenizer 가 trigram → unicode61 + 한국어
|
||||
/// 형태소 분해 column 로 갱신됨. `fts5-v009-korean-morphological`
|
||||
/// suffix 가 V007 baseline 과 구별되어 eval runner 의 config
|
||||
/// snapshot 및 search cache 무효화에 picks up 된다.
|
||||
fn lexical_index_version(config: &kebab_config::Config) -> IndexVersion {
|
||||
IndexVersion(format!("lex:{}", config.chunking.chunker_version))
|
||||
IndexVersion(format!(
|
||||
"lex:{}:fts5-v009-korean-morphological",
|
||||
config.chunking.chunker_version
|
||||
))
|
||||
}
|
||||
|
||||
/// p9-fb-37: stand-in for the vector retriever in the trace path when
|
||||
@@ -896,6 +1109,238 @@ fn estimate_chars(hits: &[SearchHit]) -> usize {
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// p10-1A-2: back-fill `SearchHit.code_lang` from `Citation::Code.lang`
|
||||
/// for every code hit in the list. The search layer (kebab-search)
|
||||
/// constructs hits with `code_lang: None`; we fill it here in kebab-app
|
||||
/// post-retrieval so callers see the correct language identifier without
|
||||
/// requiring a second SQL query.
|
||||
fn backfill_code_lang(hits: &mut [SearchHit]) {
|
||||
for hit in hits.iter_mut() {
|
||||
if let kebab_core::Citation::Code { lang, .. } = &hit.citation {
|
||||
if hit.code_lang.is_none() {
|
||||
hit.code_lang = lang.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── v0.20.x r2 Enhancement 3: OCR stats + failures inspect ──────────────
|
||||
|
||||
/// Wire type for `kebab inspect ocr-stats --json` (`ocr_stats.v1`).
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct OcrStatsV1 {
|
||||
pub schema_version: &'static str,
|
||||
pub total_events: u64,
|
||||
pub total_runs: u64,
|
||||
pub success_count: u64,
|
||||
pub failure_count: u64,
|
||||
pub success_rate: f64,
|
||||
pub p50_ms: Option<u64>,
|
||||
pub p90_ms: Option<u64>,
|
||||
pub p99_ms: Option<u64>,
|
||||
pub max_ms: Option<u64>,
|
||||
pub by_engine: std::collections::BTreeMap<String, u64>,
|
||||
pub by_doc: Vec<OcrStatsByDoc>,
|
||||
}
|
||||
|
||||
/// Per-doc breakdown row inside `OcrStatsV1`.
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct OcrStatsByDoc {
|
||||
pub doc_id: String,
|
||||
pub failure_count: u64,
|
||||
pub success_count: u64,
|
||||
pub p90_ms: Option<u64>,
|
||||
}
|
||||
|
||||
/// Wire type for `kebab inspect ocr-failures --json` (`ocr_failures.v1`).
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct OcrFailuresV1 {
|
||||
pub schema_version: &'static str,
|
||||
pub doc_id: Option<String>,
|
||||
pub failure_count: u64,
|
||||
pub failures: Vec<OcrFailureRow>,
|
||||
}
|
||||
|
||||
/// Single failure row inside `OcrFailuresV1`.
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct OcrFailureRow {
|
||||
pub ts: String,
|
||||
pub page: u32,
|
||||
pub ms: u64,
|
||||
pub reason: String,
|
||||
pub image_byte_size: Option<u64>,
|
||||
}
|
||||
|
||||
impl App {
|
||||
/// Corpus-wide OCR statistics from the `pdf_ocr_events` SQLite mirror.
|
||||
pub fn inspect_ocr_stats(&self) -> Result<OcrStatsV1> {
|
||||
self.inspect_ocr_stats_with_config(&self.config)
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn inspect_ocr_stats_with_config(&self, _cfg: &kebab_config::Config) -> Result<OcrStatsV1> {
|
||||
use crate::ingest_log::percentiles;
|
||||
let conn = self.sqlite.read_conn();
|
||||
|
||||
// 1. Aggregate counters
|
||||
let (total_events, success_count, failure_count, total_runs): (u64, u64, u64, u64) = conn
|
||||
.query_row(
|
||||
"SELECT COUNT(*), \
|
||||
SUM(CASE WHEN success=1 THEN 1 ELSE 0 END), \
|
||||
SUM(CASE WHEN success=0 THEN 1 ELSE 0 END), \
|
||||
COUNT(DISTINCT run_id) \
|
||||
FROM pdf_ocr_events",
|
||||
[],
|
||||
|r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)),
|
||||
)
|
||||
.unwrap_or((0, 0, 0, 0));
|
||||
|
||||
let success_rate = if total_events == 0 {
|
||||
0.0
|
||||
} else {
|
||||
success_count as f64 / total_events as f64
|
||||
};
|
||||
|
||||
// 2. Latency percentiles from successful events
|
||||
let samples: Vec<u64> = {
|
||||
let mut stmt = conn
|
||||
.prepare("SELECT ms FROM pdf_ocr_events WHERE success=1 ORDER BY ms")
|
||||
.context("prepare ms query")?;
|
||||
stmt.query_map([], |r| r.get::<_, u64>(0))
|
||||
.context("query ms")?
|
||||
.filter_map(Result::ok)
|
||||
.collect()
|
||||
};
|
||||
let (p50_ms, p90_ms, p99_ms, max_ms) = percentiles(&samples);
|
||||
|
||||
// 3. Engine breakdown
|
||||
let mut by_engine = std::collections::BTreeMap::new();
|
||||
{
|
||||
let mut stmt = conn
|
||||
.prepare("SELECT ocr_engine, COUNT(*) FROM pdf_ocr_events GROUP BY ocr_engine")
|
||||
.context("prepare engine query")?;
|
||||
let rows = stmt
|
||||
.query_map([], |r| Ok((r.get::<_, String>(0)?, r.get::<_, u64>(1)?)))
|
||||
.context("query engine")?;
|
||||
for row in rows.filter_map(Result::ok) {
|
||||
by_engine.insert(row.0, row.1);
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Top-10 docs by failure count
|
||||
let by_doc: Vec<OcrStatsByDoc> = {
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"SELECT doc_id, \
|
||||
SUM(CASE WHEN success=0 THEN 1 ELSE 0 END), \
|
||||
SUM(CASE WHEN success=1 THEN 1 ELSE 0 END) \
|
||||
FROM pdf_ocr_events \
|
||||
WHERE doc_id IS NOT NULL \
|
||||
GROUP BY doc_id \
|
||||
ORDER BY 2 DESC \
|
||||
LIMIT 10",
|
||||
)
|
||||
.context("prepare by_doc query")?;
|
||||
stmt.query_map([], |r| {
|
||||
Ok(OcrStatsByDoc {
|
||||
doc_id: r.get(0)?,
|
||||
failure_count: r.get(1)?,
|
||||
success_count: r.get(2)?,
|
||||
p90_ms: None, // per-doc p90 deferred (open question #3)
|
||||
})
|
||||
})
|
||||
.context("query by_doc")?
|
||||
.filter_map(Result::ok)
|
||||
.collect()
|
||||
};
|
||||
|
||||
Ok(OcrStatsV1 {
|
||||
schema_version: "ocr_stats.v1",
|
||||
total_events,
|
||||
total_runs,
|
||||
success_count,
|
||||
failure_count,
|
||||
success_rate,
|
||||
p50_ms,
|
||||
p90_ms,
|
||||
p99_ms,
|
||||
max_ms,
|
||||
by_engine,
|
||||
by_doc,
|
||||
})
|
||||
}
|
||||
|
||||
/// Recent OCR failure rows, optionally filtered by `doc_id`.
|
||||
pub fn inspect_ocr_failures(
|
||||
&self,
|
||||
doc_id: Option<&str>,
|
||||
limit: usize,
|
||||
) -> Result<OcrFailuresV1> {
|
||||
self.inspect_ocr_failures_with_config(&self.config, doc_id, limit)
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn inspect_ocr_failures_with_config(
|
||||
&self,
|
||||
_cfg: &kebab_config::Config,
|
||||
doc_id: Option<&str>,
|
||||
limit: usize,
|
||||
) -> Result<OcrFailuresV1> {
|
||||
let conn = self.sqlite.read_conn();
|
||||
let failures: Vec<OcrFailureRow> = if let Some(did) = doc_id {
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"SELECT ts, page, ms, COALESCE(reason,'unknown'), image_byte_size \
|
||||
FROM pdf_ocr_events \
|
||||
WHERE success=0 AND doc_id=? \
|
||||
ORDER BY ts DESC \
|
||||
LIMIT ?",
|
||||
)
|
||||
.context("prepare failures by doc_id")?;
|
||||
stmt.query_map(rusqlite::params![did, limit as i64], |r| {
|
||||
Ok(OcrFailureRow {
|
||||
ts: r.get(0)?,
|
||||
page: r.get(1)?,
|
||||
ms: r.get(2)?,
|
||||
reason: r.get(3)?,
|
||||
image_byte_size: r.get(4)?,
|
||||
})
|
||||
})
|
||||
.context("query failures by doc_id")?
|
||||
.filter_map(Result::ok)
|
||||
.collect()
|
||||
} else {
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"SELECT ts, page, ms, COALESCE(reason,'unknown'), image_byte_size \
|
||||
FROM pdf_ocr_events \
|
||||
WHERE success=0 \
|
||||
ORDER BY ts DESC \
|
||||
LIMIT ?",
|
||||
)
|
||||
.context("prepare failures corpus-wide")?;
|
||||
stmt.query_map(rusqlite::params![limit as i64], |r| {
|
||||
Ok(OcrFailureRow {
|
||||
ts: r.get(0)?,
|
||||
page: r.get(1)?,
|
||||
ms: r.get(2)?,
|
||||
reason: r.get(3)?,
|
||||
image_byte_size: r.get(4)?,
|
||||
})
|
||||
})
|
||||
.context("query failures corpus-wide")?
|
||||
.filter_map(Result::ok)
|
||||
.collect()
|
||||
};
|
||||
Ok(OcrFailuresV1 {
|
||||
schema_version: "ocr_failures.v1",
|
||||
doc_id: doc_id.map(String::from),
|
||||
failure_count: failures.len() as u64,
|
||||
failures,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -994,3 +1439,128 @@ mod tests_trace {
|
||||
assert!(resp.trace.is_some(), "trace populated when opts.trace=true");
|
||||
}
|
||||
}
|
||||
|
||||
/// post-v0.18.0 extractor-dispatch-unification: in-crate unit tests for
|
||||
/// the `App.extractors` registry + `App::extract_for` polymorphic
|
||||
/// dispatch. In-crate (not `tests/`) because `extractors` + `extract_for`
|
||||
/// are `pub(crate)` — integration tests cannot reach them.
|
||||
///
|
||||
/// Spec §5.1 + plan §2 Step 10 — 3 test class:
|
||||
/// 1. registry length = 11 (image + pdf + 9 AST).
|
||||
/// 2. mutually-exclusive `supports()` grid over 16 sample MediaTypes.
|
||||
/// 3. `extract_for` returns `Err("no Extractor ...")` for registry-NOT-cover
|
||||
/// MediaType (Audio).
|
||||
#[cfg(test)]
|
||||
mod tests_extractor_dispatch {
|
||||
use super::*;
|
||||
use kebab_core::{AudioType, ExtractConfig, ImageType};
|
||||
|
||||
/// helper: tempdir-isolated App for tests (mirrors `tests_trace`'s
|
||||
/// `open_app_with_temp_dir` pattern).
|
||||
fn open_app_with_temp_dir() -> (tempfile::TempDir, App) {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let mut cfg = kebab_config::Config::defaults();
|
||||
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
|
||||
// Bring up migrations.
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
drop(store);
|
||||
let app = App::open_with_config(cfg).unwrap();
|
||||
(dir, app)
|
||||
}
|
||||
|
||||
/// Registry length invariant: 11 Extractor (image + pdf + 9 AST).
|
||||
/// Markdown is NOT registered (free-function path — defer to a
|
||||
/// separate PR per spec §3.4).
|
||||
#[test]
|
||||
fn registry_has_eleven_extractors() {
|
||||
let (_dir, app) = open_app_with_temp_dir();
|
||||
assert_eq!(
|
||||
app.extractors.len(),
|
||||
11,
|
||||
"registry must hold 11 Extractors (image + pdf + 9 AST). \
|
||||
markdown 은 별 PR."
|
||||
);
|
||||
}
|
||||
|
||||
/// 11 Extractor 의 `supports()` 가 16 sample MediaType 에 대해
|
||||
/// mutually exclusive — 어떤 두 Extractor 도 동일 MediaType 에
|
||||
/// 대해 true 반환 안 됨.
|
||||
#[test]
|
||||
fn supports_grid_is_mutually_exclusive() {
|
||||
let (_dir, app) = open_app_with_temp_dir();
|
||||
let samples = vec![
|
||||
MediaType::Markdown,
|
||||
MediaType::Pdf,
|
||||
MediaType::Image(ImageType::Png),
|
||||
MediaType::Image(ImageType::Jpeg),
|
||||
MediaType::Code("rust".into()),
|
||||
MediaType::Code("python".into()),
|
||||
MediaType::Code("typescript".into()),
|
||||
MediaType::Code("javascript".into()),
|
||||
MediaType::Code("go".into()),
|
||||
MediaType::Code("java".into()),
|
||||
MediaType::Code("kotlin".into()),
|
||||
MediaType::Code("c".into()),
|
||||
MediaType::Code("cpp".into()),
|
||||
MediaType::Code("yaml".into()), // registry NOT cover
|
||||
MediaType::Code("shell".into()), // registry NOT cover
|
||||
MediaType::Audio(AudioType::Wav), // registry NOT cover
|
||||
];
|
||||
for sample in &samples {
|
||||
let hits: Vec<_> = app
|
||||
.extractors
|
||||
.iter()
|
||||
.filter(|e| e.supports(sample))
|
||||
.collect();
|
||||
assert!(
|
||||
hits.len() <= 1,
|
||||
"mutually exclusive violated for {sample:?}: {} hits",
|
||||
hits.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// `extract_for` 가 registry NOT cover MediaType (Audio) 에 대해
|
||||
/// `Err("no Extractor for media_type ...")` 반환. Audio MediaType
|
||||
/// 사용으로 RawAsset 의 actual content 의존 회피 — registry NOT
|
||||
/// cover → 즉시 Err.
|
||||
#[test]
|
||||
fn extract_for_unsupported_media_errors() {
|
||||
let (_dir, app) = open_app_with_temp_dir();
|
||||
|
||||
// Minimal RawAsset. Actual content never read — Audio MediaType
|
||||
// 는 registry NOT cover → `extract_for` 가 dispatch loop 안에서
|
||||
// 바로 Err 반환. RawAsset field set 은 `crates/kebab-core/src/
|
||||
// asset.rs:62-73` 와 정합 (8 field).
|
||||
let asset = kebab_core::RawAsset {
|
||||
asset_id: kebab_core::AssetId("00".repeat(16)),
|
||||
source_uri: kebab_core::SourceUri::File("/tmp/dummy.wav".into()),
|
||||
workspace_path: kebab_core::WorkspacePath("dummy.wav".to_string()),
|
||||
media_type: MediaType::Audio(AudioType::Wav),
|
||||
byte_len: 0,
|
||||
checksum: kebab_core::Checksum("00".repeat(32)),
|
||||
discovered_at: time::OffsetDateTime::now_utc(),
|
||||
// AssetStorage::Inline 미존재 — actual variant `Copied { path }`
|
||||
// 사용 (kebab-core/src/asset.rs:55-60).
|
||||
stored: kebab_core::AssetStorage::Copied {
|
||||
path: std::path::PathBuf::from("/tmp/dummy.wav"),
|
||||
},
|
||||
};
|
||||
|
||||
let workspace_root: std::path::PathBuf = std::path::PathBuf::from("/tmp");
|
||||
let cfg = ExtractConfig::default();
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root: &workspace_root,
|
||||
config: &cfg,
|
||||
};
|
||||
let result = app.extract_for(&MediaType::Audio(AudioType::Wav), &ctx, &[]);
|
||||
assert!(result.is_err(), "Audio 는 registry 미포함 → Err 기대");
|
||||
let err_msg = format!("{:#}", result.unwrap_err());
|
||||
assert!(
|
||||
err_msg.contains("no Extractor"),
|
||||
"unexpected err: {err_msg}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,6 +96,11 @@ fn serialize_search_response(r: &SearchResponse) -> Value {
|
||||
None => Value::Null,
|
||||
};
|
||||
map.insert("trace".to_string(), trace_v);
|
||||
// v0.17.0 A5 Step 4b: only emit `hint` when set — matches
|
||||
// the CLI wire wrapper's additive emit pattern.
|
||||
if let Some(hint) = &r.hint {
|
||||
map.insert("hint".to_string(), Value::String(hint.clone()));
|
||||
}
|
||||
}
|
||||
v
|
||||
}
|
||||
@@ -121,7 +126,10 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
|
||||
let text = obj
|
||||
.get("query")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing required field: query")?
|
||||
.ok_or(
|
||||
"missing required field: query \
|
||||
(expected {\"query\":\"<text>\",\"mode\":\"lexical|vector|hybrid\",\"k\":3,...})",
|
||||
)?
|
||||
.to_string();
|
||||
|
||||
let mode = match obj.get("mode").and_then(|v| v.as_str()) {
|
||||
@@ -134,9 +142,8 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
|
||||
|
||||
let k = obj
|
||||
.get("k")
|
||||
.and_then(|v| v.as_u64())
|
||||
.map(|n| n as usize)
|
||||
.unwrap_or(0); // 0 → use config default in app
|
||||
.and_then(serde_json::Value::as_u64)
|
||||
.map_or(0, |n| n as usize); // 0 → use config default in app
|
||||
|
||||
let trust_min = match obj.get("trust_min").and_then(|v| v.as_str()) {
|
||||
None => None,
|
||||
@@ -197,19 +204,24 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
|
||||
media,
|
||||
ingested_after,
|
||||
doc_id,
|
||||
repo: vec![],
|
||||
code_lang: vec![],
|
||||
};
|
||||
|
||||
let opts = SearchOpts {
|
||||
max_tokens: obj
|
||||
.get("max_tokens")
|
||||
.and_then(|v| v.as_u64())
|
||||
.and_then(serde_json::Value::as_u64)
|
||||
.map(|n| n as usize),
|
||||
snippet_chars: obj
|
||||
.get("snippet_chars")
|
||||
.and_then(|v| v.as_u64())
|
||||
.and_then(serde_json::Value::as_u64)
|
||||
.map(|n| n as usize),
|
||||
cursor: obj.get("cursor").and_then(|v| v.as_str()).map(String::from),
|
||||
trace: obj.get("trace").and_then(|v| v.as_bool()).unwrap_or(false),
|
||||
trace: obj
|
||||
.get("trace")
|
||||
.and_then(serde_json::Value::as_bool)
|
||||
.unwrap_or(false),
|
||||
};
|
||||
|
||||
Ok((
|
||||
@@ -293,4 +305,17 @@ mod tests {
|
||||
assert!(items[1].error.is_some());
|
||||
assert_eq!(items[1].error.as_ref().unwrap()["code"], "invalid_input");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn missing_query_error_message_includes_shape_hint() {
|
||||
let cfg = open_temp();
|
||||
let raw = vec![serde_json::json!({"mode": "lexical"})];
|
||||
let (items, _summary) = bulk_search_with_config(cfg, raw).unwrap();
|
||||
let err = items[0].error.as_ref().unwrap();
|
||||
let msg = err["message"].as_str().unwrap();
|
||||
assert!(
|
||||
msg.contains("query") && msg.contains("mode"),
|
||||
"missing shape hint in error message: {msg}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
61
crates/kebab-app/src/derivation_payload.rs
Normal file
61
crates/kebab-app/src/derivation_payload.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
//! Derivation-cache payload encoding helpers (design 2026-05-31 §3.3).
|
||||
//!
|
||||
//! - embedding: `dimensions × f32` little-endian bytes (1024×4 = 4096 B/chunk).
|
||||
//! - alias / korean_tokens: UTF-8 as-is (handled inline by the caller — no
|
||||
//! helper needed, `String::as_bytes` / `String::from_utf8`).
|
||||
|
||||
/// Encode an embedding vector as a little-endian `f32` byte string (§3.3).
|
||||
pub fn encode_embedding(vector: &[f32]) -> Vec<u8> {
|
||||
let mut out = Vec::with_capacity(vector.len() * 4);
|
||||
for &v in vector {
|
||||
out.extend_from_slice(&v.to_le_bytes());
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Decode a little-endian `f32` byte string back into a vector (§3.3).
|
||||
///
|
||||
/// Returns `None` if the payload length is not a multiple of 4 (corrupt
|
||||
/// entry) — the caller treats this as a cache miss and recomputes, so a bad
|
||||
/// payload never produces a wrong vector.
|
||||
pub fn decode_embedding(payload: &[u8]) -> Option<Vec<f32>> {
|
||||
if payload.len() % 4 != 0 {
|
||||
return None;
|
||||
}
|
||||
Some(
|
||||
payload
|
||||
.chunks_exact(4)
|
||||
.map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn roundtrips_vector() {
|
||||
let v = vec![0.0_f32, 1.5, -2.25, 3.125e10, f32::MIN, f32::MAX];
|
||||
let bytes = encode_embedding(&v);
|
||||
assert_eq!(bytes.len(), v.len() * 4);
|
||||
assert_eq!(decode_embedding(&bytes), Some(v));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_vector_roundtrips() {
|
||||
assert_eq!(encode_embedding(&[]), Vec::<u8>::new());
|
||||
assert_eq!(decode_embedding(&[]), Some(vec![]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn misaligned_payload_is_none() {
|
||||
assert_eq!(decode_embedding(&[1, 2, 3]), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn little_endian_layout_is_fixed() {
|
||||
// 1.0_f32 == 0x3F800000, little-endian bytes [0x00,0x00,0x80,0x3F].
|
||||
assert_eq!(encode_embedding(&[1.0]), vec![0x00, 0x00, 0x80, 0x3F]);
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,6 @@
|
||||
|
||||
pub use crate::doctor_signal::{DoctorUnhealthy, NoHitSignal, RefusalSignal};
|
||||
|
||||
pub use kebab_config::{ConfigInvalid, ConfigNotFound};
|
||||
pub use kebab_llm_local::LlmError;
|
||||
pub use kebab_config::ConfigInvalid;
|
||||
pub use kebab_store_sqlite::NotIndexed;
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use crate::error_signal::{ConfigInvalid, LlmError, NotIndexed};
|
||||
use crate::error_signal::{ConfigInvalid, ConfigNotFound, LlmError, NotIndexed};
|
||||
|
||||
// p9-fb-34: `stale_cursor` is constructed directly by `cursor::decode`
|
||||
// and surfaced through `StructuredError` (an anyhow-friendly wrapper
|
||||
@@ -65,6 +65,20 @@ pub fn classify(err: &anyhow::Error, verbose: bool) -> ErrorV1 {
|
||||
hint: Some("check `--config <path>` and TOML syntax".to_string()),
|
||||
};
|
||||
}
|
||||
if let Some(s) = err.downcast_ref::<ConfigNotFound>() {
|
||||
return ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "config_not_found".to_string(),
|
||||
message: s.to_string(),
|
||||
details: json!({
|
||||
"path": s.path.to_string_lossy(),
|
||||
}),
|
||||
hint: Some(
|
||||
"verify --config <path>; pass an existing toml file or omit --config to use XDG default"
|
||||
.to_string(),
|
||||
),
|
||||
};
|
||||
}
|
||||
if let Some(s) = err.downcast_ref::<NotIndexed>() {
|
||||
return ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
@@ -91,7 +105,7 @@ pub fn classify(err: &anyhow::Error, verbose: bool) -> ErrorV1 {
|
||||
}
|
||||
let mut details = json!({});
|
||||
if verbose {
|
||||
let chain: Vec<String> = err.chain().map(|c| c.to_string()).collect();
|
||||
let chain: Vec<String> = err.chain().map(std::string::ToString::to_string).collect();
|
||||
details = json!({"chain": chain});
|
||||
}
|
||||
ErrorV1 {
|
||||
@@ -158,7 +172,10 @@ mod tests {
|
||||
});
|
||||
let v1 = classify(&err, false);
|
||||
assert_eq!(v1.code, "config_invalid");
|
||||
assert_eq!(v1.details.get("path").and_then(|p| p.as_str()), Some("/tmp/x.toml"));
|
||||
assert_eq!(
|
||||
v1.details.get("path").and_then(|p| p.as_str()),
|
||||
Some("/tmp/x.toml")
|
||||
);
|
||||
assert!(v1.hint.is_some());
|
||||
}
|
||||
|
||||
@@ -182,7 +199,8 @@ mod tests {
|
||||
// the resulting LlmError::Unreachable maps to "model_unreachable".
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.timeout(std::time::Duration::from_millis(500))
|
||||
.build().unwrap();
|
||||
.build()
|
||||
.unwrap();
|
||||
let err = client.get("http://127.0.0.1:1").send().unwrap_err();
|
||||
let llm = LlmError::Unreachable {
|
||||
endpoint: "http://127.0.0.1:1".to_string(),
|
||||
@@ -198,7 +216,10 @@ mod tests {
|
||||
let llm = LlmError::ModelNotPulled("gemma4:e4b".to_string());
|
||||
let v1 = classify(&anyhow::Error::new(llm), false);
|
||||
assert_eq!(v1.code, "model_not_pulled");
|
||||
assert_eq!(v1.details.get("model").and_then(|p| p.as_str()), Some("gemma4:e4b"));
|
||||
assert_eq!(
|
||||
v1.details.get("model").and_then(|p| p.as_str()),
|
||||
Some("gemma4:e4b")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -235,7 +256,10 @@ mod tests {
|
||||
// (single source of truth). classify must not pattern-match on
|
||||
// anyhow string contents — that would create two sources of
|
||||
// truth. The bare anyhow string falls through to "generic".
|
||||
assert_ne!(v1.code, "stale_cursor", "classify must not produce stale_cursor from bare anyhow string");
|
||||
assert_ne!(
|
||||
v1.code, "stale_cursor",
|
||||
"classify must not produce stale_cursor from bare anyhow string"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
274
crates/kebab-app/src/expansion.rs
Normal file
274
crates/kebab-app/src/expansion.rs
Normal file
@@ -0,0 +1,274 @@
|
||||
//! 색인시 doc-side expansion (Phase 2) — 청크당 "검색용 별칭" 생성.
|
||||
//!
|
||||
//! 설계 spec docs/superpowers/specs/2026-05-30-doc-side-expansion-design.md §3.2 / §5.
|
||||
|
||||
use kebab_core::{Chunk, GenerateRequest, LanguageModel};
|
||||
|
||||
/// 별칭 1줄의 최대 글자 수(이 이상은 문장형/환각으로 보고 drop).
|
||||
const MAX_ALIAS_CHARS: usize = 120;
|
||||
|
||||
/// 별칭 프롬프트 템플릿 버전. derivation cache 의 alias version_key 에 포함되어
|
||||
/// (§3.1), 프롬프트를 바꾸면 bump 해 캐시를 무효화한다(전부 miss → 재생성).
|
||||
/// `build_request` 의 gemma 프롬프트와 한 쌍 — 프롬프트 수정 시 함께 bump.
|
||||
pub const PROMPT_VERSION: &str = "expansion-v1";
|
||||
|
||||
/// 청크당 검색용 별칭을 생성한다.
|
||||
///
|
||||
/// 반환: 검증·상한 적용된 별칭들을 개행 join 한 문자열. 생성 0개 / LLM
|
||||
/// 실패 / 빈 출력이면 `None` (호출측은 chunk.aliases 를 None 으로 두고 진행).
|
||||
pub struct ExpansionGenerator<'a> {
|
||||
llm: &'a dyn LanguageModel,
|
||||
max_aliases: usize,
|
||||
}
|
||||
|
||||
impl<'a> ExpansionGenerator<'a> {
|
||||
pub fn new(llm: &'a dyn LanguageModel, max_aliases: usize) -> Self {
|
||||
Self { llm, max_aliases }
|
||||
}
|
||||
|
||||
/// gemma 프롬프트(expansion-v1)를 구성한다. (self 미사용 — associated fn.)
|
||||
fn build_request(chunk: &Chunk) -> GenerateRequest {
|
||||
let heading = chunk.heading_path.join(" > ");
|
||||
let system = "당신은 검색 색인용 별칭 생성기다. 주어진 문단을 찾을 사용자가 \
|
||||
입력할 법한 짧은 검색어/질문을 생성한다. 동의어·풀어쓴 표현을 포함하라. \
|
||||
문단이 한국어면 영어 표현도, 영어면 한국어 표현도 섞어라. \
|
||||
한 줄에 하나씩, 설명·번호·머리기호 없이 검색어만 출력하라."
|
||||
.to_string();
|
||||
let user = format!(
|
||||
"제목 경로: {heading}\n\n문단:\n{}\n\n검색 별칭(한 줄에 하나):",
|
||||
chunk.text
|
||||
);
|
||||
GenerateRequest {
|
||||
system,
|
||||
user,
|
||||
stop: vec![],
|
||||
max_tokens: 256,
|
||||
temperature: 0.0,
|
||||
seed: Some(0),
|
||||
images: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn generate(&self, chunk: &Chunk) -> Option<String> {
|
||||
// 나무위키 네비게이션 boilerplate 청크는 LLM 호출 없이 skip — 별칭
|
||||
// 생성 가치가 없고 노이즈 sentinel 벡터만 만든다.
|
||||
if is_nav_boilerplate(chunk) {
|
||||
return None;
|
||||
}
|
||||
let req = Self::build_request(chunk);
|
||||
let raw = match self.llm.generate_stream(req) {
|
||||
Ok(iter) => {
|
||||
let mut acc = String::new();
|
||||
for ch in iter {
|
||||
match ch {
|
||||
Ok(kebab_core::TokenChunk::Token(t)) => acc.push_str(&t),
|
||||
Ok(kebab_core::TokenChunk::Done { .. }) => {}
|
||||
Err(_) => return None, // fail-soft
|
||||
}
|
||||
}
|
||||
acc
|
||||
}
|
||||
Err(_) => return None, // fail-soft (connection refused 등)
|
||||
};
|
||||
let aliases = parse_aliases(&raw, self.max_aliases);
|
||||
if aliases.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(aliases.join("\n"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// 나무위키 네비게이션 boilerplate 청크 판정.
|
||||
///
|
||||
/// heading_path 가 비어 있고(문서 본문 섹션이 아닌 머리/꼬리 nav), text 앞부분에
|
||||
/// nav 키워드("최근 변경" 등)가 하나라도 있으면 boilerplate 로 본다. 둘 다
|
||||
/// 만족할 때만 true — 정상 본문(heading 있음, 또는 nav 키워드 없음)은 false.
|
||||
pub fn is_nav_boilerplate(chunk: &Chunk) -> bool {
|
||||
const NAV_KEYWORDS: [&str; 5] = [
|
||||
"최근 변경",
|
||||
"Recent changes",
|
||||
"최근 토론",
|
||||
"특수 기능",
|
||||
"편집 토론 역사",
|
||||
];
|
||||
if !chunk.heading_path.is_empty() {
|
||||
return false;
|
||||
}
|
||||
let head: String = chunk.text.chars().take(200).collect();
|
||||
NAV_KEYWORDS.iter().any(|kw| head.contains(kw))
|
||||
}
|
||||
|
||||
/// 줄 선두의 목록 마커만 1회 제거한다. **마커 뒤 공백이 필수** — 별칭 내용이
|
||||
/// 숫자/하이픈/별표로 시작하는 경우(예: "3D 렌더링", "-fast", "2단계")는 보존한다.
|
||||
/// (Task 4 리뷰 MAJOR-1: 탐욕적 `trim_start_matches` 가 정당한 별칭을 손상시키던 버그 수정.)
|
||||
fn strip_list_marker(s: &str) -> &str {
|
||||
// 1) 머리기호 + 공백 ("- " / "* " / "• ").
|
||||
for marker in ["- ", "* ", "• "] {
|
||||
if let Some(rest) = s.strip_prefix(marker) {
|
||||
return rest.trim_start();
|
||||
}
|
||||
}
|
||||
// 2) 번호 + ('.' | ')') + 공백 ("1. " / "2) "). 마커 뒤 공백이 없으면
|
||||
// ("3D", "2단계") 번호가 아니라 내용으로 보고 보존.
|
||||
let digit_end = s.find(|c: char| !c.is_ascii_digit()).unwrap_or(s.len());
|
||||
if digit_end > 0 {
|
||||
let after = &s[digit_end..];
|
||||
if let Some(rest) = after.strip_prefix(". ").or_else(|| after.strip_prefix(") ")) {
|
||||
return rest.trim_start();
|
||||
}
|
||||
}
|
||||
s
|
||||
}
|
||||
|
||||
/// LLM 출력 문자열 → 검증된 별칭 리스트.
|
||||
/// 줄 단위 split → trim → 목록 마커 1회 제거 → 빈 줄·과길이 drop →
|
||||
/// 중복 제거 → 상한 N.
|
||||
fn parse_aliases(raw: &str, max_aliases: usize) -> Vec<String> {
|
||||
let mut out: Vec<String> = Vec::new();
|
||||
for line in raw.lines() {
|
||||
let t = strip_list_marker(line.trim());
|
||||
if t.is_empty() || t.chars().count() > MAX_ALIAS_CHARS {
|
||||
continue;
|
||||
}
|
||||
let s = t.to_string();
|
||||
if !out.contains(&s) {
|
||||
out.push(s);
|
||||
}
|
||||
if out.len() >= max_aliases {
|
||||
break;
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{ChunkId, ChunkerVersion, DocumentId, FinishReason, TokenUsage};
|
||||
use kebab_llm::MockLanguageModel;
|
||||
|
||||
fn mk_chunk(text: &str) -> Chunk {
|
||||
Chunk {
|
||||
chunk_id: ChunkId("c1".into()),
|
||||
doc_id: DocumentId("d1".into()),
|
||||
block_ids: vec![],
|
||||
text: text.into(),
|
||||
heading_path: vec!["Guide".into()],
|
||||
source_spans: vec![],
|
||||
token_estimate: 3,
|
||||
chunker_version: ChunkerVersion("md-heading-v1".into()),
|
||||
policy_hash: "h".into(),
|
||||
tokenized_korean_text: None,
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn mock(resp: &str) -> MockLanguageModel {
|
||||
MockLanguageModel {
|
||||
model_id: "gemma4:e4b".into(),
|
||||
provider: "ollama".into(),
|
||||
context_tokens: 32768,
|
||||
canned_response: resp.into(),
|
||||
canned_finish: FinishReason::Stop,
|
||||
canned_usage: TokenUsage {
|
||||
prompt_tokens: 0,
|
||||
completion_tokens: 0,
|
||||
latency_ms: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_lines_strips_bullets_and_caps() {
|
||||
let llm = mock("- 메모리 안전성\n1. who owns the value\nborrow checker\n\n* 소유권");
|
||||
let generator = ExpansionGenerator::new(&llm, 2);
|
||||
let out = generator.generate(&mk_chunk("Rust ownership")).unwrap();
|
||||
// 상한 2 → 앞 2개만, 접두 제거됨.
|
||||
assert_eq!(out, "메모리 안전성\nwho owns the value");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn drops_overlong_lines() {
|
||||
let long = "x".repeat(200);
|
||||
let llm = mock(&format!("{long}\n짧은 별칭"));
|
||||
let generator = ExpansionGenerator::new(&llm, 8);
|
||||
let out = generator.generate(&mk_chunk("t")).unwrap();
|
||||
assert_eq!(out, "짧은 별칭", "120자 초과 줄은 drop");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_output_returns_none() {
|
||||
let llm = mock(" \n\n");
|
||||
let generator = ExpansionGenerator::new(&llm, 8);
|
||||
assert_eq!(generator.generate(&mk_chunk("t")), None);
|
||||
}
|
||||
|
||||
/// Task 4 리뷰 MAJOR-1 회귀: 숫자/하이픈/별표로 시작하는 정당한 별칭은
|
||||
/// 손상 없이 보존돼야 한다(목록 마커는 마커 뒤 공백이 있을 때만 제거).
|
||||
#[test]
|
||||
fn preserves_numeric_and_dash_leading_aliases() {
|
||||
let llm = mock("3D 렌더링\n2단계 커밋\n-fast 플래그\n- 메모리 안전성\n1. 첫 항목");
|
||||
let generator = ExpansionGenerator::new(&llm, 8);
|
||||
let out = generator.generate(&mk_chunk("graphics")).unwrap();
|
||||
// 마커 없는 선두 숫자/하이픈은 보존; "- "/"1. " 만 마커로 제거.
|
||||
assert_eq!(out, "3D 렌더링\n2단계 커밋\n-fast 플래그\n메모리 안전성\n첫 항목");
|
||||
}
|
||||
|
||||
fn mk_chunk_nav(text: &str, heading: Vec<String>) -> Chunk {
|
||||
let mut c = mk_chunk(text);
|
||||
c.heading_path = heading;
|
||||
c
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nav_boilerplate_skips_alias_generation() {
|
||||
// heading 없음 + nav 키워드 → boilerplate → LLM 호출 전에 None.
|
||||
let llm = mock("별칭1\n별칭2");
|
||||
let generator = ExpansionGenerator::new(&llm, 8);
|
||||
let chunk = mk_chunk_nav("최근 변경 최근 토론 특수 기능", vec![]);
|
||||
assert_eq!(generator.generate(&chunk), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normal_body_chunk_generates_aliases() {
|
||||
// heading 없지만 nav 키워드도 없음 → 정상 본문 → 별칭 생성.
|
||||
let llm = mock("별칭1\n별칭2");
|
||||
let generator = ExpansionGenerator::new(&llm, 8);
|
||||
let chunk = mk_chunk_nav("러스트의 소유권과 빌림 검사기 개요", vec![]);
|
||||
assert_eq!(generator.generate(&chunk).unwrap(), "별칭1\n별칭2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nav_keyword_with_heading_is_not_boilerplate() {
|
||||
// nav 키워드가 있어도 heading 이 있으면 본문 섹션 → 생성.
|
||||
let llm = mock("별칭1");
|
||||
let generator = ExpansionGenerator::new(&llm, 8);
|
||||
let chunk = mk_chunk_nav("최근 변경 내역 설명", vec!["문서 변경사항".into()]);
|
||||
assert_eq!(generator.generate(&chunk).unwrap(), "별칭1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_nav_boilerplate_unit() {
|
||||
assert!(is_nav_boilerplate(&mk_chunk_nav("Recent changes list", vec![])));
|
||||
assert!(is_nav_boilerplate(&mk_chunk_nav("편집 토론 역사", vec![])));
|
||||
assert!(!is_nav_boilerplate(&mk_chunk_nav("일반 본문 텍스트", vec![])));
|
||||
assert!(!is_nav_boilerplate(&mk_chunk_nav(
|
||||
"최근 변경",
|
||||
vec!["섹션".into()]
|
||||
)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strip_list_marker_unit() {
|
||||
assert_eq!(strip_list_marker("- 메모리"), "메모리");
|
||||
assert_eq!(strip_list_marker("* 소유권"), "소유권");
|
||||
assert_eq!(strip_list_marker("1. who owns"), "who owns");
|
||||
assert_eq!(strip_list_marker("2) 항목"), "항목");
|
||||
// 마커 뒤 공백 없음 → 보존.
|
||||
assert_eq!(strip_list_marker("3D 렌더링"), "3D 렌더링");
|
||||
assert_eq!(strip_list_marker("-fast"), "-fast");
|
||||
assert_eq!(strip_list_marker("2단계"), "2단계");
|
||||
assert_eq!(strip_list_marker("2.0 릴리스"), "2.0 릴리스");
|
||||
}
|
||||
}
|
||||
@@ -36,9 +36,7 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let already = existing
|
||||
.lines()
|
||||
.any(|line| line.trim() == KEBABIGNORE_LINE);
|
||||
let already = existing.lines().any(|line| line.trim() == KEBABIGNORE_LINE);
|
||||
if already {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -50,18 +48,14 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
|
||||
if !existing.is_empty() && !existing.ends_with('\n') {
|
||||
file.write_all(b"\n")?;
|
||||
}
|
||||
writeln!(file, "{}", KEBABIGNORE_LINE)?;
|
||||
writeln!(file, "{KEBABIGNORE_LINE}")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Copy bytes to `<external_dir>/<blake3-12>.<ext>`. Idempotent — if the
|
||||
/// destination file already exists with the expected hash, the existing
|
||||
/// file is reused (no second write). Returns the destination path.
|
||||
pub fn copy_to_external(
|
||||
external_dir: &Path,
|
||||
bytes: &[u8],
|
||||
ext: &str,
|
||||
) -> Result<PathBuf> {
|
||||
pub fn copy_to_external(external_dir: &Path, bytes: &[u8], ext: &str) -> Result<PathBuf> {
|
||||
let hash = blake3::hash(bytes);
|
||||
let hex = hash.to_hex();
|
||||
let prefix = &hex.as_str()[..12];
|
||||
@@ -82,11 +76,7 @@ pub fn copy_to_external(
|
||||
/// Internal `yaml_quote` always uses double-quoted YAML form with backslash
|
||||
/// escapes for `"` / `\` / control chars — agent-supplied titles with
|
||||
/// special characters are safe.
|
||||
pub fn inject_frontmatter(
|
||||
body: &str,
|
||||
title: &str,
|
||||
source_uri: Option<&str>,
|
||||
) -> Result<String> {
|
||||
pub fn inject_frontmatter(body: &str, title: &str, source_uri: Option<&str>) -> Result<String> {
|
||||
let head = body.trim_start();
|
||||
if head.starts_with("---\n") || head.starts_with("---\r\n") || head.starts_with("---\r") {
|
||||
anyhow::bail!(
|
||||
|
||||
@@ -50,14 +50,14 @@ impl App {
|
||||
fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result<FetchResult> {
|
||||
let target = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_chunk(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "chunk_not_found".to_string(),
|
||||
message: format!("chunk_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "chunk_not_found".to_string(),
|
||||
message: format!("chunk_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
let doc_id = target.doc_id.clone();
|
||||
let doc =
|
||||
@@ -107,14 +107,14 @@ fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result<FetchResult> {
|
||||
fn fetch_doc(app: &App, id: DocumentId, opts: FetchOpts) -> Result<FetchResult> {
|
||||
let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
let mut text = fmt_canonical_to_markdown(&doc);
|
||||
let mut truncated = false;
|
||||
@@ -176,23 +176,25 @@ fn fetch_span(
|
||||
) -> Result<FetchResult> {
|
||||
let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
// Reject line-incompatible media types (PDF / audio). `SourceType`
|
||||
// (markdown / note / paper / reference / inbox) is the *user-facing*
|
||||
// category, not the rendering format — the actual byte-level format
|
||||
// lives on the source `RawAsset.media_type`. Look it up via
|
||||
// workspace_path (unique key per asset).
|
||||
if let Some(asset) = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_asset_by_workspace_path(
|
||||
// doc.source_asset_id (PRIMARY KEY) so twin files (identical content
|
||||
// at different paths) always read *this* document's own asset row,
|
||||
// not whichever twin last wrote `assets.workspace_path`.
|
||||
if let Some(asset) = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_asset(
|
||||
&app.sqlite,
|
||||
&doc.workspace_path,
|
||||
&doc.source_asset_id,
|
||||
)? {
|
||||
if matches!(
|
||||
asset.media_type,
|
||||
|
||||
446
crates/kebab-app/src/ingest_log.rs
Normal file
446
crates/kebab-app/src/ingest_log.rs
Normal file
@@ -0,0 +1,446 @@
|
||||
//! Per-ingest-run structured ndjson log writer (v0.20.x ingest log feature).
|
||||
//!
|
||||
//! Each `kebab ingest` run produces one `ingest-{run_id}.ndjson` file in
|
||||
//! `config.logging.ingest_log_dir`. Records are appended line by line; the
|
||||
//! last record is always `kind="summary"`. `IngestLogWriter::open` returns
|
||||
//! `Ok(None)` when `ingest_log_enabled = false` so callers need not branch.
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
|
||||
pub struct IngestLogWriter {
|
||||
file: BufWriter<File>,
|
||||
path: PathBuf,
|
||||
run_id: String,
|
||||
started_at: SystemTime,
|
||||
}
|
||||
|
||||
impl IngestLogWriter {
|
||||
/// Open a new log file. Returns `Ok(None)` when `cfg.ingest_log_enabled == false` (AC-6).
|
||||
pub fn open(cfg: &kebab_config::LoggingCfg) -> anyhow::Result<Option<Self>> {
|
||||
if !cfg.ingest_log_enabled {
|
||||
return Ok(None);
|
||||
}
|
||||
let run_id = generate_run_id();
|
||||
let log_dir = expand_log_dir(&cfg.ingest_log_dir);
|
||||
std::fs::create_dir_all(&log_dir)?;
|
||||
// Cleanup before creating the new file (non-critical: warn on error).
|
||||
if let Err(e) = cleanup_old_logs(&log_dir, cfg.keep_recent_runs, cfg.retention_days) {
|
||||
tracing::warn!(target: "kebab-app", "ingest log cleanup failed: {e}");
|
||||
}
|
||||
let path = log_dir.join(format!("ingest-{run_id}.ndjson"));
|
||||
let file = BufWriter::new(File::create(&path)?);
|
||||
Ok(Some(Self {
|
||||
file,
|
||||
path,
|
||||
run_id,
|
||||
started_at: SystemTime::now(),
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn write_event(&mut self, event: &LogEvent<'_>) -> anyhow::Result<()> {
|
||||
serde_json::to_writer(&mut self.file, event)?;
|
||||
writeln!(self.file)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_summary(&mut self, summary: &IngestSummary) -> anyhow::Result<()> {
|
||||
serde_json::to_writer(&mut self.file, summary)?;
|
||||
writeln!(self.file)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> anyhow::Result<()> {
|
||||
self.file.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn run_id(&self) -> &str {
|
||||
&self.run_id
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.path
|
||||
}
|
||||
|
||||
pub fn started_at(&self) -> SystemTime {
|
||||
self.started_at
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for IngestLogWriter {
|
||||
fn drop(&mut self) {
|
||||
let _ = self.file.flush();
|
||||
}
|
||||
}
|
||||
|
||||
/// ISO 8601 compact timestamp + uuid v7 suffix: `20260528T013000Z-abc123de`.
|
||||
/// uuid v7 is the workspace dep (Cargo.toml); `rand` is not added (spec §6 R-5).
|
||||
fn generate_run_id() -> String {
|
||||
use time::macros::format_description;
|
||||
let now = time::OffsetDateTime::now_utc();
|
||||
let ts = now
|
||||
.format(format_description!(
|
||||
"[year][month][day]T[hour][minute][second]Z"
|
||||
))
|
||||
.unwrap_or_else(|_| "19700101T000000Z".to_string());
|
||||
let uid = uuid::Uuid::now_v7().simple().to_string();
|
||||
let suffix = &uid[uid.len() - 8..];
|
||||
format!("{ts}-{suffix}")
|
||||
}
|
||||
|
||||
/// Expand `{state_dir}` placeholder → XDG state dir (spec §6 R-3).
|
||||
/// Other tilde/env expansion is delegated to `kebab_config::expand_path`.
|
||||
fn expand_log_dir(path: &Path) -> PathBuf {
|
||||
let path_str = path.to_string_lossy();
|
||||
if path_str.contains("{state_dir}") {
|
||||
let state_dir = kebab_config::Config::xdg_state_dir();
|
||||
PathBuf::from(path_str.replace("{state_dir}", &state_dir.to_string_lossy()))
|
||||
} else {
|
||||
path.to_path_buf()
|
||||
}
|
||||
}
|
||||
|
||||
/// RFC 3339 UTC timestamp for log records.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn now_ts() -> String {
|
||||
time::OffsetDateTime::now_utc()
|
||||
.format(&Rfc3339)
|
||||
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string())
|
||||
}
|
||||
|
||||
/// Ingest event record (ndjson line). `kind` is the discriminator.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||
pub enum LogEvent<'a> {
|
||||
Ocr {
|
||||
ts: String,
|
||||
/// v0.20.x r2: additive field — doc_id for dual-write SQLite correlation.
|
||||
/// Round 1 ndjson logs deserialize with doc_id=None (Serde Option default).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
doc_id: Option<&'a str>,
|
||||
doc_path: &'a str,
|
||||
page: u32,
|
||||
image_byte_size: Option<u64>,
|
||||
image_width: Option<u32>,
|
||||
image_height: Option<u32>,
|
||||
ms: u64,
|
||||
chars: u32,
|
||||
success: bool,
|
||||
reason: Option<&'a str>,
|
||||
ocr_engine: &'a str,
|
||||
},
|
||||
ParseError {
|
||||
ts: String,
|
||||
doc_path: &'a str,
|
||||
reason: &'a str,
|
||||
message: &'a str,
|
||||
},
|
||||
Skip {
|
||||
ts: String,
|
||||
doc_path: &'a str,
|
||||
reason: &'a str,
|
||||
detail: Option<&'a str>,
|
||||
},
|
||||
Error {
|
||||
ts: String,
|
||||
code: &'a str,
|
||||
message: &'a str,
|
||||
},
|
||||
}
|
||||
|
||||
/// Final summary record — always the last line of the log file.
|
||||
/// Explicit `kind` field serializes to `"kind": "summary"`.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct IngestSummary {
|
||||
pub kind: String,
|
||||
pub ts: String,
|
||||
pub run_id: String,
|
||||
pub scanned: u32,
|
||||
pub new: u32,
|
||||
pub errors: u32,
|
||||
pub ocr_pages: u32,
|
||||
pub ocr_failures: u32,
|
||||
pub ocr_p50_ms: Option<u64>,
|
||||
pub ocr_p90_ms: Option<u64>,
|
||||
pub ocr_max_ms: Option<u64>,
|
||||
pub duration_ms: u64,
|
||||
}
|
||||
|
||||
impl IngestSummary {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
ts: String,
|
||||
run_id: String,
|
||||
scanned: u32,
|
||||
new: u32,
|
||||
errors: u32,
|
||||
ocr_pages: u32,
|
||||
ocr_failures: u32,
|
||||
ocr_ms_samples: &[u64],
|
||||
duration_ms: u64,
|
||||
) -> Self {
|
||||
let (p50, p90, _p99, max) = percentiles(ocr_ms_samples);
|
||||
Self {
|
||||
kind: "summary".to_string(),
|
||||
ts,
|
||||
run_id,
|
||||
scanned,
|
||||
new,
|
||||
errors,
|
||||
ocr_pages,
|
||||
ocr_failures,
|
||||
ocr_p50_ms: p50,
|
||||
ocr_p90_ms: p90,
|
||||
ocr_max_ms: max,
|
||||
duration_ms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple percentile extraction on a sorted copy of `samples`.
|
||||
/// Returns `(p50, p90, p99, max)`. All `None` when samples is empty.
|
||||
/// p99 surfaces via `inspect ocr-stats`; `IngestSummary` uses p50/p90/max only.
|
||||
pub(crate) fn percentiles(samples: &[u64]) -> (Option<u64>, Option<u64>, Option<u64>, Option<u64>) {
|
||||
if samples.is_empty() {
|
||||
return (None, None, None, None);
|
||||
}
|
||||
let mut sorted = samples.to_vec();
|
||||
sorted.sort_unstable();
|
||||
let n = sorted.len();
|
||||
let p50 = sorted[(n.saturating_sub(1) * 50) / 100];
|
||||
let p90 = sorted[(n.saturating_sub(1) * 90) / 100];
|
||||
let p99 = sorted[(n.saturating_sub(1) * 99) / 100];
|
||||
let max = *sorted.last().unwrap();
|
||||
(Some(p50), Some(p90), Some(p99), Some(max))
|
||||
}
|
||||
|
||||
/// Delete old ingest log files from `log_dir`.
|
||||
///
|
||||
/// **Retention rule (§3.4 OR-on-stale semantics):**
|
||||
/// Keep a file iff BOTH conditions hold: (idx < keep_recent) AND (modified > cutoff).
|
||||
/// Delete iff (idx >= keep_recent) OR (modified <= cutoff) — either stale condition
|
||||
/// triggers deletion. Files are indexed newest-first so `idx=0` is the most recent.
|
||||
pub(crate) fn cleanup_old_logs(
|
||||
log_dir: &Path,
|
||||
keep_recent: u32,
|
||||
retention_days: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut entries: Vec<_> = std::fs::read_dir(log_dir)?
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
e.path()
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.is_some_and(|s| s.starts_with("ingest-") && s.ends_with(".ndjson"))
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort newest-first by mtime (files without mtime go to the end).
|
||||
entries.sort_by_key(|e| std::cmp::Reverse(e.metadata().ok().and_then(|m| m.modified().ok())));
|
||||
|
||||
let cutoff = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(
|
||||
u64::from(retention_days) * 86400,
|
||||
))
|
||||
.unwrap_or(SystemTime::UNIX_EPOCH);
|
||||
|
||||
for (idx, entry) in entries.into_iter().enumerate() {
|
||||
let modified = entry
|
||||
.metadata()
|
||||
.ok()
|
||||
.and_then(|m| m.modified().ok())
|
||||
.unwrap_or(SystemTime::UNIX_EPOCH);
|
||||
// Keep iff (idx < keep_recent) AND (modified > cutoff).
|
||||
if (idx as u32) < keep_recent && modified > cutoff {
|
||||
continue;
|
||||
}
|
||||
if let Err(e) = std::fs::remove_file(entry.path()) {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
"failed to remove old log {}: {e}",
|
||||
entry.path().display()
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_config::LoggingCfg;
|
||||
use std::time::SystemTime;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn generate_run_id_has_iso_prefix_and_8_hex_suffix() {
|
||||
let id = generate_run_id();
|
||||
// Format: YYYYMMDDTHHmmssZ-xxxxxxxx (total len = 16+1+8 = 25)
|
||||
assert_eq!(id.len(), 25, "run_id len should be 25: {id}");
|
||||
let (prefix, suffix) = id.split_once('-').expect("run_id should contain '-'");
|
||||
assert_eq!(prefix.len(), 16, "prefix should be 16 chars: {prefix}");
|
||||
assert!(prefix.contains('T'), "prefix should contain T: {prefix}");
|
||||
assert!(prefix.ends_with('Z'), "prefix should end with Z: {prefix}");
|
||||
assert_eq!(suffix.len(), 8, "suffix should be 8 chars: {suffix}");
|
||||
assert!(
|
||||
suffix.chars().all(|c| c.is_ascii_hexdigit()),
|
||||
"suffix should be hex: {suffix}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_log_dir_substitutes_state_dir_placeholder() {
|
||||
let input = PathBuf::from("{state_dir}/logs");
|
||||
let expanded = expand_log_dir(&input);
|
||||
let expected = kebab_config::Config::xdg_state_dir().join("logs");
|
||||
assert_eq!(expanded, expected);
|
||||
assert!(!expanded.to_string_lossy().contains("{state_dir}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn writer_disabled_returns_none() {
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: false,
|
||||
ingest_log_dir: PathBuf::from("/tmp/should-not-exist"),
|
||||
..Default::default()
|
||||
};
|
||||
let result = IngestLogWriter::open(&cfg).expect("open should not error");
|
||||
assert!(result.is_none(), "disabled writer should return None");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn writer_writes_one_event_per_line_with_kind_discriminator() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: tmp.path().to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
|
||||
let path = writer.path().to_path_buf();
|
||||
|
||||
writer
|
||||
.write_event(&LogEvent::Skip {
|
||||
ts: now_ts(),
|
||||
doc_path: "a.zip",
|
||||
reason: "builtin_blacklist",
|
||||
detail: Some(".zip extension"),
|
||||
})
|
||||
.unwrap();
|
||||
writer
|
||||
.write_event(&LogEvent::Error {
|
||||
ts: now_ts(),
|
||||
code: "ingest_fatal",
|
||||
message: "something bad",
|
||||
})
|
||||
.unwrap();
|
||||
writer
|
||||
.write_event(&LogEvent::ParseError {
|
||||
ts: now_ts(),
|
||||
doc_path: "weird.pdf",
|
||||
reason: "lopdf_error",
|
||||
message: "unexpected EOF",
|
||||
})
|
||||
.unwrap();
|
||||
writer.flush().unwrap();
|
||||
|
||||
let contents = std::fs::read_to_string(&path).unwrap();
|
||||
let lines: Vec<&str> = contents.lines().collect();
|
||||
assert_eq!(lines.len(), 3, "expected 3 lines, got: {}", lines.len());
|
||||
for line in &lines {
|
||||
assert!(
|
||||
line.starts_with('{'),
|
||||
"each line should be JSON object: {line}"
|
||||
);
|
||||
assert!(
|
||||
line.contains("\"kind\""),
|
||||
"each line should have 'kind': {line}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn drop_flushes_pending_buffer() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: tmp.path().to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
|
||||
let path = writer.path().to_path_buf();
|
||||
writer
|
||||
.write_event(&LogEvent::Error {
|
||||
ts: now_ts(),
|
||||
code: "test",
|
||||
message: "drop flush test",
|
||||
})
|
||||
.unwrap();
|
||||
// Drop without explicit flush — Drop impl should flush BufWriter.
|
||||
drop(writer);
|
||||
let contents = std::fs::read_to_string(&path).unwrap();
|
||||
assert!(
|
||||
contents.lines().count() >= 1,
|
||||
"file should have at least 1 line after drop"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC-7: keep_recent=3 with 5 files, oldest 2 should be deleted.
|
||||
#[test]
|
||||
fn cleanup_keeps_recent_n_drops_old() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let dir = tmp.path();
|
||||
// Create 5 files with mtime spread across 60 days
|
||||
for i in 0..5u64 {
|
||||
let path = dir.join(format!("ingest-file{i}.ndjson"));
|
||||
std::fs::write(&path, b"x").unwrap();
|
||||
// Set mtime: file 0 = newest, file 4 = 60 days old
|
||||
let age_days = i * 15; // 0, 15, 30, 45, 60 days old
|
||||
let mtime = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(age_days * 86400))
|
||||
.unwrap();
|
||||
filetime::set_file_mtime(&path, filetime::FileTime::from_system_time(mtime)).unwrap();
|
||||
}
|
||||
// keep_recent=3, retention_days=90 (no time-based deletion)
|
||||
cleanup_old_logs(dir, 3, 90).unwrap();
|
||||
let remaining: Vec<_> = std::fs::read_dir(dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.collect();
|
||||
assert_eq!(remaining.len(), 3, "expected 3 files after cleanup");
|
||||
}
|
||||
|
||||
/// F5 OR-on-stale: files within keep_recent count but older than retention_days
|
||||
/// must still be deleted.
|
||||
#[test]
|
||||
fn cleanup_drops_stale_even_within_count() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let dir = tmp.path();
|
||||
// 2 files, both 90 days old — well past retention_days=30
|
||||
for i in 0..2u64 {
|
||||
let path = dir.join(format!("ingest-old{i}.ndjson"));
|
||||
std::fs::write(&path, b"x").unwrap();
|
||||
let mtime = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(90 * 86400))
|
||||
.unwrap();
|
||||
filetime::set_file_mtime(&path, filetime::FileTime::from_system_time(mtime)).unwrap();
|
||||
}
|
||||
// keep_recent=10 (both within count) but retention_days=30 → both stale
|
||||
cleanup_old_logs(dir, 10, 30).unwrap();
|
||||
let remaining: Vec<_> = std::fs::read_dir(dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.collect();
|
||||
assert_eq!(
|
||||
remaining.len(),
|
||||
0,
|
||||
"stale files must be deleted even within keep_recent"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -46,10 +46,13 @@ pub struct AggregateCounts {
|
||||
/// Ordering invariant per design §2.4a:
|
||||
///
|
||||
/// ```text
|
||||
/// ScanStarted < ScanCompleted < (AssetStarted < AssetFinished)*
|
||||
/// < (Completed | Aborted)
|
||||
/// ScanStarted < ScanCompleted
|
||||
/// < (AssetStarted [< (PdfOcrStarted < PdfOcrFinished)*] < AssetFinished)*
|
||||
/// < (Completed | Aborted)
|
||||
/// ```
|
||||
///
|
||||
/// `[]` = optional, per-PDF asset only (v0.20.0 sub-item 1).
|
||||
///
|
||||
/// Embed-batch events (`embed_batch_started` / `embed_batch_finished`
|
||||
/// in §2.4a) are reserved for a future iteration and are not emitted
|
||||
/// by this task; the spec calls them out as "임의 위치" (optional).
|
||||
@@ -85,6 +88,30 @@ pub enum IngestEvent {
|
||||
/// aggregate at the cancel boundary. Emitted by `p9-fb-04`; this
|
||||
/// task never produces `Aborted`.
|
||||
Aborted { counts: AggregateCounts },
|
||||
/// PDF page 별 OCR 시작 시 emit. v0.20.0 sub-item 1.
|
||||
PdfOcrStarted { page: u32 },
|
||||
/// PDF page 별 OCR 종료 시 emit. v0.20.0 sub-item 1.
|
||||
/// `skipped` = `true` 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패).
|
||||
/// `chars = 0` 만으로는 "skip" 과 "0-char OCR result" 구분 불가, `skipped` field 가 명시적.
|
||||
PdfOcrFinished {
|
||||
page: u32,
|
||||
ms: u64,
|
||||
chars: u32,
|
||||
ocr_engine: String,
|
||||
skipped: bool,
|
||||
/// v0.20.x ingest log: raster image byte size (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_byte_size: Option<u64>,
|
||||
/// v0.20.x ingest log: raster image width in pixels (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_width: Option<u32>,
|
||||
/// v0.20.x ingest log: raster image height in pixels (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_height: Option<u32>,
|
||||
/// v0.20.x ingest log: OCR failure reason (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
failure_reason: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
/// Map a `MediaType` to the short label used by `IngestEvent::AssetStarted`.
|
||||
@@ -96,6 +123,7 @@ pub fn media_label(media: &kebab_core::MediaType) -> &'static str {
|
||||
kebab_core::MediaType::Pdf => "pdf",
|
||||
kebab_core::MediaType::Image(_) => "image",
|
||||
kebab_core::MediaType::Audio(_) => "audio",
|
||||
kebab_core::MediaType::Code(_) => "code",
|
||||
kebab_core::MediaType::Other(_) => "other",
|
||||
}
|
||||
}
|
||||
@@ -117,10 +145,7 @@ pub fn render_skipped_breakdown(map: &std::collections::BTreeMap<String, u32>) -
|
||||
/// Best-effort send into an optional `mpsc::Sender`. A dropped receiver
|
||||
/// is silently absorbed — the ingest hot path must not stall on a slow
|
||||
/// consumer. Logged at `trace` for diagnostics.
|
||||
pub(crate) fn emit(
|
||||
progress: Option<&std::sync::mpsc::Sender<IngestEvent>>,
|
||||
event: IngestEvent,
|
||||
) {
|
||||
pub(crate) fn emit(progress: Option<&std::sync::mpsc::Sender<IngestEvent>>, event: IngestEvent) {
|
||||
if let Some(tx) = progress {
|
||||
if tx.send(event).is_err() {
|
||||
tracing::trace!(
|
||||
@@ -148,6 +173,7 @@ mod tests {
|
||||
media_label(&MediaType::Audio(kebab_core::AudioType::Wav)),
|
||||
"audio"
|
||||
);
|
||||
assert_eq!(media_label(&MediaType::Code("rust".into())), "code");
|
||||
assert_eq!(media_label(&MediaType::Other("x".into())), "other");
|
||||
}
|
||||
|
||||
@@ -163,9 +189,12 @@ mod tests {
|
||||
media: "markdown".into(),
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("asset_started"));
|
||||
assert_eq!(v.get("idx").and_then(|n| n.as_u64()), Some(1));
|
||||
assert_eq!(v.get("total").and_then(|n| n.as_u64()), Some(10));
|
||||
assert_eq!(
|
||||
v.get("kind").and_then(|s| s.as_str()),
|
||||
Some("asset_started")
|
||||
);
|
||||
assert_eq!(v.get("idx").and_then(serde_json::Value::as_u64), Some(1));
|
||||
assert_eq!(v.get("total").and_then(serde_json::Value::as_u64), Some(10));
|
||||
assert_eq!(v.get("path").and_then(|s| s.as_str()), Some("notes/foo.md"));
|
||||
assert_eq!(v.get("media").and_then(|s| s.as_str()), Some("markdown"));
|
||||
}
|
||||
@@ -182,8 +211,14 @@ mod tests {
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("completed"));
|
||||
let counts = v.get("counts").unwrap();
|
||||
assert_eq!(counts.get("scanned").and_then(|n| n.as_u64()), Some(5));
|
||||
assert_eq!(counts.get("new").and_then(|n| n.as_u64()), Some(2));
|
||||
assert_eq!(
|
||||
counts.get("scanned").and_then(serde_json::Value::as_u64),
|
||||
Some(5)
|
||||
);
|
||||
assert_eq!(
|
||||
counts.get("new").and_then(serde_json::Value::as_u64),
|
||||
Some(2)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -26,7 +26,9 @@ pub fn init(level: LogLevel) -> Result<WorkerGuard> {
|
||||
let (nb, guard) = tracing_appender::non_blocking(file_appender);
|
||||
|
||||
let env_filter = match level {
|
||||
LogLevel::Default => EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")),
|
||||
LogLevel::Default => {
|
||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn"))
|
||||
}
|
||||
LogLevel::Verbose => EnvFilter::new("info"),
|
||||
LogLevel::Debug => EnvFilter::new("debug"),
|
||||
};
|
||||
|
||||
362
crates/kebab-app/src/pdf_ocr_apply.rs
Normal file
362
crates/kebab-app/src/pdf_ocr_apply.rs
Normal file
@@ -0,0 +1,362 @@
|
||||
// crates/kebab-app/src/pdf_ocr_apply.rs
|
||||
//
|
||||
// PDF post-extract OCR enrichment. parser isolation 보존 — kebab-parse-pdf 가
|
||||
// kebab-parse-image::OcrEngine 을 import 하지 않도록, helper 는 kebab-app 에 둠.
|
||||
// image path 의 apply_ocr (kebab-parse-image::ocr::apply_ocr) 의
|
||||
// PDF page 변형 — image 는 ImageRefBlock.ocr 를 mutate, PDF 는
|
||||
// Block::Paragraph.text / inlines 를 in-place mutate (단일 OCR fallback) 또는
|
||||
// 새 Block::Paragraph 를 push (always_on dual-block).
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, CommonBlock, Inline, Lang, ProvenanceEvent, ProvenanceKind,
|
||||
SourceSpan, TextBlock, id_for_block,
|
||||
};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
use kebab_parse_pdf::{compute_valid_char_ratio, extract_dctdecode_page_image};
|
||||
use lopdf::Document as LopdfDocument;
|
||||
use time::OffsetDateTime;
|
||||
use tracing::warn;
|
||||
|
||||
/// Extract width/height from a JPEG (or any image format) byte slice.
|
||||
/// Returns `None` on corrupt / unsupported data — callers fall back to
|
||||
/// `(None, None)` so OCR results remain valid (R-4 mitigation).
|
||||
fn extract_image_dimensions(bytes: &[u8]) -> Option<(u32, u32)> {
|
||||
use image::ImageReader;
|
||||
ImageReader::new(std::io::Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.ok()?
|
||||
.into_dimensions()
|
||||
.ok()
|
||||
}
|
||||
|
||||
/// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`].
|
||||
/// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade
|
||||
/// (`kebab_app::ingest_one_pdf_asset`) fills these from
|
||||
/// `kebab_config::Config::pdf::ocr` plus runtime flags (CLI / SIGINT).
|
||||
pub struct PdfOcrOpts {
|
||||
/// Master switch. `false` short-circuits to
|
||||
/// `PdfOcrSummary { pages_ocrd: 0, ms_total: 0 }` without lopdf reparse.
|
||||
pub enabled: bool,
|
||||
/// `true` → 모든 page OCR (dual-block path, new `Block::Paragraph` push).
|
||||
/// `false` → text-detect block 의 `min_char_count` 또는
|
||||
/// `valid_ratio_threshold` 미달인 page 만 OCR (in-place mutate).
|
||||
pub always_on: bool,
|
||||
/// 0.0..=1.0. text-detect block 의 `compute_valid_char_ratio` 가
|
||||
/// 본 임계 미만이면 OCR fallback. Default `0.5`.
|
||||
pub valid_ratio_threshold: f32,
|
||||
/// text-detect block 의 char count 가 본 임계 미만이면 OCR fallback.
|
||||
/// empty page (cover, blank separator) 자동 skip. Default `20`.
|
||||
pub min_char_count: u32,
|
||||
/// OCR engine 에 전달할 언어 힌트 (예: `Lang("kor".into())`).
|
||||
/// `None` → no hint passed to engine.
|
||||
pub lang_hint: Option<Lang>,
|
||||
/// Optional per-page cancellation handle. checked at start of each page
|
||||
/// loop iteration; set→true 시 `cancelled mid-PDF` error 반환. plan §6 E4
|
||||
/// + verifier LOW L-1 resolution + spec §4.8 line 1159 명시.
|
||||
pub cancel: Option<Arc<AtomicBool>>,
|
||||
}
|
||||
|
||||
/// OCR run summary returned by [`apply_ocr_to_pdf_pages`] for the caller's
|
||||
/// `IngestItem.pdf_ocr_pages` + `pdf_ocr_ms_total` wire fields (§4.6.2).
|
||||
#[derive(Debug)]
|
||||
pub struct PdfOcrSummary {
|
||||
/// Number of pages 가 OCR pipeline 을 실제 통과 (skipped page 제외).
|
||||
pub pages_ocrd: u32,
|
||||
/// Cumulative wall-clock duration of successful OCR engine calls (ms).
|
||||
/// `saturating_add` 사용 — 24-day cumulative 까지 overflow-safe.
|
||||
pub ms_total: u64,
|
||||
}
|
||||
|
||||
/// Post-extract OCR enrichment for PDF. Walks `canonical.blocks` page-by-page,
|
||||
/// classifies each page via `text_quality::compute_valid_char_ratio` +
|
||||
/// `min_char_count`, and either:
|
||||
/// - skips (vector PDF + sufficient text + `always_on=false`),
|
||||
/// - mutates the text-detect `Block::Paragraph` in-place with OCR output
|
||||
/// (scanned/mojibake page), or
|
||||
/// - pushes a new `Block::Paragraph` with dual ordinal (`always_on=true` +
|
||||
/// vector page).
|
||||
///
|
||||
/// Errors:
|
||||
/// - cancel handle (`opts.cancel = Some(true)`) → `Err("PDF OCR cancelled mid-PDF at page N")`.
|
||||
/// - lopdf re-parse failure → `Err(...)`.
|
||||
/// - per-page OCR engine failure 또는 DCTDecode 부재 → `ProvenanceKind::Warning`
|
||||
/// event push + `emit_progress(Finished { skipped: true })` + continue
|
||||
/// (no `Err` propagation).
|
||||
///
|
||||
/// See spec §4.1 + §4.4 for the full pipeline.
|
||||
pub fn apply_ocr_to_pdf_pages<F>(
|
||||
canonical: &mut CanonicalDocument,
|
||||
engine: &dyn OcrEngine,
|
||||
pdf_bytes: &[u8],
|
||||
opts: &PdfOcrOpts,
|
||||
mut emit_progress: F,
|
||||
) -> Result<PdfOcrSummary>
|
||||
where
|
||||
F: FnMut(PdfOcrProgress),
|
||||
{
|
||||
if !opts.enabled {
|
||||
return Ok(PdfOcrSummary {
|
||||
pages_ocrd: 0,
|
||||
ms_total: 0,
|
||||
});
|
||||
}
|
||||
let pdf_doc = LopdfDocument::load_mem(pdf_bytes)
|
||||
.context("kb-app::pdf_ocr_apply: re-parse PDF for image extract")?;
|
||||
let page_count = pdf_doc.get_pages().len() as u32;
|
||||
|
||||
let mut new_events: Vec<ProvenanceEvent> = Vec::new();
|
||||
let mut ocr_blocks: Vec<Block> = Vec::new();
|
||||
let mut pages_ocrd: u32 = 0;
|
||||
let mut ms_total: u64 = 0;
|
||||
|
||||
// canonical.blocks 의 page → block index map (text-detect block 의 in-place
|
||||
// mutate 또는 dual-block push 결정용).
|
||||
// PdfTextExtractor 가 page 마다 1 Block::Paragraph + SourceSpan::Page 를
|
||||
// 생성 (§1.4) — 그 invariant 사용.
|
||||
for page_num in 1..=page_count {
|
||||
if let Some(cancel) = &opts.cancel {
|
||||
if cancel.load(std::sync::atomic::Ordering::Relaxed) {
|
||||
anyhow::bail!("PDF OCR cancelled mid-PDF at page {page_num}");
|
||||
}
|
||||
}
|
||||
|
||||
let text_block_idx = find_paragraph_block_idx(&canonical.blocks, page_num);
|
||||
let text = match &canonical.blocks[text_block_idx] {
|
||||
Block::Paragraph(tb) => tb.text.clone(),
|
||||
_ => String::new(),
|
||||
};
|
||||
let chars = text.chars().count() as u32;
|
||||
let valid_ratio = compute_valid_char_ratio(&text);
|
||||
let needs_ocr = chars < opts.min_char_count || valid_ratio < opts.valid_ratio_threshold;
|
||||
|
||||
// 결정 matrix:
|
||||
// always_on=true → 모든 page OCR (dual-block).
|
||||
// always_on=false + needs_ocr → in-place OCR (text-detect block mutate).
|
||||
// needs_ocr=false → skip.
|
||||
let do_ocr = opts.always_on || needs_ocr;
|
||||
if !do_ocr {
|
||||
continue;
|
||||
}
|
||||
|
||||
emit_progress(PdfOcrProgress::Started { page: page_num });
|
||||
|
||||
let page_image_bytes = if let Some(b) = extract_dctdecode_page_image(&pdf_doc, page_num)? {
|
||||
b
|
||||
} else {
|
||||
let note = format!(
|
||||
"page={page_num} skipped: no DCTDecode image XObject (vector PDF page or unsupported /Filter — v1 supports DCTDecode passthrough only; see release notes for normalization guidance)"
|
||||
);
|
||||
warn!(target: "kebab-app", "{}", note);
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(note),
|
||||
});
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: 0,
|
||||
chars: 0,
|
||||
skipped: true,
|
||||
image_byte_size: None,
|
||||
image_width: None,
|
||||
image_height: None,
|
||||
failure_reason: None,
|
||||
});
|
||||
continue;
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
let ocr = match engine.recognize(&page_image_bytes, opts.lang_hint.as_ref()) {
|
||||
Ok(t) => t,
|
||||
Err(e) => {
|
||||
// OCR failure: warning event + skip (text-detect block 그대로).
|
||||
let note = format!(
|
||||
"page={} OCR failed engine={} version={} err={}",
|
||||
page_num,
|
||||
engine.engine_name(),
|
||||
engine.engine_version(),
|
||||
e
|
||||
);
|
||||
warn!(target: "kebab-app", "{}", note);
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(note),
|
||||
});
|
||||
let (image_width, image_height) = extract_image_dimensions(&page_image_bytes)
|
||||
.map_or((None, None), |(w, h)| (Some(w), Some(h)));
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: start.elapsed().as_millis() as u64,
|
||||
chars: 0,
|
||||
skipped: true,
|
||||
image_byte_size: Some(page_image_bytes.len() as u64),
|
||||
image_width,
|
||||
image_height,
|
||||
failure_reason: Some("ocr_error".to_string()),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let elapsed_ms = start.elapsed().as_millis() as u64;
|
||||
let chars_ocr = ocr.joined.chars().count() as u32;
|
||||
|
||||
pages_ocrd = pages_ocrd.saturating_add(1);
|
||||
ms_total = ms_total.saturating_add(elapsed_ms);
|
||||
|
||||
if opts.always_on && !needs_ocr {
|
||||
// dual-block path: 새 Block::Paragraph push, ordinal = page-1 + page_count.
|
||||
let ocr_ordinal = (page_num - 1) + page_count;
|
||||
let span_ocr = SourceSpan::Page {
|
||||
page: page_num,
|
||||
char_start: Some(0),
|
||||
char_end: Some(chars_ocr),
|
||||
};
|
||||
let block_id =
|
||||
id_for_block(&canonical.doc_id, "paragraph", &[], ocr_ordinal, &span_ocr);
|
||||
let common = CommonBlock {
|
||||
block_id,
|
||||
heading_path: Vec::new(),
|
||||
source_span: span_ocr,
|
||||
};
|
||||
ocr_blocks.push(Block::Paragraph(TextBlock {
|
||||
common,
|
||||
text: ocr.joined.clone(),
|
||||
inlines: if ocr.joined.is_empty() {
|
||||
Vec::new()
|
||||
} else {
|
||||
vec![Inline::Text {
|
||||
text: ocr.joined.clone(),
|
||||
}]
|
||||
},
|
||||
}));
|
||||
} else {
|
||||
// in-place mutate: text-detect block (빈 또는 low-valid) 의 text/inlines 교체.
|
||||
// block_id / ordinal 보존 — span 의 char_end 만 갱신.
|
||||
if let Block::Paragraph(tb) = &mut canonical.blocks[text_block_idx] {
|
||||
tb.text = ocr.joined.clone();
|
||||
tb.inlines = if ocr.joined.is_empty() {
|
||||
Vec::new()
|
||||
} else {
|
||||
vec![Inline::Text {
|
||||
text: ocr.joined.clone(),
|
||||
}]
|
||||
};
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(chars_ocr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::OcrApplied,
|
||||
note: Some(format!(
|
||||
"page={} engine={} version={} regions={} ms={} chars={}",
|
||||
page_num,
|
||||
engine.engine_name(),
|
||||
engine.engine_version(),
|
||||
ocr.regions.len(),
|
||||
elapsed_ms,
|
||||
chars_ocr
|
||||
)),
|
||||
});
|
||||
|
||||
let (image_width, image_height) = extract_image_dimensions(&page_image_bytes)
|
||||
.map_or((None, None), |(w, h)| (Some(w), Some(h)));
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: elapsed_ms,
|
||||
chars: chars_ocr,
|
||||
skipped: false,
|
||||
image_byte_size: Some(page_image_bytes.len() as u64),
|
||||
image_width,
|
||||
image_height,
|
||||
failure_reason: None,
|
||||
});
|
||||
}
|
||||
|
||||
canonical.blocks.extend(ocr_blocks);
|
||||
canonical.provenance.events.extend(new_events);
|
||||
Ok(PdfOcrSummary {
|
||||
pages_ocrd,
|
||||
ms_total,
|
||||
})
|
||||
}
|
||||
|
||||
fn find_paragraph_block_idx(blocks: &[Block], page_num: u32) -> usize {
|
||||
blocks
|
||||
.iter()
|
||||
.position(|b| match b {
|
||||
Block::Paragraph(tb) => matches!(
|
||||
tb.common.source_span,
|
||||
SourceSpan::Page { page, .. } if page == page_num
|
||||
),
|
||||
_ => false,
|
||||
})
|
||||
.expect("PdfTextExtractor emits 1 Block::Paragraph per page (invariant)")
|
||||
}
|
||||
|
||||
/// Per-page OCR progress event 가 caller 의 `emit_progress` closure 호출 시 emit.
|
||||
/// Step 6 의 ingest_one_pdf_asset 가 IngestEvent::PdfOcrStarted / PdfOcrFinished
|
||||
/// 로 carry (spec §4.6.1 wire schema).
|
||||
pub enum PdfOcrProgress {
|
||||
/// page 별 OCR 시작 시 emit. `engine.recognize` 호출 직전.
|
||||
Started {
|
||||
/// 1-based PDF page number.
|
||||
page: u32,
|
||||
},
|
||||
/// page 별 OCR 종료 시 emit (성공 / skip / failure 모두).
|
||||
Finished {
|
||||
/// 1-based PDF page number.
|
||||
page: u32,
|
||||
/// `engine.recognize` wall-clock duration. skip path 의 의미는 mixed
|
||||
/// (DCTDecode 부재 시 `0`, OCR engine 실패 시 actual latency before bail).
|
||||
ms: u64,
|
||||
/// OCR result text 의 char count. skip 시 `0`.
|
||||
chars: u32,
|
||||
/// `true` = DCTDecode 부재 또는 OCR engine 실패 로 skip.
|
||||
/// `false` = 정상 OCR 완료.
|
||||
skipped: bool,
|
||||
/// v0.20.x ingest log: raster image byte size (additive, optional).
|
||||
image_byte_size: Option<u64>,
|
||||
/// v0.20.x ingest log: raster image width in pixels (additive, optional).
|
||||
image_width: Option<u32>,
|
||||
/// v0.20.x ingest log: raster image height in pixels (additive, optional).
|
||||
image_height: Option<u32>,
|
||||
/// v0.20.x ingest log: failure reason string when OCR failed (additive, optional).
|
||||
/// Values: "timeout" | "ocr_error" | "network_error" | None (success).
|
||||
failure_reason: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn extract_image_dimensions_valid_jpeg() {
|
||||
let img = image::RgbImage::new(16, 12);
|
||||
let mut bytes = Vec::new();
|
||||
image::DynamicImage::from(img)
|
||||
.write_to(
|
||||
&mut std::io::Cursor::new(&mut bytes),
|
||||
image::ImageFormat::Jpeg,
|
||||
)
|
||||
.expect("encode jpeg");
|
||||
assert_eq!(extract_image_dimensions(&bytes), Some((16, 12)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_image_dimensions_corrupt_returns_none() {
|
||||
assert_eq!(extract_image_dimensions(b"not a jpeg"), None);
|
||||
}
|
||||
}
|
||||
@@ -9,13 +9,19 @@
|
||||
//!
|
||||
//! `--vector-only` additionally truncates `embedding_records` in SQLite
|
||||
//! so the next `kebab ingest` re-embeds cleanly without orphan rows.
|
||||
//!
|
||||
//! `--orphans-only` purges stored docs that are outside the current walker
|
||||
//! scope (config narrowing / removed sub-directory). No filesystem paths are
|
||||
//! removed — this is purely a store-level reconciliation.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use kebab_config::{Config, expand_path};
|
||||
use kebab_core::WorkspacePath;
|
||||
|
||||
/// What the user asked to remove. Mutually exclusive — picked by the CLI
|
||||
/// from a clap `ArgGroup`.
|
||||
@@ -32,6 +38,13 @@ pub enum ResetScope {
|
||||
VectorOnly,
|
||||
/// Wipe only the config dir.
|
||||
ConfigOnly,
|
||||
/// Purge stored docs that are outside the current walker scope (no
|
||||
/// filesystem paths are removed). Filesystem existence is NOT checked —
|
||||
/// anything the current walker would not visit is considered an orphan.
|
||||
/// The explicit complement to the conservative `sweep_deleted_files`
|
||||
/// that runs during ingest (which leaves on-disk-but-out-of-scope docs
|
||||
/// alone for data safety).
|
||||
OrphansOnly,
|
||||
}
|
||||
|
||||
/// Result of a successful wipe — emitted as `reset_report.v1` by the
|
||||
@@ -41,6 +54,16 @@ pub struct ResetReport {
|
||||
pub scope: ResetScope,
|
||||
pub removed_paths: Vec<PathBuf>,
|
||||
pub embedding_rows_truncated: u64,
|
||||
/// Number of stored docs purged because they are outside the current
|
||||
/// walker scope. Non-zero only when `scope == OrphansOnly`.
|
||||
/// `#[serde(default)]` preserves back-compat with older callers that
|
||||
/// do not include this field.
|
||||
#[serde(default)]
|
||||
pub orphans_purged: u32,
|
||||
/// Paths of the orphaned docs that were purged. Sorted for deterministic
|
||||
/// output. Non-empty only when `scope == OrphansOnly`.
|
||||
#[serde(default)]
|
||||
pub purged_paths: Vec<WorkspacePath>,
|
||||
}
|
||||
|
||||
/// Compute the absolute on-disk paths a given scope will wipe, given a
|
||||
@@ -62,11 +85,14 @@ pub fn enumerate_paths(scope: ResetScope, cfg: &Config) -> Vec<PathBuf> {
|
||||
ResetScope::All => vec![cfg_dir, data_dir, cache_dir, state_dir],
|
||||
ResetScope::DataOnly => vec![data_dir, cache_dir, state_dir],
|
||||
ResetScope::VectorOnly => {
|
||||
let vector_dir =
|
||||
expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
|
||||
let vector_dir = expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
|
||||
vec![vector_dir]
|
||||
}
|
||||
ResetScope::ConfigOnly => vec![cfg_dir],
|
||||
// OrphansOnly operates purely at the store level — no filesystem paths
|
||||
// are removed. Return empty so `estimate_size_bytes` stays zero and
|
||||
// the existing confirm UI path for directory wipes is skipped.
|
||||
ResetScope::OrphansOnly => vec![],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,16 +122,79 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 {
|
||||
paths.iter().map(|p| walk(p)).sum()
|
||||
}
|
||||
|
||||
/// Compute the workspace paths stored in SQLite that are NOT visited by
|
||||
/// the current walker scope (i.e. they are "orphans" — on disk but
|
||||
/// outside the configured include/exclude rules, or from a sub-directory
|
||||
/// that has since been removed from the workspace).
|
||||
///
|
||||
/// Does NOT check filesystem existence — `OrphansOnly` is the explicit
|
||||
/// "I know what I'm doing" variant; callers that want the conservative
|
||||
/// fs-aware sweep should use `sweep_deleted_files` inside ingest.
|
||||
///
|
||||
/// Returns the list sorted for deterministic output. Called twice by the
|
||||
/// CLI path (once for the confirm UI preview, once inside `execute`);
|
||||
/// the double scan is acceptable for a rare destructive operation.
|
||||
pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
|
||||
use kebab_core::DocumentStore as _;
|
||||
use kebab_core::SourceScope;
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
|
||||
let store = kebab_store_sqlite::SqliteStore::open(cfg)
|
||||
.context("enumerate_orphans: open SqliteStore")?;
|
||||
|
||||
let stored = store
|
||||
.all_workspace_paths()
|
||||
.context("enumerate_orphans: all_workspace_paths")?;
|
||||
|
||||
if stored.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
// Build the same SourceScope the CLI's ingest path uses: root from
|
||||
// config, exclude list from config, no include override (full scope).
|
||||
let root = cfg.resolve_workspace_root();
|
||||
let scope = SourceScope {
|
||||
root: root.clone(),
|
||||
exclude: cfg.workspace.exclude.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let connector =
|
||||
FsSourceConnector::new(cfg).context("enumerate_orphans: build FsSourceConnector")?;
|
||||
let (assets, _skips) = connector
|
||||
.scan_with_skips(&scope)
|
||||
.context("enumerate_orphans: scan workspace")?;
|
||||
|
||||
let scanned: HashSet<WorkspacePath> = assets.into_iter().map(|a| a.workspace_path).collect();
|
||||
|
||||
let mut orphans: Vec<WorkspacePath> = stored
|
||||
.into_iter()
|
||||
.filter(|p| !scanned.contains(p))
|
||||
.collect();
|
||||
orphans.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
Ok(orphans)
|
||||
}
|
||||
|
||||
/// Wipe every path from `enumerate_paths(scope, cfg)`. For
|
||||
/// `ResetScope::VectorOnly`, also truncates the SQLite
|
||||
/// `embedding_records` table so the store doesn't point at the Lance
|
||||
/// rows we just removed off-disk.
|
||||
///
|
||||
/// For `ResetScope::OrphansOnly`, no filesystem directories are removed.
|
||||
/// Instead the store is reconciled: stored docs outside the current walker
|
||||
/// scope are purged from SQLite (+ vector store when configured). The
|
||||
/// caller is expected to have already shown the confirm UI using
|
||||
/// `enumerate_orphans`.
|
||||
///
|
||||
/// Idempotent: a missing path is treated as already-removed (success).
|
||||
/// Returns a `ResetReport` listing exactly what was removed (paths that
|
||||
/// existed before the call) so `--json` callers see the truth, not the
|
||||
/// request.
|
||||
pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
||||
if matches!(scope, ResetScope::OrphansOnly) {
|
||||
return execute_orphans_only(cfg);
|
||||
}
|
||||
|
||||
let paths = enumerate_paths(scope, cfg);
|
||||
let mut removed = Vec::new();
|
||||
|
||||
@@ -113,8 +202,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
||||
if !p.exists() {
|
||||
continue;
|
||||
}
|
||||
std::fs::remove_dir_all(p)
|
||||
.with_context(|| format!("remove {}", p.display()))?;
|
||||
std::fs::remove_dir_all(p).with_context(|| format!("remove {}", p.display()))?;
|
||||
removed.push(p.clone());
|
||||
}
|
||||
|
||||
@@ -128,9 +216,99 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
||||
scope,
|
||||
removed_paths: removed,
|
||||
embedding_rows_truncated,
|
||||
orphans_purged: 0,
|
||||
purged_paths: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Execute the `OrphansOnly` variant: reconcile stored docs against the
|
||||
/// current walker scope without touching any filesystem directory.
|
||||
fn execute_orphans_only(cfg: &Config) -> Result<ResetReport> {
|
||||
let orphans = enumerate_orphans(cfg).context("execute_orphans_only: enumerate orphans")?;
|
||||
|
||||
if orphans.is_empty() {
|
||||
return Ok(ResetReport {
|
||||
scope: ResetScope::OrphansOnly,
|
||||
removed_paths: Vec::new(),
|
||||
embedding_rows_truncated: 0,
|
||||
orphans_purged: 0,
|
||||
purged_paths: Vec::new(),
|
||||
});
|
||||
}
|
||||
|
||||
let store = std::sync::Arc::new(
|
||||
kebab_store_sqlite::SqliteStore::open(cfg)
|
||||
.context("execute_orphans_only: open SqliteStore")?,
|
||||
);
|
||||
|
||||
// Open vector store if configured. Mirror the same guard the ingest
|
||||
// path uses: only construct when the provider is not "none" / dims > 0.
|
||||
let vector_store: Option<kebab_store_vector::LanceVectorStore> =
|
||||
open_vector_store_if_configured(cfg, store.clone())?;
|
||||
|
||||
let mut purged_paths: Vec<WorkspacePath> = Vec::new();
|
||||
|
||||
for path in &orphans {
|
||||
let chunk_ids = kebab_store_sqlite::purge_deleted_workspace_path(&store, path)
|
||||
.with_context(|| format!("execute_orphans_only: purge {}", path.0))?;
|
||||
|
||||
if let Some(ref vs) = vector_store {
|
||||
if !chunk_ids.is_empty() {
|
||||
use kebab_core::VectorStore as _;
|
||||
if let Err(e) = vs.delete_by_chunk_ids(&chunk_ids) {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
path = %path.0,
|
||||
count = chunk_ids.len(),
|
||||
error = %e,
|
||||
"reset --orphans-only: vector delete failed; SQLite side already cleaned"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
target: "kebab-app",
|
||||
path = %path.0,
|
||||
"reset --orphans-only: purged orphan document"
|
||||
);
|
||||
purged_paths.push(path.clone());
|
||||
}
|
||||
|
||||
let orphans_purged = u32::try_from(purged_paths.len()).unwrap_or(u32::MAX);
|
||||
|
||||
Ok(ResetReport {
|
||||
scope: ResetScope::OrphansOnly,
|
||||
removed_paths: Vec::new(),
|
||||
embedding_rows_truncated: 0,
|
||||
orphans_purged,
|
||||
purged_paths,
|
||||
})
|
||||
}
|
||||
|
||||
/// Open the Lance vector store if the configured embedding provider is
|
||||
/// active (non-"none", dimensions > 0). Returns `None` for lexical-only
|
||||
/// configs. Mirrors the guard in `App::vector`.
|
||||
fn open_vector_store_if_configured(
|
||||
cfg: &Config,
|
||||
store: std::sync::Arc<kebab_store_sqlite::SqliteStore>,
|
||||
) -> Result<Option<kebab_store_vector::LanceVectorStore>> {
|
||||
if cfg.models.embedding.provider == "none" || cfg.models.embedding.dimensions == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
match kebab_store_vector::LanceVectorStore::new(cfg, store) {
|
||||
Ok(vs) => Ok(Some(vs)),
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
error = %e,
|
||||
"reset --orphans-only: could not open vector store; skipping vector delete"
|
||||
);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Open the SQLite store at the configured path and run
|
||||
/// `truncate_embedding_records`. Returns the count of truncated rows
|
||||
/// (the helper itself reports `DELETE` rowcount). If the SQLite file
|
||||
@@ -200,4 +378,14 @@ mod tests {
|
||||
let bytes = estimate_size_bytes(&[dir.path().to_path_buf()]);
|
||||
assert_eq!(bytes, 5 + 6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn enumerate_orphans_only_returns_empty_paths() {
|
||||
let cfg = Config::defaults();
|
||||
let paths = enumerate_paths(ResetScope::OrphansOnly, &cfg);
|
||||
assert!(
|
||||
paths.is_empty(),
|
||||
"OrphansOnly must return empty vec from enumerate_paths"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,13 +39,21 @@ pub struct Capabilities {
|
||||
pub struct Models {
|
||||
pub parser_version: String,
|
||||
pub chunker_version: String,
|
||||
/// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체.
|
||||
/// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존.
|
||||
#[serde(default)]
|
||||
pub active_parsers: Vec<String>,
|
||||
/// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체.
|
||||
/// 빈 corpus → empty Vec.
|
||||
#[serde(default)]
|
||||
pub active_chunkers: Vec<String>,
|
||||
pub embedding_version: String,
|
||||
pub prompt_template_version: String,
|
||||
pub index_version: String,
|
||||
pub corpus_revision: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct Stats {
|
||||
pub doc_count: u64,
|
||||
pub chunk_count: u64,
|
||||
@@ -63,6 +71,26 @@ pub struct Stats {
|
||||
/// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold.
|
||||
#[serde(default)]
|
||||
pub stale_doc_count: u64,
|
||||
/// p10-1A-1: code language breakdown (**doc** counts by canonical
|
||||
/// lowercase language identifier). Empty until 1A-2 produces code
|
||||
/// docs. v0.17.0 PR-C: doc-count semantics corrected here (the
|
||||
/// previous "chunk counts" wording was a longstanding mis-label —
|
||||
/// implementation has always been `COUNT(*) FROM documents
|
||||
/// GROUP BY code_lang`). Use `code_lang_chunk_breakdown` for the
|
||||
/// chunk-level companion.
|
||||
#[serde(default)]
|
||||
pub code_lang_breakdown: std::collections::BTreeMap<String, u32>,
|
||||
/// p10-1A-1: repo breakdown (**doc** counts by `metadata.repo`
|
||||
/// value). Empty until 1A-2 produces code docs. v0.17.0 PR-C:
|
||||
/// doc-count wording corrected (mirror of code_lang_breakdown).
|
||||
#[serde(default)]
|
||||
pub repo_breakdown: std::collections::BTreeMap<String, u32>,
|
||||
/// v0.17.0 PR-C: sister of [`Self::code_lang_breakdown`] returning
|
||||
/// chunk counts instead of doc counts. Indexing-pressure metric —
|
||||
/// one PDF spec → 200 chunks vs one Rust file → 5 chunks shows up
|
||||
/// here in a way `code_lang_breakdown` (doc count) hides.
|
||||
#[serde(default)]
|
||||
pub code_lang_chunk_breakdown: std::collections::BTreeMap<String, u32>,
|
||||
}
|
||||
|
||||
const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
@@ -80,6 +108,7 @@ const WIRE_SCHEMAS: &[&str] = &[
|
||||
"doc_summary.v1",
|
||||
"chunk_inspection.v1",
|
||||
"doctor.v1",
|
||||
"config_migration.v1",
|
||||
"ingest_report.v1",
|
||||
"ingest_progress.v1",
|
||||
"reset_report.v1",
|
||||
@@ -88,6 +117,9 @@ const WIRE_SCHEMAS: &[&str] = &[
|
||||
"error.v1",
|
||||
"bulk_search_item.v1",
|
||||
"bulk_search_response.v1",
|
||||
// v0.20.x r2 Enhancement 3: OCR statistics + failures introspection.
|
||||
"ocr_stats.v1",
|
||||
"ocr_failures.v1",
|
||||
];
|
||||
|
||||
/// Build a [`SchemaV1`] introspection report for the given config.
|
||||
@@ -122,10 +154,10 @@ fn capabilities_snapshot() -> Capabilities {
|
||||
rag_multi_turn: true,
|
||||
search_cache: true,
|
||||
incremental_ingest: true,
|
||||
streaming_ask: false,
|
||||
streaming_ask: true,
|
||||
http_daemon: false,
|
||||
mcp_server: true,
|
||||
single_file_ingest: false,
|
||||
single_file_ingest: true,
|
||||
bulk_search: true,
|
||||
}
|
||||
}
|
||||
@@ -140,12 +172,8 @@ fn open_store_for_stats(cfg: &Config) -> anyhow::Result<kebab_store_sqlite::Sqli
|
||||
kebab_store_sqlite::SqliteStore::open_existing(&db_path)
|
||||
}
|
||||
|
||||
fn collect_stats(
|
||||
cfg: &Config,
|
||||
store: &kebab_store_sqlite::SqliteStore,
|
||||
) -> anyhow::Result<Stats> {
|
||||
let counts = store
|
||||
.count_summary_with_threshold(cfg.search.stale_threshold_days as u64)?;
|
||||
fn collect_stats(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
|
||||
let counts = store.count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?;
|
||||
let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
|
||||
let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir)
|
||||
.map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?;
|
||||
@@ -158,16 +186,28 @@ fn collect_stats(
|
||||
lang_breakdown: counts.lang_breakdown,
|
||||
index_bytes,
|
||||
stale_doc_count: counts.stale_doc_count,
|
||||
// p10-1A-2: populated by the store query added in this task.
|
||||
code_lang_breakdown: store.code_lang_breakdown()?,
|
||||
// p10-1A-2 follow-up: dogfooding (2026-05-20) revealed this was a
|
||||
// placeholder — mirror of code_lang_breakdown for the repo field.
|
||||
repo_breakdown: store.repo_breakdown()?,
|
||||
// v0.17.0 PR-C: chunk-level companion (closes HOTFIXES
|
||||
// 2026-05-22 "code_lang_breakdown chunk granularity" LOW).
|
||||
code_lang_chunk_breakdown: store.code_lang_chunk_breakdown()?,
|
||||
})
|
||||
}
|
||||
|
||||
fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
|
||||
let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default();
|
||||
let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default();
|
||||
Models {
|
||||
// markdown parser only — pdf-page-v1 (P7) / image extractors (P6)
|
||||
// maintain their own versions; surface those when SchemaV1.models
|
||||
// becomes a multi-medium map (P+).
|
||||
parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
|
||||
chunker_version: cfg.chunking.chunker_version.clone(),
|
||||
active_parsers,
|
||||
active_chunkers,
|
||||
// EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
|
||||
embedding_version: cfg.models.embedding.model.clone(),
|
||||
prompt_template_version: cfg.rag.prompt_template_version.clone(),
|
||||
@@ -182,6 +222,41 @@ fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Mode
|
||||
mod tests_stats_ext {
|
||||
use super::*;
|
||||
|
||||
/// p10-1A-1: Stats must serialize `code_lang_breakdown` and
|
||||
/// `repo_breakdown` so downstream consumers (MCP skill, Claude Code)
|
||||
/// can branch on their presence.
|
||||
#[test]
|
||||
fn stats_includes_code_lang_and_repo_breakdown_fields() {
|
||||
let stats = Stats::default();
|
||||
let v = serde_json::to_value(&stats).unwrap();
|
||||
assert!(
|
||||
v.get("code_lang_breakdown").is_some(),
|
||||
"Stats JSON must include code_lang_breakdown: {v}"
|
||||
);
|
||||
assert!(
|
||||
v.get("repo_breakdown").is_some(),
|
||||
"Stats JSON must include repo_breakdown: {v}"
|
||||
);
|
||||
// v0.17.0 PR-C: chunk-level companion field.
|
||||
assert!(
|
||||
v.get("code_lang_chunk_breakdown").is_some(),
|
||||
"Stats JSON must include code_lang_chunk_breakdown (v0.17.0 PR-C): {v}"
|
||||
);
|
||||
// Empty BTreeMap serializes as `{}` — confirm it's an object, not null.
|
||||
assert!(
|
||||
v["code_lang_breakdown"].is_object(),
|
||||
"code_lang_breakdown must be an object: {v}"
|
||||
);
|
||||
assert!(
|
||||
v["repo_breakdown"].is_object(),
|
||||
"repo_breakdown must be an object: {v}"
|
||||
);
|
||||
assert!(
|
||||
v["code_lang_chunk_breakdown"].is_object(),
|
||||
"code_lang_chunk_breakdown must be an object: {v}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stats_includes_breakdowns_and_bytes_on_fresh_corpus() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
@@ -205,3 +280,27 @@ mod tests_stats_ext {
|
||||
assert_eq!(s.stats.stale_doc_count, 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_capabilities {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn capabilities_streaming_ask_matches_cli_surface() {
|
||||
// Bug #9: kebab ask --stream 가 answer_event.v1 ndjson 191 event 정상 emit →
|
||||
// capabilities.streaming_ask 가 true 여야 함.
|
||||
let caps = capabilities_snapshot();
|
||||
assert!(caps.streaming_ask, "streaming_ask must be true (Bug #9)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn capabilities_single_file_ingest_matches_cli_surface() {
|
||||
// Bug #9: kebab ingest-file <path> + kebab ingest-stdin --title <T> 양쪽 모두
|
||||
// ingest_report.v1 정상 emit → capabilities.single_file_ingest 가 true 여야 함.
|
||||
let caps = capabilities_snapshot();
|
||||
assert!(
|
||||
caps.single_file_ingest,
|
||||
"single_file_ingest must be true (Bug #9)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,11 +10,7 @@ use kebab_core::SearchHit;
|
||||
///
|
||||
/// p9-fb-32: mirrored in `kebab_rag::pipeline::compute_stale` (dep-boundary
|
||||
/// rule prevents `kebab-rag → kebab-app`). Update both together.
|
||||
pub fn compute_stale(
|
||||
indexed_at: OffsetDateTime,
|
||||
now: OffsetDateTime,
|
||||
threshold_days: u32,
|
||||
) -> bool {
|
||||
pub fn compute_stale(indexed_at: OffsetDateTime, now: OffsetDateTime, threshold_days: u32) -> bool {
|
||||
if threshold_days == 0 {
|
||||
return false;
|
||||
}
|
||||
@@ -23,11 +19,7 @@ pub fn compute_stale(
|
||||
}
|
||||
|
||||
/// Sets `stale` on each hit in place using `compute_stale`.
|
||||
pub fn mark_stale_in_place(
|
||||
hits: &mut [SearchHit],
|
||||
now: OffsetDateTime,
|
||||
threshold_days: u32,
|
||||
) {
|
||||
pub fn mark_stale_in_place(hits: &mut [SearchHit], now: OffsetDateTime, threshold_days: u32) {
|
||||
for h in hits {
|
||||
h.stale = compute_stale(h.indexed_at, now, threshold_days);
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@ fn ask_lexical_smoke() {
|
||||
history: Vec::new(),
|
||||
conversation_id: None,
|
||||
turn_index: None,
|
||||
multi_hop: false,
|
||||
};
|
||||
// The fixture workspace contains "ownership" content; the model's
|
||||
// citation behavior depends on its training, so we don't assert on
|
||||
|
||||
1393
crates/kebab-app/tests/code_ingest_smoke.rs
Normal file
1393
crates/kebab-app/tests/code_ingest_smoke.rs
Normal file
File diff suppressed because it is too large
Load Diff
60
crates/kebab-app/tests/common/mock_ocr.rs
Normal file
60
crates/kebab-app/tests/common/mock_ocr.rs
Normal file
@@ -0,0 +1,60 @@
|
||||
use std::sync::Mutex;
|
||||
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Lang, OcrText};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
|
||||
pub struct MockOcrEngine {
|
||||
expected_texts: Vec<String>,
|
||||
call_index: Mutex<usize>,
|
||||
fail: bool,
|
||||
}
|
||||
|
||||
impl MockOcrEngine {
|
||||
/// Single text (backward-compat ctor for pdf_ocr_apply.rs 10 sites).
|
||||
pub fn single(text: impl Into<String>, fail: bool) -> Self {
|
||||
Self {
|
||||
expected_texts: vec![text.into()],
|
||||
call_index: Mutex::new(0),
|
||||
fail,
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-page texts (cursor advances per recognize call).
|
||||
pub fn per_page(texts: Vec<String>, fail: bool) -> Self {
|
||||
Self {
|
||||
expected_texts: texts,
|
||||
call_index: Mutex::new(0),
|
||||
fail,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OcrEngine for MockOcrEngine {
|
||||
fn engine_name(&self) -> &'static str {
|
||||
"mock-ocr"
|
||||
}
|
||||
|
||||
fn engine_version(&self) -> String {
|
||||
"mock-v1".to_string()
|
||||
}
|
||||
|
||||
fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
|
||||
if self.fail {
|
||||
anyhow::bail!("mock failure");
|
||||
}
|
||||
let mut idx = self.call_index.lock().unwrap();
|
||||
let text = self
|
||||
.expected_texts
|
||||
.get(*idx)
|
||||
.cloned()
|
||||
.unwrap_or_else(|| self.expected_texts.last().cloned().unwrap_or_default());
|
||||
*idx += 1;
|
||||
Ok(OcrText {
|
||||
joined: text,
|
||||
regions: vec![],
|
||||
engine: "mock-ocr".to_string(),
|
||||
engine_version: "mock-v1".to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -93,8 +93,7 @@ impl TestEnv {
|
||||
/// directly. Caller can invoke this multiple times to simulate
|
||||
/// re-opening the binary after a corpus revision bump.
|
||||
pub fn app(&self) -> kebab_app::App {
|
||||
kebab_app::App::open_with_config(self.config.clone())
|
||||
.expect("App::open_with_config")
|
||||
kebab_app::App::open_with_config(self.config.clone()).expect("App::open_with_config")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,3 +168,5 @@ fn copy_dir_recursive(src: &Path, dest: &Path) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub mod mock_ocr;
|
||||
|
||||
82
crates/kebab-app/tests/config_migrate.rs
Normal file
82
crates/kebab-app/tests/config_migrate.rs
Normal file
@@ -0,0 +1,82 @@
|
||||
use std::fs;
|
||||
|
||||
#[test]
|
||||
fn migrate_writes_backup_and_atomic_with_dry_run_noop() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = dir.path().join("config.toml");
|
||||
fs::write(
|
||||
&cfg,
|
||||
"schema_version = 1\n\n[workspace]\nroot = \"/n\"\ninclude = [\"*.md\"]\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// dry-run: 파일·백업 미변경.
|
||||
let report = kebab_app::config_migrate_with_config_path(Some(&cfg), true).unwrap();
|
||||
assert!(report.changed);
|
||||
assert!(report.dry_run);
|
||||
assert!(report.backup_path.is_none());
|
||||
assert!(!dir.path().join("config.toml.bak").exists());
|
||||
assert!(
|
||||
fs::read_to_string(&cfg).unwrap().contains("include"),
|
||||
"dry-run modified file"
|
||||
);
|
||||
|
||||
// 실제 적용: 백업 생성 + 파일 갱신.
|
||||
let report = kebab_app::config_migrate_with_config_path(Some(&cfg), false).unwrap();
|
||||
assert!(report.changed);
|
||||
assert!(!report.dry_run);
|
||||
assert!(report.backup_path.is_some());
|
||||
assert!(dir.path().join("config.toml.bak").exists());
|
||||
let new = fs::read_to_string(&cfg).unwrap();
|
||||
assert!(!new.contains("include"));
|
||||
assert!(new.contains("[ingest.expansion]"));
|
||||
|
||||
// 멱등: 재실행 changed=false.
|
||||
let report = kebab_app::config_migrate_with_config_path(Some(&cfg), false).unwrap();
|
||||
assert!(!report.changed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn migrate_missing_file_errors() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = dir.path().join("nope.toml");
|
||||
assert!(kebab_app::config_migrate_with_config_path(Some(&cfg), false).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn annotated_default_serialization_contains_section_comments() {
|
||||
let doc = kebab_config::migrate::annotated_default_document();
|
||||
let text = doc.to_string();
|
||||
assert!(text.contains("doc-side 별칭"), "section comment missing:\n{text}");
|
||||
assert!(text.contains("[ingest.expansion]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doctor_flags_outdated_config() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = dir.path().join("config.toml");
|
||||
fs::write(
|
||||
&cfg,
|
||||
"schema_version = 1\n\n[workspace]\nroot = \"/n\"\ninclude=[\"*.md\"]\n",
|
||||
)
|
||||
.unwrap();
|
||||
let report = kebab_app::doctor_with_config_path(Some(&cfg)).unwrap();
|
||||
let check = report
|
||||
.checks
|
||||
.iter()
|
||||
.find(|c| c.name == "config_migration")
|
||||
.unwrap();
|
||||
assert!(!check.ok, "outdated config should fail check");
|
||||
assert!(check.hint.as_deref().unwrap().contains("config migrate"));
|
||||
assert!(!report.ok, "overall doctor should be false");
|
||||
|
||||
// migrate 후엔 통과.
|
||||
kebab_app::config_migrate_with_config_path(Some(&cfg), false).unwrap();
|
||||
let report = kebab_app::doctor_with_config_path(Some(&cfg)).unwrap();
|
||||
let check = report
|
||||
.checks
|
||||
.iter()
|
||||
.find(|c| c.name == "config_migration")
|
||||
.unwrap();
|
||||
assert!(check.ok, "after migrate should pass");
|
||||
}
|
||||
@@ -12,7 +12,11 @@ fn open(env: &common::TestEnv) -> App {
|
||||
#[test]
|
||||
fn fetch_chunk_returns_target_only_when_no_context() {
|
||||
let env = common::TestEnv::new();
|
||||
common::ingest_md(&env, "a.md", "# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n");
|
||||
common::ingest_md(
|
||||
&env,
|
||||
"a.md",
|
||||
"# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n",
|
||||
);
|
||||
let app = open(&env);
|
||||
|
||||
// Find a chunk via search to obtain its id.
|
||||
@@ -38,12 +42,17 @@ fn fetch_chunk_returns_target_only_when_no_context() {
|
||||
#[test]
|
||||
fn fetch_chunk_with_context_returns_neighbors() {
|
||||
let env = common::TestEnv::new();
|
||||
let body = "# H1\n\nA1\n\n# H2\n\nA2\n\n# H3\n\nA3\n\n# H4\n\nA4\n\n# H5\n\nA5\n";
|
||||
// v0.17.0 trigram tokenizer: terms must be ≥3 Unicode chars to
|
||||
// match. The earlier fixture used 2-char tokens like `A1`/`A3` for
|
||||
// section bodies — those zero-hit under trigram. Use 5-char unique
|
||||
// words per section so the query can pin one chunk deterministically.
|
||||
let body =
|
||||
"# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n";
|
||||
common::ingest_md(&env, "multi.md", body);
|
||||
let app = env.app();
|
||||
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "A3".to_string(),
|
||||
text: "cherry".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
@@ -106,7 +115,10 @@ fn fetch_doc_returns_serialized_markdown() {
|
||||
.unwrap();
|
||||
assert_eq!(result.kind, FetchKind::Doc);
|
||||
let text = result.text.expect("doc text");
|
||||
assert!(text.contains("Heading One"), "doc text contains heading: {text:?}");
|
||||
assert!(
|
||||
text.contains("Heading One"),
|
||||
"doc text contains heading: {text:?}"
|
||||
);
|
||||
assert!(text.contains("First paragraph"), "doc text contains body");
|
||||
assert!(!result.truncated);
|
||||
}
|
||||
@@ -151,7 +163,11 @@ fn fetch_doc_with_max_tokens_truncates() {
|
||||
.unwrap();
|
||||
assert!(result.truncated);
|
||||
let text = result.text.expect("doc text");
|
||||
assert!(text.chars().count() <= 100, "trimmed text len {}", text.chars().count());
|
||||
assert!(
|
||||
text.chars().count() <= 100,
|
||||
"trimmed text len {}",
|
||||
text.chars().count()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -288,8 +304,7 @@ fn fetch_span_line_start_beyond_total_returns_empty_text() {
|
||||
fn fetch_chunk_context_at_first_chunk_clamps_lower_bound() {
|
||||
let env = common::TestEnv::new();
|
||||
// Multi-chunk markdown so context ±N has neighbors.
|
||||
let body =
|
||||
"# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
|
||||
let body = "# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
|
||||
common::ingest_md(&env, "boundary.md", body);
|
||||
let app = env.app();
|
||||
let q = kebab_core::SearchQuery {
|
||||
|
||||
171
crates/kebab-app/tests/file_deletion_auto_purge.rs
Normal file
171
crates/kebab-app/tests/file_deletion_auto_purge.rs
Normal file
@@ -0,0 +1,171 @@
|
||||
//! Dogfood: auto-purge stored docs for filesystem-deleted files.
|
||||
//!
|
||||
//! Two tests:
|
||||
//!
|
||||
//! 1. `file_deletion_auto_purge` — ingest 2 files, delete one, re-ingest.
|
||||
//! The re-ingest must report `purged_deleted_files = 1`, the deleted
|
||||
//! file must no longer appear in `list_docs`, and lexical search for
|
||||
//! its unique content must return no hits.
|
||||
//!
|
||||
//! 2. `include_scope_narrowing_does_not_purge` — ingest 2 files under a
|
||||
//! wide glob, narrow the walker scope to only one file, re-ingest.
|
||||
//! The narrowed ingest must NOT purge the out-of-scope file because
|
||||
//! the file is still on disk (just excluded from this run). Protects
|
||||
//! users against accidental data loss via config edits.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::IngestOpts;
|
||||
use kebab_app::ingest_with_config_opts;
|
||||
use kebab_core::{DocFilter, DocumentStore, SearchMode, SearchQuery, SourceScope};
|
||||
|
||||
/// Helper: open the store via `TestEnv` and run `list_documents`.
|
||||
fn list_doc_paths(env: &TestEnv) -> Vec<String> {
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
let store = SqliteStore::open(&env.config).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
store
|
||||
.list_documents(&DocFilter::default())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|d| d.doc_path.0)
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn file_deletion_auto_purge() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write two .rs files into the workspace.
|
||||
let a_path = env.workspace_root.join("a.rs");
|
||||
let b_path = env.workspace_root.join("b.rs");
|
||||
std::fs::write(&a_path, "// file a\nfn alpha() {}\n").unwrap();
|
||||
std::fs::write(&b_path, "// file b\nfn bravo() {}\n").unwrap();
|
||||
|
||||
// First ingest — both must be New.
|
||||
let first = ingest_with_config_opts(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.expect("first ingest must succeed");
|
||||
// Only count the .rs files we added (there may be fixture files too).
|
||||
let first_new = first.new;
|
||||
assert!(first_new >= 2, "expected at least 2 new docs: {first:?}");
|
||||
assert_eq!(
|
||||
first.purged_deleted_files, 0,
|
||||
"no purges on first ingest: {first:?}"
|
||||
);
|
||||
assert_eq!(first.errors, 0, "no errors on first ingest: {first:?}");
|
||||
|
||||
// Delete one file from the filesystem.
|
||||
std::fs::remove_file(&b_path).expect("remove b.rs");
|
||||
|
||||
// Second ingest — scanned count drops by 1; b.rs should be purged.
|
||||
let second = ingest_with_config_opts(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.expect("second ingest must succeed");
|
||||
|
||||
assert_eq!(
|
||||
second.purged_deleted_files, 1,
|
||||
"exactly 1 file should be purged: {second:?}"
|
||||
);
|
||||
assert_eq!(second.new, 0, "no new docs after deletion: {second:?}");
|
||||
assert_eq!(second.updated, 0, "no updated docs: {second:?}");
|
||||
assert_eq!(second.errors, 0, "no errors: {second:?}");
|
||||
|
||||
// b.rs must no longer appear in list_docs.
|
||||
let doc_paths = list_doc_paths(&env);
|
||||
let b_ws_path = "b.rs";
|
||||
assert!(
|
||||
!doc_paths.iter().any(|p| p == b_ws_path),
|
||||
"b.rs must be gone from list_docs; got: {doc_paths:?}"
|
||||
);
|
||||
// a.rs must still be present.
|
||||
let a_ws_path = "a.rs";
|
||||
assert!(
|
||||
doc_paths.iter().any(|p| p == a_ws_path),
|
||||
"a.rs must still be in list_docs; got: {doc_paths:?}"
|
||||
);
|
||||
|
||||
// Lexical search for b.rs's unique content returns no hits.
|
||||
let app = env.app();
|
||||
let query = SearchQuery {
|
||||
text: "bravo".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = app.search(query).expect("search must not error");
|
||||
assert!(
|
||||
hits.is_empty(),
|
||||
"search for deleted file's content must return no hits; got: {hits:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn include_scope_narrowing_does_not_purge() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write two .rs files.
|
||||
let a_path = env.workspace_root.join("a_narrow.rs");
|
||||
let b_path = env.workspace_root.join("b_narrow.rs");
|
||||
std::fs::write(&a_path, "// narrow a\nfn alpha_narrow() {}\n").unwrap();
|
||||
std::fs::write(&b_path, "// narrow b\nfn bravo_narrow() {}\n").unwrap();
|
||||
|
||||
// Wide scope: first ingest — both must be New.
|
||||
let wide_scope = SourceScope {
|
||||
root: env.workspace_root.clone(),
|
||||
include: vec!["**/*.rs".to_string()],
|
||||
exclude: env.config.workspace.exclude.clone(),
|
||||
};
|
||||
let first =
|
||||
ingest_with_config_opts(env.config.clone(), wide_scope, false, IngestOpts::default())
|
||||
.expect("first ingest (wide) must succeed");
|
||||
assert!(first.new >= 2, "expected at least 2 new docs: {first:?}");
|
||||
assert_eq!(
|
||||
first.purged_deleted_files, 0,
|
||||
"no purges on first ingest: {first:?}"
|
||||
);
|
||||
|
||||
// Narrow scope: only a_narrow.rs in include — b_narrow.rs is still
|
||||
// on disk but excluded from the walker scope.
|
||||
let narrow_scope = SourceScope {
|
||||
root: env.workspace_root.clone(),
|
||||
include: vec!["a_narrow.rs".to_string()],
|
||||
exclude: env.config.workspace.exclude.clone(),
|
||||
};
|
||||
let second = ingest_with_config_opts(
|
||||
env.config.clone(),
|
||||
narrow_scope,
|
||||
false,
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.expect("second ingest (narrow) must succeed");
|
||||
|
||||
// CRITICAL: b_narrow.rs is still on disk — must NOT be purged.
|
||||
assert_eq!(
|
||||
second.purged_deleted_files, 0,
|
||||
"scope-narrowing must NOT purge on-disk files; got: {second:?}"
|
||||
);
|
||||
assert_eq!(second.errors, 0, "no errors: {second:?}");
|
||||
|
||||
// b_narrow.rs must still exist in the store.
|
||||
let doc_paths = list_doc_paths(&env);
|
||||
let b_ws_path = "b_narrow.rs";
|
||||
assert!(
|
||||
doc_paths.iter().any(|p| p == b_ws_path),
|
||||
"b_narrow.rs must still be in list_docs after scope narrowing; got: {doc_paths:?}"
|
||||
);
|
||||
// And the file must still be on disk.
|
||||
assert!(
|
||||
b_path.exists(),
|
||||
"b_narrow.rs must still be on disk (we didn't delete it)"
|
||||
);
|
||||
}
|
||||
@@ -24,8 +24,7 @@ use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
/// inspectable in stored DB rows.
|
||||
fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf {
|
||||
use image::{ImageBuffer, Rgb};
|
||||
let img: ImageBuffer<Rgb<u8>, _> =
|
||||
ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
|
||||
let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
|
||||
let path = root.join(name);
|
||||
img.save(&path).expect("write PNG fixture");
|
||||
path
|
||||
@@ -80,7 +79,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
|
||||
// Counters: scanned should include the PNG; new ≥ 1 (markdown
|
||||
// fixtures from the workspace tree may also count).
|
||||
assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items);
|
||||
assert!(
|
||||
report.scanned >= 1,
|
||||
"scanned={}, items={:?}",
|
||||
report.scanned,
|
||||
report.items
|
||||
);
|
||||
assert_eq!(report.errors, 0, "no errors on lenient OCR path");
|
||||
|
||||
// Locate the image doc in the report items.
|
||||
@@ -94,7 +98,11 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
kebab_core::IngestItemKind::New,
|
||||
"image asset must be classified New on first ingest"
|
||||
);
|
||||
assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk");
|
||||
assert_eq!(
|
||||
img_item.chunk_count,
|
||||
Some(1),
|
||||
"image emits exactly one chunk"
|
||||
);
|
||||
|
||||
// Inspect the stored chunk text via kb-app's inspect_chunk facade.
|
||||
let doc_id = img_item.doc_id.clone().expect("image doc id");
|
||||
@@ -117,10 +125,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
|
||||
// Sanity: the doc was actually persisted into SQLite (kb-app's
|
||||
// list_docs facade reads the same store the chunker writes to).
|
||||
let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
|
||||
.expect("list_docs");
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).expect("list_docs");
|
||||
assert!(
|
||||
summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")),
|
||||
summaries
|
||||
.iter()
|
||||
.any(|s| s.doc_path.0.ends_with("diagram.png")),
|
||||
"image doc must appear in list_docs"
|
||||
);
|
||||
|
||||
@@ -171,8 +181,7 @@ async fn ingest_image_with_ocr_and_caption_populates_both_fields() {
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("diagram.png"))
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let block = match &doc.blocks[0] {
|
||||
kebab_core::Block::ImageRef(b) => b,
|
||||
_ => unreachable!(),
|
||||
@@ -267,8 +276,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
|
||||
let cfg_clone = cfg.clone();
|
||||
let scope = env.scope();
|
||||
let report = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg_clone, scope, false)
|
||||
.expect("ingest with no OCR/caption")
|
||||
kebab_app::ingest_with_config(cfg_clone, scope, false).expect("ingest with no OCR/caption")
|
||||
})
|
||||
.await
|
||||
.expect("task");
|
||||
@@ -282,8 +290,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
|
||||
.find(|i| i.doc_path.0.ends_with("raw.png"))
|
||||
.unwrap();
|
||||
assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk");
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let block = match &doc.blocks[0] {
|
||||
kebab_core::Block::ImageRef(b) => b,
|
||||
_ => unreachable!(),
|
||||
@@ -392,16 +399,12 @@ async fn re_ingest_image_produces_unchanged_with_same_doc_id() {
|
||||
let scope1 = scope.clone();
|
||||
let scope2 = scope.clone();
|
||||
|
||||
let r1 = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg1, scope1, false).unwrap()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
let r2 = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg2, scope2, false).unwrap()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
let r1 = spawn_blocking(move || kebab_app::ingest_with_config(cfg1, scope1, false).unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
let r2 = spawn_blocking(move || kebab_app::ingest_with_config(cfg2, scope2, false).unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let id1 = r1
|
||||
.items
|
||||
|
||||
@@ -21,11 +21,16 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
// First ingest — populates the DB. Use the legacy entry so the
|
||||
// assertions cover the "previously ingested" set without needing
|
||||
// IngestOpts::default() to behave identically.
|
||||
let first =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||
assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
|
||||
assert_eq!(first.unchanged, 0, "first ingest cannot have unchanged: {first:?}");
|
||||
assert!(
|
||||
first.new >= 1,
|
||||
"first ingest must create new docs: {first:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
first.unchanged, 0,
|
||||
"first ingest cannot have unchanged: {first:?}"
|
||||
);
|
||||
|
||||
let scanned = first.scanned;
|
||||
|
||||
@@ -38,9 +43,15 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(second.scanned, scanned, "second scanned matches first: {second:?}");
|
||||
assert_eq!(
|
||||
second.scanned, scanned,
|
||||
"second scanned matches first: {second:?}"
|
||||
);
|
||||
assert_eq!(second.new, 0, "no new docs on re-ingest: {second:?}");
|
||||
assert_eq!(second.updated, 0, "nothing should be marked updated: {second:?}");
|
||||
assert_eq!(
|
||||
second.updated, 0,
|
||||
"nothing should be marked updated: {second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.unchanged, scanned,
|
||||
"every doc must be Unchanged: {second:?}"
|
||||
@@ -52,10 +63,12 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
fn force_reingest_bypasses_skip() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
let first =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||
assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
|
||||
assert!(
|
||||
first.new >= 1,
|
||||
"first ingest must create new docs: {first:?}"
|
||||
);
|
||||
let scanned = first.scanned;
|
||||
|
||||
let second = ingest_with_config_opts(
|
||||
|
||||
@@ -107,13 +107,9 @@ fn cancel_none_is_uncancellable_default() {
|
||||
// ingest_with_config_progress (no cancel) runs to completion.
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ fn ingest_file_copies_external_md_and_reports_new() {
|
||||
assert!(ext_dir.is_dir());
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
.filter_map(std::result::Result::ok)
|
||||
.collect();
|
||||
assert_eq!(entries.len(), 1, "exactly one file in _external/");
|
||||
let name = entries[0].file_name().to_string_lossy().into_owned();
|
||||
@@ -107,5 +107,8 @@ fn ingest_file_errors_on_unsupported_extension() {
|
||||
|
||||
let err = kebab_app::ingest_file_with_config(cfg, &docx).unwrap_err();
|
||||
assert!(err.to_string().contains("unsupported extension"), "{err}");
|
||||
assert!(err.to_string().contains(".docx") || err.to_string().contains("docx"), "{err}");
|
||||
assert!(
|
||||
err.to_string().contains(".docx") || err.to_string().contains("docx"),
|
||||
"{err}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -8,8 +8,7 @@ use common::TestEnv;
|
||||
#[test]
|
||||
fn ingest_then_list_inspects_round_trip() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
|
||||
// The fixture has 3 markdown files; first ingest should label them
|
||||
// all as New.
|
||||
@@ -27,17 +26,14 @@ fn ingest_then_list_inspects_round_trip() {
|
||||
}
|
||||
|
||||
// list_docs returns the 3 docs.
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert_eq!(docs.len(), 3, "docs: {docs:?}");
|
||||
|
||||
// inspect_doc round-trips one of them.
|
||||
let any_doc_id = docs[0].doc_id.clone();
|
||||
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id)
|
||||
.unwrap();
|
||||
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id).unwrap();
|
||||
assert_eq!(canonical.doc_id, any_doc_id);
|
||||
assert!(!canonical.blocks.is_empty(), "blocks empty");
|
||||
}
|
||||
@@ -46,12 +42,10 @@ fn ingest_then_list_inspects_round_trip() {
|
||||
fn ingest_idempotent_on_second_run() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
let r1 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let r1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(r1.new, 3);
|
||||
|
||||
let r2 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let r2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
// Same files re-ingested — p9-fb-23 task 7 introduced the early-skip
|
||||
// path: when checksum + parser/chunker/embedding versions all match,
|
||||
// the second run reports `Unchanged` rather than `Updated`. Pre-p9-fb-23
|
||||
@@ -63,19 +57,16 @@ fn ingest_idempotent_on_second_run() {
|
||||
assert_eq!(r2.unchanged, 3, "second run unchanged: {r2:?}");
|
||||
|
||||
// list_docs still has 3 docs (no duplicates).
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert_eq!(docs.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_summary_only_drops_items() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert!(report.items.is_none(), "summary-only should null items");
|
||||
}
|
||||
@@ -87,12 +78,10 @@ fn ingest_records_ingest_runs_row_with_aggregate_counts() {
|
||||
// of every run. `summary_only=true` writes `items_json=NULL`; the
|
||||
// counts MUST still be present.
|
||||
let env = TestEnv::lexical_only();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
|
||||
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir)
|
||||
.join("kebab.sqlite");
|
||||
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
|
||||
let conn = rusqlite::Connection::open(&db_path).expect("open kebab.sqlite");
|
||||
let (scanned, new_c, updated, skipped, errors, items_json): (
|
||||
i64,
|
||||
@@ -141,25 +130,18 @@ fn ingest_provider_none_skips_lance() {
|
||||
// tree shape (no `<data_dir>/lancedb` directory, or no `*.lance`
|
||||
// tables under it).
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0, "lexical-only run must not error");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir)
|
||||
.join("lancedb");
|
||||
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir).join("lancedb");
|
||||
if lance_dir.exists() {
|
||||
// If the dir was created (e.g., by an earlier consumer touching
|
||||
// the path), it MUST contain no `.lance` tables.
|
||||
let mut had_lance_table = false;
|
||||
for entry in std::fs::read_dir(&lance_dir).expect("read lance_dir") {
|
||||
let entry = entry.unwrap();
|
||||
if entry
|
||||
.path()
|
||||
.extension()
|
||||
.and_then(|s| s.to_str())
|
||||
== Some("lance")
|
||||
{
|
||||
if entry.path().extension().and_then(|s| s.to_str()) == Some("lance") {
|
||||
had_lance_table = true;
|
||||
break;
|
||||
}
|
||||
@@ -189,8 +171,7 @@ fn list_docs_filters_by_tags_any() {
|
||||
tags_any: vec!["rust".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let rust_docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
|
||||
let rust_docs = kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
|
||||
// intro.md and notes/cargo.md both tag "rust".
|
||||
assert_eq!(rust_docs.len(), 2, "expected 2 rust docs: {rust_docs:?}");
|
||||
}
|
||||
@@ -198,8 +179,9 @@ fn list_docs_filters_by_tags_any() {
|
||||
#[test]
|
||||
fn inspect_doc_not_found_returns_actionable_error() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bogus =
|
||||
kebab_core::DocumentId("0000000000000000000000000000000000000000000000000000000000000000".to_string());
|
||||
let bogus = kebab_core::DocumentId(
|
||||
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
|
||||
);
|
||||
let err = kebab_app::inspect_doc_with_config(env.config.clone(), &bogus).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(
|
||||
@@ -218,8 +200,7 @@ fn inspect_chunk_not_found_returns_actionable_error() {
|
||||
let bogus = kebab_core::ChunkId(
|
||||
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
|
||||
);
|
||||
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus)
|
||||
.unwrap_err();
|
||||
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("not found"), "got: {msg}");
|
||||
}
|
||||
@@ -251,22 +232,18 @@ fn ingest_with_config_opts_default_matches_legacy_behaviour() {
|
||||
#[test]
|
||||
fn ingest_stamps_chunker_version_on_document() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert!(report.new >= 1, "expected at least one new doc: {report:?}");
|
||||
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
|
||||
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert!(!docs.is_empty(), "no docs after ingest");
|
||||
|
||||
for doc_entry in &docs {
|
||||
let canonical =
|
||||
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id)
|
||||
.unwrap();
|
||||
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id).unwrap();
|
||||
assert!(
|
||||
canonical.last_chunker_version.is_some(),
|
||||
"last_chunker_version must be stamped for doc {}: got {:?}",
|
||||
|
||||
171
crates/kebab-app/tests/ingest_log_smoke.rs
Normal file
171
crates/kebab-app/tests/ingest_log_smoke.rs
Normal file
@@ -0,0 +1,171 @@
|
||||
// crates/kebab-app/tests/ingest_log_smoke.rs
|
||||
//
|
||||
// Integration tests for ingest_log feature (v0.20.x). Spec §5 AC-9 + AC-6.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_app::{IngestOpts, ingest_with_config_opts};
|
||||
use kebab_config::{Config, LoggingCfg};
|
||||
use kebab_core::SourceScope;
|
||||
use serde_json::Value;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Config {
|
||||
let data_dir = workspace.parent().unwrap().join("data");
|
||||
std::fs::create_dir_all(&data_dir).unwrap();
|
||||
let model_dir = workspace.parent().unwrap().join("models");
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.chunking.target_tokens = 80;
|
||||
cfg.chunking.overlap_tokens = 20;
|
||||
cfg.logging = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: log_dir.to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
cfg
|
||||
}
|
||||
|
||||
/// AC-9: ingest → log file exists + each line valid JSON + last line kind=summary + scanned>0.
|
||||
#[test]
|
||||
fn ingest_log_smoke() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let workspace = tmp.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let log_dir = tmp.path().join("logs");
|
||||
|
||||
// 1. Minimal corpus: 1 markdown + 1 scanned PDF (OCR disabled — no Ollama needed).
|
||||
std::fs::write(
|
||||
workspace.join("hello.md"),
|
||||
"# Hello\n\nThis is a smoke test.\n",
|
||||
)
|
||||
.unwrap();
|
||||
let pdf_src = PathBuf::from("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
if pdf_src.exists() {
|
||||
std::fs::copy(&pdf_src, workspace.join("scanned.pdf")).unwrap();
|
||||
}
|
||||
|
||||
// 2. Config with logging enabled.
|
||||
let cfg = minimal_config(&workspace, &log_dir);
|
||||
let scope = SourceScope {
|
||||
root: workspace.clone(),
|
||||
exclude: vec![],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// 3. Run ingest.
|
||||
ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
|
||||
.expect("ingest should succeed");
|
||||
|
||||
// 4. Assert log file exists in log_dir.
|
||||
let log_files: Vec<_> = std::fs::read_dir(&log_dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
e.file_name().to_string_lossy().starts_with("ingest-")
|
||||
&& e.file_name().to_string_lossy().ends_with(".ndjson")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(
|
||||
log_files.len(),
|
||||
1,
|
||||
"expected exactly 1 ingest-*.ndjson file, found: {log_files:?}"
|
||||
);
|
||||
|
||||
// 5. Parse each line as JSON — assert kind field present and valid.
|
||||
let body = std::fs::read_to_string(log_files[0].path()).unwrap();
|
||||
let lines: Vec<&str> = body.lines().collect();
|
||||
assert!(!lines.is_empty(), "log file should not be empty");
|
||||
|
||||
let valid_kinds = ["ocr", "parse_error", "skip", "error", "summary"];
|
||||
for line in &lines {
|
||||
let v: Value = serde_json::from_str(line)
|
||||
.unwrap_or_else(|e| panic!("line is not valid JSON: {e}\nline: {line}"));
|
||||
let kind = v
|
||||
.get("kind")
|
||||
.and_then(|k| k.as_str())
|
||||
.unwrap_or_else(|| panic!("line missing 'kind' field: {line}"));
|
||||
assert!(
|
||||
valid_kinds.contains(&kind),
|
||||
"unexpected kind '{kind}' in line: {line}"
|
||||
);
|
||||
}
|
||||
|
||||
// 6. Last line must be kind=summary with scanned > 0.
|
||||
let last = lines.last().unwrap();
|
||||
let last_v: Value = serde_json::from_str(last).unwrap();
|
||||
assert_eq!(
|
||||
last_v.get("kind").and_then(|k| k.as_str()),
|
||||
Some("summary"),
|
||||
"last line must be kind=summary, got: {last}"
|
||||
);
|
||||
let scanned = last_v.get("scanned").and_then(Value::as_u64).unwrap_or(0);
|
||||
assert!(scanned > 0, "summary.scanned should be > 0, got: {last}");
|
||||
}
|
||||
|
||||
/// AC-6: ingest_log_enabled=false → no log file created.
|
||||
#[test]
|
||||
fn ingest_log_disabled_emits_no_file() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let workspace = tmp.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let log_dir = tmp.path().join("logs");
|
||||
|
||||
std::fs::write(
|
||||
workspace.join("hello.md"),
|
||||
"# Hello\n\nDisabled log test.\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let data_dir = tmp.path().join("data");
|
||||
std::fs::create_dir_all(&data_dir).unwrap();
|
||||
let model_dir = tmp.path().join("models");
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.logging = LoggingCfg {
|
||||
ingest_log_enabled: false,
|
||||
ingest_log_dir: log_dir.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scope = SourceScope {
|
||||
root: workspace.clone(),
|
||||
exclude: vec![],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
|
||||
.expect("ingest should succeed");
|
||||
|
||||
// log_dir should either not exist or contain 0 ingest-*.ndjson files.
|
||||
let log_file_count = if log_dir.exists() {
|
||||
std::fs::read_dir(&log_dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
e.file_name().to_string_lossy().starts_with("ingest-")
|
||||
&& e.file_name().to_string_lossy().ends_with(".ndjson")
|
||||
})
|
||||
.count()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
assert_eq!(
|
||||
log_file_count, 0,
|
||||
"no ingest-*.ndjson file should be created when disabled"
|
||||
);
|
||||
}
|
||||
117
crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs
Normal file
117
crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
//! Integration smoke tests for the PDF OCR pipeline (§ Acceptance §9 #1 + #2).
|
||||
//!
|
||||
//! Tests 1 and 2 require a live Ollama endpoint — `#[ignore]` by default.
|
||||
//! Manual invoke:
|
||||
//! KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
|
||||
//! cargo test -p kebab-app --test ingest_pdf_ocr_smoke --ignored -j 4
|
||||
//!
|
||||
//! Test 3 (cancel) uses a dummy endpoint + pre-set cancel — runs by default
|
||||
//! to verify the cancel wiring doesn't panic/deadlock.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
fn ollama_endpoint() -> String {
|
||||
std::env::var("KEBAB_PDF_OCR_ENDPOINT").unwrap_or_else(|_| "http://localhost:11434".to_string())
|
||||
}
|
||||
|
||||
fn make_ocr_env_real() -> TestEnv {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.pdf.ocr.enabled = true;
|
||||
env.config.pdf.ocr.endpoint = Some(ollama_endpoint());
|
||||
env.config.models.embedding.provider = "none".to_string();
|
||||
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let dest = env.workspace_root.join("scanned_page1.pdf");
|
||||
std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
|
||||
|
||||
env
|
||||
}
|
||||
|
||||
/// § Acceptance §9 #1 — real Ollama OCR + IngestItem.pdf_ocr_pages = Some(1).
|
||||
#[test]
|
||||
#[ignore = "real Ollama qwen2.5vl:3b dependency"]
|
||||
fn ingest_with_mock_ocr_yields_pdf_ocr_summary() {
|
||||
let env = make_ocr_env_real();
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
|
||||
|
||||
assert!(report.new >= 1, "at least one PDF ingested: {report:?}");
|
||||
|
||||
let items = report.items.unwrap_or_default();
|
||||
let pdf_item = items.iter().find(|i| i.doc_path.0.ends_with(".pdf"));
|
||||
assert!(
|
||||
pdf_item.is_some(),
|
||||
"PDF item must appear in ingest report items: {items:?}"
|
||||
);
|
||||
let pdf_item = pdf_item.unwrap();
|
||||
assert!(
|
||||
pdf_item.pdf_ocr_pages.is_some(),
|
||||
"pdf_ocr_pages must be set for scanned PDF: {pdf_item:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.pdf_ocr_pages.unwrap(),
|
||||
1,
|
||||
"scanned_page1.pdf has exactly 1 page"
|
||||
);
|
||||
}
|
||||
|
||||
/// § Acceptance §9 #2 — OCR text indexed and retrievable via lexical search.
|
||||
#[test]
|
||||
#[ignore = "real Ollama qwen2.5vl:3b dependency"]
|
||||
fn ocr_text_indexed_and_searchable() {
|
||||
let env = make_ocr_env_real();
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
|
||||
|
||||
// Search for a Korean morpheme expected to appear in qwen2.5vl:3b OCR
|
||||
// output of the PoC ground-truth page. "다음" is a high-frequency token
|
||||
// in page1.txt truth file.
|
||||
let query = common::lexical_query("다음");
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query).expect("search");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"OCR-indexed text must surface in lexical search results"
|
||||
);
|
||||
}
|
||||
|
||||
/// Production cancel wiring smoke — pre-set cancel exits before any OCR call.
|
||||
/// Dummy endpoint (port 1 = connection-refused) means OCR HTTP calls would
|
||||
/// fail, but cancel=true prevents the loop from reaching OCR at all.
|
||||
/// Verifies no panic/deadlock regardless of Ok/Err outcome.
|
||||
#[test]
|
||||
fn ingest_with_cancel_aborts_mid_pdf() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.pdf.ocr.enabled = true;
|
||||
env.config.pdf.ocr.endpoint = Some("http://127.0.0.1:1".to_string());
|
||||
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let dest = env.workspace_root.join("scanned_page1.pdf");
|
||||
std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
|
||||
|
||||
let cancel = Arc::new(AtomicBool::new(true)); // pre-set — abort immediately
|
||||
|
||||
let result = kebab_app::ingest_with_config_cancellable(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
None,
|
||||
Some(cancel),
|
||||
);
|
||||
// Both Ok (pre-cancel exit) and Err (eager OCR engine fail) are acceptable —
|
||||
// key assertion is no panic/deadlock.
|
||||
let _ = result;
|
||||
}
|
||||
@@ -13,13 +13,9 @@ use kebab_core::IngestItemKind;
|
||||
fn run_with_progress() -> Vec<IngestEvent> {
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), false, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
@@ -116,13 +112,9 @@ fn ingest_with_config_progress_none_matches_ingest_with_config() {
|
||||
// `ingest_with_config_progress(..., None)` must produce identical
|
||||
// reports modulo wall-clock duration.
|
||||
let env = TestEnv::lexical_only();
|
||||
let r_none = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
let r_none =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, None)
|
||||
.unwrap();
|
||||
assert_eq!(r_none.scanned, 3);
|
||||
assert_eq!(r_none.new, 3);
|
||||
}
|
||||
@@ -134,12 +126,77 @@ fn dropped_receiver_does_not_panic_or_fail_ingest() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
drop(rx);
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
}
|
||||
|
||||
/// v0.20.0 sub-item 1: pdf_ocr_started + pdf_ocr_finished events 가 PDF asset 의
|
||||
/// OCR-enabled ingest 시 emit 됨을 검증. real Ollama 의존 — `#[ignore]` default.
|
||||
///
|
||||
/// Manual invoke:
|
||||
/// ```
|
||||
/// KEBAB_PDF_OCR_ENABLED=true \
|
||||
/// KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
|
||||
/// cargo test -p kebab-app --test ingest_progress \
|
||||
/// --ignored pdf_ocr_progress_emits_started_finished_events
|
||||
/// ```
|
||||
#[test]
|
||||
#[ignore = "real Ollama dependency — manual invoke via KEBAB_PDF_OCR_ENABLED=true"]
|
||||
fn pdf_ocr_progress_emits_started_finished_events() {
|
||||
// F1 fixture (DCTDecode JPEG passthrough) 을 tmpdir 의 workspace 로 copy.
|
||||
let tmpdir = tempfile::tempdir().expect("create tmpdir");
|
||||
let workspace = tmpdir.path().join("workspace");
|
||||
std::fs::create_dir_all(&workspace).expect("create workspace dir");
|
||||
let f1_src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let f1 = std::fs::read(&f1_src).expect("F1 fixture present");
|
||||
std::fs::write(workspace.join("page1.pdf"), &f1).expect("copy F1");
|
||||
|
||||
let data_dir = tmpdir.path().join("data");
|
||||
std::fs::create_dir_all(&data_dir).expect("create data dir");
|
||||
|
||||
let mut config = kebab_config::Config::defaults();
|
||||
config.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
config.models.embedding.provider = "none".to_string();
|
||||
config.models.embedding.dimensions = 0;
|
||||
config.pdf.ocr.enabled = true;
|
||||
if let Ok(endpoint) = std::env::var("KEBAB_PDF_OCR_ENDPOINT") {
|
||||
config.pdf.ocr.endpoint = Some(endpoint);
|
||||
}
|
||||
|
||||
let scope = kebab_core::SourceScope {
|
||||
root: workspace.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let _report = kebab_app::ingest_with_config_progress(config, scope, false, Some(tx))
|
||||
.expect("ingest_with_config_progress");
|
||||
|
||||
let events: Vec<_> = rx.iter().collect();
|
||||
|
||||
let started_count = events
|
||||
.iter()
|
||||
.filter(|e| matches!(e, IngestEvent::PdfOcrStarted { .. }))
|
||||
.count();
|
||||
let finished_count = events
|
||||
.iter()
|
||||
.filter(|e| matches!(e, IngestEvent::PdfOcrFinished { .. }))
|
||||
.count();
|
||||
|
||||
assert!(
|
||||
started_count >= 1,
|
||||
"PdfOcrStarted 가 ≥ 1 emit 됨 (got {started_count})"
|
||||
);
|
||||
assert!(
|
||||
finished_count >= 1,
|
||||
"PdfOcrFinished 가 ≥ 1 emit 됨 (got {finished_count})"
|
||||
);
|
||||
assert_eq!(
|
||||
started_count, finished_count,
|
||||
"Started 와 Finished 의 count 일치"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -29,13 +29,15 @@ fn ingest_stdin_writes_frontmatter_and_reports_new() {
|
||||
"## Body content\n\nMore.",
|
||||
"Article X",
|
||||
Some("https://example.com/x"),
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(report.new, 1, "{report:?}");
|
||||
|
||||
// _external/ contains exactly one .md file with frontmatter.
|
||||
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(std::result::Result::ok)
|
||||
.collect();
|
||||
assert_eq!(entries.len(), 1);
|
||||
let content = fs::read_to_string(entries[0].path()).unwrap();
|
||||
@@ -50,17 +52,14 @@ fn ingest_stdin_without_source_uri() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = fresh_cfg(dir.path());
|
||||
|
||||
let report = kebab_app::ingest_stdin_with_config(
|
||||
cfg.clone(),
|
||||
"## Body",
|
||||
"Title",
|
||||
None,
|
||||
).unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_stdin_with_config(cfg.clone(), "## Body", "Title", None).unwrap();
|
||||
assert_eq!(report.new, 1);
|
||||
|
||||
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(std::result::Result::ok)
|
||||
.collect();
|
||||
let content = fs::read_to_string(entries[0].path()).unwrap();
|
||||
assert!(content.contains("title: \"Title\""));
|
||||
|
||||
@@ -17,9 +17,8 @@ fn init_workspace_header_lists_supported_extensions() {
|
||||
}
|
||||
kebab_app::init_workspace(true).expect("init_workspace");
|
||||
let cfg_path = kebab_config::Config::xdg_config_path();
|
||||
let body = std::fs::read_to_string(&cfg_path).unwrap_or_else(|e| {
|
||||
panic!("read config at {}: {e}", cfg_path.display())
|
||||
});
|
||||
let body = std::fs::read_to_string(&cfg_path)
|
||||
.unwrap_or_else(|e| panic!("read config at {}: {e}", cfg_path.display()));
|
||||
assert!(
|
||||
body.contains("처리 가능한 형식"),
|
||||
"header lists supported types section: body=\n{body}"
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
//! Bug #3 regression: multi-scanned PDF ingest must produce globally unique chunk_ids.
|
||||
//! v0.20.0 sub-item 1 bugfix.
|
||||
//!
|
||||
//! Strategy: helper-level chain test (apply_ocr_to_pdf_pages → PdfPageV1Chunker).
|
||||
//! Facade mock injection is unavailable (kebab-app hardcodes OllamaVisionOcr), so
|
||||
//! this test covers the full OCR→chunk pipeline with real PDF fixtures + MockOcrEngine,
|
||||
//! adding value beyond kebab-chunk unit test B5 (which tests PdfPageV1Chunker alone).
|
||||
|
||||
mod common;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use common::mock_ocr::MockOcrEngine;
|
||||
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
||||
use kebab_chunk::PdfPageV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetStorage, Checksum, ChunkPolicy, Chunker, ExtractConfig, ExtractContext, Extractor,
|
||||
MediaType, RawAsset, SourceUri, WorkspacePath, id_for_asset,
|
||||
};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn make_pdf_asset(path: &str, hash_char: char, byte_len: u64) -> RawAsset {
|
||||
let fake_hash: String = hash_char.to_string().repeat(64);
|
||||
let asset_id = id_for_asset(&fake_hash);
|
||||
RawAsset {
|
||||
asset_id,
|
||||
source_uri: SourceUri::File(PathBuf::from(path)),
|
||||
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
|
||||
media_type: MediaType::Pdf,
|
||||
byte_len,
|
||||
checksum: Checksum(fake_hash),
|
||||
discovered_at: OffsetDateTime::UNIX_EPOCH,
|
||||
stored: AssetStorage::Copied {
|
||||
path: PathBuf::from(path),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_and_ocr(
|
||||
bytes: &[u8],
|
||||
path: &str,
|
||||
hash_char: char,
|
||||
engine: &dyn OcrEngine,
|
||||
) -> kebab_core::CanonicalDocument {
|
||||
let asset = make_pdf_asset(path, hash_char, bytes.len() as u64);
|
||||
let workspace_root = Path::new("/");
|
||||
let config = ExtractConfig::default();
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root,
|
||||
config: &config,
|
||||
};
|
||||
let mut canonical = PdfTextExtractor::new().extract(&ctx, bytes).unwrap();
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
apply_ocr_to_pdf_pages(&mut canonical, engine, bytes, &opts, |_| {}).unwrap();
|
||||
canonical
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_scanned_pdf_ingest_no_chunk_id_collision() {
|
||||
let f1_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||
.expect("F1 fixture missing");
|
||||
let f2_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page2.pdf")
|
||||
.expect("F2 fixture missing");
|
||||
|
||||
// Bug #3 trigger shape: 10-char early segment + ". " + 500-char tail.
|
||||
// byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500 → multi-chunk.
|
||||
// overlap_bytes = min(240, 750) = 240 / chars=80 → second chunk's actual_start
|
||||
// collapses to prev_min=0 without the fix → same #c0 suffix → chunk_id collision.
|
||||
let trigger_text = format!("{}. {}", "가".repeat(10), "나".repeat(500));
|
||||
|
||||
let f1_engine = MockOcrEngine::single("F1 mock OCR page text", false);
|
||||
let f2_engine = MockOcrEngine::single(&trigger_text, false);
|
||||
|
||||
let f1_canonical = extract_and_ocr(&f1_bytes, "page1.pdf", '1', &f1_engine);
|
||||
let f2_canonical = extract_and_ocr(&f2_bytes, "page2.pdf", '2', &f2_engine);
|
||||
|
||||
let chunk_policy = ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: PdfPageV1Chunker.chunker_version(),
|
||||
};
|
||||
|
||||
let f1_chunks = PdfPageV1Chunker
|
||||
.chunk(&f1_canonical, &chunk_policy)
|
||||
.unwrap();
|
||||
let f2_chunks = PdfPageV1Chunker
|
||||
.chunk(&f2_canonical, &chunk_policy)
|
||||
.unwrap();
|
||||
|
||||
assert!(
|
||||
f2_chunks.len() >= 2,
|
||||
"F2 trigger text must produce ≥2 chunks for the collision to be possible; got {}",
|
||||
f2_chunks.len()
|
||||
);
|
||||
|
||||
let all_ids: Vec<&str> = f1_chunks
|
||||
.iter()
|
||||
.chain(f2_chunks.iter())
|
||||
.map(|c| c.chunk_id.0.as_str())
|
||||
.collect();
|
||||
let total = all_ids.len();
|
||||
let unique: HashSet<&str> = all_ids.iter().copied().collect();
|
||||
assert_eq!(
|
||||
unique.len(),
|
||||
total,
|
||||
"all chunk_ids must be globally unique across F1 + F2 ({} unique vs {} total — collision detected)",
|
||||
unique.len(),
|
||||
total,
|
||||
);
|
||||
}
|
||||
156
crates/kebab-app/tests/ocr_inspect_smoke.rs
Normal file
156
crates/kebab-app/tests/ocr_inspect_smoke.rs
Normal file
@@ -0,0 +1,156 @@
|
||||
//! Integration smoke tests for `kebab inspect ocr-stats / ocr-failures`.
|
||||
//! AC-4, AC-5, AC-6, AC-11 (ocr_inspect_smoke binary), AC-13.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::App;
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
|
||||
/// Insert synthetic pdf_ocr_events rows directly so the test runs without
|
||||
/// a live Ollama endpoint.
|
||||
fn seed_ocr_events(env: &TestEnv, store: &SqliteStore) {
|
||||
// Success rows
|
||||
for i in 0..3u32 {
|
||||
store
|
||||
.record_pdf_ocr_event(
|
||||
"run-aaa",
|
||||
&format!("2026-05-28T0{i}:00:00Z"),
|
||||
Some("doc-abc"),
|
||||
"path/scanned.pdf",
|
||||
i + 1,
|
||||
Some(50_000),
|
||||
Some(200),
|
||||
Some(150),
|
||||
100 + u64::from(i) * 20,
|
||||
42,
|
||||
true,
|
||||
None,
|
||||
"qwen2.5vl",
|
||||
)
|
||||
.expect("seed success row");
|
||||
}
|
||||
// Failure row
|
||||
store
|
||||
.record_pdf_ocr_event(
|
||||
"run-bbb",
|
||||
"2026-05-28T10:00:00Z",
|
||||
Some("doc-abc"),
|
||||
"path/scanned.pdf",
|
||||
4,
|
||||
Some(30_000),
|
||||
Some(200),
|
||||
Some(150),
|
||||
9999,
|
||||
0,
|
||||
false,
|
||||
Some("ocr_error"),
|
||||
"qwen2.5vl",
|
||||
)
|
||||
.expect("seed failure row");
|
||||
// Row for different doc
|
||||
store
|
||||
.record_pdf_ocr_event(
|
||||
"run-ccc",
|
||||
"2026-05-28T11:00:00Z",
|
||||
Some("doc-xyz"),
|
||||
"path/other.pdf",
|
||||
1,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
200,
|
||||
10,
|
||||
true,
|
||||
None,
|
||||
"qwen2.5vl",
|
||||
)
|
||||
.expect("seed doc-xyz row");
|
||||
// Trigger migration (no-op if already done via App::open_with_config)
|
||||
let _ = env;
|
||||
}
|
||||
|
||||
fn open_app_with_seeded_events(env: &TestEnv) -> App {
|
||||
let app = env.app();
|
||||
let store = SqliteStore::open(&env.config).expect("open store for seed");
|
||||
store.run_migrations().expect("run migrations for seed");
|
||||
seed_ocr_events(env, &store);
|
||||
app
|
||||
}
|
||||
|
||||
/// AC-4: `inspect_ocr_stats` returns `schema_version = "ocr_stats.v1"`,
|
||||
/// `total_events >= 1`, `0 ≤ success_rate ≤ 1`.
|
||||
#[test]
|
||||
fn ocr_stats_after_seeded_events() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let app = open_app_with_seeded_events(&env);
|
||||
|
||||
let stats = app.inspect_ocr_stats().expect("inspect_ocr_stats");
|
||||
|
||||
assert_eq!(stats.schema_version, "ocr_stats.v1");
|
||||
assert!(stats.total_events >= 1, "total_events should be >= 1");
|
||||
assert!(
|
||||
(0.0..=1.0).contains(&stats.success_rate),
|
||||
"success_rate must be in [0, 1]: {}",
|
||||
stats.success_rate
|
||||
);
|
||||
assert!(stats.total_runs >= 1, "total_runs should be >= 1");
|
||||
// by_engine should have at least one entry
|
||||
assert!(!stats.by_engine.is_empty(), "by_engine must be non-empty");
|
||||
}
|
||||
|
||||
/// AC-6: `inspect_ocr_failures` (no doc_id, corpus-wide) returns failures list.
|
||||
#[test]
|
||||
fn ocr_failures_corpus_wide() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let app = open_app_with_seeded_events(&env);
|
||||
|
||||
let result = app
|
||||
.inspect_ocr_failures(None, 10)
|
||||
.expect("inspect_ocr_failures");
|
||||
|
||||
assert_eq!(result.schema_version, "ocr_failures.v1");
|
||||
assert!(result.failure_count >= 1, "expected at least 1 failure");
|
||||
assert!(
|
||||
!result.failures.is_empty(),
|
||||
"failures list must be non-empty"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC-5: `inspect_ocr_failures` with doc_id filter returns matching rows.
|
||||
#[test]
|
||||
fn ocr_failures_filter_by_doc_id() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let app = open_app_with_seeded_events(&env);
|
||||
|
||||
let result = app
|
||||
.inspect_ocr_failures(Some("doc-abc"), 10)
|
||||
.expect("inspect_ocr_failures by doc_id");
|
||||
|
||||
assert_eq!(result.schema_version, "ocr_failures.v1");
|
||||
assert_eq!(
|
||||
result.doc_id.as_deref(),
|
||||
Some("doc-abc"),
|
||||
"doc_id must be echoed back"
|
||||
);
|
||||
// All rows must belong to doc-abc (no cross-doc leak)
|
||||
for row in &result.failures {
|
||||
// rows are failure rows for doc-abc only (reason = ocr_error)
|
||||
assert_eq!(row.reason, "ocr_error");
|
||||
}
|
||||
}
|
||||
|
||||
/// AC-13: SKILL.md lists both new wire schemas.
|
||||
#[test]
|
||||
fn skill_md_lists_new_schemas() {
|
||||
let skill_md = std::fs::read_to_string("../../integrations/claude-code/kebab/SKILL.md")
|
||||
.expect("read SKILL.md");
|
||||
assert!(
|
||||
skill_md.contains("ocr_stats.v1"),
|
||||
"SKILL.md must mention ocr_stats.v1"
|
||||
);
|
||||
assert!(
|
||||
skill_md.contains("ocr_failures.v1"),
|
||||
"SKILL.md must mention ocr_failures.v1"
|
||||
);
|
||||
}
|
||||
81
crates/kebab-app/tests/open_with_config_nli.rs
Normal file
81
crates/kebab-app/tests/open_with_config_nli.rs
Normal file
@@ -0,0 +1,81 @@
|
||||
//! Tests for `App::open_with_config`'s NLI verifier construction path.
|
||||
//!
|
||||
//! Coverage:
|
||||
//! 1. `open_with_config_nli_fails_when_model_dir_unwritable_and_threshold_positive` —
|
||||
//! when `rag.nli_threshold > 0` and `storage.model_dir` is unwritable,
|
||||
//! `open_with_config` returns `Err` with "OnnxNliVerifier" in the
|
||||
//! error chain.
|
||||
//! 2. `open_with_config_nli_skipped_when_threshold_zero` —
|
||||
//! same bad `model_dir`, but `rag.nli_threshold = 0.0` (gate disabled),
|
||||
//! so `OnnxNliVerifier::new` is never called and `open_with_config`
|
||||
//! succeeds.
|
||||
//!
|
||||
//! `/proc/1/root` is the init process's filesystem root; on Linux it is
|
||||
//! owned by root and not traversable by unprivileged users, making
|
||||
//! `create_dir_all` fail with `EACCES` — a reliable "unwritable path"
|
||||
//! that requires no test setup beyond the path literal.
|
||||
|
||||
use kebab_config::Config;
|
||||
|
||||
/// Return a `Config` whose `data_dir` lives in a fresh `TempDir`
|
||||
/// (so `SqliteStore::open` succeeds) and whose `model_dir` is set to
|
||||
/// `/proc/1/root` (unwritable by non-root processes on Linux).
|
||||
///
|
||||
/// The `TempDir` is returned alongside the config so the caller keeps
|
||||
/// it alive until the test completes — dropping it early would delete
|
||||
/// the data directory before any assertions run.
|
||||
fn config_with_unwritable_model_dir() -> (tempfile::TempDir, Config) {
|
||||
let tmp = tempfile::tempdir().expect("tempdir");
|
||||
let mut cfg = Config::defaults();
|
||||
// Valid data_dir → SqliteStore::open + run_migrations succeed.
|
||||
cfg.storage.data_dir = tmp.path().to_string_lossy().into_owned();
|
||||
// /proc/1/root is only accessible to root; create_dir_all will
|
||||
// return EACCES for any unprivileged user, which is exactly the
|
||||
// failure mode we want to exercise.
|
||||
cfg.storage.model_dir = "/proc/1/root".to_string();
|
||||
(tmp, cfg)
|
||||
}
|
||||
|
||||
// ── 1. Failure path: threshold > 0 + unwritable model_dir ─────────────────
|
||||
|
||||
#[test]
|
||||
fn open_with_config_nli_fails_when_model_dir_unwritable_and_threshold_positive() {
|
||||
let (_tmp, mut cfg) = config_with_unwritable_model_dir();
|
||||
cfg.rag.nli_threshold = 0.5; // gate enabled → OnnxNliVerifier::new runs
|
||||
|
||||
let result = kebab_app::App::open_with_config(cfg);
|
||||
|
||||
let Err(err) = result else {
|
||||
panic!(
|
||||
"App::open_with_config must fail when model_dir is unwritable and nli_threshold > 0"
|
||||
);
|
||||
};
|
||||
// The error chain must identify the OnnxNliVerifier as the source so
|
||||
// an operator reading logs can trace the failure to the NLI config.
|
||||
let err_chain = format!("{err:?}");
|
||||
assert!(
|
||||
err_chain.contains("OnnxNliVerifier"),
|
||||
"error chain must mention OnnxNliVerifier; full chain: {err_chain}"
|
||||
);
|
||||
}
|
||||
|
||||
// ── 2. Success path: threshold = 0.0 → NLI verifier never constructed ──────
|
||||
|
||||
#[test]
|
||||
fn open_with_config_nli_skipped_when_threshold_zero() {
|
||||
let (_tmp, cfg) = config_with_unwritable_model_dir();
|
||||
// Default nli_threshold is 0.0 — gate disabled, verifier skipped.
|
||||
assert!(
|
||||
(cfg.rag.nli_threshold - 0.0).abs() < f32::EPSILON,
|
||||
"precondition: default nli_threshold must be 0.0 (gate disabled)"
|
||||
);
|
||||
|
||||
// A bad model_dir must NOT cause a failure when the NLI gate is off.
|
||||
let result = kebab_app::App::open_with_config(cfg);
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"App::open_with_config must succeed when nli_threshold = 0.0 \
|
||||
(OnnxNliVerifier is never constructed); err: {:?}",
|
||||
result.err()
|
||||
);
|
||||
}
|
||||
358
crates/kebab-app/tests/pdf_ocr_apply.rs
Normal file
358
crates/kebab-app/tests/pdf_ocr_apply.rs
Normal file
@@ -0,0 +1,358 @@
|
||||
//! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
use common::mock_ocr::MockOcrEngine;
|
||||
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
||||
use kebab_core::{
|
||||
AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext, Extractor,
|
||||
Inline, Lang, MediaType, RawAsset, SourceSpan, SourceUri, WorkspacePath, id_for_asset,
|
||||
};
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
// ── Fixture helpers ───────────────────────────────────────────────────────
|
||||
|
||||
fn f1_pdf_bytes() -> Vec<u8> {
|
||||
std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||
.expect("F1 fixture missing")
|
||||
}
|
||||
|
||||
fn make_raw_asset(path: &str, media_type: MediaType, byte_len: u64) -> RawAsset {
|
||||
let fake_hash = "0".repeat(64);
|
||||
let asset_id = id_for_asset(&fake_hash);
|
||||
RawAsset {
|
||||
asset_id,
|
||||
source_uri: SourceUri::File(PathBuf::from(path)),
|
||||
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
|
||||
media_type,
|
||||
byte_len,
|
||||
checksum: Checksum(fake_hash.clone()),
|
||||
discovered_at: OffsetDateTime::UNIX_EPOCH,
|
||||
stored: AssetStorage::Copied {
|
||||
path: PathBuf::from(path),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a CanonicalDocument from raw PDF bytes using PdfTextExtractor.
|
||||
/// F1 (scanned) returns an empty-text Block::Paragraph per page.
|
||||
fn extract_canonical_from_bytes(bytes: &[u8]) -> CanonicalDocument {
|
||||
let asset = make_raw_asset("test.pdf", MediaType::Pdf, bytes.len() as u64);
|
||||
let workspace_root = Path::new("/");
|
||||
let config = ExtractConfig::default();
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root,
|
||||
config: &config,
|
||||
};
|
||||
PdfTextExtractor::new().extract(&ctx, bytes).unwrap()
|
||||
}
|
||||
|
||||
/// F1 bytes → canonical with 1 empty Block::Paragraph for page 1.
|
||||
fn canonical_with_empty_block() -> CanonicalDocument {
|
||||
extract_canonical_from_bytes(&f1_pdf_bytes())
|
||||
}
|
||||
|
||||
/// F1-based canonical with block text replaced by `text` (high valid_ratio, chars≥20).
|
||||
fn canonical_with_filled_block(text: &str) -> CanonicalDocument {
|
||||
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
|
||||
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
|
||||
let char_count = text.chars().count() as u32;
|
||||
tb.text = text.to_string();
|
||||
tb.inlines = vec![Inline::Text {
|
||||
text: text.to_string(),
|
||||
}];
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(char_count);
|
||||
}
|
||||
}
|
||||
canonical
|
||||
}
|
||||
|
||||
/// F1-based canonical with block text replaced by PUA codepoints (low valid_ratio).
|
||||
fn canonical_with_mojibake_block() -> CanonicalDocument {
|
||||
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
|
||||
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
|
||||
let pua = "\u{E000}".repeat(25); // 25 PUA codepoints → valid_ratio ≈ 0
|
||||
let char_count = pua.chars().count() as u32;
|
||||
tb.text = pua.clone();
|
||||
tb.inlines = vec![Inline::Text { text: pua }];
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(char_count);
|
||||
}
|
||||
}
|
||||
canonical
|
||||
}
|
||||
|
||||
fn default_opts(enabled: bool) -> PdfOcrOpts {
|
||||
PdfOcrOpts {
|
||||
enabled,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// Test 1: F1 + enabled=true → in-place mutate
|
||||
#[test]
|
||||
fn f1_input_with_ocr_enabled_replaces_empty_block() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("MOCK_OCR_TEXT", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: Some(Lang("kor".into())),
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1);
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
assert!(first_para.is_some());
|
||||
assert_eq!(first_para.unwrap().text, "MOCK_OCR_TEXT");
|
||||
}
|
||||
|
||||
// Test 2: F3 vector (mock filled canonical) + enabled=true → OCR skip (needs_ocr=false)
|
||||
#[test]
|
||||
fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() {
|
||||
let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text
|
||||
let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "vector PDF 의 OCR 호출 0");
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
if let Some(tb) = first_para {
|
||||
assert!(tb.text.starts_with("충분한"), "원본 text 보존");
|
||||
}
|
||||
}
|
||||
|
||||
// Test 3: F1 + enabled=false → no-op
|
||||
#[test]
|
||||
fn f1_input_with_ocr_disabled_keeps_empty_block() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("IGNORED", false);
|
||||
let opts = default_opts(false);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0);
|
||||
assert_eq!(summary.ms_total, 0);
|
||||
}
|
||||
|
||||
// Test 4: mojibake canonical (PUA chars) + enabled=true → in-place mutate
|
||||
#[test]
|
||||
fn f4_input_with_ocr_enabled_replaces_mojibake_block() {
|
||||
let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image
|
||||
let mut canonical = canonical_with_mojibake_block();
|
||||
let engine = MockOcrEngine::single("OCR_MOJIBAKE_REPLACEMENT", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1, "mojibake page 의 OCR 호출");
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
if let Some(tb) = first_para {
|
||||
assert_eq!(tb.text, "OCR_MOJIBAKE_REPLACEMENT");
|
||||
}
|
||||
}
|
||||
|
||||
// Test 5: filled canonical + always_on=true → dual-block (+1 OCR block)
|
||||
#[test]
|
||||
fn f3_input_with_always_on_pushes_dual_blocks() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let original_block_count = canonical.blocks.len();
|
||||
let engine = MockOcrEngine::single("OCR_DUAL", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: true,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1);
|
||||
assert_eq!(
|
||||
canonical.blocks.len(),
|
||||
original_block_count + 1,
|
||||
"always_on 시 새 Block::Paragraph push"
|
||||
);
|
||||
let texts: Vec<&str> = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb.text.as_str()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
assert!(texts.contains(&"OCR_DUAL"), "OCR block 포함");
|
||||
assert!(
|
||||
texts.iter().any(|t| t.starts_with("vector")),
|
||||
"원본 text-detect block 보존"
|
||||
);
|
||||
}
|
||||
|
||||
// Test 6: F6 FlateDecode → extract_dctdecode_page_image=None → skip + warning
|
||||
#[test]
|
||||
fn f6_flatedecode_skipped_with_warning() {
|
||||
let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf")
|
||||
.expect("F6 fixture missing");
|
||||
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
summary.pages_ocrd, 0,
|
||||
"FlateDecode page 는 skip (DCTDecode-only v1 invariant)"
|
||||
);
|
||||
let warning_count = canonical
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
|
||||
.count();
|
||||
assert!(warning_count >= 1, "FlateDecode skip 시 Warning event 발행");
|
||||
}
|
||||
|
||||
// Test 7: F7 CCITTFax → skip + warning (verifier M-4 split)
|
||||
#[test]
|
||||
fn f7_ccittfax_skipped_with_warning() {
|
||||
let bytes =
|
||||
std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf").expect("F7 fixture missing");
|
||||
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "CCITTFax page 는 skip");
|
||||
let warning_count = canonical
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
|
||||
.count();
|
||||
assert!(warning_count >= 1, "CCITTFax skip 시 Warning event 발행");
|
||||
}
|
||||
|
||||
// Test 8: OCR engine failure → warning event + skip
|
||||
#[test]
|
||||
fn ocr_engine_failure_surfaces_as_warning() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("", true);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "OCR failure 시 pages_ocrd=0");
|
||||
let warning_with_failure = canonical.provenance.events.iter().any(|e| {
|
||||
e.kind == kebab_core::ProvenanceKind::Warning
|
||||
&& e.note.as_deref().unwrap_or("").contains("mock failure")
|
||||
});
|
||||
assert!(
|
||||
warning_with_failure,
|
||||
"OCR failure 의 error message 가 warning event 의 note 안"
|
||||
);
|
||||
}
|
||||
|
||||
// Test 9: dual-block ordinals are deterministic and unique
|
||||
#[test]
|
||||
fn dual_block_ordinals_are_deterministic_and_unique() {
|
||||
let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1
|
||||
let text = "vector 충분한 텍스트. This text has more than twenty characters total.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let engine = MockOcrEngine::single("DUAL", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: true,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
// page_count=1 → text-detect ordinal=0, ocr ordinal=1 (page_num-1 + page_count = 0+1=1)
|
||||
let para_count = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter(|b| matches!(b, Block::Paragraph(_)))
|
||||
.count();
|
||||
assert_eq!(para_count, 2, "dual-block: text-detect + OCR");
|
||||
|
||||
let all_page_1 = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(&tb.common.source_span),
|
||||
_ => None,
|
||||
})
|
||||
.all(|s| matches!(s, SourceSpan::Page { page: 1, .. }));
|
||||
assert!(all_page_1, "두 block 모두 page=1");
|
||||
}
|
||||
|
||||
// Test 10: cancel handle aborts mid-PDF
|
||||
#[test]
|
||||
fn cancel_handle_aborts_mid_pdf() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel
|
||||
let engine = MockOcrEngine::single("IGNORED", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: Some(cancel.clone()),
|
||||
};
|
||||
|
||||
let result = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {});
|
||||
let err = result.expect_err("cancel=true 시 error 반환");
|
||||
assert!(
|
||||
format!("{err}").contains("cancelled mid-PDF"),
|
||||
"error message 가 'cancelled mid-PDF' 포함: {err}"
|
||||
);
|
||||
}
|
||||
139
crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs
Normal file
139
crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs
Normal file
@@ -0,0 +1,139 @@
|
||||
//! Integration smoke test: dual-write (ndjson + SQLite) for PDF OCR events.
|
||||
//! AC-3: SQLite row count and doc_id matches ndjson LogEvent::Ocr.
|
||||
//!
|
||||
//! Uses wiremock to stub the Ollama `/api/generate` endpoint so the test
|
||||
//! runs without a live Ollama instance.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_config::LoggingCfg;
|
||||
use serde_json::Value;
|
||||
use tokio::task::spawn_blocking;
|
||||
use wiremock::matchers::{method, path};
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
fn scanned_pdf_src() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||
}
|
||||
|
||||
/// AC-3: ndjson OCR line count == pdf_ocr_events row count, and doc_id matches.
|
||||
#[tokio::test]
|
||||
async fn ingest_dual_write_doc_id_matches_ndjson() {
|
||||
let src = scanned_pdf_src();
|
||||
if !src.exists() {
|
||||
eprintln!("skipping test: scanned_page1.pdf fixture not found");
|
||||
return;
|
||||
}
|
||||
|
||||
let server = MockServer::start().await;
|
||||
// Stub Ollama /api/generate to return a minimal OCR response.
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/generate"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
|
||||
"model": "qwen2.5vl:3b",
|
||||
"response": "test ocr output",
|
||||
"done": true,
|
||||
"done_reason": "stop"
|
||||
})))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let mock_url = server.uri();
|
||||
|
||||
let result = spawn_blocking(move || {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
// Enable PDF OCR + set up mock endpoint
|
||||
env.config.pdf.ocr.enabled = true;
|
||||
env.config.pdf.ocr.endpoint = Some(mock_url.clone());
|
||||
env.config.pdf.ocr.model = "qwen2.5vl:3b".to_string();
|
||||
// Enable ingest log
|
||||
let log_dir = env.temp.path().join("logs");
|
||||
std::fs::create_dir_all(&log_dir).unwrap();
|
||||
env.config.logging = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: log_dir.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Copy scanned PDF into workspace
|
||||
let dest = env.workspace_root.join("scanned.pdf");
|
||||
std::fs::copy(scanned_pdf_src(), &dest).expect("copy scanned PDF");
|
||||
|
||||
// Run ingest
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
|
||||
|
||||
// Read ndjson log
|
||||
let log_files: Vec<_> = std::fs::read_dir(&log_dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
let name = e.file_name().to_string_lossy().to_string();
|
||||
name.starts_with("ingest-") && name.ends_with(".ndjson")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(log_files.len(), 1, "expected 1 ndjson log file");
|
||||
|
||||
let body = std::fs::read_to_string(log_files[0].path()).unwrap();
|
||||
let ocr_lines: Vec<Value> = body
|
||||
.lines()
|
||||
.filter_map(|l| serde_json::from_str(l).ok())
|
||||
.filter(|v: &Value| v.get("kind").and_then(Value::as_str) == Some("ocr"))
|
||||
.collect();
|
||||
|
||||
// Read pdf_ocr_events from SQLite
|
||||
let db_path = PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
|
||||
let conn = rusqlite::Connection::open(&db_path).expect("open db");
|
||||
let rows: Vec<(Option<String>, String)> = {
|
||||
let mut stmt = conn
|
||||
.prepare("SELECT doc_id, doc_path FROM pdf_ocr_events ORDER BY id")
|
||||
.expect("prepare");
|
||||
stmt.query_map([], |r| Ok((r.get(0)?, r.get(1)?)))
|
||||
.expect("query")
|
||||
.map(|r| r.expect("row"))
|
||||
.collect()
|
||||
};
|
||||
|
||||
(ocr_lines, rows)
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
|
||||
let (ocr_lines, rows) = result;
|
||||
|
||||
// At least one OCR event must be produced
|
||||
assert!(!ocr_lines.is_empty(), "expected ≥1 ndjson ocr line");
|
||||
assert!(!rows.is_empty(), "expected ≥1 pdf_ocr_events row");
|
||||
|
||||
// Row counts must match
|
||||
assert_eq!(
|
||||
ocr_lines.len(),
|
||||
rows.len(),
|
||||
"ndjson ocr lines ({}) must equal pdf_ocr_events rows ({})",
|
||||
ocr_lines.len(),
|
||||
rows.len()
|
||||
);
|
||||
|
||||
// doc_id in both sources must be non-null and consistent
|
||||
for (line, (sql_doc_id, _sql_doc_path)) in ocr_lines.iter().zip(rows.iter()) {
|
||||
let json_doc_id = line.get("doc_id").and_then(Value::as_str);
|
||||
assert!(
|
||||
json_doc_id.is_some(),
|
||||
"ndjson ocr line should have doc_id: {line}"
|
||||
);
|
||||
assert!(
|
||||
sql_doc_id.is_some(),
|
||||
"pdf_ocr_events row should have doc_id"
|
||||
);
|
||||
assert_eq!(
|
||||
json_doc_id,
|
||||
sql_doc_id.as_deref(),
|
||||
"ndjson doc_id must equal SQLite doc_id"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -46,17 +46,13 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
|
||||
operations: vec![
|
||||
Operation::new("BT", vec![]),
|
||||
Operation::new("Tf", vec!["F1".into(), 24.into()]),
|
||||
Operation::new(
|
||||
"Td",
|
||||
vec![Object::Integer(100), Object::Integer(700)],
|
||||
),
|
||||
Operation::new("Td", vec![Object::Integer(100), Object::Integer(700)]),
|
||||
Operation::new("Tj", vec![Object::string_literal(*text)]),
|
||||
Operation::new("ET", vec![]),
|
||||
],
|
||||
};
|
||||
let stream_data = content.encode().expect("content encode");
|
||||
let content_id =
|
||||
doc.add_object(Stream::new(dictionary! {}, stream_data));
|
||||
let content_id = doc.add_object(Stream::new(dictionary! {}, stream_data));
|
||||
page_dict.set("Contents", content_id);
|
||||
}
|
||||
let page_id = doc.add_object(page_dict);
|
||||
@@ -76,8 +72,7 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
|
||||
Object::Integer(842),
|
||||
],
|
||||
};
|
||||
doc.objects
|
||||
.insert(pages_id, Object::Dictionary(pages_dict));
|
||||
doc.objects.insert(pages_id, Object::Dictionary(pages_dict));
|
||||
|
||||
let catalog_id = doc.add_object(dictionary! {
|
||||
"Type" => "Catalog",
|
||||
@@ -146,9 +141,8 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
|
||||
write_pdf(&env.workspace_root, "three.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
|
||||
.expect("PDF ingest must succeed");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
|
||||
.expect("PDF ingest must succeed");
|
||||
|
||||
assert_eq!(report.errors, 0);
|
||||
let items = report.items.as_ref().expect("items present");
|
||||
@@ -157,23 +151,28 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
|
||||
.find(|i| i.doc_path.0.ends_with("three.pdf"))
|
||||
.expect("PDF item present");
|
||||
assert_eq!(pdf_item.kind, IngestItemKind::New);
|
||||
assert_eq!(pdf_item.block_count, Some(3), "one Block::Paragraph per page");
|
||||
assert_eq!(pdf_item.chunk_count, Some(3), "one chunk per non-empty page");
|
||||
assert_eq!(
|
||||
pdf_item.block_count,
|
||||
Some(3),
|
||||
"one Block::Paragraph per page"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.chunk_count,
|
||||
Some(3),
|
||||
"one chunk per non-empty page"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
Some("pdf-text-v1")
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.chunker_version.as_ref().map(|c| c.0.as_str()),
|
||||
Some("pdf-page-v1")
|
||||
Some("pdf-page-v1.1")
|
||||
);
|
||||
|
||||
// Inspect the stored doc to confirm SourceSpan::Page round-trip.
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.expect("inspect_doc returns the PDF document");
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap())
|
||||
.expect("inspect_doc returns the PDF document");
|
||||
assert_eq!(doc.blocks.len(), 3);
|
||||
for (i, block) in doc.blocks.iter().enumerate() {
|
||||
let want_page = (i as u32) + 1;
|
||||
@@ -202,8 +201,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
|
||||
write_pdf(&env.workspace_root, "stable.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report1 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item1 = report1
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -214,8 +212,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
|
||||
.unwrap();
|
||||
assert_eq!(item1.kind, IngestItemKind::New);
|
||||
|
||||
let report2 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item2 = report2
|
||||
.items
|
||||
.unwrap()
|
||||
@@ -239,8 +236,7 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
|
||||
std::fs::write(&path, &bytes_v1).unwrap();
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report_v1 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report_v1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let id_v1 = report_v1
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -252,12 +248,10 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
|
||||
.clone()
|
||||
.unwrap();
|
||||
|
||||
let bytes_v2 =
|
||||
build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
|
||||
let bytes_v2 = build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
|
||||
std::fs::write(&path, &bytes_v2).unwrap();
|
||||
|
||||
let report_v2 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report_v2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item_v2 = report_v2
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -282,9 +276,11 @@ fn encrypted_pdf_fails_with_qpdf_hint() {
|
||||
write_pdf(&env.workspace_root, "secret.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 1, "encrypted PDF must increment errors exactly once");
|
||||
let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 1,
|
||||
"encrypted PDF must increment errors exactly once"
|
||||
);
|
||||
let items = report.items.as_ref().unwrap();
|
||||
let pdf_item = items
|
||||
.iter()
|
||||
@@ -310,9 +306,11 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
write_pdf(&env.workspace_root, "corrupt.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 1, "corrupt PDF must increment errors exactly once");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 1,
|
||||
"corrupt PDF must increment errors exactly once"
|
||||
);
|
||||
let items = report.items.as_ref().unwrap();
|
||||
let pdf_item = items
|
||||
.iter()
|
||||
@@ -322,11 +320,8 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
|
||||
// Confirm the doc was NOT stored — list_docs returns nothing for
|
||||
// this path.
|
||||
let summaries = kebab_app::list_docs_with_config(
|
||||
cfg,
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
|
||||
assert!(
|
||||
!summaries
|
||||
.iter()
|
||||
@@ -341,14 +336,15 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
#[test]
|
||||
fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bytes =
|
||||
build_text_pdf(&[Some("first page"), None, Some("third page")]);
|
||||
let bytes = build_text_pdf(&[Some("first page"), None, Some("third page")]);
|
||||
write_pdf(&env.workspace_root, "mixed.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0, "scanned candidate is a Warning, not Error");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 0,
|
||||
"scanned candidate is a Warning, not Error"
|
||||
);
|
||||
let pdf_item = report
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -365,14 +361,10 @@ fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
|
||||
assert_eq!(
|
||||
pdf_item.chunk_count,
|
||||
Some(2),
|
||||
"pdf-page-v1 emits 0 chunks for the empty page; total = 2"
|
||||
"pdf-page-v1.1 emits 0 chunks for the empty page; total = 2"
|
||||
);
|
||||
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let warnings: Vec<_> = doc
|
||||
.provenance
|
||||
.events
|
||||
@@ -419,8 +411,7 @@ fn ingest_report_arithmetic_invariant_holds_with_corrupt_pdf() {
|
||||
write_pdf(&env.workspace_root, "broken.pdf", &corrupt_pdf());
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
let total = report.new + report.updated + report.skipped + report.errors;
|
||||
assert_eq!(
|
||||
report.scanned, total,
|
||||
@@ -441,14 +432,12 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
let pages: Vec<String> = (1..=50)
|
||||
.map(|i| format!("Page {i} body — lorem ipsum dolor sit amet."))
|
||||
.collect();
|
||||
let page_refs: Vec<Option<&str>> =
|
||||
pages.iter().map(|s| Some(s.as_str())).collect();
|
||||
let page_refs: Vec<Option<&str>> = pages.iter().map(|s| Some(s.as_str())).collect();
|
||||
let bytes = build_text_pdf(&page_refs);
|
||||
write_pdf(&env.workspace_root, "long.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0);
|
||||
let pdf_item = report
|
||||
.items
|
||||
@@ -466,8 +455,7 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
|
||||
// Round-trip: list_docs sees the long PDF.
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
|
||||
assert!(summaries.iter().any(|s| s.doc_path.0.ends_with("long.pdf")));
|
||||
}
|
||||
|
||||
@@ -476,13 +464,11 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
#[test]
|
||||
fn inspect_doc_surfaces_page_spans() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bytes =
|
||||
build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
|
||||
let bytes = build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
|
||||
write_pdf(&env.workspace_root, "inspect.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let pdf_item = report
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -490,19 +476,12 @@ fn inspect_doc_surfaces_page_spans() {
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("inspect.pdf"))
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
assert_eq!(doc.parser_version.0, "pdf-text-v1");
|
||||
assert_eq!(doc.blocks.len(), 3);
|
||||
for block in &doc.blocks {
|
||||
match block {
|
||||
Block::Paragraph(p) => assert!(matches!(
|
||||
p.common.source_span,
|
||||
SourceSpan::Page { .. }
|
||||
)),
|
||||
Block::Paragraph(p) => assert!(matches!(p.common.source_span, SourceSpan::Page { .. })),
|
||||
other => panic!("expected Paragraph, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
137
crates/kebab-app/tests/reset_orphans.rs
Normal file
137
crates/kebab-app/tests/reset_orphans.rs
Normal file
@@ -0,0 +1,137 @@
|
||||
//! Integration test for `kebab reset --orphans-only`.
|
||||
//!
|
||||
//! Verifies that stored docs outside the current walker scope are purged
|
||||
//! from the store without removing any files from the filesystem.
|
||||
//!
|
||||
//! Test outline:
|
||||
//! 1. Ingest 3 .rs files (a.rs, b.rs, c.rs) — all New.
|
||||
//! 2. Narrow the config `include` to `["a.rs"]` only; b.rs and c.rs are
|
||||
//! still on disk but outside the walker scope.
|
||||
//! 3. Run `execute(ResetScope::OrphansOnly, &cfg)` — report must show
|
||||
//! `orphans_purged == 2` and `purged_paths` contains b.rs + c.rs.
|
||||
//! 4. `list docs` must show only a.rs.
|
||||
//! 5. b.rs and c.rs must still exist on disk (no filesystem removal).
|
||||
//! 6. Second reset → `orphans_purged == 0` (idempotent).
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::IngestOpts;
|
||||
use kebab_app::reset::{ResetScope, execute};
|
||||
use kebab_core::{DocFilter, DocumentStore, SourceScope};
|
||||
|
||||
/// Open the SqliteStore and list all `workspace_path` values.
|
||||
fn list_doc_paths(env: &TestEnv) -> Vec<String> {
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
let store = SqliteStore::open(&env.config).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
store
|
||||
.list_documents(&DocFilter::default())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|d| d.doc_path.0)
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reset_orphans_only_purges_out_of_scope_docs() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write three .rs files into the workspace.
|
||||
let a_path = env.workspace_root.join("a.rs");
|
||||
let b_path = env.workspace_root.join("b.rs");
|
||||
let c_path = env.workspace_root.join("c.rs");
|
||||
std::fs::write(&a_path, "// file a\nfn alpha() {}\n").unwrap();
|
||||
std::fs::write(&b_path, "// file b\nfn bravo() {}\n").unwrap();
|
||||
std::fs::write(&c_path, "// file c\nfn charlie() {}\n").unwrap();
|
||||
|
||||
// Ingest all three with a wide scope.
|
||||
let wide_scope = SourceScope {
|
||||
root: env.workspace_root.clone(),
|
||||
include: vec!["**/*.rs".to_string()],
|
||||
exclude: env.config.workspace.exclude.clone(),
|
||||
};
|
||||
let first = kebab_app::ingest_with_config_opts(
|
||||
env.config.clone(),
|
||||
wide_scope,
|
||||
false,
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.expect("first ingest must succeed");
|
||||
// The fixture workspace may contain other .rs files — just assert we
|
||||
// got at least 3 new docs (our a.rs, b.rs, c.rs).
|
||||
assert!(first.new >= 3, "expected at least 3 new docs: {first:?}");
|
||||
assert_eq!(first.errors, 0, "no errors on first ingest");
|
||||
|
||||
// Narrow config to include only a.rs; b.rs + c.rs are still on disk.
|
||||
let mut narrow_cfg = env.config.clone();
|
||||
narrow_cfg.workspace.exclude.clear();
|
||||
// Re-point workspace root (already correct) and restrict include via
|
||||
// the SourceScope in the connector. The config's `workspace.root` is
|
||||
// used by `enumerate_orphans` to build its scope — we keep that
|
||||
// pointing at the workspace root. We simulate narrowing by setting a
|
||||
// glob that only matches a.rs.
|
||||
//
|
||||
// NOTE: `kebab_config::WorkspaceCfg` does not have an `include` field
|
||||
// (it was removed in p9-fb-25). We narrow the scope via the walker
|
||||
// exclude list: exclude b.rs and c.rs explicitly.
|
||||
narrow_cfg.workspace.exclude = vec!["b.rs".to_string(), "c.rs".to_string()];
|
||||
|
||||
// Run orphans-only reset.
|
||||
let report =
|
||||
execute(ResetScope::OrphansOnly, &narrow_cfg).expect("orphans-only reset must succeed");
|
||||
|
||||
assert_eq!(
|
||||
report.orphans_purged, 2,
|
||||
"expected 2 orphans purged (b.rs + c.rs): {report:?}"
|
||||
);
|
||||
|
||||
let mut purged: Vec<String> = report.purged_paths.iter().map(|p| p.0.clone()).collect();
|
||||
purged.sort();
|
||||
assert_eq!(
|
||||
purged,
|
||||
vec!["b.rs".to_string(), "c.rs".to_string()],
|
||||
"purged_paths must list b.rs and c.rs in sorted order: {purged:?}"
|
||||
);
|
||||
|
||||
// list docs must show only a.rs (and any pre-existing fixture files
|
||||
// that are not excluded by the narrow config).
|
||||
let doc_paths = list_doc_paths(&env);
|
||||
// The narrow_cfg excludes b.rs + c.rs — they must no longer be in store.
|
||||
assert!(
|
||||
!doc_paths.iter().any(|p| p == "b.rs"),
|
||||
"b.rs must be gone from store after orphans-only reset; got: {doc_paths:?}"
|
||||
);
|
||||
assert!(
|
||||
!doc_paths.iter().any(|p| p == "c.rs"),
|
||||
"c.rs must be gone from store after orphans-only reset; got: {doc_paths:?}"
|
||||
);
|
||||
assert!(
|
||||
doc_paths.iter().any(|p| p == "a.rs"),
|
||||
"a.rs must still be in store; got: {doc_paths:?}"
|
||||
);
|
||||
|
||||
// Both b.rs and c.rs must still exist on the filesystem — no file
|
||||
// removal is performed by orphans-only.
|
||||
assert!(
|
||||
b_path.exists(),
|
||||
"b.rs must still be on disk after orphans-only reset"
|
||||
);
|
||||
assert!(
|
||||
c_path.exists(),
|
||||
"c.rs must still be on disk after orphans-only reset"
|
||||
);
|
||||
|
||||
// Second reset must be idempotent: nothing left to purge.
|
||||
let second = execute(ResetScope::OrphansOnly, &narrow_cfg)
|
||||
.expect("second orphans-only reset must succeed");
|
||||
assert_eq!(
|
||||
second.orphans_purged, 0,
|
||||
"second reset must be idempotent (orphans_purged == 0): {second:?}"
|
||||
);
|
||||
assert!(
|
||||
second.purged_paths.is_empty(),
|
||||
"second reset purged_paths must be empty: {:?}",
|
||||
second.purged_paths
|
||||
);
|
||||
}
|
||||
79
crates/kebab-app/tests/schema_active_versions.rs
Normal file
79
crates/kebab-app/tests/schema_active_versions.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
//! Integration tests for Bug #13: schema.v1.models.active_parsers + active_chunkers.
|
||||
|
||||
use kebab_app::schema_with_config;
|
||||
use kebab_config::Config;
|
||||
use kebab_core::SourceScope;
|
||||
|
||||
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.chunking.target_tokens = 80;
|
||||
cfg.chunking.overlap_tokens = 20;
|
||||
cfg
|
||||
}
|
||||
|
||||
fn minimal_scope(workspace_root: &std::path::Path) -> SourceScope {
|
||||
SourceScope {
|
||||
root: workspace_root.to_path_buf(),
|
||||
include: vec![],
|
||||
exclude: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn schema_models_active_arrays_empty_on_empty_corpus() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let workspace = dir.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let cfg = minimal_config(dir.path(), &workspace);
|
||||
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
drop(store);
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
assert!(
|
||||
s.models.active_parsers.is_empty(),
|
||||
"empty corpus → no parsers"
|
||||
);
|
||||
assert!(
|
||||
s.models.active_chunkers.is_empty(),
|
||||
"empty corpus → no chunkers"
|
||||
);
|
||||
// backward compat: 기존 단일 field 는 markdown default 보존.
|
||||
assert_eq!(s.models.parser_version, kebab_parse_md::PARSER_VERSION);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn schema_emits_active_parsers_and_chunkers_array_after_ingest() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let workspace = dir.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
std::fs::write(workspace.join("a.md"), "# A\nhello world\n").unwrap();
|
||||
let cfg = minimal_config(dir.path(), &workspace);
|
||||
let scope = minimal_scope(&workspace);
|
||||
|
||||
kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap();
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
assert!(
|
||||
!s.models.active_parsers.is_empty(),
|
||||
"active_parsers populated after ingest"
|
||||
);
|
||||
assert!(
|
||||
!s.models.active_chunkers.is_empty(),
|
||||
"active_chunkers populated after ingest"
|
||||
);
|
||||
// active arrays must be sorted (ORDER BY in SQL).
|
||||
let mut sorted = s.models.active_parsers.clone();
|
||||
sorted.sort();
|
||||
assert_eq!(
|
||||
s.models.active_parsers, sorted,
|
||||
"active_parsers must be sorted"
|
||||
);
|
||||
}
|
||||
@@ -57,7 +57,7 @@ fn schema_report_reflects_freshly_ingested_kb() {
|
||||
schema.wire.schemas
|
||||
);
|
||||
assert!(schema.capabilities.json_mode);
|
||||
assert!(!schema.capabilities.streaming_ask);
|
||||
assert!(schema.capabilities.streaming_ask); // Bug #9: streaming_ask is now true
|
||||
assert!(
|
||||
schema.capabilities.mcp_server,
|
||||
"mcp_server should be true after fb-30",
|
||||
|
||||
@@ -27,7 +27,10 @@ fn search_with_opts_no_budget_matches_search() {
|
||||
|
||||
assert_eq!(resp.hits.len(), baseline.len());
|
||||
assert!(!resp.truncated);
|
||||
assert!(resp.next_cursor.is_none(), "k=5 against 1 doc → no next page");
|
||||
assert!(
|
||||
resp.next_cursor.is_none(),
|
||||
"k=5 against 1 doc → no next page"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -62,7 +65,11 @@ fn budget_truncates_snippets_when_below_threshold() {
|
||||
fn cursor_paginates_to_next_page() {
|
||||
let env = common::TestEnv::new();
|
||||
for i in 0..6 {
|
||||
common::ingest_md(&env, &format!("d{i}.md"), &format!("# T{i}\n\nrust topic {i}\n"));
|
||||
common::ingest_md(
|
||||
&env,
|
||||
&format!("d{i}.md"),
|
||||
&format!("# T{i}\n\nrust topic {i}\n"),
|
||||
);
|
||||
}
|
||||
let app = env.app();
|
||||
|
||||
@@ -88,7 +95,10 @@ fn cursor_paginates_to_next_page() {
|
||||
page1.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
|
||||
let p2_ids: std::collections::HashSet<_> =
|
||||
page2.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
|
||||
assert!(p1_ids.is_disjoint(&p2_ids), "page 2 must not repeat page 1 hits");
|
||||
assert!(
|
||||
p1_ids.is_disjoint(&p2_ids),
|
||||
"page 2 must not repeat page 1 hits"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -46,3 +46,152 @@ fn korean_lexical_query_returns_korean_document() {
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// A4 Step 1c — multi-token Korean query (`해시 충돌`) must hit when
|
||||
/// the lexical builder routes it through a whole-phrase MATCH candidate.
|
||||
///
|
||||
/// Expected: FAIL until A5 (`build_match_string` redesign) lands — the
|
||||
/// current builder emits `"해시" "충돌"` AND, but FTS5 trigram tokenizer
|
||||
/// has no 2-char terms so each side is 0-hit. A5 introduces a whole-
|
||||
/// phrase candidate (`"해시 충돌"`) OR'd with the token AND, restoring
|
||||
/// hits for the dominant Korean usage pattern.
|
||||
#[test]
|
||||
fn lexical_multi_token_korean_query_hits() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Copy the synthetic Korean fixture (introduced in A4 Step 0) into
|
||||
// the test workspace. The fixture contains the exact phrase
|
||||
// "해시 충돌" multiple times.
|
||||
let dest = env.workspace_root.join("hash-table.md");
|
||||
let src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
.join("..")
|
||||
.join("fixtures")
|
||||
.join("search")
|
||||
.join("korean")
|
||||
.join("hash-table.md");
|
||||
std::fs::copy(&src, &dest).expect("copy korean fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query("해시 충돌"))
|
||||
.expect("search must succeed");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"multi-token Korean query '해시 충돌' must hit the hash-table fixture; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
let any_hash_table = hits.iter().any(|h| h.doc_path.0.contains("hash-table"));
|
||||
assert!(
|
||||
any_hash_table,
|
||||
"expected at least one hit on the hash-table fixture, got: {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// A4 Step 1c — mixed Korean+English multi-token query (`Rust 충돌은`).
|
||||
/// Both tokens are ≥3 chars, so the redesigned builder (A5) emits
|
||||
/// `("Rust 충돌은") OR ("Rust" AND "충돌은")`. With trigram tokenizer
|
||||
/// each side has substring coverage in the document, so the AND branch
|
||||
/// alone is enough. Expected: FAIL pre-A5, PASS post-A5.
|
||||
#[test]
|
||||
fn lexical_mixed_korean_english_multi_token_query_hits() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let doc_path = env.workspace_root.join("rust-hash.md");
|
||||
std::fs::write(
|
||||
&doc_path,
|
||||
"# Rust 해시 테이블\n\nRust 의 std::collections::HashMap 에서 \
|
||||
해시 충돌은 SipHash 로 완화한다.\n",
|
||||
)
|
||||
.expect("write rust-hash fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust 충돌은"))
|
||||
.expect("search must succeed");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"mixed Korean+English multi-token query 'Rust 충돌은' must hit the rust-hash fixture; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
let any_rust_hash = hits.iter().any(|h| h.doc_path.0.contains("rust-hash"));
|
||||
assert!(
|
||||
any_rust_hash,
|
||||
"expected at least one hit on the rust-hash fixture, got: {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
// ── S7 V009 morphological tokenizer end-to-end tests ─────────────────
|
||||
|
||||
/// S7 — V009 morphological tokenizer: 한국어 2자 query 가 end-to-end
|
||||
/// lexical 경로에서 hit. lindera ko-dic 이 '한국어를' → '한국어' 형태소로
|
||||
/// 분해, '서울은' → '서울' 로 분해하여 tokenized_korean_text column 에
|
||||
/// 기록 → FTS5 매칭.
|
||||
#[test]
|
||||
fn korean_morphological_2char_query_lexical_mode() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let doc_path = env.workspace_root.join("korean-wiki.md");
|
||||
std::fs::write(
|
||||
&doc_path,
|
||||
"# 한국어 위키\n\n한국어를 공부합니다.\n서울은 한국의 수도입니다.\n",
|
||||
)
|
||||
.expect("write korean-wiki fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("한국"))
|
||||
.expect("search 한국");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'한국' 2-char Korean query must return at least one hit (V009 morphological); got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("서울"))
|
||||
.expect("search 서울");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'서울' 2-char Korean query must return at least one hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// S7 — V009 morphological tokenizer: 한-영 혼합 query lexical hit.
|
||||
/// 'Rust' (English whole-token) + '최적화' (Korean morpheme) 각각 hit.
|
||||
#[test]
|
||||
fn korean_morphological_mixed_english_korean_query() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let doc_path = env.workspace_root.join("rust-optimization.md");
|
||||
std::fs::write(
|
||||
&doc_path,
|
||||
"# Rust 최적화 노트\n\nRust 최적화는 zero-cost abstraction 을 강조한다.\n",
|
||||
)
|
||||
.expect("write rust-optimization fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust"))
|
||||
.expect("search Rust");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'Rust' English whole-token must hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화"))
|
||||
.expect("search 최적화");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'최적화' Korean morpheme must hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -35,8 +35,8 @@ fn lexical_search_returns_hits_after_ingest() {
|
||||
fn lexical_search_empty_query_returns_empty() {
|
||||
let env = TestEnv::lexical_only();
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query(" "))
|
||||
.unwrap();
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query(" ")).unwrap();
|
||||
assert!(hits.is_empty(), "blank query must short-circuit empty");
|
||||
}
|
||||
|
||||
@@ -107,20 +107,24 @@ fn search_uncached_returns_same_hits_as_cached() {
|
||||
#[test]
|
||||
fn first_ingest_bumps_corpus_revision() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let store_before =
|
||||
kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
let store_before = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
store_before.run_migrations().unwrap();
|
||||
assert_eq!(store_before.corpus_revision(), 0, "fresh store seeds 0");
|
||||
// V004 seeds 0; V009 + V010 + V011 migrations each bump by 1 to
|
||||
// invalidate stale LRU caches (spec §5.2). Baseline before ingest = 3.
|
||||
// (V012 derivation_cache is purely additive — does NOT bump.)
|
||||
let baseline = store_before.corpus_revision();
|
||||
assert_eq!(baseline, 3, "fresh store post-V011 baseline = 3");
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert!(report.new + report.updated > 0, "first ingest must commit ≥1 doc");
|
||||
|
||||
let store_after =
|
||||
kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert!(
|
||||
store_after.corpus_revision() >= 1,
|
||||
"ingest commit must bump corpus_revision (got {})",
|
||||
report.new + report.updated > 0,
|
||||
"first ingest must commit ≥1 doc"
|
||||
);
|
||||
|
||||
let store_after = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
assert!(
|
||||
store_after.corpus_revision() > baseline,
|
||||
"ingest commit must bump corpus_revision past baseline {baseline} (got {})",
|
||||
store_after.corpus_revision(),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -29,7 +29,9 @@ fn fresh_doc_is_not_stale_with_default_threshold() {
|
||||
assert!(
|
||||
hits.iter().all(|h| !h.stale),
|
||||
"freshly-ingested doc must not be stale at default 30d threshold: {:?}",
|
||||
hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::<Vec<_>>()
|
||||
hits.iter()
|
||||
.map(|h| (h.doc_path.0.clone(), h.stale))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -50,7 +52,9 @@ fn threshold_zero_disables_staleness() {
|
||||
assert!(
|
||||
hits.iter().all(|h| !h.stale),
|
||||
"threshold=0 disables staleness even for year-old docs: {:?}",
|
||||
hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::<Vec<_>>()
|
||||
hits.iter()
|
||||
.map(|h| (h.doc_path.0.clone(), h.stale))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -14,12 +14,11 @@ use common::TestEnv;
|
||||
fn require_avx_or_panic() {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if !std::is_x86_feature_detected!("avx") {
|
||||
panic!(
|
||||
"kb-app vector integration test requires AVX-capable hardware; \
|
||||
host CPU lacks AVX. Run on an AVX-capable machine."
|
||||
);
|
||||
}
|
||||
assert!(
|
||||
std::is_x86_feature_detected!("avx"),
|
||||
"kb-app vector integration test requires AVX-capable hardware; \
|
||||
host CPU lacks AVX. Run on an AVX-capable machine."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,8 +29,7 @@ fn ingest_then_hybrid_search_returns_hits() {
|
||||
require_avx_or_panic();
|
||||
|
||||
let env = TestEnv::with_embeddings();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
@@ -57,8 +55,7 @@ fn ingest_then_vector_search_carries_embedding_model() {
|
||||
require_avx_or_panic();
|
||||
|
||||
let env = TestEnv::with_embeddings();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
|
||||
@@ -13,11 +13,7 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
|
||||
std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap();
|
||||
std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap();
|
||||
|
||||
let report = kebab_app::ingest_with_config(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
|
||||
let items = report.items.as_ref().expect("items array populated");
|
||||
let docx_item = items
|
||||
@@ -39,5 +35,8 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
|
||||
vec!["unsupported media type: <no-ext>".to_string()],
|
||||
);
|
||||
assert_eq!(report.skipped_by_extension.get("docx").copied(), Some(1));
|
||||
assert_eq!(report.skipped_by_extension.get("<no-ext>").copied(), Some(1));
|
||||
assert_eq!(
|
||||
report.skipped_by_extension.get("<no-ext>").copied(),
|
||||
Some(1)
|
||||
);
|
||||
}
|
||||
|
||||
178
crates/kebab-app/tests/twin_files_fetch_span.rs
Normal file
178
crates/kebab-app/tests/twin_files_fetch_span.rs
Normal file
@@ -0,0 +1,178 @@
|
||||
//! Regression test for the twin-file fetch_span media-type lookup bug.
|
||||
//!
|
||||
//! Twin files (identical content at different workspace paths) share one
|
||||
//! `assets` row whose PRIMARY KEY is the blake3 content hash. The old
|
||||
//! `fetch_span` implementation called
|
||||
//! `get_asset_by_workspace_path(&doc.workspace_path)` to check whether the
|
||||
//! media type was PDF/audio (and therefore reject span fetch). For a twin
|
||||
//! file that lookup could silently return the *other* twin's asset row if
|
||||
//! `assets.workspace_path` had been overwritten on the most recent ingest of
|
||||
//! the sibling — making the media-type branch decision incorrect.
|
||||
//!
|
||||
//! Fix: `fetch_span` now uses the 2-step lookup
|
||||
//! `get_document_by_workspace_path` → `doc.source_asset_id` → `get_asset`
|
||||
//! so the result is always anchored to the requesting document, not
|
||||
//! whichever twin last updated `assets.workspace_path`.
|
||||
//!
|
||||
//! This test builds a twin-file scenario (two .md files at different paths
|
||||
//! with identical content), ingests both, then calls `fetch_span` on each
|
||||
//! twin's `doc_id` and asserts it succeeds. Before the fix, if the asset
|
||||
//! row's workspace_path happened to point at the wrong twin the span could
|
||||
//! return an incorrect `span_not_supported` for a non-PDF/audio file, or
|
||||
//! conversely allow span on a PDF twin by accident. After the fix, the
|
||||
//! lookup is always doc-specific.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::ingest_with_config;
|
||||
use kebab_core::{DocumentStore, FetchKind, FetchOpts, FetchQuery, IngestItemKind};
|
||||
|
||||
#[test]
|
||||
fn twin_files_fetch_span_uses_correct_asset() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write two markdown files with identical content at different paths.
|
||||
let dir_a = env.workspace_root.join("src_a");
|
||||
let dir_b = env.workspace_root.join("src_b");
|
||||
std::fs::create_dir_all(&dir_a).unwrap();
|
||||
std::fs::create_dir_all(&dir_b).unwrap();
|
||||
|
||||
// The content must produce at least 1 line so span fetch is non-trivial.
|
||||
let content = "# Twin\n\nLine one.\n\nLine two.\n\nLine three.\n";
|
||||
std::fs::write(dir_a.join("note.md"), content).unwrap();
|
||||
std::fs::write(dir_b.join("note.md"), content).unwrap();
|
||||
|
||||
// Ingest all files (fixture workspace + our two new twins).
|
||||
let report =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest must succeed");
|
||||
assert_eq!(report.errors, 0, "no ingest errors; report={report:?}");
|
||||
|
||||
// Both twin paths must appear as New in the report.
|
||||
let items = report.items.as_ref().expect("items must be present");
|
||||
let twin_items: Vec<_> = items
|
||||
.iter()
|
||||
.filter(|i| {
|
||||
i.doc_path.0.ends_with("src_a/note.md") || i.doc_path.0.ends_with("src_b/note.md")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(
|
||||
twin_items.len(),
|
||||
2,
|
||||
"exactly 2 twin items expected; items={items:?}"
|
||||
);
|
||||
for item in &twin_items {
|
||||
assert_eq!(
|
||||
item.kind,
|
||||
IngestItemKind::New,
|
||||
"each twin must be New; item={item:?}"
|
||||
);
|
||||
}
|
||||
|
||||
// Resolve doc_ids for both workspace paths.
|
||||
// The ingest layer normalises workspace_path to the path relative to
|
||||
// workspace_root (e.g. "src_a/note.md"), so we look up by that form.
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
|
||||
// Find the twin items by matching on suffix so the test is robust to
|
||||
// however the workspace root is represented.
|
||||
let items = report.items.as_ref().expect("items must be present");
|
||||
let path_a_str = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("src_a/note.md"))
|
||||
.map(|i| i.doc_path.0.clone())
|
||||
.expect("src_a/note.md must appear in ingest report");
|
||||
let path_b_str = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("src_b/note.md"))
|
||||
.map(|i| i.doc_path.0.clone())
|
||||
.expect("src_b/note.md must appear in ingest report");
|
||||
|
||||
let path_a = kebab_core::WorkspacePath(path_a_str);
|
||||
let path_b = kebab_core::WorkspacePath(path_b_str);
|
||||
|
||||
let doc_a = store
|
||||
.get_document_by_workspace_path(&path_a)
|
||||
.expect("get_document_by_workspace_path path_a")
|
||||
.expect("doc_a must exist after ingest");
|
||||
let doc_b = store
|
||||
.get_document_by_workspace_path(&path_b)
|
||||
.expect("get_document_by_workspace_path path_b")
|
||||
.expect("doc_b must exist after ingest");
|
||||
|
||||
// Both twins share one asset_id (same content hash).
|
||||
assert_eq!(
|
||||
doc_a.source_asset_id, doc_b.source_asset_id,
|
||||
"twin files must share one asset_id"
|
||||
);
|
||||
|
||||
// Open App and issue span fetch on each twin's doc_id.
|
||||
let app = env.app();
|
||||
|
||||
let result_a = app
|
||||
.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_a.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 2,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin A must succeed for a markdown file");
|
||||
assert_eq!(result_a.kind, FetchKind::Span);
|
||||
assert!(
|
||||
result_a.text.as_deref().is_some_and(|t| !t.is_empty()),
|
||||
"span text for twin A must not be empty"
|
||||
);
|
||||
|
||||
let result_b = app
|
||||
.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_b.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 2,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin B must succeed for a markdown file");
|
||||
assert_eq!(result_b.kind, FetchKind::Span);
|
||||
assert!(
|
||||
result_b.text.as_deref().is_some_and(|t| !t.is_empty()),
|
||||
"span text for twin B must not be empty"
|
||||
);
|
||||
|
||||
// Ingest again to force the asset.workspace_path flip-flop, then
|
||||
// re-check. Pre-fix this was the scenario that triggered the bug:
|
||||
// after the second ingest the asset row's workspace_path could point
|
||||
// at either twin, making one twin's span fetch behave incorrectly.
|
||||
let report2 = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest must succeed");
|
||||
assert_eq!(
|
||||
report2.errors, 0,
|
||||
"no ingest errors on second run; report={report2:?}"
|
||||
);
|
||||
|
||||
// Re-open app after second ingest and verify span still works on both.
|
||||
let app2 = env.app();
|
||||
|
||||
app2.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_a.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 3,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin A after flip-flop must still succeed");
|
||||
|
||||
app2.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_b.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 3,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin B after flip-flop must still succeed");
|
||||
}
|
||||
94
crates/kebab-app/tests/twin_files_idempotent.rs
Normal file
94
crates/kebab-app/tests/twin_files_idempotent.rs
Normal file
@@ -0,0 +1,94 @@
|
||||
//! Regression test for the twin-file idempotency bug.
|
||||
//!
|
||||
//! Identical-content files at different workspace paths share one
|
||||
//! `assets` row (`asset_id` = blake3 content hash, PRIMARY KEY). The
|
||||
//! old UPSERT `ON CONFLICT(asset_id) DO UPDATE SET workspace_path =
|
||||
//! excluded.workspace_path` made each twin overwrite the other's path
|
||||
//! on every ingest, so `get_asset_by_workspace_path(path1)` returned
|
||||
//! None (or the wrong twin) → re-process every time.
|
||||
//!
|
||||
//! Fix: `try_skip_unchanged` now uses `get_document_by_workspace_path`
|
||||
//! instead. `documents.workspace_path` is UNIQUE (V001) so each twin
|
||||
//! has its own stable document row.
|
||||
//!
|
||||
//! Assertion contract:
|
||||
//! 1st ingest → 2 New (one per twin)
|
||||
//! 2nd ingest → 0 New, 0 Updated, 2 Unchanged
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::ingest_with_config;
|
||||
use kebab_core::IngestItemKind;
|
||||
|
||||
#[test]
|
||||
fn twin_files_second_ingest_is_unchanged() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write two files with identical content at different paths.
|
||||
let pkg_a = env.workspace_root.join("pkg_a");
|
||||
let pkg_b = env.workspace_root.join("pkg_b");
|
||||
std::fs::create_dir_all(&pkg_a).unwrap();
|
||||
std::fs::create_dir_all(&pkg_b).unwrap();
|
||||
|
||||
let content = b"# shared\nThis content is identical in both files.\n";
|
||||
std::fs::write(pkg_a.join("__init__.py"), content).unwrap();
|
||||
std::fs::write(pkg_b.join("__init__.py"), content).unwrap();
|
||||
|
||||
// First ingest — both files must be New.
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("first ingest must succeed");
|
||||
assert_eq!(first.errors, 0, "first ingest: no errors; report={first:?}");
|
||||
|
||||
let items = first.items.as_ref().expect("items must be present");
|
||||
let twin_items: Vec<_> = items
|
||||
.iter()
|
||||
.filter(|i| i.doc_path.0.ends_with("__init__.py"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
twin_items.len(),
|
||||
2,
|
||||
"first ingest: expected exactly 2 __init__.py items; items={items:?}"
|
||||
);
|
||||
for item in &twin_items {
|
||||
assert_eq!(
|
||||
item.kind,
|
||||
IngestItemKind::New,
|
||||
"first ingest: each twin must be New; item={item:?}"
|
||||
);
|
||||
}
|
||||
|
||||
// Second ingest — same files, same content → both must be Unchanged.
|
||||
let second = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest must succeed");
|
||||
assert_eq!(
|
||||
second.errors, 0,
|
||||
"second ingest: no errors; report={second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.new, 0,
|
||||
"second ingest: no new docs; report={second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.updated, 0,
|
||||
"second ingest: no updated docs (twin-file bug would set this to 2); report={second:?}"
|
||||
);
|
||||
|
||||
let second_items = second.items.as_ref().expect("items must be present");
|
||||
let twin_items2: Vec<_> = second_items
|
||||
.iter()
|
||||
.filter(|i| i.doc_path.0.ends_with("__init__.py"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
twin_items2.len(),
|
||||
2,
|
||||
"second ingest: expected exactly 2 __init__.py items; items={second_items:?}"
|
||||
);
|
||||
for item in &twin_items2 {
|
||||
assert_eq!(
|
||||
item.kind,
|
||||
IngestItemKind::Unchanged,
|
||||
"second ingest: each twin must be Unchanged; item={item:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -13,14 +13,21 @@ serde_json_canonicalizer = "0.3"
|
||||
blake3 = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
serde_yaml = { workspace = true }
|
||||
lindera = { workspace = true, features = ["embed-ko-dic"] }
|
||||
lindera-ko-dic = { workspace = true, features = ["embed-ko-dic"] }
|
||||
|
||||
[dev-dependencies]
|
||||
# kb-parse-md / kb-normalize are dev-only — used by the snapshot integration
|
||||
# test to build a CanonicalDocument from a fixture Markdown file. Forbidden as
|
||||
# regular deps per design §8 (chunker consumes CanonicalDocument from kb-core
|
||||
# only); `cargo tree -p kb-chunk --depth 1` (default scope, excludes dev-deps)
|
||||
# kb-parse-md / kb-parse-code are dev-only — used by the snapshot integration
|
||||
# tests to build a CanonicalDocument from fixture files. kb-parse-md absorbed
|
||||
# kb-normalize in v0.19.0 (HOTFIXES.md 2026-05-26). Forbidden as regular deps
|
||||
# per design §8 (chunker consumes CanonicalDocument from kb-core only);
|
||||
# `cargo tree -p kb-chunk --depth 1` (default scope, excludes dev-deps)
|
||||
# confirms this.
|
||||
kebab-parse-md = { path = "../kebab-parse-md" }
|
||||
kebab-normalize = { path = "../kebab-normalize" }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
kebab-parse-md = { path = "../kebab-parse-md" }
|
||||
kebab-parse-code = { path = "../kebab-parse-code" }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
376
crates/kebab-chunk/src/code_c_ast_v1.rs
Normal file
376
crates/kebab-chunk/src/code_c_ast_v1.rs
Normal file
@@ -0,0 +1,376 @@
|
||||
//! `code-c-ast-v1` — maps a tree-sitter-derived C AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-c-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeCAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeCAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!("CodeCAstV1Chunker only handles code docs (got non-Code block)"),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeCAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-c-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.c".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-c-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("c".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("c".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("c".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_c_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeCAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-c-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "int parse() {\n\t// x\n}"),
|
||||
("print", 5, 7, "void print() {\n\t//\n\treturn;\n}"),
|
||||
]);
|
||||
let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-c-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} = {i};\n"))
|
||||
.collect::<String>();
|
||||
let code = format!("int big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "int parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeCAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
|
||||
let base: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeCAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
378
crates/kebab-chunk/src/code_cpp_ast_v1.rs
Normal file
378
crates/kebab-chunk/src/code_cpp_ast_v1.rs
Normal file
@@ -0,0 +1,378 @@
|
||||
//! `code-cpp-ast-v1` — maps a tree-sitter-derived C++ AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-cpp-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeCppAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeCppAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => {
|
||||
anyhow::bail!("CodeCppAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeCppAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-cpp-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.cpp".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-cpp-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("cpp".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("cpp".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("cpp".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_cpp_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeCppAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-cpp-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "int parse() {\n\t// x\n}"),
|
||||
("print", 5, 7, "void print() {\n\t//\n\treturn;\n}"),
|
||||
]);
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-cpp-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} = {i};\n"))
|
||||
.collect::<String>();
|
||||
let code = format!("int big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "int parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeCppAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
|
||||
let base: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeCppAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
384
crates/kebab-chunk/src/code_go_ast_v1.rs
Normal file
384
crates/kebab-chunk/src/code_go_ast_v1.rs
Normal file
@@ -0,0 +1,384 @@
|
||||
//! `code-go-ast-v1` — maps a tree-sitter-derived Go AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-go-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeGoAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeGoAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => {
|
||||
anyhow::bail!("CodeGoAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeGoAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-go-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.go".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-go-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("go".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("go".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("go".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_go_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeGoAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-go-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "func parse() {\n\t// x\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"func double() int {\n\t//\n\treturn 0\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-go-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} := {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("func big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "func parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeGoAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "func parse() {}\n")]);
|
||||
let base: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeGoAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
379
crates/kebab-chunk/src/code_java_ast_v1.rs
Normal file
379
crates/kebab-chunk/src/code_java_ast_v1.rs
Normal file
@@ -0,0 +1,379 @@
|
||||
//! `code-java-ast-v1` — maps a tree-sitter-derived Java AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-java-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeJavaAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeJavaAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeJavaAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeJavaAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-java-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/Main.java".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-java-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("java".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("java".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("java".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_java_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeJavaAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-java-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "void parse() {\n\t// x\n}"),
|
||||
("Foo.double", 5, 7, "int double() {\n\t//\n\treturn 0;\n}"),
|
||||
]);
|
||||
let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-java-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tint x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("void big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "void parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeJavaAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "void parse() {}\n")]);
|
||||
let base: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeJavaAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
384
crates/kebab-chunk/src/code_js_ast_v1.rs
Normal file
384
crates/kebab-chunk/src/code_js_ast_v1.rs
Normal file
@@ -0,0 +1,384 @@
|
||||
//! `code-js-ast-v1` — maps a tree-sitter-derived JavaScript AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-js-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeJsAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeJsAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => {
|
||||
anyhow::bail!("CodeJsAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeJsAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-js-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.js".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-js-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("javascript".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("javascript".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("javascript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_js_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeJsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-js-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "function parse() {\n // x\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"function double() {\n //\n return 0;\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-js-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" const x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("function big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "function parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeJsAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "function parse() {}\n")]);
|
||||
let base: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeJsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
384
crates/kebab-chunk/src/code_kotlin_ast_v1.rs
Normal file
384
crates/kebab-chunk/src/code_kotlin_ast_v1.rs
Normal file
@@ -0,0 +1,384 @@
|
||||
//! `code-kotlin-ast-v1` — maps a tree-sitter-derived Kotlin AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-kotlin-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeKotlinAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeKotlinAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeKotlinAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeKotlinAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-kotlin-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/Main.kt".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-kotlin-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("kotlin".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("kotlin".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("kotlin".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_kotlin_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeKotlinAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-kotlin-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "fun parse() {\n\t// x\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"fun double(): Int {\n\t//\n\treturn 0\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-kotlin-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tval x{i} = {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("fun big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "fun parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeKotlinAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "fun parse() {}\n")]);
|
||||
let base: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeKotlinAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
379
crates/kebab-chunk/src/code_python_ast_v1.rs
Normal file
379
crates/kebab-chunk/src/code_python_ast_v1.rs
Normal file
@@ -0,0 +1,379 @@
|
||||
//! `code-python-ast-v1` — maps a tree-sitter-derived Python AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-python-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodePythonAstV1Chunker;
|
||||
|
||||
impl Chunker for CodePythonAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodePythonAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodePythonAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-python-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.py".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-python-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("python".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("python".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("python".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_python_ast_v1() {
|
||||
assert_eq!(
|
||||
CodePythonAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-python-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "def parse():\n pass\n # x"),
|
||||
("Foo.double", 5, 7, "def double():\n #\n pass"),
|
||||
]);
|
||||
let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-python-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" x{i} = {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("def big():\n{body}\n");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "def parse(): pass")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodePythonAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "def parse(): pass\n")]);
|
||||
let base: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodePythonAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
379
crates/kebab-chunk/src/code_rust_ast_v1.rs
Normal file
379
crates/kebab-chunk/src/code_rust_ast_v1.rs
Normal file
@@ -0,0 +1,379 @@
|
||||
//! `code-rust-ast-v1` — maps a tree-sitter-derived Rust AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-rust-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeRustAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeRustAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeRustAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeRustAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-rust-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.rs".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-rust-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("rust".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("rust".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("rust".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_rust_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeRustAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-rust-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "pub fn parse() {}\n// x\n}"),
|
||||
("Foo::double", 5, 7, "fn double() {}\n//\n}"),
|
||||
]);
|
||||
let chunks = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-rust-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" let x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("pub fn big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "fn parse(){}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeRustAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "fn parse(){}\n}")]);
|
||||
let base: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeRustAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
170
crates/kebab-chunk/src/code_text_paragraph_v1.rs
Normal file
170
crates/kebab-chunk/src/code_text_paragraph_v1.rs
Normal file
@@ -0,0 +1,170 @@
|
||||
//! p10-3: Tier 3 paragraph + line-window fallback chunker.
|
||||
//!
|
||||
//! Splits code/text files on blank-line paragraph boundaries. Paragraphs
|
||||
//! with more than 80 lines are further split into 80-line windows with a
|
||||
//! 20-line overlap (stride 60) — the same oversize pattern used by Tier 1/2
|
||||
//! chunkers but without AST structure, hence no symbol.
|
||||
//!
|
||||
//! Per spec §9.3: all emitted chunks carry `symbol: None`.
|
||||
|
||||
use crate::tier2_shared::{build_chunk_no_symbol, policy_hash};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "code-text-paragraph-v1";
|
||||
|
||||
/// Lines-per-window for the oversize fallback (Tier 3).
|
||||
const FALLBACK_LINES_PER_CHUNK: usize = 80;
|
||||
/// Overlap between consecutive windows.
|
||||
const FALLBACK_LINES_OVERLAP: usize = 20;
|
||||
// stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP = 60.
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeTextParagraphV1Chunker;
|
||||
|
||||
impl Chunker for CodeTextParagraphV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full source text.
|
||||
let (text, lang_str) = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => (cb.code.as_str(), cb.lang.as_deref().unwrap_or("")),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let mut chunks = Vec::new();
|
||||
for para in split_paragraphs(text) {
|
||||
push_paragraph(&mut chunks, doc, policy, ¶, lang_str)?;
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"code-text-paragraph-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
|
||||
/// A contiguous run of non-blank lines from the source text.
|
||||
struct Paragraph {
|
||||
/// Lines joined with `\n` (no trailing newline).
|
||||
text: String,
|
||||
/// 1-indexed line number of the first line in the source file.
|
||||
line_start: u32,
|
||||
/// 1-indexed line number of the last line in the source file.
|
||||
line_end: u32,
|
||||
}
|
||||
|
||||
/// Split `text` into `Paragraph`s separated by blank (all-whitespace) lines.
|
||||
///
|
||||
/// Blank lines are treated as boundaries and are NOT included in any
|
||||
/// paragraph's line range. Paragraphs that would consist entirely of blank
|
||||
/// lines are skipped.
|
||||
fn split_paragraphs(text: &str) -> Vec<Paragraph> {
|
||||
let mut paragraphs = Vec::new();
|
||||
let mut current: Vec<&str> = Vec::new();
|
||||
let mut current_start: Option<u32> = None;
|
||||
|
||||
for (idx, line) in text.lines().enumerate() {
|
||||
let line_no = (idx + 1) as u32;
|
||||
let is_blank = line.trim().is_empty();
|
||||
if is_blank {
|
||||
if let Some(start) = current_start.take() {
|
||||
let end = start + current.len() as u32 - 1;
|
||||
paragraphs.push(Paragraph {
|
||||
text: current.join("\n"),
|
||||
line_start: start,
|
||||
line_end: end,
|
||||
});
|
||||
current.clear();
|
||||
}
|
||||
} else {
|
||||
if current_start.is_none() {
|
||||
current_start = Some(line_no);
|
||||
}
|
||||
current.push(line);
|
||||
}
|
||||
}
|
||||
// Flush any trailing paragraph not terminated by a blank line.
|
||||
if let Some(start) = current_start {
|
||||
let end = start + current.len() as u32 - 1;
|
||||
paragraphs.push(Paragraph {
|
||||
text: current.join("\n"),
|
||||
line_start: start,
|
||||
line_end: end,
|
||||
});
|
||||
}
|
||||
paragraphs
|
||||
}
|
||||
|
||||
/// Emit one or more chunks for a single paragraph.
|
||||
///
|
||||
/// Paragraphs with ≤ `FALLBACK_LINES_PER_CHUNK` lines become a single chunk.
|
||||
/// Larger paragraphs are split into overlapping windows of
|
||||
/// `FALLBACK_LINES_PER_CHUNK` lines with stride `FALLBACK_LINES_PER_CHUNK -
|
||||
/// FALLBACK_LINES_OVERLAP`. The last window may be shorter. Window starts
|
||||
/// are passed as `split_key` so `id_for_chunk` can produce distinct ids
|
||||
/// across windows.
|
||||
fn push_paragraph(
|
||||
out: &mut Vec<Chunk>,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
para: &Paragraph,
|
||||
lang: &str,
|
||||
) -> Result<()> {
|
||||
let n_lines = (para.line_end - para.line_start + 1) as usize;
|
||||
|
||||
if n_lines <= FALLBACK_LINES_PER_CHUNK {
|
||||
// Use line_start as split_key so each paragraph gets a distinct
|
||||
// chunk_id even when block_ids is empty (no symbol, no AST structure).
|
||||
// Without this, all short paragraphs from the same doc share the same
|
||||
// base_policy_hash and therefore the same id_for_chunk result.
|
||||
out.push(build_chunk_no_symbol(
|
||||
doc,
|
||||
policy,
|
||||
¶.text,
|
||||
para.line_start,
|
||||
para.line_end,
|
||||
lang,
|
||||
VERSION_LABEL,
|
||||
Some(para.line_start),
|
||||
));
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Oversize: line-window split with overlap.
|
||||
let stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP;
|
||||
let lines: Vec<&str> = para.text.lines().collect();
|
||||
let mut i = 0usize;
|
||||
loop {
|
||||
let end = (i + FALLBACK_LINES_PER_CHUNK).min(lines.len());
|
||||
let window_text = lines[i..end].join("\n");
|
||||
let window_start = para.line_start + i as u32;
|
||||
let window_end = para.line_start + (end as u32) - 1;
|
||||
// Use window_start as split_key so chunk_ids are unique across windows.
|
||||
out.push(build_chunk_no_symbol(
|
||||
doc,
|
||||
policy,
|
||||
&window_text,
|
||||
window_start,
|
||||
window_end,
|
||||
lang,
|
||||
VERSION_LABEL,
|
||||
Some(window_start),
|
||||
));
|
||||
if end == lines.len() {
|
||||
break;
|
||||
}
|
||||
i += stride;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
384
crates/kebab-chunk/src/code_ts_ast_v1.rs
Normal file
384
crates/kebab-chunk/src/code_ts_ast_v1.rs
Normal file
@@ -0,0 +1,384 @@
|
||||
//! `code-ts-ast-v1` — maps a tree-sitter-derived TypeScript AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-ts-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeTsAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeTsAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => {
|
||||
anyhow::bail!("CodeTsAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeTsAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-ts-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.ts".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-ts-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("typescript".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("typescript".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("typescript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_ts_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeTsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-ts-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "function parse(): void {\n // x\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"function double(): number {\n //\n return 0;\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-ts-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" const x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("function big(): void {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "function parse(): void {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeTsAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "function parse(): void {}\n")]);
|
||||
let base: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeTsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
58
crates/kebab-chunk/src/dockerfile_file_v1.rs
Normal file
58
crates/kebab-chunk/src/dockerfile_file_v1.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
//! p10-2: dockerfile whole-file chunker (Tier 2).
|
||||
//!
|
||||
//! Reads entire Dockerfile content and emits a single Chunk with symbol
|
||||
//! "<dockerfile>", code_lang "dockerfile", line range 1..EOF.
|
||||
//! Oversize >200 lines splits into line-windows sharing the symbol via
|
||||
//! tier2_shared::push_chunks_with_oversize.
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "dockerfile-file-v1";
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct DockerfileFileV1Chunker;
|
||||
|
||||
impl Chunker for DockerfileFileV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full Dockerfile text.
|
||||
let text = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => cb.code.as_str(),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let total_lines = text.lines().count().max(1) as u32;
|
||||
let mut chunks = Vec::new();
|
||||
|
||||
push_chunks_with_oversize(
|
||||
&mut chunks,
|
||||
doc,
|
||||
policy,
|
||||
text,
|
||||
1,
|
||||
total_lines,
|
||||
"<dockerfile>",
|
||||
"dockerfile",
|
||||
VERSION_LABEL,
|
||||
None,
|
||||
)?;
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"dockerfile-file-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
162
crates/kebab-chunk/src/k8s_manifest_resource_v1.rs
Normal file
162
crates/kebab-chunk/src/k8s_manifest_resource_v1.rs
Normal file
@@ -0,0 +1,162 @@
|
||||
//! p10-2: k8s manifest resource-aware chunker.
|
||||
//!
|
||||
//! Splits a multi-document YAML file on `^---\s*$` boundaries, recognises
|
||||
//! documents that have both `apiVersion` and `kind` string fields as k8s
|
||||
//! resources, and emits one `Chunk` per resource (with oversize >200-line
|
||||
//! fallback). Non-k8s documents are skipped; invalid YAML yields 0 chunks
|
||||
//! for the entire file.
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "k8s-manifest-resource-v1";
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct K8sManifestResourceV1Chunker;
|
||||
|
||||
impl Chunker for K8sManifestResourceV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full YAML text.
|
||||
let text = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => cb.code.as_str(),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let slices = split_yaml_documents(text);
|
||||
let mut chunks: Vec<Chunk> = Vec::new();
|
||||
|
||||
for slice in slices {
|
||||
// Invalid YAML in any document → return 0 chunks for the file.
|
||||
let value: serde_yaml::Value = match serde_yaml::from_str(slice.text) {
|
||||
Ok(v) => v,
|
||||
Err(_) => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let Some(mapping) = value.as_mapping() else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let api = mapping
|
||||
.get("apiVersion")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("");
|
||||
let kind = mapping.get("kind").and_then(|v| v.as_str()).unwrap_or("");
|
||||
|
||||
// Skip non-k8s documents.
|
||||
if api.is_empty() || kind.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let metadata = mapping.get("metadata").and_then(|v| v.as_mapping());
|
||||
let name = metadata
|
||||
.and_then(|m| m.get("name"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("<unnamed>");
|
||||
let namespace = metadata
|
||||
.and_then(|m| m.get("namespace"))
|
||||
.and_then(|v| v.as_str());
|
||||
|
||||
let symbol = match namespace {
|
||||
Some(ns) if !ns.is_empty() => format!("{kind}/{ns}/{name}"),
|
||||
_ => format!("{kind}/{name}"),
|
||||
};
|
||||
|
||||
push_chunks_with_oversize(
|
||||
&mut chunks,
|
||||
doc,
|
||||
policy,
|
||||
slice.text,
|
||||
slice.line_start,
|
||||
slice.line_end,
|
||||
&symbol,
|
||||
"yaml",
|
||||
VERSION_LABEL,
|
||||
Some(slice.line_start),
|
||||
)?;
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"k8s-manifest-resource-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
|
||||
struct YamlSlice<'a> {
|
||||
text: &'a str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
}
|
||||
|
||||
/// Split raw YAML text into per-document slices on `---` separator lines.
|
||||
/// Line numbers are 1-indexed.
|
||||
fn split_yaml_documents(text: &str) -> Vec<YamlSlice<'_>> {
|
||||
let lines: Vec<&str> = text.lines().collect();
|
||||
|
||||
// Collect indices of separator lines (0-based), then append a sentinel at
|
||||
// the end so the last slice is always terminated.
|
||||
let mut separators: Vec<usize> = lines
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(i, l)| {
|
||||
let trimmed = l.trim_end();
|
||||
if trimmed == "---" || trimmed.starts_with("--- ") || trimmed.starts_with("---\t") {
|
||||
Some(i)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
separators.push(lines.len());
|
||||
|
||||
let mut slices: Vec<YamlSlice<'_>> = Vec::new();
|
||||
let mut doc_start_line: usize = 0; // 0-based index of current doc start
|
||||
|
||||
for sep_line in separators {
|
||||
if sep_line > doc_start_line {
|
||||
let start_byte = byte_offset_of_line(text, doc_start_line);
|
||||
let end_byte = byte_offset_of_line(text, sep_line);
|
||||
let slice_text = &text[start_byte..end_byte];
|
||||
if !slice_text.trim().is_empty() {
|
||||
slices.push(YamlSlice {
|
||||
text: slice_text,
|
||||
line_start: (doc_start_line + 1) as u32,
|
||||
line_end: sep_line as u32,
|
||||
});
|
||||
}
|
||||
}
|
||||
doc_start_line = sep_line + 1;
|
||||
}
|
||||
|
||||
slices
|
||||
}
|
||||
|
||||
/// Return the byte offset of the start of `line_idx` (0-based line index).
|
||||
fn byte_offset_of_line(text: &str, line_idx: usize) -> usize {
|
||||
if line_idx == 0 {
|
||||
return 0;
|
||||
}
|
||||
let mut count = 0usize;
|
||||
for (i, c) in text.char_indices() {
|
||||
if c == '\n' {
|
||||
count += 1;
|
||||
if count == line_idx {
|
||||
return i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
text.len()
|
||||
}
|
||||
@@ -15,8 +15,107 @@
|
||||
//! embedder, the retriever, the LLM, the RAG layer, or the UI layers.
|
||||
//! It consumes `CanonicalDocument` purely through `kb-core` types.
|
||||
|
||||
mod code_c_ast_v1;
|
||||
mod code_cpp_ast_v1;
|
||||
mod code_go_ast_v1;
|
||||
mod code_java_ast_v1;
|
||||
mod code_js_ast_v1;
|
||||
mod code_kotlin_ast_v1;
|
||||
mod code_python_ast_v1;
|
||||
mod code_rust_ast_v1;
|
||||
pub mod code_text_paragraph_v1;
|
||||
mod code_ts_ast_v1;
|
||||
pub mod dockerfile_file_v1;
|
||||
pub mod k8s_manifest_resource_v1;
|
||||
pub mod manifest_file_v1;
|
||||
mod md_heading_v1;
|
||||
mod pdf_page_v1;
|
||||
mod tier2_shared;
|
||||
|
||||
pub use code_c_ast_v1::CodeCAstV1Chunker;
|
||||
pub use code_cpp_ast_v1::CodeCppAstV1Chunker;
|
||||
pub use code_go_ast_v1::CodeGoAstV1Chunker;
|
||||
pub use code_java_ast_v1::CodeJavaAstV1Chunker;
|
||||
pub use code_js_ast_v1::CodeJsAstV1Chunker;
|
||||
pub use code_kotlin_ast_v1::CodeKotlinAstV1Chunker;
|
||||
pub use code_python_ast_v1::CodePythonAstV1Chunker;
|
||||
pub use code_rust_ast_v1::CodeRustAstV1Chunker;
|
||||
pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker;
|
||||
pub use code_ts_ast_v1::CodeTsAstV1Chunker;
|
||||
pub use dockerfile_file_v1::DockerfileFileV1Chunker;
|
||||
pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
|
||||
pub use manifest_file_v1::ManifestFileV1Chunker;
|
||||
pub use md_heading_v1::MdHeadingV1Chunker;
|
||||
pub use pdf_page_v1::PdfPageV1Chunker;
|
||||
|
||||
// ── Korean morphological tokenizer ───────────────────────────────────────────
|
||||
|
||||
use lindera::dictionary::{DictionaryKind, load_embedded_dictionary};
|
||||
use lindera::mode::Mode;
|
||||
use lindera::segmenter::Segmenter;
|
||||
use lindera::tokenizer::Tokenizer;
|
||||
|
||||
static KOREAN_TOKENIZER: std::sync::OnceLock<Option<Tokenizer>> = std::sync::OnceLock::new();
|
||||
|
||||
/// 한 codepoint 가 한글 음절 또는 자모인지 판정 — N-gram supplement 의 emit 대상 필터링.
|
||||
fn is_hangul(c: char) -> bool {
|
||||
matches!(
|
||||
c,
|
||||
'\u{AC00}'..='\u{D7A3}' // 한글 음절 (precomposed)
|
||||
| '\u{1100}'..='\u{11FF}' // 한글 자모
|
||||
| '\u{3130}'..='\u{318F}' // 한글 호환 자모
|
||||
)
|
||||
}
|
||||
|
||||
/// 한국어 chunk text 를 lindera ko-dic 으로 형태소 분해해 공백 join 한 결과를 반환.
|
||||
/// chunker 들이 `Chunk.tokenized_korean_text` pre-fill 에 사용.
|
||||
/// 분석 실패 시 None — 호출자는 NULL fallback 처리.
|
||||
/// Tokenizer 는 OnceLock 으로 1회 초기화; dict load 실패 시 영구 None.
|
||||
///
|
||||
/// v0.21.0 — N-gram supplement (Option β, post-v0.20.1 enhancement).
|
||||
/// ko-dic 가 compound noun (`한국정부`, `서울특별시` 등) 을 단일 token 으로
|
||||
/// 저장하는 정책 의 한계 해소 — morpheme 길이 ≥ 3 인 한글 token 에 대해
|
||||
/// 2-char sliding window n-gram 도 추가 emit. `'한국정부'` morpheme →
|
||||
/// `[한국정부, 한국, 국정, 정부]` 의 4 token 으로 expand. 사용자 의 2-char
|
||||
/// query (`'한국'`) 가 compound chunk 에서도 hit. 영어/숫자 token 은 영향
|
||||
/// 없음 (is_hangul filter). DB size + ingest latency 의 trade-off 는
|
||||
/// HOTFIXES 2026-05-28 의 "N-gram supplement (Option β)" 보강 entry.
|
||||
pub fn tokenize_korean_morphological(text: &str) -> Option<String> {
|
||||
if text.trim().is_empty() {
|
||||
return None;
|
||||
}
|
||||
let tokenizer = KOREAN_TOKENIZER.get_or_init(|| {
|
||||
let dict = match load_embedded_dictionary(DictionaryKind::KoDic) {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
tracing::warn!(target: "kebab-chunk", "tokenize_korean_morphological: dict load failed: {e}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
let segmenter = Segmenter::new(Mode::Normal, dict, None);
|
||||
Some(Tokenizer::new(segmenter))
|
||||
});
|
||||
let tokenizer = tokenizer.as_ref()?;
|
||||
let tokens = tokenizer.tokenize(text).ok()?;
|
||||
|
||||
let mut out_tokens: Vec<String> = Vec::with_capacity(tokens.len() * 2);
|
||||
for tok in tokens.iter() {
|
||||
let surface = tok.surface.as_ref();
|
||||
out_tokens.push(surface.to_string());
|
||||
|
||||
// N-gram supplement: 한글 morpheme 의 2-char sliding window.
|
||||
let chars: Vec<char> = surface.chars().collect();
|
||||
if chars.len() >= 3 && chars.iter().all(|c| is_hangul(*c)) {
|
||||
for window in chars.windows(2) {
|
||||
out_tokens.push(window.iter().collect());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let joined = out_tokens.join(" ");
|
||||
if joined.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(joined)
|
||||
}
|
||||
}
|
||||
|
||||
59
crates/kebab-chunk/src/manifest_file_v1.rs
Normal file
59
crates/kebab-chunk/src/manifest_file_v1.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
//! p10-2: manifest whole-file chunker (Tier 2).
|
||||
//!
|
||||
//! Reads entire manifest file (Cargo.toml / package.json / pom.xml / go.mod /
|
||||
//! build.gradle / pyproject.toml / tsconfig.json) and emits a single Chunk
|
||||
//! with symbol "<manifest>", code_lang read from Block::Code.lang, line range
|
||||
//! 1..EOF. Oversize >200 lines splits into line-windows sharing the symbol via
|
||||
//! tier2_shared::push_chunks_with_oversize.
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "manifest-file-v1";
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct ManifestFileV1Chunker;
|
||||
|
||||
impl Chunker for ManifestFileV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full manifest text.
|
||||
let (text, lang) = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => (cb.code.as_str(), cb.lang.as_deref().unwrap_or("")),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let total_lines = text.lines().count().max(1) as u32;
|
||||
let mut chunks = Vec::new();
|
||||
|
||||
push_chunks_with_oversize(
|
||||
&mut chunks,
|
||||
doc,
|
||||
policy,
|
||||
text,
|
||||
1,
|
||||
total_lines,
|
||||
"<manifest>",
|
||||
lang,
|
||||
VERSION_LABEL,
|
||||
None,
|
||||
)?;
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"manifest-file-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
//! `md-heading-v1` — heading-aware Markdown chunker.
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker,
|
||||
ChunkerVersion, DocumentId, SourceSpan, id_for_chunk,
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
/// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label
|
||||
@@ -99,11 +99,7 @@ impl Chunker for MdHeadingV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
let policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
@@ -152,22 +148,12 @@ impl Chunker for MdHeadingV1Chunker {
|
||||
// `collect_overlap_seed` keeps seed ≤ target/2, so
|
||||
// a flush here never produces a chunk smaller than
|
||||
// the seed budget.
|
||||
let would_exceed = acc.text_tokens + next_tokens
|
||||
> policy.target_tokens
|
||||
let would_exceed = acc.text_tokens + next_tokens > policy.target_tokens
|
||||
&& acc.has_non_heading_content();
|
||||
if would_exceed {
|
||||
let overlap_seed = collect_overlap_seed(
|
||||
&acc,
|
||||
policy.overlap_tokens,
|
||||
policy.target_tokens,
|
||||
);
|
||||
flush(
|
||||
&mut acc,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&policy_hash,
|
||||
&mut out,
|
||||
);
|
||||
let overlap_seed =
|
||||
collect_overlap_seed(&acc, policy.overlap_tokens, policy.target_tokens);
|
||||
flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
|
||||
// Seed next accumulator with the prior chunk's
|
||||
// tail blocks (paragraph-level overlap). The
|
||||
// heading is *not* re-included here — it lives
|
||||
@@ -292,10 +278,11 @@ fn build_chunk(
|
||||
) -> Chunk {
|
||||
debug_assert!(!blocks.is_empty(), "build_chunk requires ≥1 block");
|
||||
|
||||
let block_ids: Vec<BlockId> =
|
||||
blocks.iter().map(|b| common(b).block_id.clone()).collect();
|
||||
let source_spans: Vec<SourceSpan> =
|
||||
blocks.iter().map(|b| common(b).source_span.clone()).collect();
|
||||
let block_ids: Vec<BlockId> = blocks.iter().map(|b| common(b).block_id.clone()).collect();
|
||||
let source_spans: Vec<SourceSpan> = blocks
|
||||
.iter()
|
||||
.map(|b| common(b).source_span.clone())
|
||||
.collect();
|
||||
|
||||
// heading_path: pick the first non-Heading block's heading_path
|
||||
// (which already includes every parent heading per kb-normalize).
|
||||
@@ -339,23 +326,20 @@ fn build_chunk(
|
||||
text.len().div_ceil(BYTES_PER_TOKEN)
|
||||
};
|
||||
|
||||
let chunk_id = id_for_chunk(
|
||||
&doc.doc_id,
|
||||
chunker_version,
|
||||
&block_ids,
|
||||
policy_hash,
|
||||
);
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, &block_ids, policy_hash);
|
||||
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path,
|
||||
source_spans,
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: policy_hash.to_string(),
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -387,9 +371,7 @@ fn render_block_text(b: &Block) -> String {
|
||||
// alt keeps lexical search hits on filenames working even when
|
||||
// P6-1's filename auto-fill is bypassed.
|
||||
Block::ImageRef(i) => {
|
||||
let alt = if !i.alt.is_empty() {
|
||||
i.alt.clone()
|
||||
} else {
|
||||
let alt = if i.alt.is_empty() {
|
||||
// P6-1 falls back to filename so this branch is
|
||||
// defensive — keep it lest a future test fixture or
|
||||
// synthetic block path skip the auto-fill.
|
||||
@@ -399,17 +381,11 @@ fn render_block_text(b: &Block) -> String {
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or("[image]")
|
||||
.to_string()
|
||||
} else {
|
||||
i.alt.clone()
|
||||
};
|
||||
let ocr = i
|
||||
.ocr
|
||||
.as_ref()
|
||||
.map(|o| o.joined.as_str())
|
||||
.unwrap_or("");
|
||||
let cap = i
|
||||
.caption
|
||||
.as_ref()
|
||||
.map(|c| c.text.as_str())
|
||||
.unwrap_or("");
|
||||
let ocr = i.ocr.as_ref().map_or("", |o| o.joined.as_str());
|
||||
let cap = i.caption.as_ref().map_or("", |c| c.text.as_str());
|
||||
[alt.as_str(), ocr, cap]
|
||||
.iter()
|
||||
.filter(|s| !s.is_empty())
|
||||
@@ -449,9 +425,8 @@ fn common(b: &Block) -> &kebab_core::CommonBlock {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang,
|
||||
Metadata, Provenance, SourceType, TableBlock, TextBlock, TrustLevel,
|
||||
WorkspacePath, id_for_block,
|
||||
AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang, Metadata, Provenance,
|
||||
SourceType, TableBlock, TextBlock, TrustLevel, WorkspacePath, id_for_block,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -472,6 +447,10 @@ mod tests {
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: None,
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: kebab_core::ParserVersion("test-parser-0".into()),
|
||||
@@ -490,12 +469,7 @@ mod tests {
|
||||
SourceSpan::Line { start, end }
|
||||
}
|
||||
|
||||
fn common_for(
|
||||
kind: &str,
|
||||
heading_path: &[String],
|
||||
ordinal: u32,
|
||||
s: SourceSpan,
|
||||
) -> CommonBlock {
|
||||
fn common_for(kind: &str, heading_path: &[String], ordinal: u32, s: SourceSpan) -> CommonBlock {
|
||||
CommonBlock {
|
||||
block_id: id_for_block(&doc_id(), kind, heading_path, ordinal, &s),
|
||||
heading_path: heading_path.to_vec(),
|
||||
@@ -530,12 +504,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn paragraph(
|
||||
text: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
line: u32,
|
||||
) -> Block {
|
||||
fn paragraph(text: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::Paragraph(TextBlock {
|
||||
common: common_for("paragraph", &hp, ordinal, span(line, line)),
|
||||
@@ -544,12 +513,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn code_block(
|
||||
code: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
s: SourceSpan,
|
||||
) -> Block {
|
||||
fn code_block(code: &str, heading_path: &[&str], ordinal: u32, s: SourceSpan) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::Code(CodeBlock {
|
||||
common: common_for("code", &hp, ordinal, s),
|
||||
@@ -576,12 +540,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn image_ref(
|
||||
alt: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
line: u32,
|
||||
) -> Block {
|
||||
fn image_ref(alt: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::ImageRef(ImageRefBlock {
|
||||
common: common_for("imageref", &hp, ordinal, span(line, line)),
|
||||
|
||||
@@ -53,18 +53,21 @@
|
||||
//! one chunk per atomic block. PdfPageV1 cannot.
|
||||
//!
|
||||
//! Workaround that doesn't change the §4.2 recipe: feed a per-chunk
|
||||
//! variant `format!("{base_policy_hash}#c{char_start}")` into the
|
||||
//! recipe's `policy_hash` slot (so distinct chunks distinguish via
|
||||
//! different policy_hash inputs), while storing the unmodified
|
||||
//! `base_policy_hash` in `Chunk.policy_hash` so the field still answers
|
||||
//! "what policy was active". Logged in `tasks/HOTFIXES.md`.
|
||||
//! variant `format!("{base_policy_hash}#c{segment_start}")` into the
|
||||
//! recipe's `policy_hash` slot. `segment_start` is the pre-overlap
|
||||
//! segment boundary, strictly increasing across the returned chunks
|
||||
//! even when the overlap walk collapses `actual_start` to a previous
|
||||
//! chunk's `prev_min`. Unmodified `base_policy_hash` is stored in
|
||||
//! `Chunk.policy_hash` so the field still answers "what policy was
|
||||
//! active". v1.1 second-iteration patch — logged in
|
||||
//! `tasks/HOTFIXES.md` (2026-05-27).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "pdf-page-v1";
|
||||
const VERSION_LABEL: &str = "pdf-page-v1.1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
|
||||
@@ -89,11 +92,7 @@ impl Chunker for PdfPageV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
// Validate up front — every block must be a Paragraph carrying
|
||||
// SourceSpan::Page. A mixed document signals a routing bug in
|
||||
// the caller (e.g. running this chunker on Markdown) and is
|
||||
@@ -106,18 +105,13 @@ impl Chunker for PdfPageV1Chunker {
|
||||
),
|
||||
};
|
||||
if !matches!(common.source_span, SourceSpan::Page { .. }) {
|
||||
anyhow::bail!(
|
||||
"PdfPageV1Chunker only handles PDF docs (got non-Page source_span)"
|
||||
);
|
||||
anyhow::bail!("PdfPageV1Chunker only handles PDF docs (got non-Page source_span)");
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let target_bytes = policy
|
||||
.target_tokens
|
||||
.saturating_mul(BYTES_PER_TOKEN)
|
||||
.max(1);
|
||||
let target_bytes = policy.target_tokens.saturating_mul(BYTES_PER_TOKEN).max(1);
|
||||
// Clamp the overlap to half the target. Without this, a policy
|
||||
// with `overlap_tokens >= target_tokens` would make every chunk
|
||||
// fully re-emit the previous chunk's text — mirrors
|
||||
@@ -146,7 +140,7 @@ impl Chunker for PdfPageV1Chunker {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (char_start, char_end, slice) in
|
||||
for (segment_start, char_start, char_end, slice) in
|
||||
chunk_page(&p.text, target_bytes, overlap_bytes)
|
||||
{
|
||||
// PDF chars-per-page comfortably fits in u32 (a single
|
||||
@@ -154,20 +148,20 @@ impl Chunker for PdfPageV1Chunker {
|
||||
// typography); silent `as u32` truncation would only
|
||||
// surface on corrupted input, where an explicit panic
|
||||
// is preferable to an off-by-2^32 span.
|
||||
let char_start_u32 = u32::try_from(char_start)
|
||||
.expect("page chars fit in u32");
|
||||
let char_end_u32 =
|
||||
u32::try_from(char_end).expect("page chars fit in u32");
|
||||
let char_start_u32 = u32::try_from(char_start).expect("page chars fit in u32");
|
||||
let char_end_u32 = u32::try_from(char_end).expect("page chars fit in u32");
|
||||
let span = SourceSpan::Page {
|
||||
page: page_num,
|
||||
char_start: Some(char_start_u32),
|
||||
char_end: Some(char_end_u32),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![p.common.block_id.clone()];
|
||||
// Per-chunk policy_hash variant prevents chunk_id
|
||||
// collision when a page produces multiple chunks. See
|
||||
// module docs for rationale.
|
||||
let per_chunk_hash = format!("{base_policy_hash}#c{char_start}");
|
||||
// v0.20.0 sub-item 1 bugfix (#3): per-chunk policy_hash
|
||||
// variant uses `segment_start` (pre-overlap boundary,
|
||||
// strictly increasing) instead of `char_start` (post-
|
||||
// overlap, may collapse to prev_min). See module docs +
|
||||
// spec §4.1 root cause + HOTFIXES.md 2026-05-27.
|
||||
let per_chunk_hash = format!("{base_policy_hash}#c{segment_start}");
|
||||
let chunk_id =
|
||||
id_for_chunk(&doc.doc_id, &chunker_version, &block_ids, &per_chunk_hash);
|
||||
let token_estimate = slice.len().div_ceil(BYTES_PER_TOKEN);
|
||||
@@ -176,12 +170,14 @@ impl Chunker for PdfPageV1Chunker {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&slice),
|
||||
text: slice,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.clone(),
|
||||
aliases: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -198,18 +194,28 @@ impl Chunker for PdfPageV1Chunker {
|
||||
}
|
||||
|
||||
/// Split a single page's text into ordered chunks, each represented as
|
||||
/// `(char_start, char_end, text_slice)`. Char positions are within the
|
||||
/// page text, suitable for `SourceSpan::Page::char_start` / `char_end`.
|
||||
/// `(segment_start, actual_start, chunk_end, text_slice)`.
|
||||
///
|
||||
/// - `segment_start` = pre-overlap segment boundary. Strictly increasing
|
||||
/// across the returned vec. Use this for chunk_id uniqueness suffixes.
|
||||
/// - `actual_start` = post-overlap start char index. May collapse to a
|
||||
/// previous chunk's `actual_start` under aggressive overlap policy.
|
||||
/// Use this for `SourceSpan::Page::char_start`.
|
||||
/// - `chunk_end` = chunk's end char index (exclusive).
|
||||
///
|
||||
/// Returns an empty vector when `text` is empty or whitespace-only.
|
||||
fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usize, usize, String)> {
|
||||
fn chunk_page(
|
||||
text: &str,
|
||||
target_bytes: usize,
|
||||
overlap_bytes: usize,
|
||||
) -> Vec<(usize, usize, usize, String)> {
|
||||
let chars: Vec<char> = text.chars().collect();
|
||||
let n = chars.len();
|
||||
if n == 0 {
|
||||
return Vec::new();
|
||||
}
|
||||
if text.len() <= target_bytes {
|
||||
return vec![(0, n, text.to_string())];
|
||||
return vec![(0, 0, n, text.to_string())];
|
||||
}
|
||||
|
||||
// Build candidate boundary positions (char indices where a chunk
|
||||
@@ -222,8 +228,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
let c = chars[k];
|
||||
let nx = chars[k + 1];
|
||||
let is_paragraph_break = c == '\n' && nx == '\n';
|
||||
let is_sentence_end =
|
||||
matches!(c, '.' | '?' | '!') && nx.is_whitespace();
|
||||
let is_sentence_end = matches!(c, '.' | '?' | '!') && nx.is_whitespace();
|
||||
if (is_paragraph_break || is_sentence_end) && k + 2 <= n {
|
||||
bounds.push(k + 2);
|
||||
}
|
||||
@@ -235,11 +240,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
bounds.dedup();
|
||||
|
||||
// UTF-8 byte length of the slice between two char indices.
|
||||
let byte_len = |a: usize, b: usize| -> usize {
|
||||
chars[a..b].iter().map(|c| c.len_utf8()).sum()
|
||||
};
|
||||
let byte_len = |a: usize, b: usize| -> usize { chars[a..b].iter().map(|c| c.len_utf8()).sum() };
|
||||
|
||||
let mut chunks: Vec<(usize, usize, String)> = Vec::new();
|
||||
let mut chunks: Vec<(usize, usize, usize, String)> = Vec::new();
|
||||
let mut seg_idx: usize = 0;
|
||||
while seg_idx + 1 < bounds.len() {
|
||||
let start = bounds[seg_idx];
|
||||
@@ -264,7 +267,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
// have absorbed up to `overlap_bytes` of bytes, but never past
|
||||
// the previous chunk's start (no full re-emission).
|
||||
let actual_start = if let Some(prev) = chunks.last() {
|
||||
let prev_min = prev.0;
|
||||
// prev tuple shape = (segment_start, actual_start, chunk_end, slice).
|
||||
// overlap walk floor = previous chunk's actual_start (prev.1).
|
||||
let prev_min = prev.1;
|
||||
let mut a = start;
|
||||
let mut acc_o: usize = 0;
|
||||
while a > prev_min {
|
||||
@@ -281,7 +286,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
};
|
||||
|
||||
let slice: String = chars[actual_start..chunk_end].iter().collect();
|
||||
chunks.push((actual_start, chunk_end, slice));
|
||||
chunks.push((start, actual_start, chunk_end, slice));
|
||||
seg_idx = end_idx;
|
||||
}
|
||||
|
||||
@@ -347,6 +352,10 @@ mod tests {
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: None,
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version,
|
||||
@@ -386,7 +395,11 @@ mod tests {
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
match c.source_spans[0] {
|
||||
SourceSpan::Page { page, char_start, char_end } => {
|
||||
SourceSpan::Page {
|
||||
page,
|
||||
char_start,
|
||||
char_end,
|
||||
} => {
|
||||
assert_eq!(page, (i as u32) + 1);
|
||||
assert_eq!(char_start, Some(0));
|
||||
assert!(char_end.unwrap() > 0);
|
||||
@@ -431,11 +444,16 @@ mod tests {
|
||||
// N-1's char_end).
|
||||
for w in chunks.windows(2) {
|
||||
let prev_end = match w[0].source_spans[0] {
|
||||
SourceSpan::Page { char_end: Some(e), .. } => e,
|
||||
SourceSpan::Page {
|
||||
char_end: Some(e), ..
|
||||
} => e,
|
||||
_ => panic!("missing char_end"),
|
||||
};
|
||||
let next_start = match w[1].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
assert!(
|
||||
@@ -446,7 +464,7 @@ mod tests {
|
||||
// chunk_ids stay distinct despite identical block_ids — the
|
||||
// per-chunk policy_hash variant is doing its job.
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
ids.sort();
|
||||
ids.sort_unstable();
|
||||
let total = ids.len();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), total, "all chunk_ids must be unique");
|
||||
@@ -512,6 +530,10 @@ mod tests {
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: None,
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version,
|
||||
@@ -645,11 +667,17 @@ mod tests {
|
||||
// overlap) is the failure mode.
|
||||
for w in chunks.windows(2) {
|
||||
let prev_start = match w[0].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
let next_start = match w[1].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
assert!(
|
||||
@@ -660,12 +688,49 @@ mod tests {
|
||||
// chunk_ids stay distinct (the per-chunk hash variant keys off
|
||||
// char_start which is now strictly increasing).
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
ids.sort();
|
||||
ids.sort_unstable();
|
||||
let total = ids.len();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), total, "chunk_ids must remain unique");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_chunk_page_with_aggressive_overlap_produces_unique_chunk_ids() {
|
||||
// 한국어 OCR text 의 trigger shape: 10 char "가" + ". " + 500 char "나".
|
||||
// → first segment [0, 12), second segment [12, n).
|
||||
// page_text byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500
|
||||
// → multi-chunk. overlap_bytes = min(240, 750) = 240 chars=80
|
||||
// → second chunk 의 actual_start 가 prev_min=0 collapse → same `#c0`.
|
||||
//
|
||||
// default_policy(500, 80) — target_tokens=500 → target_bytes=500*3=1500
|
||||
// (한국어 3byte/char 환산), overlap_tokens=80 → overlap_bytes=min(240, 750)=240.
|
||||
// verifier round 1 L-3 보강.
|
||||
let early_seg = "가".repeat(10);
|
||||
let tail = "나".repeat(500);
|
||||
let page_text = format!("{early_seg}. {tail}");
|
||||
|
||||
let doc = make_pdf_doc(&[&page_text]);
|
||||
let policy = default_policy(500, 80); // target=1500 byte, overlap=240 byte
|
||||
let chunks = PdfPageV1Chunker.chunk(&doc, &policy).unwrap();
|
||||
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"expected ≥2 chunks for {} byte page; got {}",
|
||||
page_text.len(),
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
ids.sort_unstable();
|
||||
let total = ids.len();
|
||||
ids.dedup();
|
||||
assert_eq!(
|
||||
ids.len(),
|
||||
total,
|
||||
"all chunk_ids must be unique even when overlap walks actual_start back to prev_min"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1_for_identical_policy() {
|
||||
// Cross-chunker policy fingerprint identity — important so a
|
||||
|
||||
201
crates/kebab-chunk/src/tier2_shared.rs
Normal file
201
crates/kebab-chunk/src/tier2_shared.rs
Normal file
@@ -0,0 +1,201 @@
|
||||
//! p10-2: Tier 2 chunker shared helpers (oversize fallback + Chunk build).
|
||||
//!
|
||||
//! Mirrors `code_rust_ast_v1`'s Chunk-construction pattern exactly so that
|
||||
//! id / hashes / token-count / ChunkPolicy semantics stay identical across
|
||||
//! Tier 1 (AST) and Tier 2 (resource-aware) chunkers.
|
||||
|
||||
use anyhow::Result;
|
||||
use kebab_core::{
|
||||
BlockId, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, DocumentId, SourceSpan,
|
||||
id_for_chunk,
|
||||
};
|
||||
|
||||
pub(crate) const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
|
||||
/// Compute the policy hash the same way `code_rust_ast_v1` does.
|
||||
pub(crate) fn policy_hash(policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
/// Emit one chunk for `(text, line_start..=line_end, symbol, lang)`, splitting
|
||||
/// into line-windows of at most `AST_CHUNK_MAX_LINES` if the slice is oversize.
|
||||
/// Mirrors the oversize path in `code_rust_ast_v1`'s `chunk` impl.
|
||||
///
|
||||
/// `base_split_key` is used as the `split_key` for the non-oversize single-chunk
|
||||
/// case. Callers that emit multiple chunks from the same document (e.g.
|
||||
/// `K8sManifestResourceV1Chunker` — one call per k8s resource) MUST pass
|
||||
/// `Some(line_start)` so that each call produces a distinct `chunk_id`.
|
||||
/// Single-chunk callers (dockerfile-file-v1, manifest-file-v1) pass `None` to
|
||||
/// keep chunk_ids stable (no sibling can collide when there's only one chunk).
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn push_chunks_with_oversize(
|
||||
out: &mut Vec<Chunk>,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
text: &str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
symbol: &str,
|
||||
lang: &str,
|
||||
chunker_version: &str,
|
||||
base_split_key: Option<u32>,
|
||||
) -> Result<()> {
|
||||
let n_lines = (line_end - line_start + 1).max(1);
|
||||
let cv = ChunkerVersion(chunker_version.to_string());
|
||||
let base_policy_hash = policy_hash(policy);
|
||||
|
||||
if n_lines <= AST_CHUNK_MAX_LINES {
|
||||
out.push(build_chunk(
|
||||
doc,
|
||||
&cv,
|
||||
&base_policy_hash,
|
||||
text,
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
base_split_key,
|
||||
));
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let lines: Vec<&str> = text.lines().collect();
|
||||
let total = lines.len();
|
||||
let mut window_start = line_start;
|
||||
let mut i = 0usize;
|
||||
while i < total {
|
||||
let take = (AST_CHUNK_MAX_LINES as usize).min(total - i);
|
||||
let window_text = lines[i..i + take].join("\n");
|
||||
let window_end = window_start + take as u32 - 1;
|
||||
out.push(build_chunk(
|
||||
doc,
|
||||
&cv,
|
||||
&base_policy_hash,
|
||||
&window_text,
|
||||
window_start,
|
||||
window_end,
|
||||
symbol,
|
||||
lang,
|
||||
Some(window_start),
|
||||
));
|
||||
i += take;
|
||||
window_start = window_end + 1;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build a single `Chunk`, mirroring `make_chunk` in `code_rust_ast_v1.rs`
|
||||
/// exactly (same id recipe, same token estimate, same field set).
|
||||
///
|
||||
/// `split_key` is `Some(line_start_of_window)` for oversize splits, `None`
|
||||
/// for normal single-chunk emission. Mirrors the `Some(part_ls)` / `None`
|
||||
/// split_key pattern in 1A-2.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn build_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
base_policy_hash: &str,
|
||||
text: &str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
symbol: &str,
|
||||
lang: &str,
|
||||
split_key: Option<u32>,
|
||||
) -> Chunk {
|
||||
let span = SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol: Some(symbol.to_string()),
|
||||
lang: Some(lang.to_string()),
|
||||
};
|
||||
build_chunk_from_span(
|
||||
doc,
|
||||
chunker_version,
|
||||
base_policy_hash,
|
||||
text,
|
||||
span,
|
||||
split_key,
|
||||
)
|
||||
}
|
||||
|
||||
/// Like `build_chunk` but emits `symbol: None`. Used by Tier 3 (per spec §9.3).
|
||||
///
|
||||
/// Accepts `policy: &ChunkPolicy` and `chunker_version: &str` (string slice)
|
||||
/// so callers don't need to pre-compute the hash and version wrapper.
|
||||
/// `split_key` is `Some(window_start)` for oversize line-window splits.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn build_chunk_no_symbol(
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
text: &str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
lang: &str,
|
||||
chunker_version: &str,
|
||||
split_key: Option<u32>,
|
||||
) -> Chunk {
|
||||
let cv = ChunkerVersion(chunker_version.to_string());
|
||||
let base_policy_hash = policy_hash(policy);
|
||||
let span = SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol: None,
|
||||
lang: Some(lang.to_string()),
|
||||
};
|
||||
build_chunk_from_span(doc, &cv, &base_policy_hash, text, span, split_key)
|
||||
}
|
||||
|
||||
/// Core chunk-building logic shared by `build_chunk` and `build_chunk_no_symbol`.
|
||||
///
|
||||
/// Takes a pre-built `SourceSpan` so the only difference between the two
|
||||
/// public helpers is whether `symbol` is `Some` or `None`. All id/hash/
|
||||
/// token mechanics are identical.
|
||||
fn build_chunk_from_span(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
base_policy_hash: &str,
|
||||
text: &str,
|
||||
span: SourceSpan,
|
||||
split_key: Option<u32>,
|
||||
) -> Chunk {
|
||||
// id_hash mirrors code_rust_ast_v1's make_chunk logic:
|
||||
// split_key Some(k) => "{base_policy_hash}#L{k}"
|
||||
// split_key None => base_policy_hash
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
|
||||
// block_ids: Tier 2/3 chunkers have no per-block structure (the whole file
|
||||
// is one Block::Code), so we pass an empty slice — same as using the doc-
|
||||
// level slice without explicit block granularity.
|
||||
let block_ids: Vec<BlockId> = vec![];
|
||||
|
||||
let chunk_id = id_for_chunk(
|
||||
&DocumentId(doc.doc_id.0.clone()),
|
||||
chunker_version,
|
||||
&block_ids,
|
||||
&id_hash,
|
||||
);
|
||||
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(text),
|
||||
text: text.to_string(),
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
aliases: None,
|
||||
}
|
||||
}
|
||||
196
crates/kebab-chunk/tests/code_c_ast_snapshot.rs
Normal file
196
crates/kebab-chunk/tests/code_c_ast_snapshot.rs
Normal file
@@ -0,0 +1,196 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative C code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_go_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeCAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("projects/record.c".into());
|
||||
let aid = AssetId("c".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-c-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Representative units:
|
||||
// 0. imports + defines (lines 1–4, ≤200)
|
||||
// 1. status_t enum typedef (lines 6–9, ≤200)
|
||||
// 2. record_t struct typedef (lines 11–16, ≤200)
|
||||
// 3. static counter decl glue (line 18, ≤200)
|
||||
// 4. parse_record fn (lines 20–23, ≤200)
|
||||
// 5. print_record fn (lines 25–27, ≤200)
|
||||
// 6. main fn (lines 29–33, ≤200)
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"<top-level>",
|
||||
1,
|
||||
18,
|
||||
"#include <stdio.h>\n#include <stdlib.h>\n\n#define MAX_BUF 4096\n\ntypedef enum {\n OK = 0,\n ERR_PARSE,\n ERR_IO,\n} status_t;\n\ntypedef struct {\n int id;\n char name[64];\n status_t status;\n} record_t;\n\nstatic int counter = 0;".to_string(),
|
||||
),
|
||||
(
|
||||
"parse_record",
|
||||
20,
|
||||
23,
|
||||
"int parse_record(const char *line, record_t *out) {\n if (line == NULL || out == NULL) return ERR_PARSE;\n return OK;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"print_record",
|
||||
25,
|
||||
27,
|
||||
"void print_record(const record_t *r) {\n printf(\"[%d] %s (status=%d)\\n\", r->id, r->name, r->status);\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"main",
|
||||
29,
|
||||
33,
|
||||
"int main(void) {\n record_t r = { .id = 1, .name = \"foo\", .status = OK };\n print_record(&r);\n return 0;\n}".to_string(),
|
||||
),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("c".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("c".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "record.c".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("c".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-c-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_c_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeCAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.c.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-c-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_c_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
354
crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs
Normal file
354
crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs
Normal file
@@ -0,0 +1,354 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative C++ code `CanonicalDocument`.
|
||||
//!
|
||||
//! Two complementary tests:
|
||||
//! 1. `code_cpp_ast_chunks_snapshot` — hand-built `fixed_doc()` validates the
|
||||
//! chunker's 1:1 mapping (design §6.3 / §8 boundary: no parse-code dep needed).
|
||||
//! 2. `code_cpp_ast_extractor_snapshot` — invokes `CppAstExtractor` against the
|
||||
//! real `tests/fixtures/sample.cpp` fixture, validating the extractor → chunker
|
||||
//! end-to-end pipeline. `kebab-parse-code` is a dev-dep (same pattern as
|
||||
//! `kebab-parse-md` in Markdown snapshot tests).
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeCppAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use kebab_parse_code::CppAstExtractor;
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("projects/record.cpp".into());
|
||||
let aid = AssetId("c".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-cpp-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Representative units (C++ specific):
|
||||
// 0. includes + namespace opening (lines 1–4, ≤200)
|
||||
// 1. class definition (lines 6–20, ≤200)
|
||||
// 2. template function (lines 22–25, ≤200)
|
||||
// 3. namespace closing + free fn (lines 27–29, ≤200)
|
||||
// 4. main fn (lines 31–34, ≤200)
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"<top-level>",
|
||||
1,
|
||||
4,
|
||||
"#include <string>\n#include <vector>\n\nnamespace kebab {".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::chunk::MdHeadingV1Chunker",
|
||||
6,
|
||||
20,
|
||||
"class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::identity",
|
||||
22,
|
||||
25,
|
||||
"template <typename T>\nT identity(T value) {\n return value;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::global_helper",
|
||||
27,
|
||||
29,
|
||||
"void global_helper() {\n // free function in kebab namespace\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"main",
|
||||
31,
|
||||
34,
|
||||
"int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}".to_string(),
|
||||
),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("cpp".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("cpp".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "record.cpp".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("cpp".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-cpp-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helper: run the real CppAstExtractor against tests/fixtures/sample.cpp
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn extract_cpp_fixture() -> CanonicalDocument {
|
||||
use kebab_core::{
|
||||
AssetId, AssetStorage, Checksum, ExtractConfig, ExtractContext, Extractor, RawAsset,
|
||||
SourceUri, WorkspacePath,
|
||||
};
|
||||
use std::path::PathBuf;
|
||||
|
||||
let bytes = std::fs::read(fixtures_dir().join("sample.cpp")).expect("read sample.cpp fixture");
|
||||
let src = String::from_utf8(bytes).expect("fixture is valid UTF-8");
|
||||
let wp = WorkspacePath("tests/fixtures/sample.cpp".to_string());
|
||||
let asset = RawAsset {
|
||||
asset_id: AssetId("e".repeat(64)),
|
||||
source_uri: SourceUri::File(PathBuf::from("tests/fixtures/sample.cpp")),
|
||||
workspace_path: wp,
|
||||
media_type: kebab_core::MediaType::Code("cpp".to_string()),
|
||||
byte_len: src.len() as u64,
|
||||
checksum: Checksum("f".repeat(64)),
|
||||
discovered_at: time::OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
stored: AssetStorage::Reference {
|
||||
path: PathBuf::from("tests/fixtures/sample.cpp"),
|
||||
sha: Checksum("f".repeat(64)),
|
||||
},
|
||||
};
|
||||
let cfg = ExtractConfig::default();
|
||||
let root = PathBuf::from("/tmp");
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root: &root,
|
||||
config: &cfg,
|
||||
};
|
||||
CppAstExtractor::new()
|
||||
.extract(&ctx, src.as_bytes())
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 1 (hand-built): chunker-only 1:1 mapping validation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn code_cpp_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.cpp.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-cpp-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_cpp_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 2 (real extractor): end-to-end extractor → chunker pipeline
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Validates that the real `CppAstExtractor` processes `sample.cpp` and
|
||||
/// emits the expected set of symbols through the full chunker pipeline.
|
||||
///
|
||||
/// `sample.cpp` contains:
|
||||
/// - `#include` directives + nested namespace `kebab::chunk` → glue + struct unit
|
||||
/// - `class MdHeadingV1Chunker` with methods (ctor, dtor, chunk_doc, operator())
|
||||
/// - `template <typename T> T identity(T value)` (template fn)
|
||||
/// - `void kebab::global_helper()` (free fn in namespace)
|
||||
/// - `int main()` (global free fn)
|
||||
#[test]
|
||||
fn code_cpp_ast_extractor_snapshot() {
|
||||
let doc = extract_cpp_fixture();
|
||||
|
||||
// Verify the extractor emits all expected named units.
|
||||
let block_syms: Vec<Option<String>> = doc
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Code(c) => match &c.common.source_span {
|
||||
SourceSpan::Code { symbol, .. } => Some(symbol.clone()),
|
||||
_ => None,
|
||||
},
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Must include namespace-qualified class and its methods
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker")),
|
||||
"class unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker")),
|
||||
"ctor unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker")),
|
||||
"dtor unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::chunk_doc")),
|
||||
"chunk_doc unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::operator()")),
|
||||
"operator() unit missing: {block_syms:?}"
|
||||
);
|
||||
// Template function (inside kebab::chunk namespace in the fixture)
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::identity")),
|
||||
"identity template fn unit missing: {block_syms:?}"
|
||||
);
|
||||
// Free function in outer namespace
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::global_helper")),
|
||||
"global_helper unit missing: {block_syms:?}"
|
||||
);
|
||||
// Global main
|
||||
assert!(
|
||||
block_syms.iter().any(|s| s.as_deref() == Some("main")),
|
||||
"main unit missing: {block_syms:?}"
|
||||
);
|
||||
}
|
||||
|
||||
/// End-to-end chunker output from real extractor is deterministic.
|
||||
#[test]
|
||||
fn code_cpp_ast_extractor_chunks_deterministic() {
|
||||
let doc1 = extract_cpp_fixture();
|
||||
let doc2 = extract_cpp_fixture();
|
||||
assert_eq!(
|
||||
doc1.blocks, doc2.blocks,
|
||||
"extractor output non-deterministic"
|
||||
);
|
||||
|
||||
let policy = fixed_policy();
|
||||
let chunks1 = CodeCppAstV1Chunker.chunk(&doc1, &policy).unwrap();
|
||||
let chunks2 = CodeCppAstV1Chunker.chunk(&doc2, &policy).unwrap();
|
||||
assert_eq!(
|
||||
chunks1
|
||||
.iter()
|
||||
.map(|c| c.chunk_id.0.clone())
|
||||
.collect::<Vec<_>>(),
|
||||
chunks2
|
||||
.iter()
|
||||
.map(|c| c.chunk_id.0.clone())
|
||||
.collect::<Vec<_>>(),
|
||||
"chunker output non-deterministic"
|
||||
);
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_go_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_go_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Go code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeGoAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("kebab_eval/metrics.go".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-go-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "func BigCompute(data []int) int {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!("\tv{i} := 0\n\tif {i} < len(data) {{\n\t\tv{i} = data[{i}]\n\t}}\n"))
|
||||
.collect();
|
||||
let footer = "\treturn len(data)\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. free fn `ComputeMRR` (lines 7–12, ≤200)
|
||||
// 2. struct `MetricsCollector` (lines 14–20, ≤200)
|
||||
// 3. struct `BaseEvaluator` (lines 22–30, ≤200)
|
||||
// 4. method `Run` (lines 32–38, ≤200)
|
||||
// 5. method `Report` (lines 40–46, ≤200)
|
||||
// 6. BigCompute (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import (\n\t\"fmt\"\n\t\"os\"\n\t\"strings\"\n)".to_string(),
|
||||
),
|
||||
(
|
||||
"ComputeMRR",
|
||||
7,
|
||||
12,
|
||||
"func ComputeMRR(scores []float64) float64 {\n\tif len(scores) == 0 {\n\t\treturn 0.0\n\t}\n\t_ = fmt.Sprintf(\"%v\", scores)\n\treturn 1.0 / float64(len(scores))\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector",
|
||||
14,
|
||||
20,
|
||||
"type MetricsCollector struct {\n\tScores []float64\n\tLabels []string\n\tCounts map[string]int\n\tTotals map[string]float64\n\tTags []string\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseEvaluator",
|
||||
22,
|
||||
30,
|
||||
"type BaseEvaluator struct {\n\tName string\n}\n\nfunc (e *BaseEvaluator) Evaluate(data []string) error {\n\t_ = os.Stderr\n\t_ = strings.Join(data, \",\")\n\treturn nil\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.Run",
|
||||
32,
|
||||
38,
|
||||
"func (m *MetricsCollector) Run(inputs []float64) {\n\tfor _, inp := range inputs {\n\t\tm.Scores = append(\n\t\t\tm.Scores,\n\t\t\tinp,\n\t\t)\n\t}\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.Report",
|
||||
40,
|
||||
46,
|
||||
"func (m *MetricsCollector) Report() map[string]interface{} {\n\treturn map[string]interface{}{\n\t\t\"mean\": 0.0,\n\t\t\"count\": len(m.Scores),\n\t\t\"tags\": m.Tags,\n\t}\n}".to_string(),
|
||||
),
|
||||
("BigCompute", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("go".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("go".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "metrics.go".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("go".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-go-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_go_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.go.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-go-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_go_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_java_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_java_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Java code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeJavaAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("src/main/java/com/example/Metrics.java".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-java-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line method body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "public class BigCompute {\n public int compute(int[] data) {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" int v{i} = {i} < data.length ? data[{i}] : 0;\n"))
|
||||
.collect();
|
||||
let footer = " return data.length;\n }\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. free method `computeMRR` (lines 7–12, ≤200)
|
||||
// 2. class `MetricsCollector` (lines 14–20, ≤200)
|
||||
// 3. class `BaseEvaluator` (lines 22–30, ≤200)
|
||||
// 4. method `MetricsCollector.run` (lines 32–38, ≤200)
|
||||
// 5. method `MetricsCollector.report` (lines 40–46, ≤200)
|
||||
// 6. BigCompute (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import java.util.List;\nimport java.util.Map;\nimport java.util.ArrayList;\nimport java.util.HashMap;\nimport java.util.stream.Collectors;".to_string(),
|
||||
),
|
||||
(
|
||||
"computeMRR",
|
||||
7,
|
||||
12,
|
||||
"public static double computeMRR(List<Double> scores) {\n if (scores.isEmpty()) {\n return 0.0;\n }\n return 1.0 / scores.size();\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector",
|
||||
14,
|
||||
20,
|
||||
"public class MetricsCollector {\n private List<Double> scores;\n private List<String> labels;\n private Map<String, Integer> counts;\n private Map<String, Double> totals;\n private List<String> tags;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseEvaluator",
|
||||
22,
|
||||
30,
|
||||
"public class BaseEvaluator {\n private String name;\n\n public BaseEvaluator(String name) {\n this.name = name;\n }\n\n public void evaluate(List<String> data) throws Exception {\n String joined = String.join(\",\", data);\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.run",
|
||||
32,
|
||||
38,
|
||||
"public void run(List<Double> inputs) {\n for (Double inp : inputs) {\n scores.add(\n inp\n );\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.report",
|
||||
40,
|
||||
46,
|
||||
"public Map<String, Object> report() {\n Map<String, Object> result = new HashMap<>();\n result.put(\"mean\", 0.0);\n result.put(\"count\", scores.size());\n result.put(\"tags\", tags);\n return result;\n}".to_string(),
|
||||
),
|
||||
("BigCompute", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("java".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("java".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "Metrics.java".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("java".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-java-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_java_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.java.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-java-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_java_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_js_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_js_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative JavaScript code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeJsAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("src/bar.js".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-js-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "function bigTransform(items) {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" const v{i} = items[{i}] !== undefined ? items[{i}] : null;\n"))
|
||||
.collect();
|
||||
let footer = " return items;\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. require/import block (lines 1–5, ≤200)
|
||||
// 1. free fn `add` (lines 7–12, ≤200)
|
||||
// 2. class `EventBus` (lines 14–20, ≤200)
|
||||
// 3. class `BaseHandler` (lines 22–30, ≤200)
|
||||
// 4. method `EventBus.emit` (lines 32–38, ≤200)
|
||||
// 5. method `EventBus.on` (lines 40–46, ≤200)
|
||||
// 6. bigTransform (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"requires",
|
||||
1,
|
||||
5,
|
||||
"const fs = require('fs');\nconst path = require('path');\nconst { EventEmitter } = require('events');\nconst assert = require('assert');\nconst crypto = require('crypto');".to_string(),
|
||||
),
|
||||
(
|
||||
"add",
|
||||
7,
|
||||
12,
|
||||
"export function add(a, b) {\n if (typeof a !== 'number') throw new TypeError('a');\n if (typeof b !== 'number') throw new TypeError('b');\n const result = a + b;\n assert(isFinite(result));\n return result;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"EventBus",
|
||||
14,
|
||||
20,
|
||||
"class EventBus {\n constructor() {\n this._handlers = new Map();\n this._history = [];\n this._maxHistory = 100;\n this._seq = 0;\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseHandler",
|
||||
22,
|
||||
30,
|
||||
"class BaseHandler {\n handle(event) {\n throw new Error('not implemented');\n }\n batchHandle(events) {\n const results = [];\n for (const ev of events) {\n results.push(this.handle(ev));\n }\n return results;\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"EventBus.emit",
|
||||
32,
|
||||
38,
|
||||
"class EventBus {\n emit(name, payload) {\n const handlers = this._handlers.get(name) ?? [];\n for (const h of handlers) {\n h(payload);\n }\n return this;\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"EventBus.on",
|
||||
40,
|
||||
46,
|
||||
"class EventBus {\n on(name, handler) {\n if (!this._handlers.has(name)) {\n this._handlers.set(name, []);\n }\n this._handlers.get(name).push(handler);\n return this;\n }\n}".to_string(),
|
||||
),
|
||||
("bigTransform", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("javascript".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("javascript".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "bar.js".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("javascript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-js-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_js_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.js.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-js-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_js_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Kotlin code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeKotlinAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("src/main/kotlin/com/example/Metrics.kt".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-kotlin-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "class BigCompute {\n fun compute(data: IntArray): Int {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" val v{i} = if ({i} < data.size) data[{i}] else 0\n"))
|
||||
.collect();
|
||||
let footer = " return data.size\n }\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. top-level fn `computeMRR` (lines 7–12, ≤200)
|
||||
// 2. data class `MetricsCollector` (lines 14–20, ≤200)
|
||||
// 3. class `BaseEvaluator` (lines 22–30, ≤200)
|
||||
// 4. method `MetricsCollector.run` (lines 32–38, ≤200)
|
||||
// 5. method `MetricsCollector.report` (lines 40–46, ≤200)
|
||||
// 6. BigCompute (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import kotlin.collections.List\nimport kotlin.collections.Map\nimport kotlin.collections.MutableList\nimport kotlin.collections.MutableMap\nimport kotlin.collections.mutableListOf".to_string(),
|
||||
),
|
||||
(
|
||||
"computeMRR",
|
||||
7,
|
||||
12,
|
||||
"fun computeMRR(scores: List<Double>): Double {\n if (scores.isEmpty()) {\n return 0.0\n }\n return 1.0 / scores.size\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector",
|
||||
14,
|
||||
20,
|
||||
"data class MetricsCollector(\n val scores: MutableList<Double> = mutableListOf(),\n val labels: MutableList<String> = mutableListOf(),\n val counts: MutableMap<String, Int> = mutableMapOf(),\n val totals: MutableMap<String, Double> = mutableMapOf(),\n val tags: MutableList<String> = mutableListOf(),\n)".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseEvaluator",
|
||||
22,
|
||||
30,
|
||||
"open class BaseEvaluator(val name: String) {\n\n fun evaluate(data: List<String>) {\n val joined = data.joinToString(\",\")\n println(joined)\n }\n\n open fun describe(): String = name\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.run",
|
||||
32,
|
||||
38,
|
||||
"fun MetricsCollector.run(inputs: List<Double>) {\n for (inp in inputs) {\n scores.add(\n inp\n )\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.report",
|
||||
40,
|
||||
46,
|
||||
"fun MetricsCollector.report(): Map<String, Any> {\n return mapOf(\n \"mean\" to 0.0,\n \"count\" to scores.size,\n \"tags\" to tags,\n )\n}".to_string(),
|
||||
),
|
||||
("BigCompute", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("kotlin".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("kotlin".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "Metrics.kt".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("kotlin".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-kotlin-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_kotlin_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.kt.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-kotlin-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_kotlin_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_python_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_python_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Python code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodePythonAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("kebab_eval/metrics.py".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-python-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "def big_compute(data):\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" v{i} = data[{i}] if {i} < len(data) else 0\n"))
|
||||
.collect();
|
||||
let footer = " return sum(data)";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. free fn `compute_mrr` (lines 7–12, ≤200)
|
||||
// 2. class `MetricsCollector` (lines 14–20, ≤200)
|
||||
// 3. class `BaseEvaluator` (lines 22–30, ≤200)
|
||||
// 4. method `run` (lines 32–38, ≤200)
|
||||
// 5. method `report` (lines 40–46, ≤200)
|
||||
// 6. big_compute (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import os\nimport sys\nfrom typing import List\nfrom pathlib import Path\nfrom collections import defaultdict".to_string(),
|
||||
),
|
||||
(
|
||||
"compute_mrr",
|
||||
7,
|
||||
12,
|
||||
"def compute_mrr(scores):\n if not scores:\n return 0.0\n return sum(\n 1.0 / r for r in scores\n ) / len(scores)".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector",
|
||||
14,
|
||||
20,
|
||||
"class MetricsCollector:\n def __init__(self):\n self.scores = []\n self.labels = []\n self.counts = defaultdict(int)\n self.totals = defaultdict(float)\n self.tags = []".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseEvaluator",
|
||||
22,
|
||||
30,
|
||||
"class BaseEvaluator:\n def evaluate(self, data):\n raise NotImplementedError\n def batch_evaluate(self, items):\n results = []\n for item in items:\n results.append(self.evaluate(item))\n return results\n def name(self):\n return type(self).__name__".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.run",
|
||||
32,
|
||||
38,
|
||||
"class MetricsCollector:\n def run(self, inputs):\n for inp in inputs:\n score = self._score(inp)\n self.scores.append(\n score\n )".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.report",
|
||||
40,
|
||||
46,
|
||||
"class MetricsCollector:\n def report(self):\n return {\n 'mean': sum(self.scores) / max(len(self.scores), 1),\n 'count': len(self.scores),\n 'tags': self.tags,\n }".to_string(),
|
||||
),
|
||||
("big_compute", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("python".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("python".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "metrics.py".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("python".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-python-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_python_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.py.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-python-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_python_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_rust_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_rust_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Rust code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeRustAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/kebab-chunk/src/code_rust_ast_v1.rs".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-rust-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "pub fn big_fn(input: &[u8]) -> Vec<u8> {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" let v{i} = input.get({i} as usize).copied().unwrap_or(0);\n"))
|
||||
.collect();
|
||||
let footer = " vec![0u8]\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. top-level use+const block (lines 1–5, ≤200)
|
||||
// 1. free fn `parse` (lines 7–12, ≤200)
|
||||
// 2. struct `Foo` (lines 14–20, ≤200)
|
||||
// 3. trait `Frobable` (lines 22–30, ≤200)
|
||||
// 4. impl Foo::double (lines 32–38, ≤200)
|
||||
// 5. impl Foo::triple (lines 40–46, ≤200)
|
||||
// 6. big_fn (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"use+const",
|
||||
1,
|
||||
5,
|
||||
"use std::collections::HashMap;\nuse std::fmt;\n\nconst MAX: usize = 1024;\nconst MIN: usize = 0;".to_string(),
|
||||
),
|
||||
(
|
||||
"parse",
|
||||
7,
|
||||
12,
|
||||
"pub fn parse(input: &str) -> Option<u32> {\n input\n .trim()\n .parse()\n .ok()\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo",
|
||||
14,
|
||||
20,
|
||||
"pub struct Foo {\n pub name: String,\n pub value: u32,\n pub tags: Vec<String>,\n pub meta: Option<String>,\n pub count: usize,\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Frobable",
|
||||
22,
|
||||
30,
|
||||
"pub trait Frobable {\n fn frob(&self) -> String;\n fn frob_twice(&self) -> String {\n let a = self.frob();\n let b = self.frob();\n format!(\"{a}{b}\")\n }\n fn name(&self) -> &str;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo::double",
|
||||
32,
|
||||
38,
|
||||
"impl Foo {\n pub fn double(&self) -> u32 {\n self.value\n .checked_mul(2)\n .unwrap_or(u32::MAX)\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo::triple",
|
||||
40,
|
||||
46,
|
||||
"impl Foo {\n pub fn triple(&self) -> u32 {\n self.value\n .checked_mul(3)\n .unwrap_or(u32::MAX)\n }\n}".to_string(),
|
||||
),
|
||||
("big_fn", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("rust".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("rust".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "code_rust_ast_v1.rs".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("rust".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-rust-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_rust_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeRustAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-rust-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_rust_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
270
crates/kebab-chunk/tests/code_text_paragraph_v1.rs
Normal file
270
crates/kebab-chunk/tests/code_text_paragraph_v1.rs
Normal file
@@ -0,0 +1,270 @@
|
||||
//! Behavioural tests for `CodeTextParagraphV1Chunker`.
|
||||
//!
|
||||
//! Documents are constructed manually (no kebab-parse-code dependency) by
|
||||
//! placing raw text into a single `Block::Code`, mirroring the pattern used
|
||||
//! in `k8s_manifest_resource_v1.rs`.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeTextParagraphV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
/// Build a `CanonicalDocument` with a single `Block::Code` containing `text`
|
||||
/// and the supplied `lang` label.
|
||||
fn text_doc(lang: &str, text: &str) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("scripts/sample.sh".into());
|
||||
let aid = AssetId("d".repeat(64));
|
||||
let pv = ParserVersion("code-text-paragraph-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
let line_count = text.lines().count() as u32;
|
||||
let span = SourceSpan::Code {
|
||||
line_start: 1,
|
||||
line_end: line_count.max(1),
|
||||
symbol: None,
|
||||
lang: Some(lang.into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], 0, &span);
|
||||
let block = Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some(lang.into()),
|
||||
code: text.to_string(),
|
||||
});
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "sample.sh".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks: vec![block],
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some(lang.into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-text-paragraph-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// `sample_shell.sh` has 4 paragraphs separated by 3 blank lines:
|
||||
/// - paragraph 1: lines 1-2 (shebang + set -euo pipefail)
|
||||
/// - paragraph 2: lines 4-7 (env setup block)
|
||||
/// - paragraph 3: lines 9-11 (ingest block)
|
||||
/// - paragraph 4: lines 13-15 (report block)
|
||||
///
|
||||
/// We assert:
|
||||
/// - exactly 4 chunks (one per paragraph)
|
||||
/// - all symbols are None (Tier 3 spec §9.3)
|
||||
/// - all langs are "shell"
|
||||
/// - line ranges are strictly ascending and do NOT include the blank lines
|
||||
/// (lines 3, 8, 12 must not appear in any range)
|
||||
#[test]
|
||||
fn shell_multi_paragraph_splits_on_blank_lines() {
|
||||
let fixture_path = fixtures_dir().join("sample_shell.sh");
|
||||
let text = std::fs::read_to_string(&fixture_path)
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
let doc = text_doc("shell", &text);
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
4,
|
||||
"expected 4 chunks (one per paragraph), got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
// All symbols must be None (Tier 3 requirement).
|
||||
for (i, chunk) in chunks.iter().enumerate() {
|
||||
match &chunk.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.is_none(),
|
||||
"chunk[{i}] symbol must be None for Tier 3 chunker, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
other => panic!("chunk[{i}]: expected Code span, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
// All langs must be "shell".
|
||||
for (i, chunk) in chunks.iter().enumerate() {
|
||||
match &chunk.source_spans[0] {
|
||||
SourceSpan::Code { lang, .. } => {
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("shell"),
|
||||
"chunk[{i}] lang must be 'shell', got {lang:?}"
|
||||
);
|
||||
}
|
||||
other => panic!("chunk[{i}]: expected Code span, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
// Line ranges must be strictly ascending with no overlap,
|
||||
// and blank lines (3, 8, 12) must not be included in any range.
|
||||
let expected_ranges: &[(u32, u32)] = &[(1, 2), (4, 7), (9, 11), (13, 15)];
|
||||
let actual_ranges: Vec<(u32, u32)> = chunks
|
||||
.iter()
|
||||
.map(|c| match &c.source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => (*line_start, *line_end),
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
actual_ranges, expected_ranges,
|
||||
"line ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}"
|
||||
);
|
||||
}
|
||||
|
||||
/// `sample_long_paragraph.txt` has exactly 200 non-blank lines and no blank
|
||||
/// lines, so the entire file is one paragraph. 200 > 80 (FALLBACK_LINES_PER_CHUNK),
|
||||
/// so the oversize window split fires with stride 60:
|
||||
/// - window 1: lines 1-80
|
||||
/// - window 2: lines 61-140
|
||||
/// - window 3: lines 121-200
|
||||
///
|
||||
/// All chunk_ids must be distinct (the #L{window_start} split_key suffix).
|
||||
#[test]
|
||||
fn single_long_paragraph_line_window_split() {
|
||||
let fixture_path = fixtures_dir().join("sample_long_paragraph.txt");
|
||||
let text = std::fs::read_to_string(&fixture_path)
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
assert_eq!(
|
||||
text.lines().count(),
|
||||
200,
|
||||
"fixture must have exactly 200 lines"
|
||||
);
|
||||
|
||||
let doc = text_doc("shell", &text);
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
3,
|
||||
"expected 3 window chunks for 200-line paragraph, got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
let expected_ranges: &[(u32, u32)] = &[(1, 80), (61, 140), (121, 200)];
|
||||
let actual_ranges: Vec<(u32, u32)> = chunks
|
||||
.iter()
|
||||
.map(|c| match &c.source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => (*line_start, *line_end),
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
actual_ranges, expected_ranges,
|
||||
"window ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}"
|
||||
);
|
||||
|
||||
// All chunk_ids must be distinct (#L{window_start} suffix differentiates them).
|
||||
let ids: std::collections::HashSet<_> = chunks.iter().map(|c| c.chunk_id.clone()).collect();
|
||||
assert_eq!(
|
||||
ids.len(),
|
||||
chunks.len(),
|
||||
"oversize window chunks must have distinct chunk_ids"
|
||||
);
|
||||
}
|
||||
|
||||
/// An empty source file (no non-blank lines) must yield zero chunks.
|
||||
#[test]
|
||||
fn empty_file_emits_zero_chunks() {
|
||||
let doc = text_doc("shell", "");
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
0,
|
||||
"empty file must yield 0 chunks, got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
}
|
||||
|
||||
/// The `lang` field on each emitted chunk must match the `lang` passed to
|
||||
/// `text_doc`, regardless of content. `symbol` must be `None` (Tier 3 spec).
|
||||
#[test]
|
||||
fn lang_field_preserved_from_input_doc() {
|
||||
let doc = text_doc("yaml", "key1: value1\nkey2: value2\n");
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert!(!chunks.is_empty(), "expected at least one chunk");
|
||||
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { lang, symbol, .. } => {
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("yaml"),
|
||||
"lang must be 'yaml', got {lang:?}"
|
||||
);
|
||||
assert!(
|
||||
symbol.is_none(),
|
||||
"symbol must be None for Tier 3 chunker, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_ts_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_ts_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative TypeScript code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeTsAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("src/Foo.ts".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-ts-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line method body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "export class BigProcessor {\n process(items: string[]): string[] {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" const v{i} = items[{i}] ?? '';\n"))
|
||||
.collect();
|
||||
let footer = " return items;\n }\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. free fn `parseInput` (lines 7–12, ≤200)
|
||||
// 2. interface `Frobable` (lines 14–20, ≤200)
|
||||
// 3. class `Foo` (lines 22–30, ≤200)
|
||||
// 4. method `Foo.double` (lines 32–38, ≤200)
|
||||
// 5. method `Foo.triple` (lines 40–46, ≤200)
|
||||
// 6. BigProcessor (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import { readFileSync } from 'fs';\nimport { join } from 'path';\nimport type { Config } from './config';\nimport { Logger } from './logger';\nimport { EventEmitter } from 'events';".to_string(),
|
||||
),
|
||||
(
|
||||
"parseInput",
|
||||
7,
|
||||
12,
|
||||
"export function parseInput(raw: string): number | null {\n const trimmed = raw.trim();\n const n = Number(trimmed);\n if (isNaN(n)) return null;\n return n;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Frobable",
|
||||
14,
|
||||
20,
|
||||
"export interface Frobable {\n frob(): string;\n frobTwice(): string;\n readonly name: string;\n readonly tags: string[];\n count: number;\n reset(): void;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo",
|
||||
22,
|
||||
30,
|
||||
"export class Foo implements Frobable {\n constructor(\n public readonly name: string,\n public value: number,\n public tags: string[] = [],\n ) {}\n frob(): string { return this.name; }\n frobTwice(): string { return this.name.repeat(2); }\n reset(): void { this.value = 0; }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo.double",
|
||||
32,
|
||||
38,
|
||||
"export class Foo {\n double(): number {\n const result = this.value * 2;\n if (result > Number.MAX_SAFE_INTEGER) {\n return Number.MAX_SAFE_INTEGER;\n }\n return result;\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo.triple",
|
||||
40,
|
||||
46,
|
||||
"export class Foo {\n triple(): number {\n const result = this.value * 3;\n if (result > Number.MAX_SAFE_INTEGER) {\n return Number.MAX_SAFE_INTEGER;\n }\n return result;\n }\n}".to_string(),
|
||||
),
|
||||
("BigProcessor", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("typescript".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("typescript".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "Foo.ts".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("typescript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-ts-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_ts_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.ts.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-ts-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_ts_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
138
crates/kebab-chunk/tests/dockerfile_file_v1.rs
Normal file
138
crates/kebab-chunk/tests/dockerfile_file_v1.rs
Normal file
@@ -0,0 +1,138 @@
|
||||
//! Behavioural tests for `DockerfileFileV1Chunker`.
|
||||
//!
|
||||
//! Documents are constructed manually (no kebab-parse-code dependency) by
|
||||
//! placing the raw Dockerfile text into a single `Block::Code`, mirroring the
|
||||
//! pattern used in `k8s_manifest_resource_v1.rs`.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::DockerfileFileV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
/// Build a `CanonicalDocument` with a single `Block::Code` containing `dockerfile_text`.
|
||||
fn dockerfile_doc(dockerfile_text: &str) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("build/Dockerfile".into());
|
||||
let aid = AssetId("d".repeat(64));
|
||||
let pv = ParserVersion("code-dockerfile-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
let line_count = dockerfile_text.lines().count() as u32;
|
||||
let span = SourceSpan::Code {
|
||||
line_start: 1,
|
||||
line_end: line_count.max(1),
|
||||
symbol: None,
|
||||
lang: Some("dockerfile".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], 0, &span);
|
||||
let block = Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("dockerfile".into()),
|
||||
code: dockerfile_text.to_string(),
|
||||
});
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "Dockerfile".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks: vec![block],
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("dockerfile".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("dockerfile-file-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// A simple 5-line Dockerfile fixture must emit exactly 1 chunk with the
|
||||
/// correct symbol, lang, and line range.
|
||||
#[test]
|
||||
fn dockerfile_emits_single_chunk() {
|
||||
let fixture_path = fixtures_dir().join("sample.dockerfile");
|
||||
let text = std::fs::read_to_string(&fixture_path)
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
let doc = dockerfile_doc(&text);
|
||||
let chunks = DockerfileFileV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
1,
|
||||
"expected 1 chunk, got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
// Inspect the Chunk's source_spans for symbol / lang / line range.
|
||||
let span = chunks[0].source_spans.first().expect("at least one span");
|
||||
match span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => {
|
||||
assert_eq!(*line_start, 1, "line_start must be 1");
|
||||
assert_eq!(*line_end, 5, "line_end must be 5 (5-line fixture)");
|
||||
assert_eq!(
|
||||
symbol.as_deref(),
|
||||
Some("<dockerfile>"),
|
||||
"symbol must be '<dockerfile>'"
|
||||
);
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("dockerfile"),
|
||||
"lang must be 'dockerfile'"
|
||||
);
|
||||
}
|
||||
other => panic!("expected SourceSpan::Code, got {other:?}"),
|
||||
}
|
||||
|
||||
// Verify chunker_version label.
|
||||
assert_eq!(chunks[0].chunker_version.0, "dockerfile-file-v1");
|
||||
}
|
||||
94
crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json
vendored
Normal file
94
crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json
vendored
Normal file
@@ -0,0 +1,94 @@
|
||||
[
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"8149e12ca002489acb4a0f74c97a061a"
|
||||
],
|
||||
"chunk_id": "ec3cf06ae56c8e9796bbc9196438b7c5",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 18,
|
||||
"line_start": 1,
|
||||
"symbol": "<top-level>"
|
||||
}
|
||||
],
|
||||
"text": "#include <stdio.h>\n#include <stdlib.h>\n\n#define MAX_BUF 4096\n\ntypedef enum {\n OK = 0,\n ERR_PARSE,\n ERR_IO,\n} status_t;\n\ntypedef struct {\n int id;\n char name[64];\n status_t status;\n} record_t;\n\nstatic int counter = 0;",
|
||||
"token_estimate": 78,
|
||||
"tokenized_korean_text": "# include < stdio . h > # include < stdlib . h > # define MAX _ BUF 4096 typedef enum { OK = 0 , ERR _ PARSE , ERR _ IO , } status _ t ; typedef struct { int id ; char name [ 64 ]; status _ t status ; } record _ t ; static int counter = 0 ;"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"1baaa89f21a47b2f32d6396a24a85454"
|
||||
],
|
||||
"chunk_id": "c2d7a81c898106733ef2e703774a6a4a",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 23,
|
||||
"line_start": 20,
|
||||
"symbol": "parse_record"
|
||||
}
|
||||
],
|
||||
"text": "int parse_record(const char *line, record_t *out) {\n if (line == NULL || out == NULL) return ERR_PARSE;\n return OK;\n}",
|
||||
"token_estimate": 41,
|
||||
"tokenized_korean_text": "int parse _ record ( const char * line , record _ t * out ) { if ( line == NULL || out == NULL ) return ERR _ PARSE ; return OK ; }"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"8d0e14cbcc6d1e92d7878ab796ea68b8"
|
||||
],
|
||||
"chunk_id": "0e4d7b131ab64eba03b51903b5d8f96d",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 27,
|
||||
"line_start": 25,
|
||||
"symbol": "print_record"
|
||||
}
|
||||
],
|
||||
"text": "void print_record(const record_t *r) {\n printf(\"[%d] %s (status=%d)\\n\", r->id, r->name, r->status);\n}",
|
||||
"token_estimate": 35,
|
||||
"tokenized_korean_text": "void print _ record ( const record _ t * r ) { printf (\"[% d ] % s ( status =% d )\\ n \", r -> id , r -> name , r -> status ); }"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"9c2ede84423871b615d48c38fefb1853"
|
||||
],
|
||||
"chunk_id": "e076f8edb2ff141d7e99b4106bb95157",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 33,
|
||||
"line_start": 29,
|
||||
"symbol": "main"
|
||||
}
|
||||
],
|
||||
"text": "int main(void) {\n record_t r = { .id = 1, .name = \"foo\", .status = OK };\n print_record(&r);\n return 0;\n}",
|
||||
"token_estimate": 38,
|
||||
"tokenized_korean_text": "int main ( void ) { record _ t r = { . id = 1 , . name = \" foo \", . status = OK }; print _ record (& r ); return 0 ; }"
|
||||
}
|
||||
]
|
||||
186
crates/kebab-chunk/tests/fixtures/code-sample.chunks.snapshot.json
vendored
Normal file
186
crates/kebab-chunk/tests/fixtures/code-sample.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
117
crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json
vendored
Normal file
117
crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json
vendored
Normal file
@@ -0,0 +1,117 @@
|
||||
[
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"53292605459065d170cd36c118e20546"
|
||||
],
|
||||
"chunk_id": "50a5b324300d9082eac4ce2a422810e1",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 4,
|
||||
"line_start": 1,
|
||||
"symbol": "<top-level>"
|
||||
}
|
||||
],
|
||||
"text": "#include <string>\n#include <vector>\n\nnamespace kebab {",
|
||||
"token_estimate": 18,
|
||||
"tokenized_korean_text": "# include < string > # include < vector > namespace kebab {"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"f349acad94c9fa4cf9ad1c0a93e83610"
|
||||
],
|
||||
"chunk_id": "0e6bc7c522665af8a4b0f66afb9d29c8",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 20,
|
||||
"line_start": 6,
|
||||
"symbol": "kebab::chunk::MdHeadingV1Chunker"
|
||||
}
|
||||
],
|
||||
"text": "class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};",
|
||||
"token_estimate": 95,
|
||||
"tokenized_korean_text": "class MdHeadingV 1 Chunker { public : MdHeadingV 1 Chunker ( ) = default ; ~ MdHeadingV 1 Chunker ( ) = default ; std : : string chunk _ doc ( const std : : string & doc ) { return doc ; } int operator ( ) ( int x ) const { return x * 2 ; } private : int counter _ = 0 ; };"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"8b9811387717d0bd4abf84abcc35b8b1"
|
||||
],
|
||||
"chunk_id": "d9326d252905b665b2adb9a416c20451",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 25,
|
||||
"line_start": 22,
|
||||
"symbol": "kebab::identity"
|
||||
}
|
||||
],
|
||||
"text": "template <typename T>\nT identity(T value) {\n return value;\n}",
|
||||
"token_estimate": 21,
|
||||
"tokenized_korean_text": "template < typename T > T identity ( T value ) { return value ; }"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"1754cb6b971f6a4cb292f144a4f0570b"
|
||||
],
|
||||
"chunk_id": "56ee5f991de4a413c016da8dc4acfc35",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 29,
|
||||
"line_start": 27,
|
||||
"symbol": "kebab::global_helper"
|
||||
}
|
||||
],
|
||||
"text": "void global_helper() {\n // free function in kebab namespace\n}",
|
||||
"token_estimate": 22,
|
||||
"tokenized_korean_text": "void global _ helper ( ) { / / free function in kebab namespace }"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"14b5f3393d6d25f822f5b70763d24acd"
|
||||
],
|
||||
"chunk_id": "c0d7c043cdd575c530db3909b54cc906",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 34,
|
||||
"line_start": 31,
|
||||
"symbol": "main"
|
||||
}
|
||||
],
|
||||
"text": "int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}",
|
||||
"token_estimate": 23,
|
||||
"tokenized_korean_text": "int main ( ) { kebab : : chunk : : MdHeadingV 1 Chunker c ; return 0 ; }"
|
||||
}
|
||||
]
|
||||
255
crates/kebab-chunk/tests/fixtures/code-sample.go.chunks.snapshot.json
vendored
Normal file
255
crates/kebab-chunk/tests/fixtures/code-sample.go.chunks.snapshot.json
vendored
Normal file
@@ -0,0 +1,255 @@
|
||||
[
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"c182bf37e32c7fc1b868bd617f8eaf66"
|
||||
],
|
||||
"chunk_id": "43de518d946dc18ec040ae20d74e0cff",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 5,
|
||||
"line_start": 1,
|
||||
"symbol": "imports"
|
||||
}
|
||||
],
|
||||
"text": "import (\n\t\"fmt\"\n\t\"os\"\n\t\"strings\"\n)",
|
||||
"token_estimate": 12,
|
||||
"tokenized_korean_text": "import ( \" fmt \" \" os \" \" strings \" )"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"c9992cdcfdf3c2a7700a4abc4782a8a4"
|
||||
],
|
||||
"chunk_id": "af4c382a83f1e8cdea495d8b33c11abc",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 12,
|
||||
"line_start": 7,
|
||||
"symbol": "ComputeMRR"
|
||||
}
|
||||
],
|
||||
"text": "func ComputeMRR(scores []float64) float64 {\n\tif len(scores) == 0 {\n\t\treturn 0.0\n\t}\n\t_ = fmt.Sprintf(\"%v\", scores)\n\treturn 1.0 / float64(len(scores))\n}",
|
||||
"token_estimate": 50,
|
||||
"tokenized_korean_text": "func ComputeMRR ( scores [ ] float 64 ) float 64 { if len ( scores ) == 0 { return 0 . 0 } _ = fmt . Sprintf (\"% v \", scores ) return 1 . 0 / float 64 ( len ( scores ) ) }"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"5f18dc3e79fe946ba05d32c3bfc00684"
|
||||
],
|
||||
"chunk_id": "4be6d8f180bc19b8651877e5264852ac",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 20,
|
||||
"line_start": 14,
|
||||
"symbol": "MetricsCollector"
|
||||
}
|
||||
],
|
||||
"text": "type MetricsCollector struct {\n\tScores []float64\n\tLabels []string\n\tCounts map[string]int\n\tTotals map[string]float64\n\tTags []string\n}",
|
||||
"token_estimate": 45,
|
||||
"tokenized_korean_text": "type MetricsCollector struct { Scores [ ] float 64 Labels [ ] string Counts map [ string ] int Totals map [ string ] float 64 Tags [ ] string }"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"3009cc022ca832c323393e4f9bcdb388"
|
||||
],
|
||||
"chunk_id": "3ae182f4c6d304ee7f0aaf447142f948",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 30,
|
||||
"line_start": 22,
|
||||
"symbol": "BaseEvaluator"
|
||||
}
|
||||
],
|
||||
"text": "type BaseEvaluator struct {\n\tName string\n}\n\nfunc (e *BaseEvaluator) Evaluate(data []string) error {\n\t_ = os.Stderr\n\t_ = strings.Join(data, \",\")\n\treturn nil\n}",
|
||||
"token_estimate": 53,
|
||||
"tokenized_korean_text": "type BaseEvaluator struct { Name string } func ( e * BaseEvaluator ) Evaluate ( data [ ] string ) error { _ = os . Stderr _ = strings . Join ( data , \",\") return nil }"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"e0e83d1d7f9327a1902ae9a8f67c1f1c"
|
||||
],
|
||||
"chunk_id": "b962f14980e756bb8ba514e2282756cd",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 38,
|
||||
"line_start": 32,
|
||||
"symbol": "MetricsCollector.Run"
|
||||
}
|
||||
],
|
||||
"text": "func (m *MetricsCollector) Run(inputs []float64) {\n\tfor _, inp := range inputs {\n\t\tm.Scores = append(\n\t\t\tm.Scores,\n\t\t\tinp,\n\t\t)\n\t}\n}",
|
||||
"token_estimate": 44,
|
||||
"tokenized_korean_text": "func ( m * MetricsCollector ) Run ( inputs [ ] float 64 ) { for _, inp := range inputs { m . Scores = append ( m . Scores , inp , ) } }"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"0e6a572bc3fe2bd6d173fe614bd1b763"
|
||||
],
|
||||
"chunk_id": "441c695e990e7f49188068433e313e87",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 46,
|
||||
"line_start": 40,
|
||||
"symbol": "MetricsCollector.Report"
|
||||
}
|
||||
],
|
||||
"text": "func (m *MetricsCollector) Report() map[string]interface{} {\n\treturn map[string]interface{}{\n\t\t\"mean\": 0.0,\n\t\t\"count\": len(m.Scores),\n\t\t\"tags\": m.Tags,\n\t}\n}",
|
||||
"token_estimate": 53,
|
||||
"tokenized_korean_text": "func ( m * MetricsCollector ) Report ( ) map [ string ] interface {} { return map [ string ] interface {}{ \" mean \": 0 . 0 , \" count \": len ( m . Scores ) , \" tags \": m . Tags , } }"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "7a942d871c588ec69426290561f05179",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 247,
|
||||
"line_start": 48,
|
||||
"symbol": "BigCompute [part 1/5]"
|
||||
}
|
||||
],
|
||||
"text": "func BigCompute(data []int) int {\n\tv0 := 0\n\tif 0 < len(data) {\n\t\tv0 = data[0]\n\t}\n\tv1 := 0\n\tif 1 < len(data) {\n\t\tv1 = data[1]\n\t}\n\tv2 := 0\n\tif 2 < len(data) {\n\t\tv2 = data[2]\n\t}\n\tv3 := 0\n\tif 3 < len(data) {\n\t\tv3 = data[3]\n\t}\n\tv4 := 0\n\tif 4 < len(data) {\n\t\tv4 = data[4]\n\t}\n\tv5 := 0\n\tif 5 < len(data) {\n\t\tv5 = data[5]\n\t}\n\tv6 := 0\n\tif 6 < len(data) {\n\t\tv6 = data[6]\n\t}\n\tv7 := 0\n\tif 7 < len(data) {\n\t\tv7 = data[7]\n\t}\n\tv8 := 0\n\tif 8 < len(data) {\n\t\tv8 = data[8]\n\t}\n\tv9 := 0\n\tif 9 < len(data) {\n\t\tv9 = data[9]\n\t}\n\tv10 := 0\n\tif 10 < len(data) {\n\t\tv10 = data[10]\n\t}\n\tv11 := 0\n\tif 11 < len(data) {\n\t\tv11 = data[11]\n\t}\n\tv12 := 0\n\tif 12 < len(data) {\n\t\tv12 = data[12]\n\t}\n\tv13 := 0\n\tif 13 < len(data) {\n\t\tv13 = data[13]\n\t}\n\tv14 := 0\n\tif 14 < len(data) {\n\t\tv14 = data[14]\n\t}\n\tv15 := 0\n\tif 15 < len(data) {\n\t\tv15 = data[15]\n\t}\n\tv16 := 0\n\tif 16 < len(data) {\n\t\tv16 = data[16]\n\t}\n\tv17 := 0\n\tif 17 < len(data) {\n\t\tv17 = data[17]\n\t}\n\tv18 := 0\n\tif 18 < len(data) {\n\t\tv18 = data[18]\n\t}\n\tv19 := 0\n\tif 19 < len(data) {\n\t\tv19 = data[19]\n\t}\n\tv20 := 0\n\tif 20 < len(data) {\n\t\tv20 = data[20]\n\t}\n\tv21 := 0\n\tif 21 < len(data) {\n\t\tv21 = data[21]\n\t}\n\tv22 := 0\n\tif 22 < len(data) {\n\t\tv22 = data[22]\n\t}\n\tv23 := 0\n\tif 23 < len(data) {\n\t\tv23 = data[23]\n\t}\n\tv24 := 0\n\tif 24 < len(data) {\n\t\tv24 = data[24]\n\t}\n\tv25 := 0\n\tif 25 < len(data) {\n\t\tv25 = data[25]\n\t}\n\tv26 := 0\n\tif 26 < len(data) {\n\t\tv26 = data[26]\n\t}\n\tv27 := 0\n\tif 27 < len(data) {\n\t\tv27 = data[27]\n\t}\n\tv28 := 0\n\tif 28 < len(data) {\n\t\tv28 = data[28]\n\t}\n\tv29 := 0\n\tif 29 < len(data) {\n\t\tv29 = data[29]\n\t}\n\tv30 := 0\n\tif 30 < len(data) {\n\t\tv30 = data[30]\n\t}\n\tv31 := 0\n\tif 31 < len(data) {\n\t\tv31 = data[31]\n\t}\n\tv32 := 0\n\tif 32 < len(data) {\n\t\tv32 = data[32]\n\t}\n\tv33 := 0\n\tif 33 < len(data) {\n\t\tv33 = data[33]\n\t}\n\tv34 := 0\n\tif 34 < len(data) {\n\t\tv34 = data[34]\n\t}\n\tv35 := 0\n\tif 35 < len(data) {\n\t\tv35 = data[35]\n\t}\n\tv36 := 0\n\tif 36 < len(data) {\n\t\tv36 = data[36]\n\t}\n\tv37 := 0\n\tif 37 < len(data) {\n\t\tv37 = data[37]\n\t}\n\tv38 := 0\n\tif 38 < len(data) {\n\t\tv38 = data[38]\n\t}\n\tv39 := 0\n\tif 39 < len(data) {\n\t\tv39 = data[39]\n\t}\n\tv40 := 0\n\tif 40 < len(data) {\n\t\tv40 = data[40]\n\t}\n\tv41 := 0\n\tif 41 < len(data) {\n\t\tv41 = data[41]\n\t}\n\tv42 := 0\n\tif 42 < len(data) {\n\t\tv42 = data[42]\n\t}\n\tv43 := 0\n\tif 43 < len(data) {\n\t\tv43 = data[43]\n\t}\n\tv44 := 0\n\tif 44 < len(data) {\n\t\tv44 = data[44]\n\t}\n\tv45 := 0\n\tif 45 < len(data) {\n\t\tv45 = data[45]\n\t}\n\tv46 := 0\n\tif 46 < len(data) {\n\t\tv46 = data[46]\n\t}\n\tv47 := 0\n\tif 47 < len(data) {\n\t\tv47 = data[47]\n\t}\n\tv48 := 0\n\tif 48 < len(data) {\n\t\tv48 = data[48]\n\t}\n\tv49 := 0\n\tif 49 < len(data) {\n\t\tv49 = data[49]",
|
||||
"token_estimate": 847,
|
||||
"tokenized_korean_text": "func BigCompute ( data [ ] int ) int { v 0 := 0 if 0 < len ( data ) { v 0 = data [ 0 ] } v 1 := 0 if 1 < len ( data ) { v 1 = data [ 1 ] } v 2 := 0 if 2 < len ( data ) { v 2 = data [ 2 ] } v 3 := 0 if 3 < len ( data ) { v 3 = data [ 3 ] } v 4 := 0 if 4 < len ( data ) { v 4 = data [ 4 ] } v 5 := 0 if 5 < len ( data ) { v 5 = data [ 5 ] } v 6 := 0 if 6 < len ( data ) { v 6 = data [ 6 ] } v 7 := 0 if 7 < len ( data ) { v 7 = data [ 7 ] } v 8 := 0 if 8 < len ( data ) { v 8 = data [ 8 ] } v 9 := 0 if 9 < len ( data ) { v 9 = data [ 9 ] } v 10 := 0 if 10 < len ( data ) { v 10 = data [ 10 ] } v 11 := 0 if 11 < len ( data ) { v 11 = data [ 11 ] } v 12 := 0 if 12 < len ( data ) { v 12 = data [ 12 ] } v 13 := 0 if 13 < len ( data ) { v 13 = data [ 13 ] } v 14 := 0 if 14 < len ( data ) { v 14 = data [ 14 ] } v 15 := 0 if 15 < len ( data ) { v 15 = data [ 15 ] } v 16 := 0 if 16 < len ( data ) { v 16 = data [ 16 ] } v 17 := 0 if 17 < len ( data ) { v 17 = data [ 17 ] } v 18 := 0 if 18 < len ( data ) { v 18 = data [ 18 ] } v 19 := 0 if 19 < len ( data ) { v 19 = data [ 19 ] } v 20 := 0 if 20 < len ( data ) { v 20 = data [ 20 ] } v 21 := 0 if 21 < len ( data ) { v 21 = data [ 21 ] } v 22 := 0 if 22 < len ( data ) { v 22 = data [ 22 ] } v 23 := 0 if 23 < len ( data ) { v 23 = data [ 23 ] } v 24 := 0 if 24 < len ( data ) { v 24 = data [ 24 ] } v 25 := 0 if 25 < len ( data ) { v 25 = data [ 25 ] } v 26 := 0 if 26 < len ( data ) { v 26 = data [ 26 ] } v 27 := 0 if 27 < len ( data ) { v 27 = data [ 27 ] } v 28 := 0 if 28 < len ( data ) { v 28 = data [ 28 ] } v 29 := 0 if 29 < len ( data ) { v 29 = data [ 29 ] } v 30 := 0 if 30 < len ( data ) { v 30 = data [ 30 ] } v 31 := 0 if 31 < len ( data ) { v 31 = data [ 31 ] } v 32 := 0 if 32 < len ( data ) { v 32 = data [ 32 ] } v 33 := 0 if 33 < len ( data ) { v 33 = data [ 33 ] } v 34 := 0 if 34 < len ( data ) { v 34 = data [ 34 ] } v 35 := 0 if 35 < len ( data ) { v 35 = data [ 35 ] } v 36 := 0 if 36 < len ( data ) { v 36 = data [ 36 ] } v 37 := 0 if 37 < len ( data ) { v 37 = data [ 37 ] } v 38 := 0 if 38 < len ( data ) { v 38 = data [ 38 ] } v 39 := 0 if 39 < len ( data ) { v 39 = data [ 39 ] } v 40 := 0 if 40 < len ( data ) { v 40 = data [ 40 ] } v 41 := 0 if 41 < len ( data ) { v 41 = data [ 41 ] } v 42 := 0 if 42 < len ( data ) { v 42 = data [ 42 ] } v 43 := 0 if 43 < len ( data ) { v 43 = data [ 43 ] } v 44 := 0 if 44 < len ( data ) { v 44 = data [ 44 ] } v 45 := 0 if 45 < len ( data ) { v 45 = data [ 45 ] } v 46 := 0 if 46 < len ( data ) { v 46 = data [ 46 ] } v 47 := 0 if 47 < len ( data ) { v 47 = data [ 47 ] } v 48 := 0 if 48 < len ( data ) { v 48 = data [ 48 ] } v 49 := 0 if 49 < len ( data ) { v 49 = data [ 49 ]"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "3f44ba43c9415652e2705bb667776e76",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 447,
|
||||
"line_start": 248,
|
||||
"symbol": "BigCompute [part 2/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv50 := 0\n\tif 50 < len(data) {\n\t\tv50 = data[50]\n\t}\n\tv51 := 0\n\tif 51 < len(data) {\n\t\tv51 = data[51]\n\t}\n\tv52 := 0\n\tif 52 < len(data) {\n\t\tv52 = data[52]\n\t}\n\tv53 := 0\n\tif 53 < len(data) {\n\t\tv53 = data[53]\n\t}\n\tv54 := 0\n\tif 54 < len(data) {\n\t\tv54 = data[54]\n\t}\n\tv55 := 0\n\tif 55 < len(data) {\n\t\tv55 = data[55]\n\t}\n\tv56 := 0\n\tif 56 < len(data) {\n\t\tv56 = data[56]\n\t}\n\tv57 := 0\n\tif 57 < len(data) {\n\t\tv57 = data[57]\n\t}\n\tv58 := 0\n\tif 58 < len(data) {\n\t\tv58 = data[58]\n\t}\n\tv59 := 0\n\tif 59 < len(data) {\n\t\tv59 = data[59]\n\t}\n\tv60 := 0\n\tif 60 < len(data) {\n\t\tv60 = data[60]\n\t}\n\tv61 := 0\n\tif 61 < len(data) {\n\t\tv61 = data[61]\n\t}\n\tv62 := 0\n\tif 62 < len(data) {\n\t\tv62 = data[62]\n\t}\n\tv63 := 0\n\tif 63 < len(data) {\n\t\tv63 = data[63]\n\t}\n\tv64 := 0\n\tif 64 < len(data) {\n\t\tv64 = data[64]\n\t}\n\tv65 := 0\n\tif 65 < len(data) {\n\t\tv65 = data[65]\n\t}\n\tv66 := 0\n\tif 66 < len(data) {\n\t\tv66 = data[66]\n\t}\n\tv67 := 0\n\tif 67 < len(data) {\n\t\tv67 = data[67]\n\t}\n\tv68 := 0\n\tif 68 < len(data) {\n\t\tv68 = data[68]\n\t}\n\tv69 := 0\n\tif 69 < len(data) {\n\t\tv69 = data[69]\n\t}\n\tv70 := 0\n\tif 70 < len(data) {\n\t\tv70 = data[70]\n\t}\n\tv71 := 0\n\tif 71 < len(data) {\n\t\tv71 = data[71]\n\t}\n\tv72 := 0\n\tif 72 < len(data) {\n\t\tv72 = data[72]\n\t}\n\tv73 := 0\n\tif 73 < len(data) {\n\t\tv73 = data[73]\n\t}\n\tv74 := 0\n\tif 74 < len(data) {\n\t\tv74 = data[74]\n\t}\n\tv75 := 0\n\tif 75 < len(data) {\n\t\tv75 = data[75]\n\t}\n\tv76 := 0\n\tif 76 < len(data) {\n\t\tv76 = data[76]\n\t}\n\tv77 := 0\n\tif 77 < len(data) {\n\t\tv77 = data[77]\n\t}\n\tv78 := 0\n\tif 78 < len(data) {\n\t\tv78 = data[78]\n\t}\n\tv79 := 0\n\tif 79 < len(data) {\n\t\tv79 = data[79]\n\t}\n\tv80 := 0\n\tif 80 < len(data) {\n\t\tv80 = data[80]\n\t}\n\tv81 := 0\n\tif 81 < len(data) {\n\t\tv81 = data[81]\n\t}\n\tv82 := 0\n\tif 82 < len(data) {\n\t\tv82 = data[82]\n\t}\n\tv83 := 0\n\tif 83 < len(data) {\n\t\tv83 = data[83]\n\t}\n\tv84 := 0\n\tif 84 < len(data) {\n\t\tv84 = data[84]\n\t}\n\tv85 := 0\n\tif 85 < len(data) {\n\t\tv85 = data[85]\n\t}\n\tv86 := 0\n\tif 86 < len(data) {\n\t\tv86 = data[86]\n\t}\n\tv87 := 0\n\tif 87 < len(data) {\n\t\tv87 = data[87]\n\t}\n\tv88 := 0\n\tif 88 < len(data) {\n\t\tv88 = data[88]\n\t}\n\tv89 := 0\n\tif 89 < len(data) {\n\t\tv89 = data[89]\n\t}\n\tv90 := 0\n\tif 90 < len(data) {\n\t\tv90 = data[90]\n\t}\n\tv91 := 0\n\tif 91 < len(data) {\n\t\tv91 = data[91]\n\t}\n\tv92 := 0\n\tif 92 < len(data) {\n\t\tv92 = data[92]\n\t}\n\tv93 := 0\n\tif 93 < len(data) {\n\t\tv93 = data[93]\n\t}\n\tv94 := 0\n\tif 94 < len(data) {\n\t\tv94 = data[94]\n\t}\n\tv95 := 0\n\tif 95 < len(data) {\n\t\tv95 = data[95]\n\t}\n\tv96 := 0\n\tif 96 < len(data) {\n\t\tv96 = data[96]\n\t}\n\tv97 := 0\n\tif 97 < len(data) {\n\t\tv97 = data[97]\n\t}\n\tv98 := 0\n\tif 98 < len(data) {\n\t\tv98 = data[98]\n\t}\n\tv99 := 0\n\tif 99 < len(data) {\n\t\tv99 = data[99]",
|
||||
"token_estimate": 850,
|
||||
"tokenized_korean_text": "} v 50 := 0 if 50 < len ( data ) { v 50 = data [ 50 ] } v 51 := 0 if 51 < len ( data ) { v 51 = data [ 51 ] } v 52 := 0 if 52 < len ( data ) { v 52 = data [ 52 ] } v 53 := 0 if 53 < len ( data ) { v 53 = data [ 53 ] } v 54 := 0 if 54 < len ( data ) { v 54 = data [ 54 ] } v 55 := 0 if 55 < len ( data ) { v 55 = data [ 55 ] } v 56 := 0 if 56 < len ( data ) { v 56 = data [ 56 ] } v 57 := 0 if 57 < len ( data ) { v 57 = data [ 57 ] } v 58 := 0 if 58 < len ( data ) { v 58 = data [ 58 ] } v 59 := 0 if 59 < len ( data ) { v 59 = data [ 59 ] } v 60 := 0 if 60 < len ( data ) { v 60 = data [ 60 ] } v 61 := 0 if 61 < len ( data ) { v 61 = data [ 61 ] } v 62 := 0 if 62 < len ( data ) { v 62 = data [ 62 ] } v 63 := 0 if 63 < len ( data ) { v 63 = data [ 63 ] } v 64 := 0 if 64 < len ( data ) { v 64 = data [ 64 ] } v 65 := 0 if 65 < len ( data ) { v 65 = data [ 65 ] } v 66 := 0 if 66 < len ( data ) { v 66 = data [ 66 ] } v 67 := 0 if 67 < len ( data ) { v 67 = data [ 67 ] } v 68 := 0 if 68 < len ( data ) { v 68 = data [ 68 ] } v 69 := 0 if 69 < len ( data ) { v 69 = data [ 69 ] } v 70 := 0 if 70 < len ( data ) { v 70 = data [ 70 ] } v 71 := 0 if 71 < len ( data ) { v 71 = data [ 71 ] } v 72 := 0 if 72 < len ( data ) { v 72 = data [ 72 ] } v 73 := 0 if 73 < len ( data ) { v 73 = data [ 73 ] } v 74 := 0 if 74 < len ( data ) { v 74 = data [ 74 ] } v 75 := 0 if 75 < len ( data ) { v 75 = data [ 75 ] } v 76 := 0 if 76 < len ( data ) { v 76 = data [ 76 ] } v 77 := 0 if 77 < len ( data ) { v 77 = data [ 77 ] } v 78 := 0 if 78 < len ( data ) { v 78 = data [ 78 ] } v 79 := 0 if 79 < len ( data ) { v 79 = data [ 79 ] } v 80 := 0 if 80 < len ( data ) { v 80 = data [ 80 ] } v 81 := 0 if 81 < len ( data ) { v 81 = data [ 81 ] } v 82 := 0 if 82 < len ( data ) { v 82 = data [ 82 ] } v 83 := 0 if 83 < len ( data ) { v 83 = data [ 83 ] } v 84 := 0 if 84 < len ( data ) { v 84 = data [ 84 ] } v 85 := 0 if 85 < len ( data ) { v 85 = data [ 85 ] } v 86 := 0 if 86 < len ( data ) { v 86 = data [ 86 ] } v 87 := 0 if 87 < len ( data ) { v 87 = data [ 87 ] } v 88 := 0 if 88 < len ( data ) { v 88 = data [ 88 ] } v 89 := 0 if 89 < len ( data ) { v 89 = data [ 89 ] } v 90 := 0 if 90 < len ( data ) { v 90 = data [ 90 ] } v 91 := 0 if 91 < len ( data ) { v 91 = data [ 91 ] } v 92 := 0 if 92 < len ( data ) { v 92 = data [ 92 ] } v 93 := 0 if 93 < len ( data ) { v 93 = data [ 93 ] } v 94 := 0 if 94 < len ( data ) { v 94 = data [ 94 ] } v 95 := 0 if 95 < len ( data ) { v 95 = data [ 95 ] } v 96 := 0 if 96 < len ( data ) { v 96 = data [ 96 ] } v 97 := 0 if 97 < len ( data ) { v 97 = data [ 97 ] } v 98 := 0 if 98 < len ( data ) { v 98 = data [ 98 ] } v 99 := 0 if 99 < len ( data ) { v 99 = data [ 99 ]"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "e4763e10f059d97f40c2932761b56c3e",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 647,
|
||||
"line_start": 448,
|
||||
"symbol": "BigCompute [part 3/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv100 := 0\n\tif 100 < len(data) {\n\t\tv100 = data[100]\n\t}\n\tv101 := 0\n\tif 101 < len(data) {\n\t\tv101 = data[101]\n\t}\n\tv102 := 0\n\tif 102 < len(data) {\n\t\tv102 = data[102]\n\t}\n\tv103 := 0\n\tif 103 < len(data) {\n\t\tv103 = data[103]\n\t}\n\tv104 := 0\n\tif 104 < len(data) {\n\t\tv104 = data[104]\n\t}\n\tv105 := 0\n\tif 105 < len(data) {\n\t\tv105 = data[105]\n\t}\n\tv106 := 0\n\tif 106 < len(data) {\n\t\tv106 = data[106]\n\t}\n\tv107 := 0\n\tif 107 < len(data) {\n\t\tv107 = data[107]\n\t}\n\tv108 := 0\n\tif 108 < len(data) {\n\t\tv108 = data[108]\n\t}\n\tv109 := 0\n\tif 109 < len(data) {\n\t\tv109 = data[109]\n\t}\n\tv110 := 0\n\tif 110 < len(data) {\n\t\tv110 = data[110]\n\t}\n\tv111 := 0\n\tif 111 < len(data) {\n\t\tv111 = data[111]\n\t}\n\tv112 := 0\n\tif 112 < len(data) {\n\t\tv112 = data[112]\n\t}\n\tv113 := 0\n\tif 113 < len(data) {\n\t\tv113 = data[113]\n\t}\n\tv114 := 0\n\tif 114 < len(data) {\n\t\tv114 = data[114]\n\t}\n\tv115 := 0\n\tif 115 < len(data) {\n\t\tv115 = data[115]\n\t}\n\tv116 := 0\n\tif 116 < len(data) {\n\t\tv116 = data[116]\n\t}\n\tv117 := 0\n\tif 117 < len(data) {\n\t\tv117 = data[117]\n\t}\n\tv118 := 0\n\tif 118 < len(data) {\n\t\tv118 = data[118]\n\t}\n\tv119 := 0\n\tif 119 < len(data) {\n\t\tv119 = data[119]\n\t}\n\tv120 := 0\n\tif 120 < len(data) {\n\t\tv120 = data[120]\n\t}\n\tv121 := 0\n\tif 121 < len(data) {\n\t\tv121 = data[121]\n\t}\n\tv122 := 0\n\tif 122 < len(data) {\n\t\tv122 = data[122]\n\t}\n\tv123 := 0\n\tif 123 < len(data) {\n\t\tv123 = data[123]\n\t}\n\tv124 := 0\n\tif 124 < len(data) {\n\t\tv124 = data[124]\n\t}\n\tv125 := 0\n\tif 125 < len(data) {\n\t\tv125 = data[125]\n\t}\n\tv126 := 0\n\tif 126 < len(data) {\n\t\tv126 = data[126]\n\t}\n\tv127 := 0\n\tif 127 < len(data) {\n\t\tv127 = data[127]\n\t}\n\tv128 := 0\n\tif 128 < len(data) {\n\t\tv128 = data[128]\n\t}\n\tv129 := 0\n\tif 129 < len(data) {\n\t\tv129 = data[129]\n\t}\n\tv130 := 0\n\tif 130 < len(data) {\n\t\tv130 = data[130]\n\t}\n\tv131 := 0\n\tif 131 < len(data) {\n\t\tv131 = data[131]\n\t}\n\tv132 := 0\n\tif 132 < len(data) {\n\t\tv132 = data[132]\n\t}\n\tv133 := 0\n\tif 133 < len(data) {\n\t\tv133 = data[133]\n\t}\n\tv134 := 0\n\tif 134 < len(data) {\n\t\tv134 = data[134]\n\t}\n\tv135 := 0\n\tif 135 < len(data) {\n\t\tv135 = data[135]\n\t}\n\tv136 := 0\n\tif 136 < len(data) {\n\t\tv136 = data[136]\n\t}\n\tv137 := 0\n\tif 137 < len(data) {\n\t\tv137 = data[137]\n\t}\n\tv138 := 0\n\tif 138 < len(data) {\n\t\tv138 = data[138]\n\t}\n\tv139 := 0\n\tif 139 < len(data) {\n\t\tv139 = data[139]\n\t}\n\tv140 := 0\n\tif 140 < len(data) {\n\t\tv140 = data[140]\n\t}\n\tv141 := 0\n\tif 141 < len(data) {\n\t\tv141 = data[141]\n\t}\n\tv142 := 0\n\tif 142 < len(data) {\n\t\tv142 = data[142]\n\t}\n\tv143 := 0\n\tif 143 < len(data) {\n\t\tv143 = data[143]\n\t}\n\tv144 := 0\n\tif 144 < len(data) {\n\t\tv144 = data[144]\n\t}\n\tv145 := 0\n\tif 145 < len(data) {\n\t\tv145 = data[145]\n\t}\n\tv146 := 0\n\tif 146 < len(data) {\n\t\tv146 = data[146]\n\t}\n\tv147 := 0\n\tif 147 < len(data) {\n\t\tv147 = data[147]\n\t}\n\tv148 := 0\n\tif 148 < len(data) {\n\t\tv148 = data[148]\n\t}\n\tv149 := 0\n\tif 149 < len(data) {\n\t\tv149 = data[149]",
|
||||
"token_estimate": 917,
|
||||
"tokenized_korean_text": "} v 100 := 0 if 100 < len ( data ) { v 100 = data [ 100 ] } v 101 := 0 if 101 < len ( data ) { v 101 = data [ 101 ] } v 102 := 0 if 102 < len ( data ) { v 102 = data [ 102 ] } v 103 := 0 if 103 < len ( data ) { v 103 = data [ 103 ] } v 104 := 0 if 104 < len ( data ) { v 104 = data [ 104 ] } v 105 := 0 if 105 < len ( data ) { v 105 = data [ 105 ] } v 106 := 0 if 106 < len ( data ) { v 106 = data [ 106 ] } v 107 := 0 if 107 < len ( data ) { v 107 = data [ 107 ] } v 108 := 0 if 108 < len ( data ) { v 108 = data [ 108 ] } v 109 := 0 if 109 < len ( data ) { v 109 = data [ 109 ] } v 110 := 0 if 110 < len ( data ) { v 110 = data [ 110 ] } v 111 := 0 if 111 < len ( data ) { v 111 = data [ 111 ] } v 112 := 0 if 112 < len ( data ) { v 112 = data [ 112 ] } v 113 := 0 if 113 < len ( data ) { v 113 = data [ 113 ] } v 114 := 0 if 114 < len ( data ) { v 114 = data [ 114 ] } v 115 := 0 if 115 < len ( data ) { v 115 = data [ 115 ] } v 116 := 0 if 116 < len ( data ) { v 116 = data [ 116 ] } v 117 := 0 if 117 < len ( data ) { v 117 = data [ 117 ] } v 118 := 0 if 118 < len ( data ) { v 118 = data [ 118 ] } v 119 := 0 if 119 < len ( data ) { v 119 = data [ 119 ] } v 120 := 0 if 120 < len ( data ) { v 120 = data [ 120 ] } v 121 := 0 if 121 < len ( data ) { v 121 = data [ 121 ] } v 122 := 0 if 122 < len ( data ) { v 122 = data [ 122 ] } v 123 := 0 if 123 < len ( data ) { v 123 = data [ 123 ] } v 124 := 0 if 124 < len ( data ) { v 124 = data [ 124 ] } v 125 := 0 if 125 < len ( data ) { v 125 = data [ 125 ] } v 126 := 0 if 126 < len ( data ) { v 126 = data [ 126 ] } v 127 := 0 if 127 < len ( data ) { v 127 = data [ 127 ] } v 128 := 0 if 128 < len ( data ) { v 128 = data [ 128 ] } v 129 := 0 if 129 < len ( data ) { v 129 = data [ 129 ] } v 130 := 0 if 130 < len ( data ) { v 130 = data [ 130 ] } v 131 := 0 if 131 < len ( data ) { v 131 = data [ 131 ] } v 132 := 0 if 132 < len ( data ) { v 132 = data [ 132 ] } v 133 := 0 if 133 < len ( data ) { v 133 = data [ 133 ] } v 134 := 0 if 134 < len ( data ) { v 134 = data [ 134 ] } v 135 := 0 if 135 < len ( data ) { v 135 = data [ 135 ] } v 136 := 0 if 136 < len ( data ) { v 136 = data [ 136 ] } v 137 := 0 if 137 < len ( data ) { v 137 = data [ 137 ] } v 138 := 0 if 138 < len ( data ) { v 138 = data [ 138 ] } v 139 := 0 if 139 < len ( data ) { v 139 = data [ 139 ] } v 140 := 0 if 140 < len ( data ) { v 140 = data [ 140 ] } v 141 := 0 if 141 < len ( data ) { v 141 = data [ 141 ] } v 142 := 0 if 142 < len ( data ) { v 142 = data [ 142 ] } v 143 := 0 if 143 < len ( data ) { v 143 = data [ 143 ] } v 144 := 0 if 144 < len ( data ) { v 144 = data [ 144 ] } v 145 := 0 if 145 < len ( data ) { v 145 = data [ 145 ] } v 146 := 0 if 146 < len ( data ) { v 146 = data [ 146 ] } v 147 := 0 if 147 < len ( data ) { v 147 = data [ 147 ] } v 148 := 0 if 148 < len ( data ) { v 148 = data [ 148 ] } v 149 := 0 if 149 < len ( data ) { v 149 = data [ 149 ]"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "24176c911d0bacf9a29fa7f8251f5036",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 847,
|
||||
"line_start": 648,
|
||||
"symbol": "BigCompute [part 4/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv150 := 0\n\tif 150 < len(data) {\n\t\tv150 = data[150]\n\t}\n\tv151 := 0\n\tif 151 < len(data) {\n\t\tv151 = data[151]\n\t}\n\tv152 := 0\n\tif 152 < len(data) {\n\t\tv152 = data[152]\n\t}\n\tv153 := 0\n\tif 153 < len(data) {\n\t\tv153 = data[153]\n\t}\n\tv154 := 0\n\tif 154 < len(data) {\n\t\tv154 = data[154]\n\t}\n\tv155 := 0\n\tif 155 < len(data) {\n\t\tv155 = data[155]\n\t}\n\tv156 := 0\n\tif 156 < len(data) {\n\t\tv156 = data[156]\n\t}\n\tv157 := 0\n\tif 157 < len(data) {\n\t\tv157 = data[157]\n\t}\n\tv158 := 0\n\tif 158 < len(data) {\n\t\tv158 = data[158]\n\t}\n\tv159 := 0\n\tif 159 < len(data) {\n\t\tv159 = data[159]\n\t}\n\tv160 := 0\n\tif 160 < len(data) {\n\t\tv160 = data[160]\n\t}\n\tv161 := 0\n\tif 161 < len(data) {\n\t\tv161 = data[161]\n\t}\n\tv162 := 0\n\tif 162 < len(data) {\n\t\tv162 = data[162]\n\t}\n\tv163 := 0\n\tif 163 < len(data) {\n\t\tv163 = data[163]\n\t}\n\tv164 := 0\n\tif 164 < len(data) {\n\t\tv164 = data[164]\n\t}\n\tv165 := 0\n\tif 165 < len(data) {\n\t\tv165 = data[165]\n\t}\n\tv166 := 0\n\tif 166 < len(data) {\n\t\tv166 = data[166]\n\t}\n\tv167 := 0\n\tif 167 < len(data) {\n\t\tv167 = data[167]\n\t}\n\tv168 := 0\n\tif 168 < len(data) {\n\t\tv168 = data[168]\n\t}\n\tv169 := 0\n\tif 169 < len(data) {\n\t\tv169 = data[169]\n\t}\n\tv170 := 0\n\tif 170 < len(data) {\n\t\tv170 = data[170]\n\t}\n\tv171 := 0\n\tif 171 < len(data) {\n\t\tv171 = data[171]\n\t}\n\tv172 := 0\n\tif 172 < len(data) {\n\t\tv172 = data[172]\n\t}\n\tv173 := 0\n\tif 173 < len(data) {\n\t\tv173 = data[173]\n\t}\n\tv174 := 0\n\tif 174 < len(data) {\n\t\tv174 = data[174]\n\t}\n\tv175 := 0\n\tif 175 < len(data) {\n\t\tv175 = data[175]\n\t}\n\tv176 := 0\n\tif 176 < len(data) {\n\t\tv176 = data[176]\n\t}\n\tv177 := 0\n\tif 177 < len(data) {\n\t\tv177 = data[177]\n\t}\n\tv178 := 0\n\tif 178 < len(data) {\n\t\tv178 = data[178]\n\t}\n\tv179 := 0\n\tif 179 < len(data) {\n\t\tv179 = data[179]\n\t}\n\tv180 := 0\n\tif 180 < len(data) {\n\t\tv180 = data[180]\n\t}\n\tv181 := 0\n\tif 181 < len(data) {\n\t\tv181 = data[181]\n\t}\n\tv182 := 0\n\tif 182 < len(data) {\n\t\tv182 = data[182]\n\t}\n\tv183 := 0\n\tif 183 < len(data) {\n\t\tv183 = data[183]\n\t}\n\tv184 := 0\n\tif 184 < len(data) {\n\t\tv184 = data[184]\n\t}\n\tv185 := 0\n\tif 185 < len(data) {\n\t\tv185 = data[185]\n\t}\n\tv186 := 0\n\tif 186 < len(data) {\n\t\tv186 = data[186]\n\t}\n\tv187 := 0\n\tif 187 < len(data) {\n\t\tv187 = data[187]\n\t}\n\tv188 := 0\n\tif 188 < len(data) {\n\t\tv188 = data[188]\n\t}\n\tv189 := 0\n\tif 189 < len(data) {\n\t\tv189 = data[189]\n\t}\n\tv190 := 0\n\tif 190 < len(data) {\n\t\tv190 = data[190]\n\t}\n\tv191 := 0\n\tif 191 < len(data) {\n\t\tv191 = data[191]\n\t}\n\tv192 := 0\n\tif 192 < len(data) {\n\t\tv192 = data[192]\n\t}\n\tv193 := 0\n\tif 193 < len(data) {\n\t\tv193 = data[193]\n\t}\n\tv194 := 0\n\tif 194 < len(data) {\n\t\tv194 = data[194]\n\t}\n\tv195 := 0\n\tif 195 < len(data) {\n\t\tv195 = data[195]\n\t}\n\tv196 := 0\n\tif 196 < len(data) {\n\t\tv196 = data[196]\n\t}\n\tv197 := 0\n\tif 197 < len(data) {\n\t\tv197 = data[197]\n\t}\n\tv198 := 0\n\tif 198 < len(data) {\n\t\tv198 = data[198]\n\t}\n\tv199 := 0\n\tif 199 < len(data) {\n\t\tv199 = data[199]",
|
||||
"token_estimate": 917,
|
||||
"tokenized_korean_text": "} v 150 := 0 if 150 < len ( data ) { v 150 = data [ 150 ] } v 151 := 0 if 151 < len ( data ) { v 151 = data [ 151 ] } v 152 := 0 if 152 < len ( data ) { v 152 = data [ 152 ] } v 153 := 0 if 153 < len ( data ) { v 153 = data [ 153 ] } v 154 := 0 if 154 < len ( data ) { v 154 = data [ 154 ] } v 155 := 0 if 155 < len ( data ) { v 155 = data [ 155 ] } v 156 := 0 if 156 < len ( data ) { v 156 = data [ 156 ] } v 157 := 0 if 157 < len ( data ) { v 157 = data [ 157 ] } v 158 := 0 if 158 < len ( data ) { v 158 = data [ 158 ] } v 159 := 0 if 159 < len ( data ) { v 159 = data [ 159 ] } v 160 := 0 if 160 < len ( data ) { v 160 = data [ 160 ] } v 161 := 0 if 161 < len ( data ) { v 161 = data [ 161 ] } v 162 := 0 if 162 < len ( data ) { v 162 = data [ 162 ] } v 163 := 0 if 163 < len ( data ) { v 163 = data [ 163 ] } v 164 := 0 if 164 < len ( data ) { v 164 = data [ 164 ] } v 165 := 0 if 165 < len ( data ) { v 165 = data [ 165 ] } v 166 := 0 if 166 < len ( data ) { v 166 = data [ 166 ] } v 167 := 0 if 167 < len ( data ) { v 167 = data [ 167 ] } v 168 := 0 if 168 < len ( data ) { v 168 = data [ 168 ] } v 169 := 0 if 169 < len ( data ) { v 169 = data [ 169 ] } v 170 := 0 if 170 < len ( data ) { v 170 = data [ 170 ] } v 171 := 0 if 171 < len ( data ) { v 171 = data [ 171 ] } v 172 := 0 if 172 < len ( data ) { v 172 = data [ 172 ] } v 173 := 0 if 173 < len ( data ) { v 173 = data [ 173 ] } v 174 := 0 if 174 < len ( data ) { v 174 = data [ 174 ] } v 175 := 0 if 175 < len ( data ) { v 175 = data [ 175 ] } v 176 := 0 if 176 < len ( data ) { v 176 = data [ 176 ] } v 177 := 0 if 177 < len ( data ) { v 177 = data [ 177 ] } v 178 := 0 if 178 < len ( data ) { v 178 = data [ 178 ] } v 179 := 0 if 179 < len ( data ) { v 179 = data [ 179 ] } v 180 := 0 if 180 < len ( data ) { v 180 = data [ 180 ] } v 181 := 0 if 181 < len ( data ) { v 181 = data [ 181 ] } v 182 := 0 if 182 < len ( data ) { v 182 = data [ 182 ] } v 183 := 0 if 183 < len ( data ) { v 183 = data [ 183 ] } v 184 := 0 if 184 < len ( data ) { v 184 = data [ 184 ] } v 185 := 0 if 185 < len ( data ) { v 185 = data [ 185 ] } v 186 := 0 if 186 < len ( data ) { v 186 = data [ 186 ] } v 187 := 0 if 187 < len ( data ) { v 187 = data [ 187 ] } v 188 := 0 if 188 < len ( data ) { v 188 = data [ 188 ] } v 189 := 0 if 189 < len ( data ) { v 189 = data [ 189 ] } v 190 := 0 if 190 < len ( data ) { v 190 = data [ 190 ] } v 191 := 0 if 191 < len ( data ) { v 191 = data [ 191 ] } v 192 := 0 if 192 < len ( data ) { v 192 = data [ 192 ] } v 193 := 0 if 193 < len ( data ) { v 193 = data [ 193 ] } v 194 := 0 if 194 < len ( data ) { v 194 = data [ 194 ] } v 195 := 0 if 195 < len ( data ) { v 195 = data [ 195 ] } v 196 := 0 if 196 < len ( data ) { v 196 = data [ 196 ] } v 197 := 0 if 197 < len ( data ) { v 197 = data [ 197 ] } v 198 := 0 if 198 < len ( data ) { v 198 = data [ 198 ] } v 199 := 0 if 199 < len ( data ) { v 199 = data [ 199 ]"
|
||||
},
|
||||
{
|
||||
"aliases": null,
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "438127626378632c03780d10603de32c",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 890,
|
||||
"line_start": 848,
|
||||
"symbol": "BigCompute [part 5/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv200 := 0\n\tif 200 < len(data) {\n\t\tv200 = data[200]\n\t}\n\tv201 := 0\n\tif 201 < len(data) {\n\t\tv201 = data[201]\n\t}\n\tv202 := 0\n\tif 202 < len(data) {\n\t\tv202 = data[202]\n\t}\n\tv203 := 0\n\tif 203 < len(data) {\n\t\tv203 = data[203]\n\t}\n\tv204 := 0\n\tif 204 < len(data) {\n\t\tv204 = data[204]\n\t}\n\tv205 := 0\n\tif 205 < len(data) {\n\t\tv205 = data[205]\n\t}\n\tv206 := 0\n\tif 206 < len(data) {\n\t\tv206 = data[206]\n\t}\n\tv207 := 0\n\tif 207 < len(data) {\n\t\tv207 = data[207]\n\t}\n\tv208 := 0\n\tif 208 < len(data) {\n\t\tv208 = data[208]\n\t}\n\tv209 := 0\n\tif 209 < len(data) {\n\t\tv209 = data[209]\n\t}\n\treturn len(data)\n}",
|
||||
"token_estimate": 191,
|
||||
"tokenized_korean_text": "} v 200 := 0 if 200 < len ( data ) { v 200 = data [ 200 ] } v 201 := 0 if 201 < len ( data ) { v 201 = data [ 201 ] } v 202 := 0 if 202 < len ( data ) { v 202 = data [ 202 ] } v 203 := 0 if 203 < len ( data ) { v 203 = data [ 203 ] } v 204 := 0 if 204 < len ( data ) { v 204 = data [ 204 ] } v 205 := 0 if 205 < len ( data ) { v 205 = data [ 205 ] } v 206 := 0 if 206 < len ( data ) { v 206 = data [ 206 ] } v 207 := 0 if 207 < len ( data ) { v 207 = data [ 207 ] } v 208 := 0 if 208 < len ( data ) { v 208 = data [ 208 ] } v 209 := 0 if 209 < len ( data ) { v 209 = data [ 209 ] } return len ( data ) }"
|
||||
}
|
||||
]
|
||||
186
crates/kebab-chunk/tests/fixtures/code-sample.java.chunks.snapshot.json
vendored
Normal file
186
crates/kebab-chunk/tests/fixtures/code-sample.java.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
186
crates/kebab-chunk/tests/fixtures/code-sample.js.chunks.snapshot.json
vendored
Normal file
186
crates/kebab-chunk/tests/fixtures/code-sample.js.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
186
crates/kebab-chunk/tests/fixtures/code-sample.kt.chunks.snapshot.json
vendored
Normal file
186
crates/kebab-chunk/tests/fixtures/code-sample.kt.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
186
crates/kebab-chunk/tests/fixtures/code-sample.py.chunks.snapshot.json
vendored
Normal file
186
crates/kebab-chunk/tests/fixtures/code-sample.py.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
186
crates/kebab-chunk/tests/fixtures/code-sample.ts.chunks.snapshot.json
vendored
Normal file
186
crates/kebab-chunk/tests/fixtures/code-sample.ts.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
33
crates/kebab-chunk/tests/fixtures/sample.c
vendored
Normal file
33
crates/kebab-chunk/tests/fixtures/sample.c
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define MAX_BUF 4096
|
||||
|
||||
typedef enum {
|
||||
OK = 0,
|
||||
ERR_PARSE,
|
||||
ERR_IO,
|
||||
} status_t;
|
||||
|
||||
typedef struct {
|
||||
int id;
|
||||
char name[64];
|
||||
status_t status;
|
||||
} record_t;
|
||||
|
||||
static int counter = 0;
|
||||
|
||||
int parse_record(const char *line, record_t *out) {
|
||||
if (line == NULL || out == NULL) return ERR_PARSE;
|
||||
return OK;
|
||||
}
|
||||
|
||||
void print_record(const record_t *r) {
|
||||
printf("[%d] %s (status=%d)\n", r->id, r->name, r->status);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
record_t r = { .id = 1, .name = "foo", .status = OK };
|
||||
print_record(&r);
|
||||
return 0;
|
||||
}
|
||||
40
crates/kebab-chunk/tests/fixtures/sample.cpp
vendored
Normal file
40
crates/kebab-chunk/tests/fixtures/sample.cpp
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace kebab {
|
||||
namespace chunk {
|
||||
|
||||
class MdHeadingV1Chunker {
|
||||
public:
|
||||
MdHeadingV1Chunker() = default;
|
||||
~MdHeadingV1Chunker() = default;
|
||||
|
||||
std::string chunk_doc(const std::string& doc) {
|
||||
return doc;
|
||||
}
|
||||
|
||||
int operator()(int x) const {
|
||||
return x * 2;
|
||||
}
|
||||
|
||||
private:
|
||||
int counter_ = 0;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
T identity(T value) {
|
||||
return value;
|
||||
}
|
||||
|
||||
} // namespace chunk
|
||||
|
||||
void global_helper() {
|
||||
// free function in kebab namespace
|
||||
}
|
||||
|
||||
} // namespace kebab
|
||||
|
||||
int main() {
|
||||
kebab::chunk::MdHeadingV1Chunker c;
|
||||
return 0;
|
||||
}
|
||||
5
crates/kebab-chunk/tests/fixtures/sample.dockerfile
vendored
Normal file
5
crates/kebab-chunk/tests/fixtures/sample.dockerfile
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
FROM rust:1.94-slim AS builder
|
||||
WORKDIR /app
|
||||
COPY . .
|
||||
RUN cargo build --release
|
||||
CMD ["/app/target/release/kebab"]
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user