Compare commits
383 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a283e56c5c | |||
| 47ef6532f7 | |||
| 03b0745e9d | |||
| e7cb20990a | |||
| bebf6e4ac7 | |||
| 736d791056 | |||
| 6c9c8df43e | |||
| 0263667684 | |||
| 4918983d9c | |||
| aeaa18a564 | |||
| c91ff909ce | |||
| 8dee610a97 | |||
| d71ed2516b | |||
| 095c9f37a2 | |||
| 16ddb1dfc3 | |||
| 72c99c452c | |||
| cbcae69abf | |||
| 7505645008 | |||
| e2ae9a4589 | |||
| 1dfab6dfc5 | |||
| fc5103642e | |||
| e03d03cb26 | |||
| 16aadea222 | |||
| a48c405826 | |||
| 21e02d8a93 | |||
| a64c31ee94 | |||
| ec96648956 | |||
| ecaf224381 | |||
| b1c5feb3f3 | |||
| ca8c0645fb | |||
| c7af6612b7 | |||
| acb4fa6c65 | |||
| 8bfa4ba76e | |||
| ad0ccf4ccf | |||
| b351523e51 | |||
| a48b055358 | |||
| 581e1d5d55 | |||
| c17d6e67a8 | |||
| af8fd34716 | |||
| 369aeb3d24 | |||
| 99f8cfa691 | |||
| d85d7348a5 | |||
| edac3ae737 | |||
| 6ec4e6809f | |||
| 1011c75fff | |||
| 8f7b6ee538 | |||
| 76841af7d3 | |||
| 980e20fd8d | |||
| cd79ed326c | |||
| 9dbf9d781d | |||
| 9501edd82b | |||
| 4b4a4c0b32 | |||
| f2cc325cf3 | |||
| b7e022a5e3 | |||
| bd7c4fd7ef | |||
| 4dcb4a45d6 | |||
| 6d86214060 | |||
| 6bbb8f854b | |||
| 2a4df4d48d | |||
| 16f3d6eef2 | |||
| fa89c7b561 | |||
| a4c81fed86 | |||
| 5b7c02fe13 | |||
| 88c5b83dea | |||
| 2619b7bff7 | |||
| e9b520216e | |||
| a8fd76499c | |||
| 0282a81c67 | |||
| f3587b7143 | |||
| 483b1ec06b | |||
| d279f343e7 | |||
| b56469f010 | |||
| 6ba8cb2c88 | |||
| afa8af0f88 | |||
| b9d20d23d1 | |||
| 86b4e1ebd0 | |||
| 825543549d | |||
| bcb8b93751 | |||
| 116b3e6377 | |||
| 69b53d1c97 | |||
| a271352e33 | |||
| cde4d75f6b | |||
| bddcd53688 | |||
| 2a207f9868 | |||
| cc31868d24 | |||
| 0df47febf0 | |||
| b12a616ab2 | |||
| 848b75c069 | |||
| 467a974901 | |||
| 098413922b | |||
| 695010ea7a | |||
| 8bb7c276d0 | |||
| 01a03463a6 | |||
| b6ad947378 | |||
| 1529e6d991 | |||
| 5ad1f98227 | |||
| a58cae2ff3 | |||
| 7a1dff1684 | |||
| 0988f66331 | |||
| 82e02aa4fe | |||
| db4af0cc72 | |||
| ab20202241 | |||
| a51e6395c0 | |||
| fe4c854673 | |||
| 1de3f4ffca | |||
| 7fbfec647d | |||
| ca8c83b1ba | |||
| 6c611990d8 | |||
| 166b1404e4 | |||
| 2d0168b7ab | |||
| 4afcaf96d2 | |||
| 16c4579399 | |||
| 40d7faee71 | |||
| a3bb2580bf | |||
| 2429189447 | |||
| d93b757cf1 | |||
| 571996938c | |||
| be79bdb83d | |||
| 4e76f103c1 | |||
| 4fd672193f | |||
| 1454321b12 | |||
| 649ec35108 | |||
| dece5e89fc | |||
| 3cb49f1f9b | |||
| f5ff823984 | |||
| b82eaec21a | |||
| 6daa43375b | |||
| 85efeeca3e | |||
| 2b4ba8e104 | |||
| b08941d6ab | |||
| 6bf4e82e62 | |||
| a0c7fa3d1a | |||
| ebc6bf45c4 | |||
| d8fdc815be | |||
| 9f2a56d091 | |||
| fe20be8195 | |||
| 028d9ad4ea | |||
| a3513c9110 | |||
| f2a76cfe94 | |||
| 8c56ef3010 | |||
| 5d9ea588ed | |||
| 53ec9b4dc5 | |||
| 21b52bc285 | |||
| 97fd895a10 | |||
| d13eb87401 | |||
| 26f3a7756c | |||
| 881f949fcb | |||
| c5de5f812b | |||
| f94e0c4a9b | |||
| 923b959610 | |||
| b63af20b72 | |||
| e8f44a57e3 | |||
| 4b4a8cbb3a | |||
| 4dc1c10be1 | |||
| bd86f61c9c | |||
| b134ae9dd5 | |||
| 597d8b70ad | |||
| b106120e93 | |||
| 43366b1b15 | |||
| 70507e94ca | |||
| 7bbdc89ae3 | |||
| 7c24734cc7 | |||
| 9a36a06f97 | |||
| 35c987df1c | |||
| d9ec7b8dc3 | |||
| 4e451c9f7c | |||
| 6482bf1321 | |||
| 5977c8cdf1 | |||
| 89d334a92b | |||
| 09333d0b05 | |||
| 685007789a | |||
| 445b096215 | |||
| 415227bf76 | |||
| f9dc0f749f | |||
| bef0c98867 | |||
| f8a4c79727 | |||
| f60304beb4 | |||
| 6a9551e0fa | |||
| 46e99470eb | |||
| 9b44e27dfe | |||
| 854a180365 | |||
| 5bba95fd71 | |||
| 2c7fa7142a | |||
| d9c7aabce1 | |||
| 10b0e2f4f2 | |||
| 28f513795e | |||
| 760eee89c8 | |||
| f763049923 | |||
| 8cf73d1f43 | |||
| a58ee10dfb | |||
| e674ff474b | |||
| 241ded59df | |||
| 436fd015a2 | |||
| d9acda517a | |||
| b4d9e60816 | |||
| 90726ab283 | |||
| 1d4e301e5e | |||
| 48197687b7 | |||
| c9e05941c5 | |||
| 4c5ccd5447 | |||
| b9ee09f176 | |||
| 4672cba6c6 | |||
| fd918a60ce | |||
| 9f003ef1cd | |||
| 8d81bc1071 | |||
| c2cd3a7ab7 | |||
| fb3952d54f | |||
| aeeff3635b | |||
| 9d7faab650 | |||
| bcd1e37dab | |||
| e7a4330798 | |||
| 574e1b1ca1 | |||
| c1e82cca92 | |||
| 2c05dbd0dd | |||
| 96766406aa | |||
| 710945c4b0 | |||
| d4395a306b | |||
| bd48baa19a | |||
| b02ac8200e | |||
| 336962715a | |||
| 1a224bf983 | |||
| a210bf5d52 | |||
| 429287f6cb | |||
| 08495eb425 | |||
| 98cf4e8a04 | |||
| 4030f04f37 | |||
| 7c27633df2 | |||
| 3712d005cc | |||
| 7c85de065a | |||
| a0ccc7b021 | |||
| a8fd6994d2 | |||
| 505b3889fb | |||
| 772575d8f0 | |||
| 00ffe9c792 | |||
| 681c48b2a3 | |||
| 546c1564b0 | |||
| 79ad6e376f | |||
| 6ffbe0a5a3 | |||
| ab3408cb49 | |||
| b807fd5aa5 | |||
| 93436f9eca | |||
| 11ce7847a1 | |||
| 1d88dccf8a | |||
| 1eb0bbecb3 | |||
| 44fbffff26 | |||
| 63aece3ea1 | |||
| 28a8bbeace | |||
| 52a97303dc | |||
| 71fb2cbcb3 | |||
| 85855ef596 | |||
| da25ce330b | |||
| 5bfea3c28b | |||
| b6756f8ce3 | |||
| 016f380428 | |||
| bf28a1e4d9 | |||
| 24221826ed | |||
| 8a2f7affa6 | |||
| f28a422f79 | |||
| c56242d04f | |||
| 17c48a0ee6 | |||
| 64a009314c | |||
| ddfe7ba099 | |||
| 104363a0db | |||
| 6188a50c1c | |||
| 94e6146013 | |||
| 12c7dc9efb | |||
| cd1d4fb807 | |||
| 7150c376bb | |||
| 6280abf2df | |||
| 192da45dbf | |||
| cf35f36f88 | |||
| ed34f2e03f | |||
| 624b44c46b | |||
| caf690dc72 | |||
| 1640ecf288 | |||
| 90e77631a8 | |||
| fa251db48f | |||
| 3114c31841 | |||
| 271329efbd | |||
| f2867540d2 | |||
| e118844256 | |||
| 41c5edc517 | |||
| d02149c010 | |||
| 0c69b9621b | |||
| 0d69d85757 | |||
| a67300317b | |||
| abb05ebc23 | |||
| 26fdc4f344 | |||
| 3f5e0e6e90 | |||
| 578a60e3bb | |||
| 64f518e08e | |||
| fa9f91ead4 | |||
| 9ee89c2a94 | |||
| 13a3361ba2 | |||
| 0def913abd | |||
| ff9d5f5f86 | |||
| 70a5068c0d | |||
| 93ddece111 | |||
| 67559fb3ce | |||
| d79e432916 | |||
| 0ee18149e7 | |||
| 8a68289499 | |||
| 6ac7fea7b9 | |||
| fe123c0c6d | |||
| 753b1ff5e5 | |||
| 8dcedc4b11 | |||
| 8781c6112b | |||
| 14197b5e02 | |||
| 584247f1ea | |||
| a0c0dca321 | |||
| 667495ae6a | |||
| 08d72a12e0 | |||
| 1969c8e3b5 | |||
| c6207d196e | |||
| 840c6c40a6 | |||
| b81574afa9 | |||
| 6beff35a2f | |||
| 75a4207aa1 | |||
| 86aa180ad7 | |||
| 802c573c07 | |||
| 438870ee25 | |||
| 192835e5bf | |||
| 1034de25a2 | |||
| d1560be80d | |||
| b2a2902e38 | |||
| 03cd41c48f | |||
| 926042049c | |||
| e0a29225da | |||
| b541567946 | |||
| a58d400abd | |||
| 8add684ffc | |||
| 7a90df1485 | |||
| 46f408dc0f | |||
| 49e60fb314 | |||
| 6bc7a83d3c | |||
| df3c5b8caf | |||
| 5051ea7534 | |||
| 88d7fbc182 | |||
| 0b7d8af759 | |||
| 9342b9543f | |||
| a8aa03042f | |||
| 9d4a60aac5 | |||
| 8ce7a911ee | |||
| 75c1c7b911 | |||
| b5c12ecb6f | |||
| a1192ce3b2 | |||
| 17ee400fd5 | |||
| 217dddb4ba | |||
| 308666dbd5 | |||
| 522ae7b8bc | |||
| 166e1ddfaf | |||
| 226ce8b744 | |||
| 22d4161728 | |||
| 51004ac593 | |||
| 8996e73282 | |||
| 22dba09857 | |||
| aaa90b1754 | |||
| 077f92f41e | |||
| 5ce7f60932 | |||
| 47857b2622 | |||
| 1e4cff879b | |||
| 2d7a566624 | |||
| 813bdd1a16 | |||
| ff1bedbef5 | |||
| 30e03c7a12 | |||
| 2ce6ae47c5 | |||
| ebc4ef2eea | |||
| 7bda1509b7 | |||
| 61d48d67a3 | |||
| f4c840b994 | |||
| 15244b7494 | |||
| a7f7ab9f93 | |||
| 1b19e33a4f | |||
| 9c9e391b15 | |||
| f95cd55484 | |||
| ab288135e9 | |||
| c19aa006d0 | |||
| f1a4f67e12 | |||
| 6463c52827 | |||
| 2559d0d95a | |||
| 4524830306 | |||
| 8cdd3903c7 | |||
| 8b89961ada |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,6 +1,7 @@
|
||||
.superpowers/
|
||||
.worktrees/
|
||||
.claude/
|
||||
.omc/
|
||||
/target
|
||||
**/*.rs.bk
|
||||
Cargo.lock.bak
|
||||
|
||||
64
CLAUDE.md
64
CLAUDE.md
@@ -81,11 +81,71 @@ Bump 자체는 단순 minor / patch 한 줄 수정 (`Cargo.toml` workspace `vers
|
||||
Release 절차:
|
||||
|
||||
1. `gitea-release v<X.Y.Z>` (gitea-ops skill) 으로 tag + push + release notes.
|
||||
2. release notes 는 사용자 도그푸딩에 영향 가는 surface 변경 위주 — wire schema 추가, CLI flag 신규, TUI 키 변경, V00X migration 등.
|
||||
3. 프리-1.0 (`0.x.y`) 단계: minor bump 시 wire schema additive / surface 변경 누적, patch bump 시 bug fix only.
|
||||
2. release notes 는 사용자 도그푸딩에 영향이 가는 surface 변경을 위주로 — wire schema 추가, CLI flag 신규, TUI 키 변경, V00X migration 등 — 다룬다. 이때 추가된 기능과 변경사항은 유저가 이해할 수 있도록 친절하고 자세하게 풀어서 설명해야 하며, 단순히 commit subject 를 나열하는 형태로 끝내면 안 된다. 필요하다면 도그푸딩이나 테스트 결과도 함께 적어 둔다.
|
||||
3. 프리-1.0 (`0.x.y`) 단계 bump 규칙 — **기능(behavior) 또는 인터페이스(interface) 변경 여부**로 판정:
|
||||
- **minor bump** (`0.x.0`): 기능 또는 인터페이스에 *실질적* 변경이 있을 때. 인터페이스 = 신규/변경/삭제된 CLI subcommand·flag, config 키, wire schema 의 **breaking** 변경, 임베딩/검색/RAG 등 사용자가 받는 **결과·동작**의 변화, V00X migration, frozen 설계 변경. 기능 = 새 source 형식·검색 모드·백엔드 등 *할 수 있는 일*의 추가/변경.
|
||||
- **patch bump** (`0.x.y`): 기능·인터페이스 변경이 **없을** 때. bug fix, 내부 refactor, 성능 개선, 로깅/진행표시 등 **관측성(observability) 개선**, **additive-only wire 변경**(backward-compat 신규 필드/이벤트라 기존 소비자 무영향), 문서. ← 즉 "결과가 같고 새 명령/플래그/config 도 없으면 patch".
|
||||
- 경계 예: 진행 로그에 phase/파일명 추가 + additive wire 이벤트(asset_phase) = **patch** (검색·색인 결과 불변, 새 명령/플래그/config 없음). arctic 임베더 provider + 신규 config 키 = **minor** (인터페이스 추가). 별칭 기능 제거 + migration = **minor** (동작·인터페이스 변경).
|
||||
|
||||
**bump 시점 = release 시점 같은 commit**. 즉 commit `chore: bump version 0.x → 0.y` 직후 같은 commit 에 tag. v0.1.0 (`2319206`) 처럼 bump 없이 tag 만 찍는 패턴은 후속 release 가 대상 commit 을 헷갈리게 함 — pre-release snapshot 은 SHA reference 로 충분.
|
||||
|
||||
## Dogfood trigger
|
||||
|
||||
도그푸딩 = 새 binary 를 실제 KB / 실제 query 로 돌려보고 user-visible 동작이 spec 의 의도와 일치하는지 확인하는 종단 검증. unit / integration test 가 못 잡는 회귀 (UX 어색함, performance regression, 의외의 token 처리, embedding drift, RAG hallucination) 를 catch 함. PR 머지 전 또는 머지 직후 release notes 작성 전에 실시.
|
||||
|
||||
### 도그푸딩이 필요한 시점
|
||||
|
||||
다음 트리거 중 하나라도 hit 시 도그푸딩 필수. **모두 release-level 또는 user-visible behavior 변경 임**.
|
||||
|
||||
**Schema / migration**:
|
||||
- 신규 V00X migration (예: V007 trigram, V008 OCR mirror, V009 morphological) — `corpus_revision` cascade + auto-backfill 정책의 사용자 경험 확인.
|
||||
- frozen design contract 변경 (`docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` §X 갱신) — verbatim CI diff-check 외의 user-visible side effect 확인.
|
||||
|
||||
**Wire schema / CLI surface**:
|
||||
- 신규 `--json` 필드, exit code 변경, 또는 schema major bump (v1 → v2) — agent / external integration 의 호환성 검증.
|
||||
- `kebab` 의 subcommand 또는 flag 추가/삭제/rename — agent skill / muscle memory 영향.
|
||||
|
||||
**Search / RAG behavior**:
|
||||
- FTS5 tokenizer / chunker / embedder 모델 / RAG prompt template 변경 — 같은 query 의 hit ordering, snippet, RAG citation 패턴이 자연스럽게 변화하는지.
|
||||
- score gate, RRF fusion ratio, NLI threshold 같은 ranking 파라미터 default 변경.
|
||||
|
||||
**Performance**:
|
||||
- ingest / search / ask latency 의 의도된 변화 (예: lindera tokenize, OCR 추가, multi-hop RAG) — actual wall-clock 측정 + release notes 에 명시.
|
||||
- 대용량 KB (수천 doc / 만 chunk) 의 first-boot eager backfill 시간이 사용자 hang 인지에 영향 안 가는지.
|
||||
|
||||
**Language / locale**:
|
||||
- 한국어 / 일본어 / 중국어 lexical 동작 변경 (V007 trigram, V009 morphological, future N-gram).
|
||||
- 영어 substring 매칭 같은 ad-hoc 부산물의 회귀.
|
||||
|
||||
**File / asset surface**:
|
||||
- 신규 source 형식 (PDF OCR, audio, video) — extractor / chunker 의 실제 corpus 동작.
|
||||
- `.kebabignore` / `_external/` 같은 workspace 정책 변경.
|
||||
|
||||
**Release-level**: 위 트리거 중 하나가 hit 되어 `Cargo.toml` workspace `version` bump 가 필요하면, **bump commit 이전에 도그푸딩 evidence 가 HOTFIXES + release notes 에 명시** 되어 있어야 함. evidence 없는 release 는 사용자가 "왜 bump 했는지" 추적 불가.
|
||||
|
||||
### 도그푸딩 데이터 보관소
|
||||
|
||||
모든 도그푸딩 source 문서 + KB state + 로그는 `/build/dogfood/` 한 디렉토리에 누적 보관한다. **분류는 문서 의미 / 종류 / 형식 기준만** — kebab version, 생성 시점, scenario name 같은 prefix 금지 (`v0.20.1-dogfood/`, `dogfood-v018/` 같은 디렉토리 신설 X). 자세한 layout 은 `/build/dogfood/README.md` 참조.
|
||||
|
||||
- `/build/dogfood/corpus/` — source 문서 (read-only). format 별 분류 (`markdown/`, `code/`, `html/`, `images/`, `pdf/`, `manifest/`, `resources/`) + 각 format 내 category 별 (예: `markdown/{korean,english,bilingual,tech-docs,coding-md-corpus,topics,notes,edge-cases}`, `code/{rust,python,...}`). 새 fixture 는 적절한 category subdir 에 추가.
|
||||
- `/build/dogfood/kb/` — 도그푸딩 run 의 KB 출력 (SQLite + LanceDB + assets + models). 매 run 마다 reset 가능. 별 KB 디렉토리 신설 X.
|
||||
- `/build/dogfood/logs/` — 누적 실행 로그 (ndjson + stderr + summary).
|
||||
- `/build/dogfood/config.toml` — canonical 도그푸딩 config (없으면 `kebab init` 후 path override).
|
||||
- `/build/dogfood/_archive/` — regeneratable stale state (이전 run 의 sqlite/lancedb, XDG snapshot). 디스크 압박 시 wipe 가능.
|
||||
|
||||
`/tmp/kebab-smoke/`, `/tmp/kebab-*`, `/build/cache/dogfood*`, `/home/altair823/KnowledgeBase`, `~/.config/kebab/`, `~/.local/share/kebab/`, `~/.local/state/kebab/` 같은 위치 신규 사용 금지 — 모두 `/build/dogfood/` 로 일관. ad-hoc fixture 가 필요하면 `corpus/<format>/<category>/` 에 추가.
|
||||
|
||||
### 도그푸딩 결과 기록
|
||||
|
||||
도그푸딩 evidence 는 두 곳에 cascade:
|
||||
|
||||
1. **`tasks/HOTFIXES.md` 의 dated entry** — 시나리오 별 hit count 표 + snippet evidence + known limitation. 미래에 spec drift 의심 시 git history 외 immediate reference 가 됨.
|
||||
2. **`docs/release-notes/v<X.Y.Z>-draft.md`** (또는 gitea release body) — 사용자 도그푸딩 영향에 영향이 가는 surface 변경을 4 단락 (변경 사실 / trade-off / mitigation / upgrade 절차) 으로 풀어서 설명. evidence link.
|
||||
|
||||
도그푸딩 단계에서 *발견된 bug* (spec 과 실제 동작의 mismatch, performance regression, UX 어색함) 는 즉시 fix → re-dogfood. fix 가 별 PR 으로 빠지면 머지 후 HOTFIXES 에 dated entry.
|
||||
|
||||
DOGFOOD scenario catalog (§1~§13) 는 `docs/DOGFOOD.md`. 신규 release 마다 §관련 section 의 scenario list 갱신 + 신규 scenario 추가.
|
||||
|
||||
## Naming + paths
|
||||
|
||||
- Crate prefix: `kebab-` (kebab-case package, `kebab_` snake_case in Rust modules).
|
||||
|
||||
1442
Cargo.lock
generated
1442
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
121
Cargo.toml
121
Cargo.toml
@@ -2,17 +2,17 @@
|
||||
resolver = "3"
|
||||
members = [
|
||||
"crates/kebab-core",
|
||||
"crates/kebab-parse-types",
|
||||
"crates/kebab-config",
|
||||
"crates/kebab-source-fs",
|
||||
"crates/kebab-parse-md",
|
||||
"crates/kebab-normalize",
|
||||
"crates/kebab-chunk",
|
||||
"crates/kebab-store-sqlite",
|
||||
"crates/kebab-store-vector",
|
||||
"crates/kebab-search",
|
||||
"crates/kebab-embed",
|
||||
"crates/kebab-embed-local",
|
||||
"crates/kebab-embed-candle",
|
||||
"crates/kebab-embed-ollama",
|
||||
"crates/kebab-llm",
|
||||
"crates/kebab-llm-local",
|
||||
"crates/kebab-rag",
|
||||
@@ -24,6 +24,7 @@ members = [
|
||||
"crates/kebab-tui",
|
||||
"crates/kebab-mcp",
|
||||
"crates/kebab-parse-code",
|
||||
"crates/kebab-nli",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -31,7 +32,95 @@ edition = "2024"
|
||||
rust-version = "1.85"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/altair823/kebab"
|
||||
version = "0.11.1"
|
||||
version = "0.26.2" # v0.26.2 — ingest 설정 변경 시 영향 자산 자동 재색인: ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/[ingest.code])의 결정적 서명을 effective parser_version(skip 비교 + 저장 doc 필드 양쪽)에 폴딩 → 해당 설정 변경 시 `--force-reingest` 없이 영향 자산만 자동 재색인. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout 등)은 제외(과도 무효화 회피). doc_id 는 base parser_version 으로 안정 유지(orphan churn 회피). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정) → patch. — CLAUDE.md §Release
|
||||
|
||||
# pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with
|
||||
# intentional allow-list. The allowed lints are either cosmetic (doc style),
|
||||
# informational (function size), or carry intentional truncation we accept
|
||||
# (numeric casts in tokenizer/ONNX inputs, hash modular reduction, etc).
|
||||
[workspace.lints.clippy]
|
||||
pedantic = { level = "warn", priority = -1 }
|
||||
# Intentional u32 ↔ i64 casts in kebab-nli (ONNX i64 inputs from tokenizer u32 ids).
|
||||
# u64 ↔ usize across kebab-store-sqlite row counts. Wide truncation is auditable
|
||||
# at use site, not lint-wide.
|
||||
cast_possible_truncation = "allow"
|
||||
cast_possible_wrap = "allow"
|
||||
cast_sign_loss = "allow"
|
||||
cast_precision_loss = "allow"
|
||||
# Doc markdown style is cosmetic; we run rustdoc on demand.
|
||||
doc_markdown = "allow"
|
||||
missing_errors_doc = "allow"
|
||||
missing_panics_doc = "allow"
|
||||
# Informational only — splitting a long pipeline function isn't always cleaner.
|
||||
too_many_lines = "allow"
|
||||
# `Foo::default()` is concise and idiomatic here; `<Foo as Default>::default()`
|
||||
# adds noise without surfacing intent.
|
||||
default_trait_access = "allow"
|
||||
# Module name prefix on public items keeps the wire/log surface readable
|
||||
# (`refusal_reason::no_chunks` etc).
|
||||
module_name_repetitions = "allow"
|
||||
# We use `#[must_use]` deliberately on public results, not blanket.
|
||||
must_use_candidate = "allow"
|
||||
# `String` arg sometimes signals "I'll consume this" — let signature decide.
|
||||
needless_pass_by_value = "allow"
|
||||
# Idiomatic single-line bindings stay; let-else expansion isn't always clearer.
|
||||
manual_let_else = "allow"
|
||||
# `use` after `let` is a common kebab pattern (scoped imports next to use site).
|
||||
items_after_statements = "allow"
|
||||
# Naming pairs like `chunk_id` / `chunks_id` are intentional domain terms.
|
||||
similar_names = "allow"
|
||||
# `iter.map(format!).collect::<String>()` is idiomatic when the per-element
|
||||
# string is genuinely independent — `fold` only wins on accumulation patterns.
|
||||
format_collect = "allow"
|
||||
# Exhaustive `match` with explicit variant arms (vs `_`) catches future
|
||||
# variant additions at compile time (kebab core's `RefusalReason` pattern).
|
||||
match_wildcard_for_single_variants = "allow"
|
||||
# Copy types under `&self` keep call-site discipline; auto-deref noise > tiny perf gain.
|
||||
trivially_copy_pass_by_ref = "allow"
|
||||
# `unnecessary_wraps` flags helpers that could drop `Result`, but keeping the
|
||||
# Result allows future error variants without churning callers.
|
||||
unnecessary_wraps = "allow"
|
||||
# NLI score / RRF fusion / similarity threshold comparisons are intentional —
|
||||
# floats live in the `[0, 1]` band and are compared with explicit thresholds.
|
||||
float_cmp = "allow"
|
||||
# File-extension dispatch is keyed on ASCII conventions; case sensitivity
|
||||
# is part of the spec for `.md`, `.pdf`, etc.
|
||||
case_sensitive_file_extension_comparisons = "allow"
|
||||
# Config / opts structs intentionally bundle boolean flags (ingest options,
|
||||
# search modes, etc) — splitting them into enums would obscure the wire shape.
|
||||
struct_excessive_bools = "allow"
|
||||
# `bytecount` crate would be a new dep just for one-off ASCII counts.
|
||||
naive_bytecount = "allow"
|
||||
# `#[ignore]` annotations on tests document via the test name + nearby comment.
|
||||
ignore_without_reason = "allow"
|
||||
# `format!` push patterns are a hot path for kebab-tui's progressive rendering;
|
||||
# `write!` rewrite needs a verified-equal benchmark before swapping.
|
||||
format_push_string = "allow"
|
||||
# Builder-style `with_*` methods return `Self`; the existing `#[must_use]`
|
||||
# discipline lives on aggregate constructors, not every chainable setter.
|
||||
return_self_not_must_use = "allow"
|
||||
# Match arms grouped by side-effect over body equality (e.g. snake_case wire
|
||||
# label tables) — fanning them out keeps adding a new variant trivial.
|
||||
match_same_arms = "allow"
|
||||
# Remaining style-only warnings: trailing `continue` is sometimes clearer than
|
||||
# rewriting, `_x` underscored bindings document intent at the use site, and
|
||||
# `!(a == b)` reads better than `a != b` when paired with a complementary check.
|
||||
needless_continue = "allow"
|
||||
used_underscore_binding = "allow"
|
||||
nonminimal_bool = "allow"
|
||||
# Other one-off cosmetic items: large literal formatting, doc link quoting,
|
||||
# `Clone::clone_from` swap, `str::replace` chaining, `Iterator::any` ergonomics.
|
||||
unreadable_literal = "allow"
|
||||
many_single_char_names = "allow"
|
||||
doc_link_with_quotes = "allow"
|
||||
assigning_clones = "allow"
|
||||
collapsible_str_replace = "allow"
|
||||
trivial_regex = "allow"
|
||||
elidable_lifetime_names = "allow"
|
||||
range_plus_one = "allow"
|
||||
explicit_iter_loop = "allow"
|
||||
implicit_hasher = "allow"
|
||||
ref_option = "allow"
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1"
|
||||
@@ -54,6 +143,7 @@ proptest = "1"
|
||||
# p9-fb-19: LRU cache for `App::search` results. Bounded capacity
|
||||
# from `config.search.cache_capacity` (default 256, ~1.3 MB cap).
|
||||
lru = "0.12"
|
||||
lopdf = "0.32"
|
||||
# fastembed-rs ships ONNX runtime via the `ort-download-binaries` feature
|
||||
# in its default set (which also pulls `hf-hub` for first-run model
|
||||
# downloads). Pinned to the 4.x line per task p3-2 (current 5.x release
|
||||
@@ -94,6 +184,31 @@ tree-sitter-rust = "0.24"
|
||||
tree-sitter-python = "0.25.0"
|
||||
tree-sitter-typescript = "0.23.2"
|
||||
tree-sitter-javascript = "0.25.0"
|
||||
# Go grammar for code ingest (kebab-parse-code, p10-1C-Go).
|
||||
tree-sitter-go = "0.25.0"
|
||||
# JVM family grammars for code ingest (kebab-parse-code, p10-1C-JK).
|
||||
tree-sitter-java = "0.23.5"
|
||||
tree-sitter-kotlin-ng = "1.1.0" # bare tree-sitter-kotlin requires ts <0.23; -ng uses tree-sitter-language 0.1 (ts 0.26 compat)
|
||||
# C/C++ family grammars for code ingest (kebab-parse-code, p10-1D).
|
||||
tree-sitter-c = "0.24.2"
|
||||
tree-sitter-cpp = "0.23.4"
|
||||
# fb-41 PR-9 (kebab-nli): mDeBERTa-v3 XNLI verifier deps. Versions match
|
||||
# the fastembed 4.9 transitive set so the ONNX Runtime + tokenizer stack
|
||||
# stays single-versioned across the workspace. ort `default-features=false`
|
||||
# drops the bundled binary downloader (fastembed already provides one);
|
||||
# tokenizers `default-features=false, onig` swaps the default `esaxx` regex
|
||||
# backend for `onig` so the build doesn't need libstdc++ headers (verified
|
||||
# via PR-9a pre-flight: SentencePiece tokenizer.json loads + KR/EN encode).
|
||||
# hf-hub uses `ureq + rustls-tls` to stay aligned with kebab-embed-local's
|
||||
# pure-Rust TLS stack.
|
||||
ort = { version = "=2.0.0-rc.9", default-features = false, features = ["ndarray"] }
|
||||
tokenizers = { version = "0.21", default-features = false, features = ["onig"] }
|
||||
hf-hub = { version = "0.4", default-features = false, features = ["ureq", "rustls-tls"] }
|
||||
ndarray = "0.16"
|
||||
# Korean morphological tokenizer (FTS v0.20.x, §6.1). lindera-ko-dic bundles
|
||||
# the KO-DIC dictionary as an embedded blob via the embed-ko-dic feature.
|
||||
lindera = "3"
|
||||
lindera-ko-dic = "3"
|
||||
|
||||
# Disk-footprint trim for dev / test builds. Codegen, opt-level, and
|
||||
# behavior are unchanged — only DWARF debug info is reduced (line
|
||||
|
||||
104
HANDOFF.md
104
HANDOFF.md
@@ -4,7 +4,7 @@
|
||||
|
||||
## 한 줄 요약
|
||||
|
||||
P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. `kebab ingest` 가 markdown / image / PDF 모두 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공 — 사용자가 `?` 로 ask, `/` 로 search, Library Enter / Search `i` 로 inspect, Search `g` 로 editor jump. 다음 후보 = P9-5 (desktop tauri) 또는 보류 중인 P8 (audio) 의 시스템 dep brainstorm.
|
||||
P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) + P10 전체 머지 완료 (현재 **v0.18.0**). `kebab ingest` 가 markdown / image / PDF / 소스코드 (Rust / Python / TS / JS / Go / Java / Kotlin / C / C++) / Tier 2 리소스 파일 (yaml/k8s / dockerfile / toml / json / xml / groovy / go-mod) + Tier 3 paragraph fallback (shell / 비-k8s YAML / AST 실패 케이스) 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page / code citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공. **v0.17.0 cut (2026-05-24)**: 한국어 trigram FTS5 tokenizer (PR #159) + C typedef alias unit (PR #160) + `code_lang_chunk_breakdown` additive (PR #161). **v0.17.1 cut (2026-05-25)**: 확장 도그푸딩 후 `[models.llm] request_timeout_secs` config 노브 (PR #162) + sudo 없이 ollama 설치 + `kebab ask --stream` UX 권장 docs (PR #163). **v0.17.2 cut (2026-05-25)**: v0.17.1 post-dogfood polish — `[image.ocr] request_timeout_secs` 별 노브 (PR #164, v0.17.1 미진행 closure) + `heading_path` FTS5 column filter 로 text-only 매칭 + raw-mode escape hatch (PR #165, 2026-05-24 v0.17.0 trigram entry 의 JSON 노이즈 closure). **v0.18.0 cut (2026-05-26)**: fb-41 multi-hop RAG + NLI verification ship (PR #176-180) — `kebab ask --multi-hop` 의 decompose → decide → synthesize loop + mDeBERTa-v3 XNLI ONNX post-synthesize entailment 검사. dogfood S7 caffeine hallucination 의 silent LLM-self-judge ceiling 해결 (nli_score 0.0035 graceful refuse). 추가 `chore: workspace-wide cleanup + post-PR9 refactor` (PR #181) — clippy::pedantic baseline + H1 config wiring + 9 new tests. 자세한 영향은 [v0.17.0 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.17.0) + [v0.17.1 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.17.1) + [v0.17.2 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.17.2) + [v0.18.0 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.18.0). 구조적으로 남은 component 는 P9-5 (desktop tauri) 하나뿐, P8 (audio) 는 사용자 보류.
|
||||
|
||||
## Phase 로드맵
|
||||
|
||||
@@ -17,21 +17,39 @@ P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료.
|
||||
| **P4** | Local LLM + RAG + grounded answer | `kebab-llm`, `kebab-llm-local`, `kebab-rag` | P3 | ✅ 완료 |
|
||||
| **P5** | Golden query / regression eval | `kebab-eval` | P4 | ✅ 완료 |
|
||||
| **P6** | 이미지 ingestion (OCR + caption) | `kebab-parse-image` | P5 | ✅ 완료 (4/4 component, OCR/caption Ollama-vision) |
|
||||
| **P7** | PDF text + page citation | `kebab-parse-pdf` | P5 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring) |
|
||||
| **P7** | PDF text + page citation + scanned OCR (v0.20.0 sub-item 1) | `kebab-parse-pdf` + `kebab-app::pdf_ocr_apply` | P5 + P6 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring + post-extract OCR enrichment via qwen2.5vl:3b vision LLM) |
|
||||
| **P8** | 음성 transcription + timestamp citation | `kebab-parse-audio` | P5 | ⏸ 보류 (whisper-rs 시스템 dep brainstorm 필요) |
|
||||
| **P9** | TUI + desktop app | `kebab-tui`, `kebab-desktop` | P5 | 🟡 진행 (4/5 component — P9-1/2/3/4 완료 [Library / Search / Ask / Inspect], P9-5 desktop 예정 · 도그푸딩 피드백 **20/20 ✅**) |
|
||||
| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, tree-sitter-rust, `code-rust-ast-v1` — v0.7.0), **1B 🟡 PR 오픈** (Python `code-python-ast-v1` + TypeScript `code-ts-ast-v1` + JavaScript `code-js-ast-v1` — 3 언어 dogfooding 가능, v0.8.0 대기) |
|
||||
| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)**, **2 ✅ (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` — v0.14.0)**, **3 ✅ (Tier 3 paragraph fallback: code-text-paragraph-v1 — v0.15.0)**, **1D ✅ (C + C++ AST chunkers, code-c-ast-v1 + code-cpp-ast-v1 — v0.16.0)** |
|
||||
|
||||
P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
|
||||
|
||||
## Component 카운트
|
||||
|
||||
총 33 component task — spec 시점 31 개 + 후속 wiring task 3 (P3-5 / P6-4 / P7-3) 가 머지 시점에 추가됨. per-component 진행 + status 는 [tasks/INDEX.md](tasks/INDEX.md).
|
||||
총 33 component task — spec 시점 31 개 + 후속 wiring task 3 (P3-5 / P6-4 / P7-3) 가 머지 시점에 추가됨. v0.18.0 cut 시점에 fb-41 multi-hop RAG + NLI verification (PR-9 5 sub-PRs) 가 P9 추가 component 로 ship — `kebab-nli` 신규 crate (mDeBERTa-v3 XNLI ONNX verifier) + `kebab-rag::ask_multi_hop` (decompose/decide/synthesize loop + step 8.5 NLI hook). per-component 진행 + status 는 [tasks/INDEX.md](tasks/INDEX.md).
|
||||
|
||||
## 머지 후 발견된 버그 / 결정 (요약)
|
||||
|
||||
- **candle 임베딩 백엔드 다변화** (2026-06-01, Track 1, v0.22.0): `provider = "candle"` opt-in 추가 — 같은 `multilingual-e5-large` 모델을 순수 Rust(candle)로 돌려 듀얼소켓 NUMA 서버의 onnxruntime 48-스레드 double-free 를 회피. `[models.embedding].num_threads`(+env `KEBAB_EMBED_THREADS`)로 CPU 스레드 캡. fastembed default 동작·벡터 불변, `embedding_version` 유지(재색인 0). Phase 0 스파이크 패리티 cosine 1.000000. 상세 HOTFIXES 동일 일자.
|
||||
- **config 마이그레이션** (2026-05-31, PR #198): `kebab config migrate` 추가 — 기존 config.toml 에 빠진 섹션을 주석과 함께 채우고 deprecated 정리(멱등·`.bak`·dry-run, 값/주석 보존). `schema_version` 1→2, `init` 도 섹션 주석 포함, doctor 에 `config_migration` 체크. 상세 HOTFIXES 동일 일자.
|
||||
|
||||
머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만:
|
||||
|
||||
- **2026-06-03 ingest 설정 변경 자동 재색인** — v0.26.2. ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/`[ingest.code]`)을 변경하면 `--force-reingest` 없이 영향 자산만 자동 재색인. 그 설정들의 결정적 서명(`ingest_config_signature`)을 effective parser_version(skip 비교 + 저장 doc 필드 양쪽)에 폴딩 → 다음 ingest 비교가 mismatch. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout)은 제외(과도 무효화 회피), doc_id 는 base 로 안정 유지. **업그레이드 후 첫 ingest 는 전 자산 1회 재색인**(저장된 상수 parser_version ≠ 새 composite; embedding 은 V012 캐시 히트). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 설정 변경 자동 재색인), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-*invalidation*.md`.
|
||||
- **2026-06-03 ingest 진행 로그 개선** — v0.26.1. 이미지/PDF + OCR/caption on 볼트 ingest 가 "멈춘 듯" 보이던 문제 해소: TTY 진행바에 현재 파일명 + 느린 phase(ocr/caption/embed)+모델명 + 경과초 `(Ns)` heartbeat, 종료 시 최장 소요 파일 top-5 요약. 신규 wire `asset_phase{idx,total,phase,model}` + `asset_timings.ocr_ms`/`caption_ms`(additive, `ingest_progress.v1` 유지, serde default 0). 이미지·PDF 경로도 `asset_timings` emit(이전 markdown 만). 기본 동작 불변. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 진행 로그), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-ingest-log-improve-*.md`.
|
||||
- **2026-06-03 arctic-embed-l-v2.0 임베더 통합** — v0.26.0. 별칭 제거 후 설명형 query recall 보강(측정 recall@10 130/132, e5 +7). `kebab-embed-candle` 모델 레지스트리화(e5 mean + `snowflake-arctic-embed-l-v2.0` CLS, 모델별 pooling/prefix) + 신규 `kebab-embed-ollama`(`provider="ollama"`, `/api/embed`). config `endpoint: Option<String>` 추가. 기본 e5 유지(opt-in), arctic 전환은 embedding_version cascade → 재색인. candle↔Ollama cosine>0.99 게이트로 pooling/prefix 정확성 고정(`#[ignore]`). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 arctic), spec `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`.
|
||||
- **2026-06-03 doc-side expansion(별칭) 기능 완전 제거** — v0.25.0. 아래 2026-05-31 항목의 색인-시 청크당 LLM 별칭 생성 + 별칭 검색 채널을 **전부 제거**(ROI 음수: cross-lingual 은 e5-large 단독으로 충분, 기여는 설명형 +2 그룹뿐인데 대가가 청크당 색인-시 LLM). `Chunk.aliases`/`expansion.rs`/`IngestExpansionCfg`/alias lexical arm/`expansion_progress` wire kind 제거, 신규 마이그레이션 **V013** 이 `chunk_aliases_fts`+`chunks.aliases` DROP. 별칭 default-off 였어 사용자 체감 0, 기존 KB 도 재색인 불요(잔존 별칭 벡터는 `strip_alias_suffix` graceful 매핑/`reset` 정리). `AssetTimings.expansion_ms` 는 wire 호환 위해 값 0 으로 유지. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03), spec `docs/superpowers/specs/2026-06-03-remove-doc-expansion-spec.md`.
|
||||
- **2026-05-31 Phase 2 doc-side expansion 별칭(개별 dense 벡터) + 파생물 캐시(V012)** — v0.21.0 cut. 색인 시 LLM 이 청크별 별칭("같은 의미 다른 표현")을 생성, 줄별 **개별 dense 벡터**(sentinel `{chunk}#alias#N`)로 색인 (묶음 1벡터는 평균화 희석으로 회귀 → 폐기) + boilerplate 청크 skip. `[ingest.expansion]` default off. 측정(나무위키 ~1000 문서 CS corpus): 변형 일관성 14/18 → **16/18**, spread 0.222→0.111, 대조군 false-positive 별칭 무죄. 비용 병목(별칭 18문서 2.5h)은 **파생물 캐시(V012, 청크 내용 해시 키)**로 해소 — 정답 3개 cold 1879s → warm 13s **≈ 145배**, embedding+별칭 LLM 캐싱, version_key cascade 정합. search/ask 가 `kebab.sqlite`+`lancedb` 만으로 동작 → 외부 서버 색인 후 DB 만 복사하는 이식 워크플로 가능. **결정/known limitation**: grounded/refusal 판정이 부분 인용을 grounded 로 오분류(정직한 거부가 false-positive 로 집계) — 별도 개선 후보. stack·svm 설명형 2개 잔존. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-31), 측정: `docs/superpowers/handoffs/2026-05-31-namu-wiki-alias-cache-study.md`.
|
||||
- **2026-05-29 v0.20.2 dogfood findings + 검색 품질 baseline** — 8-finding 라운드 완료. (1) Ask 응답언어: rag-v3 default (질문 언어 = 답변 언어). (2) eval `--config` facade 패치 로 dogfood KB 직접 eval 가능. (3) 검색 품질 baseline — hybrid hit@3=1.0 / MRR=0.833, lexical hit@3=1.0 / MRR=0.7 (golden 10 query). **O-2 known limitation**: 소형 모델(gemma4:e4b) refusal 메시지의 query 언어 불일치 가능 — 판정은 정상, 표시 문구만 해당. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-29).
|
||||
- **v0.20 sub-item 1 (scanned PDF OCR via qwen2.5vl:3b)**: post-extract enrichment pattern (`kebab-app::pdf_ocr_apply`, H-1 resolution), DCTDecode-only v1 scope (FlateDecode/CCITTFax page 는 warning + skip), parser_version `"pdf-text-v1"` 보존 + force-reingest UX 명문 (H-4).
|
||||
- **2026-05-26 kebab-normalize + kebab-parse-types 흡수 (24 → 22 crates, design §3.7b 재작성)** — v0.19.0 cut. 4 parser 중 markdown 한 갈래만 lift 를 경유하는 reality 가 design §3.7b 의 fan-in ≥ 2 가정과 diverge → thin layer (`kebab-parse-types`) + `kebab-normalize` 두 crate 가 `kebab-parse-md` 로 흡수. 5 사용 type + 3 forward-declared struct 모두 `kebab-parse-md::{types,normalize}` module 의 `pub` re-export 로 보존. wire / surface impact = 0 (CLI / TUI / MCP / `--json` / config / XDG / parser_version 모두 unchanged). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-26 design deviation entry).
|
||||
- **2026-05-26 v0.18.0 fb-41 multi-hop RAG + NLI verification ship (PR #176-180) + post-PR9 cleanup (PR #181)** — pre-v0.18.0 dogfood (`/build/cache/dogfood-v018/`, 33 assets / 205 chunks, gemma3:4b CPU only / 16 GB RAM) 에서 발견된 S7 caffeine hallucination 의 root cause = LLM-self-judge ceiling (synthesize 가 chunks 와 무관한 Adam optimizer gradient 식을 silent emit, self-judge 가 reject 못함). 학계 표준 (Self-RAG, CRAG, Auto-GDA, MedTrust-RAG) 결론 = deterministic post-synthesis verification. mDeBERTa-v3 XNLI ONNX (280 MB, Xenova HF) 가 `(packed_chunks, answer)` entailment 검사 — `[rag] nli_threshold > 0` (default 0.0 = disabled, production 권장 0.5) 일 때 활성. dogfood retest 측정 — S7 PR-8 baseline `grounded=true + Adam hallucination` → PR-9 `nli_verification_failed, nli_score 0.0035`. wire additive minor — `answer.v1.verification` field + `refusal_reason` 의 `nli_verification_failed` / `nli_model_unavailable` 추가, pre-v0.18 reader 무영향. 5 sub-PR 시퀀스 + cleanup PR (clippy::pedantic baseline + 의도적 30+ allow + H1 `[models.nli].model` config wiring + 9 new tests). post-refactor retest = PR-9d byte-identical (deterministic 확인). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 fb-41 PR-9 closure entry + S3 follow-up).
|
||||
- **2026-05-25 v0.17.2 post-v0.17.1 polish (PR #164 + #165)** — v0.17.1 의 두 follow-up closure. (1) `[image.ocr] request_timeout_secs` 별 노브 — `crates/kebab-parse-image/src/ocr.rs::REQUEST_TIMEOUT` hard 300s 제거, LLM 쪽 패턴 (PR #162) 을 OCR 어댑터에 동일 적용. 사용자 결정으로 별 노브 분리 (OCR vs LLM 의 cold start 패턴이 달라 독립 조절). v0.17.1 미진행 항목 closure. (2) `chunks_fts` 의 `heading_path` 컬럼이 JSON 표기 + path 세그먼트 까지 trigram 색인 → query false positive 가능 문제 closure. `lexical.rs::build_match_string` 가 non-raw 분기 결과를 `text : (<expr>)` 로 wrap — heading 색인 V007 verbatim 유지, 매칭만 text 한정. 사용자가 명시 heading 검색 하려면 raw mode `'heading_path : <token>'` escape hatch (SKILL.md 갱신). 둘 다 additive (옛 config 호환) / re-ingest 불필요. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 v0.17.2 두 entry).
|
||||
- **2026-05-25 v0.17.1 post-dogfood (PR #162 + #163)** — 확장 도그푸딩 (16 GB CPU only, gemma4:e4b 시도) 에서 발견된 두 follow-up 한 묶음. (1) `crates/kebab-llm-local/src/ollama.rs::REQUEST_TIMEOUT` hard 300s → `[models.llm] request_timeout_secs` config + env override (additive, default 300, `=0` 은 disable 아닌 "즉시 timeout" 이라 doc 명시). (2) README + SMOKE 에 sudo / systemd 없이 ollama 설치 + ≤4B Q4 권장 모델 + `kebab ask --stream` UX 권장 docs. additive only — 옛 config / wire 호환. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25).
|
||||
- **2026-05-24 v0.17.0 PR-C `code_lang_chunk_breakdown` additive (closure of 2026-05-22 LOW)** — `schema.v1.stats` 에 chunk 수 집계 신규 키. 기존 `code_lang_breakdown` (doc count) 와 sister. 또 기존 두 필드 JSON schema description 의 "chunk count" 오기재 → "doc count" 로 정정. wire additive — schema_version bump 불필요. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-24 PR-C).
|
||||
- **2026-05-24 v0.17.0 PR-B C typedef alias unit (closure of 2026-05-21)** — `kebab-parse-code::c::extract_blocks` 의 `type_definition` 분기로 inner anonymous struct/enum/union → declarator 의 typedef alias 이름으로 synthetic unit 방출. `PARSER_VERSION code-c-v1` → `code-c-v2` bump + 같은-asset/다른-doc_id 케이스용 `purge_workspace_path_for_parser_bump` cascade (`stale_chunk_ids_for_workspace_path_except_doc_id` + `purge_document_at_workspace_path_except_doc_id` helper 신규). 사용자 작업 불필요 (다음 ingest 가 자동 재처리). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-24 PR-B).
|
||||
- **2026-05-24 v0.17.0 PR-A 한국어 trigram tokenizer 채택 (closure of 2026-05-22 한국어 lexical)** — `chunks_fts` 가 FTS5 `unicode61` → `trigram` 으로 V007 migration (자동 backfill, re-ingest 불필요). `lexical.rs::build_match_string` trigram-aware 재설계 — multi-token 한국어 query (`해시 충돌`) 가 whole-phrase 후보로 hit, 한영 혼합 (`Rust 충돌은`) 도 OR-combined. 2자 이하 query 는 0-hit + CLI/TUI/wire `hint` 안내. 영어 lexical 도 substring 매칭으로 바뀜 (recall ↑ / 단어 경계 ↓). `kebab.sqlite` 크기 ~2-5배 증가 (trigram index). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-24).
|
||||
- **2026-05-22 P10 종합 도그푸딩 round 2 (한국어 lexical 검색 한계)** — `kebab search --mode lexical` 의 한국어 query 가 FTS5 `unicode61` 토크나이저에서 거의 0 hit (어절 단위 토큰화 → 부분 매칭 불가). 기본 hybrid 모드는 `multilingual-e5-small` vector 가 carry 해 한국어 검색 정상. **closure**: 위 2026-05-24 v0.17.0 entry.
|
||||
- **2026-05-20 P10-1B (Rust 1A symbol path 비일관 + expression-level 함수 미방출)** — (a) Rust `code-rust-ast-v1` 은 file-scope nesting 만 (workspace path prefix 없음), 1B 의 Python/TypeScript/JavaScript 는 workspace 경로 → module path prefix 사용 (비일관 수용, retrofit = chunker_version bump + reindex 필요, 사용자 명시 요청까지 보류); (b) TS/JS 의 `const foo = () => {...}` 같은 expression-level 함수는 `<top-level>` glue 로 처리됨 (declaration-level 단위만 1B 1차 범위). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-20) 두 항목.
|
||||
- **2026-05-19 P10-1A-2 (code_rust_ast_v1.rs + SourceType)** — `AST_CHUNK_MAX_LINES` 상수가 `IngestCodeCfg.ast_chunk_max_lines` 를 읽지 않고 모듈 상수 200 고정 (Chunker trait 이 per-medium config 미노출); `SourceType::Code` variant 부재로 code 파일이 `SourceType::Note` 로 분류됨 — 두 항목 모두 `tasks/HOTFIXES.md` (2026-05-19) 에 기록.
|
||||
- **2026-05-07 fb-26 (progress.rs)** — `Aborted` unconditional writeln (TTY duplicate) + `Completed` TTY no summary fixed; `KEBAB_PROGRESS=plain` env + quiet suppression added
|
||||
@@ -81,13 +99,66 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
|
||||
|
||||
## 다음 task 후보
|
||||
|
||||
- **P9-2 TUI search** — `App.search` slot 채움. Library 의 `/` 가 enable 됨.
|
||||
- **P9-3 TUI ask** — `App.ask` slot 채움. `?` enable.
|
||||
- **P9-4 TUI inspect** — `App.inspect` slot 채움. `Enter` enable.
|
||||
- **P9-5 desktop tauri** — 별도 분기. PDF citation rendering UI 가치 큼.
|
||||
- **P8 audio brainstorm** — whisper-rs 시스템 dep 받을지 / 외부 transcription endpoint 사용할지 사용자 결정 필요. 사용자 패턴 (책+PDF 위주, audio 의향 없음) 상 후순위.
|
||||
구조적으로 미완인 component 는 P9-5 하나뿐. 나머지는 도그푸딩 follow-up (아래 "P10 dogfooding 백로그") 또는 사용자 결정 대기.
|
||||
|
||||
P9-2/3/4 는 P9-1 의 parallel-safety contract (sub-state slot 패턴) 덕에 병렬 진행 가능 — 같은 `App` 손대지 않음.
|
||||
- **P9-5 desktop tauri** — 마지막 남은 P9 component. `kebab-desktop` crate + Tauri 앱, 별도 분기. PDF citation rendering UI 가치 큼. 사용자 우선순위 (P9 우선 · 책/PDF 위주) 와 부합.
|
||||
- **P10 도그푸딩 round 2 follow-up** — ✅ v0.17.0 cut (2026-05-24) 으로 세 항목 모두 closure (한국어 trigram PR-A + C typedef alias PR-B + code_lang_chunk_breakdown additive PR-C). 상세 cross-link: 아래 "P10 dogfooding 백로그" 절 + `tasks/HOTFIXES.md` (2026-05-24 PR-A/B/C).
|
||||
- **P8 audio brainstorm** — whisper-rs 시스템 dep 받을지 / 외부 transcription endpoint 사용할지 사용자 결정 필요. 사용자 패턴 (책+PDF 위주, audio 의향 없음) 상 보류.
|
||||
- **fb-41 multi-hop reasoning** — ⏳ 미구현, XL, eval 인프라 선행 + brainstorm 필요.
|
||||
- **Rust symbol path retrofit** — Rust `code-rust-ast-v1` symbol 이 file-scope-only (1B+ 는 module prefix). `code-rust-ast-v2` bump + Rust corpus re-ingest 비용 → 사용자 명시 요청까지 보류. HOTFIXES `2026-05-20`.
|
||||
|
||||
### v0.20.0 sub-item 1 (PDF scanned OCR) 머지 후 priorities (2026-05-28, 사용자 결정)
|
||||
|
||||
PR #189 (2026-05-28 머지, commit `09333d0`) 으로 PDF scanned OCR (qwen2.5vl:3b vision LLM) + 4 round bugfix (#2/#3/#4/#6/#7/#9/#10/#11/#13/#14) + ingest log feature 가 main 으로 진입. 다음 작업 순서 = **C → B → A → G**.
|
||||
|
||||
- **C — 한국어 morphological tokenizer (Bug #8 follow-up)** ✅ **v0.20.1 머지 완료**.
|
||||
- V007 trigram 의 ≥3 char query 제약 (HOTFIXES `2026-05-22`) — '한국' 같은 2-char 한국어 query 0 hit → V009 migration + lindera-ko-dic tokenizer + tokenized_korean_text column + first-boot eager backfill 으로 해소. branch `feat/korean-morphological-tokenizer` (8 commit + 5 follow-up).
|
||||
- scope: search index 재빌드 cascade (corpus_revision bump) + V007 trigram 보존 (backward-compat).
|
||||
- 사용자 surface: `kebab search` 의 한국어 2자 query ('한국', '서울') 매칭. README + SKILL + release notes 반영.
|
||||
|
||||
- **B — OCR dense page coverage** ⏳ C 다음.
|
||||
- metro-korea.pdf page 8/13 timeout (180s, dense newspaper article). vision LLM 의 output token 과대 → 정상 timeout.
|
||||
- 가능한 path: (a) per-page `max_pixels` 동적 조정 (high-resolution page 만 축소), (b) column-level sub-region OCR (newspaper layout 분할 후 OCR call 분리), (c) model upgrade (qwen2.5vl:7b — Ollama 모델 변경 + max_pixels trade-off), (d) OCR timeout 점진 축소 (180s → 120s → 90s) — round 마다 p90 측정 후.
|
||||
- mojibake.pdf `pdf_ocr_pages: 0` (round 1 부터 동일) — text-detect path fallback 강화 검토.
|
||||
- 별 sub-item.
|
||||
|
||||
- **A — v0.20 의 deferred sub-items (frozen design contract)** ⏳ B 다음.
|
||||
- **sub-item 2** — Multi-region image dispatch (`OcrText.regions` bbox 분리) — image OCR + PDF column-aware OCR.
|
||||
- **sub-item 3** — PDF normalize integration (`ParsedPdfPage` production caller + `build_canonical_document_from_pdf_pages` + cross-page reference graph).
|
||||
- **TODO #4** — Per-page image / table extraction (PDF figure / table extract).
|
||||
- **TODO #5** — Enricher trait 도입 — OCR + caption 의 `Extractor` trait 통합 (post-extract enrichment 의 generalization).
|
||||
- 각 sub-item 별 spec/plan/executor cycle.
|
||||
|
||||
- **G — v0.20.1 patch release + release notes** ⏳ A 머지 후 (또는 C/B 시점에 따라 조기 cut).
|
||||
- CLAUDE.md release 룰 — sub-item 1 base + bugfix1-4 + log feature + logging r2 누적 → minor surface 변경 다수 + wire schema additive minor + config 신규 → **v0.20.1 patch bump + release notes**.
|
||||
- 핵심 surface (사용자 도그푸딩 가이드 형식):
|
||||
- **한국어 2자 query 지원** (`kebab search` 에서 '한국', '서울' 같은 2자 단어 매칭 — V009 morphological tokenizer).
|
||||
- OCR timeout default 180s (HOTFIXES 2026-05-28).
|
||||
- `[logging]` config section (default enabled) + `{state_dir}/logs/ingest-{run_id}.ndjson` 자동 생성.
|
||||
- `[logging] keep_recent_runs` (100) + `retention_days` (30) — OR-on-stale cleanup.
|
||||
- `ingest_progress.v1.pdf_ocr_finished` 의 4 추가 field (image_byte_size, image_width, image_height, failure_reason) — image_w/h 가 round 2 (PR #190) 에서 실제 capture.
|
||||
- `schema.v1.models` 의 `active_parsers` + `active_chunkers` (additive minor).
|
||||
- V008 migration — `pdf_ocr_events` table (per-OCR-call historical record).
|
||||
- 새 wire schemas — `ocr_stats.v1` + `ocr_failures.v1` (CLI inspect 의 emit).
|
||||
- CLI `kebab inspect ocr-stats` + `kebab inspect ocr-failures` — sweet-spot 점진 분석.
|
||||
- CLI `--media code` first-class, empty query → `invalid_input`, `--config` missing → `config_not_found` + exit 2.
|
||||
- capabilities.streaming_ask + single_file_ingest 가 true (이전 false 거짓 정정).
|
||||
- bump 작업: workspace `Cargo.toml` version → 0.20.1, tag, gitea-release.
|
||||
|
||||
### v0.20 후속 bug catalog (non-blocking known)
|
||||
|
||||
본 PR #189 dogfood 에서 **falsified** 또는 **design constraint** 로 분류 — fix 안 함:
|
||||
- Bug #8 (V007 trigram 2-char query 한계) → 위 C 항목.
|
||||
- Bug #12 (Code block wire `.code` field, `.text` 가 아닌 jq fallback artifact) — falsified.
|
||||
- ask 한국어 query phrasing-sensitive refusal — RAG corner case / NLI gate behavior. 별도 brainstorm.
|
||||
|
||||
### Logging feature enhancements — ✅ closed (PR #190, 2026-05-28 merged commit `7bbdc89a`)
|
||||
|
||||
logging round 2 (PR #190) 으로 4 enhancement 모두 closed:
|
||||
- ✅ `image_width` + `image_height` capture (raster JPEG decode).
|
||||
- ✅ SQLite mirror (V008 `pdf_ocr_events` table + dual-write).
|
||||
- ✅ CLI query (`kebab inspect ocr-stats` + `ocr-failures` — `ocr_stats.v1` + `ocr_failures.v1` wire schemas).
|
||||
- ✅ log retention (`keep_recent_runs` + `retention_days` — file + SQLite cleanup).
|
||||
|
||||
### P9 dogfooding 백로그 (fb-26 ~ fb-42) — release 분할
|
||||
|
||||
@@ -96,11 +167,20 @@ P9-2/3/4 는 P9-1 의 parallel-safety contract (sub-state slot 패턴) 덕에
|
||||
- **0.3.0 — agent foundation** ✅ cut 2026-05-07: fb-26 (log), fb-27 (introspection/error wire), fb-28 (readonly/quiet). ~~fb-29 (daemon)~~ → 🚫 **deferred** — fb-30 stdio MCP 가 동일 가치를 daemon 복잡도 없이 제공.
|
||||
- **0.4.0 — agent integration (MCP)** ✅ cut: fb-30 (MCP stdio), fb-31 (single-file/stdin ingest).
|
||||
- **0.5.0 — agent surface refinement (additive)** ✅ cut 2026-05-10: fb-32 (stale doc indicator), fb-33 (streaming ask), fb-34 (output budget controls), fb-35 (verbatim fetch), fb-36 (search filter args), fb-37 (trace + stats). 모두 wire schema additive minor.
|
||||
- **0.6.0 — RAG quality** 🟡 진행: fb-38 (score semantics) ✅ 머지 (2026-05-10), fb-40 (fact-grounded answer / rag-v2 prompt) ✅ 머지 (2026-05-10), fb-39 (retrieval precision tuning, embedding_version cascade) — 미진행 (eval golden set 선행 필요).
|
||||
- **0.7.0 또는 P+**: fb-41 (multi-hop reasoning, XL), fb-42 (bulk multi-query / rerank, Nice).
|
||||
- **0.6.0 — RAG quality** ✅ 대부분 머지 (2026-05-10): fb-38 (score semantics) ✅, fb-39 (eval foundation — `precision_at_k_chunk` metric) ✅, fb-39b (embedding upgrade — multilingual-e5-large default) ✅, fb-40 (fact-grounded answer / rag-v2 prompt) ✅. 잔여 = fb-39 의 retrieval precision lever 실제 적용 (eval golden set 확장 선행 필요).
|
||||
- **0.7.0 또는 P+**: fb-41 (multi-hop reasoning, XL) — ⏳ 미구현 · brainstorm 필요; fb-42 (bulk multi-query) ✅ 머지 (2026-05-10, bulk only — rerank hint 은 deferred).
|
||||
|
||||
각 fb spec frontmatter 의 `target_version` 필드가 source of truth. INDEX.md 의 release subheader 도 동일 grouping.
|
||||
|
||||
### P10 dogfooding 백로그 (2026-05-22 round 2)
|
||||
|
||||
P10 종합 도그푸딩 round 2 (`/build/cache/dogfood-p10b/`, OSS 8 repo + 한국어 위키 문서 10편) 에서 발견된 follow-up 후보. 자세한 내용 + 우선순위 근거는 `tasks/HOTFIXES.md` (2026-05-22).
|
||||
|
||||
- **한국어 lexical tokenizer** — ✅ v0.17.0 (2026-05-24) PR-A 머지 (#159). V007 trigram migration 자동 backfill + `build_match_string` 재설계 + CLI/TUI/wire hint. HOTFIXES `2026-05-24 PR-A` 참조.
|
||||
- **code_lang_chunk_breakdown chunk 단위 집계 (LOW)** — ✅ v0.17.0 (2026-05-24) PR-C 머지 (#161). `schema.v1.stats` additive 필드. HOTFIXES `2026-05-24 PR-C` 참조.
|
||||
- **C typedef-wrapped struct (LOW)** — ✅ v0.17.0 (2026-05-24) PR-B 머지 (#160). `type_definition` 분기 + `PARSER_VERSION code-c-v2` bump + orphan purge cascade. HOTFIXES `2026-05-24 PR-B` 참조.
|
||||
- **ranking glue chunk 편향 (deferred)** — 자동 heuristic 은 user intent misalignment 위험. 사용자 명시 요청 전까지 surface 변경 0 유지. 1주+ 실사용 후 재 brainstorm.
|
||||
|
||||
## 검증된 운영 동작 (release binary, fastembed enabled)
|
||||
|
||||
P7-3 머지 직후 25 시나리오 smoke 통과 — markdown + image + PDF 5 자산 워크스페이스에서 doctor / ingest / list / inspect / search (lex/vec/hybrid) / re-ingest / byte-edit re-ingest / corrupt PDF / RAG ask + page citation 모두. 자세한 시나리오 표는 conversation 기록 참조; 워크스페이스에 직접 돌려보는 절차는 [docs/SMOKE.md](docs/SMOKE.md).
|
||||
|
||||
321
README.md
321
README.md
@@ -1,121 +1,196 @@
|
||||
# kebab — Local-first Knowledge Base
|
||||
# kebab — Local-first Knowledge Base + RAG
|
||||
|
||||
`kebab` 는 개인용 로컬 knowledge base + RAG 도구다. Markdown / PDF / 이미지를 한 곳에 색인하고, 의미 검색 + page-단위 citation 포함 LLM 답변을 단일 binary 로 제공한다. 모든 추론은 로컬 (Ollama / fastembed) 에서 돌아간다. 대상 하드웨어: M4 48GB MacBook 1대, 사용자 1명.
|
||||
|
||||
## 사전 요구
|
||||
|
||||
- **Rust toolchain** ≥ 1.85 (workspace 가 edition 2024 + resolver 3 사용). [rustup](https://rustup.rs) 권장.
|
||||
- **Ollama** — `kebab ask` 와 이미지 OCR/caption 가 사용. `https://ollama.com/download` 에서 설치 후 `ollama serve` 실행. 기본 LLM 은 gemma4 계열 (`ollama pull gemma4:e4b`) — OCR / caption 도 같은 family 라 모델 하나만 pull 하면 됨. 더 큰 variant 원하면 `gemma4:26b` 등으로 config override. config 의 `[models.llm].endpoint` 에 host:port 명시.
|
||||
- **빌드 디스크** — 첫 빌드 시 `target/` 가 6–10 GB (Lance + DataFusion + fastembed). 여유 확인.
|
||||
- **fastembed 모델** — 첫 `kebab ingest` 시 `multilingual-e5-large` (~1.3 GB, fb-39b) 자동 다운로드. `config.toml` 에서 `model = "multilingual-e5-small"` 로 명시하면 이전 모델 사용.
|
||||
|
||||
## 설치
|
||||
|
||||
표준 경로는 `cargo install` — `~/.cargo/bin/kebab` 가 PATH 에 있는지만 확인하면 끝.
|
||||
|
||||
```bash
|
||||
# 1) repo clone
|
||||
git clone https://gitea.altair823.xyz/altair823-org/kebab.git
|
||||
cd kebab
|
||||
|
||||
# 2) binary 빌드 + 설치 (~/.cargo/bin/kebab)
|
||||
cargo install --path crates/kebab-cli --locked
|
||||
|
||||
# 3) PATH 확인 (아직 추가 안 했으면 ~/.bashrc / ~/.zshrc 에 추가)
|
||||
which kebab # → /Users/<you>/.cargo/bin/kebab 같은 경로
|
||||
kebab --version # → kebab 0.1.0
|
||||
```
|
||||
|
||||
git URL 직접 install 도 가능 (clone 없이):
|
||||
|
||||
```bash
|
||||
cargo install --git https://gitea.altair823.xyz/altair823-org/kebab.git --bin kebab --locked
|
||||
```
|
||||
|
||||
업데이트는 `git pull && cargo install --path crates/kebab-cli --locked --force` 또는 git URL 형식의 경우 `cargo install --git ... --force`.
|
||||
|
||||
제거는 `cargo uninstall kebab-cli`. 이 명령은 binary 만 지우고 워크스페이스 데이터는 그대로 남는다. 데이터까지 정리하려면 `kebab reset --all --yes` (config + data + cache + state 4 개 XDG 경로 모두 wipe — **irreversible**, 재시작 시 `kebab init` 다시 실행). 부분 wipe 는 `kebab reset --data-only` (config 보존), `kebab reset --vector-only` (Lance + `embedding_records` 만, 다음 ingest 가 re-embed), **`kebab reset --orphans-only`** (현재 walker scope 밖에 있는 stored doc 만 정리 — `config.workspace.include` 좁히거나 sub-dir 옮긴 후 explicit reconcile; fs 의 file 은 건드리지 않음) 등.
|
||||
`kebab` 는 개인용 로컬 knowledge base + RAG 도구다. Markdown · PDF · 이미지 · 소스코드를 한 곳에 색인하고, 하이브리드 의미 검색과 근거 인용을 포함한 LLM 답변을 **단일 binary** 로 제공한다. 모든 추론은 로컬 (Ollama + fastembed) 에서 돌아간다.
|
||||
|
||||
## Quick start
|
||||
|
||||
사전 요구는 두 가지뿐이다.
|
||||
|
||||
- **Rust toolchain** ≥ 1.85 (workspace 가 edition 2024 사용). [rustup](https://rustup.rs).
|
||||
- **Ollama** — `kebab ask` 와 이미지/PDF OCR 가 사용. [공식 설치 안내](https://ollama.com/download) 참고 후 `ollama serve` 실행. 기본 LLM family 는 gemma4 (`ollama pull gemma4:e4b`) — OCR/caption 도 같은 family 라 모델 하나면 된다. CPU-only 환경이면 소형 모델 (예: `gemma3:4b`) 을 권장.
|
||||
|
||||
```bash
|
||||
# 첫 실행 — XDG 경로에 데이터 디렉토리 + config.toml 생성
|
||||
# 1) 빌드 + 설치 (~/.cargo/bin/kebab)
|
||||
git clone https://gitea.altair823.xyz/altair823-org/kebab.git
|
||||
cd kebab
|
||||
cargo install --path crates/kebab-cli --locked
|
||||
|
||||
# 2) 데이터 디렉토리 + config.toml 생성 (XDG 경로)
|
||||
kebab init
|
||||
|
||||
# config 손보고 — workspace.root, 모델 endpoint 등 설정 (지원 형식: md / png / jpg / pdf / rs / py / ts / js)
|
||||
# 3) config 최소 손보기 — workspace.root (색인할 폴더) 와 LLM endpoint
|
||||
${EDITOR:-vi} ~/.config/kebab/config.toml
|
||||
|
||||
# 색인 (Markdown / 이미지 / PDF 모두 한 번에)
|
||||
# 4) 색인 (Markdown · PDF · 이미지 · 소스코드 한 번에)
|
||||
kebab ingest
|
||||
|
||||
# 검색 (citation 의 source_span 이 매체별로 line / region / page)
|
||||
kebab search "Markdown chunking 규칙" --mode hybrid
|
||||
# 5) 검색 (hybrid = lexical + vector RRF, citation 포함)
|
||||
kebab search "Markdown chunking 규칙"
|
||||
|
||||
# 질문 (Ollama 필요, PDF 인용 시 page 번호 surface)
|
||||
# 6) 질문 (RAG 답변 + 근거 인용, Ollama 필요)
|
||||
kebab ask "내 KB 설계에서 저장소 전략은?"
|
||||
|
||||
# Ratatui 셸 (Library + Search + Ask + Inspect 패널, desktop 진행 중)
|
||||
kebab tui
|
||||
|
||||
# 헬스 체크 (config 경로 / 데이터 디렉토리 쓰기 가능 여부)
|
||||
kebab doctor
|
||||
```
|
||||
|
||||
격리된 임시 워크스페이스로 돌려보는 절차는 [docs/SMOKE.md](docs/SMOKE.md) — `--config <path>` 로 분리. 이미지 / PDF fixture 가 필요하면 두 example 바이너리 (`cargo run --release --example gen_smoke_pdf -p kebab-parse-pdf` / `gen_smoke_png -p kebab-parse-image`) 로 시스템 dep 없이 in-tree 생성 가능.
|
||||
clone 없이 git URL 로 바로 설치할 수도 있다: `cargo install --git https://gitea.altair823.xyz/altair823-org/kebab.git --bin kebab --locked`. 업데이트는 동일 명령에 `--force`. 제거는 `cargo uninstall kebab-cli` (데이터는 보존 — 데이터까지 지우려면 `kebab reset --all --yes`).
|
||||
|
||||
설치 없이 dev 흐름으로 돌려볼 때는 `cargo run --release -p kebab-cli -- <subcommand>` 또는 `cargo build --release && ./target/release/kebab <subcommand>`.
|
||||
설치 없이 dev 흐름으로 돌려볼 때는 `cargo run --release -p kebab-cli -- <subcommand>`. 격리된 임시 워크스페이스로 검증하는 절차는 [docs/SMOKE.md](docs/SMOKE.md) (`--config <path>` 로 분리).
|
||||
|
||||
## 핵심 기능
|
||||
|
||||
### 하이브리드 검색 + citation
|
||||
|
||||
lexical (FTS5 BM25) 과 vector (cosine) 두 채널을 **RRF fusion** 으로 합쳐 검색한다. 모든 hit 은 출처 위치를 매체별로 정확히 담는다 — Markdown/코드는 line, 이미지는 region, PDF 는 page. `--tag` · `--media` · `--lang` · `--path-glob` 등 다양한 필터와 `--max-tokens` · `--cursor` 같은 agent budget flag 를 지원한다.
|
||||
|
||||
### 파생물 캐시 (자동)
|
||||
|
||||
embedding 벡터를 청크 **내용 해시** 로 캐싱한다 (`derivation_cache`). 재색인·갱신 시 내용이 같은 청크는 재계산을 건너뛴다. 캐시 키에 모델·차원 버전이 포함돼 버전 변경 시 자동 무효화된다 (cascade 안전). 별도 설정 없이 투명하게 동작한다. (현재 TTL/LRU 자동 정리는 미구현 — 누적된 캐시는 `kebab reset` 으로만 정리.)
|
||||
|
||||
### 외부 계산 + 로컬 검색 워크플로
|
||||
|
||||
search/ask 는 원본 파일 없이 KB 산출물만으로 동작한다 (청크 본문이 SQLite 에 저장되고 문서 경로는 상대경로로 기록됨). 비싼 색인(임베딩·OCR)을 성능 좋은 머신에서 수행한 뒤(예: Apple Silicon 맥에서 candle Metal GPU), **두 산출물만** 다른 머신(예: NUMA 서버)으로 복사하면 그대로 검색·질문할 수 있다.
|
||||
|
||||
**무엇을 복사하나 — `[storage]` 에서 정의된 두 경로:**
|
||||
|
||||
| 복사 대상 | config 키 (`[storage]`) | 기본 경로 | 내용 |
|
||||
|-----------|------------------------|-----------|------|
|
||||
| `kebab.sqlite` | `sqlite = "{data_dir}/kebab.sqlite"` | `{data_dir}/kebab.sqlite` | 문서·청크·본문·FTS5·메타 |
|
||||
| `lancedb/` | `vector_dir = "{data_dir}/lancedb"` | `{data_dir}/lancedb/` | 임베딩 벡터 |
|
||||
|
||||
`{data_dir}` 는 `[storage].data_dir` (예: `~/.local/share/kebab`). `models/`(`model_dir`)·`assets/`(`asset_dir`)는 **복사 불필요** — 모델은 각 머신이 자기 캐시를 받고, asset 원본 바이트는 검색·질문에 쓰이지 않는다 (단일파일/`stdin` 색인의 원본 재읽기·재색인까지 보존하려면 `assets/` 도 함께 복사).
|
||||
|
||||
```bash
|
||||
# ingest 가 끝난(쓰기 없는) 상태에서 복사
|
||||
rsync -a <src-data_dir>/kebab.sqlite user@server:<dst-data_dir>/
|
||||
rsync -a <src-data_dir>/lancedb/ user@server:<dst-data_dir>/lancedb/
|
||||
```
|
||||
|
||||
조건: **양쪽 동일 `kebab` 버전 + 동일 임베딩 모델/차원** (`[models.embedding].model`·`dimensions`). provider 는 달라도 됨 (예: 맥 `candle`/Metal ↔ 서버 `candle`/CPU 또는 `fastembed` — 같은 모델이면 벡터 호환). 복사는 반드시 ingest 가 돌지 않을 때.
|
||||
|
||||
### 멀티미디어 색인
|
||||
|
||||
Markdown · PDF · 이미지(OCR + caption) · 소스코드(Rust/Python/TS/JS/Go/Java/Kotlin/C/C++ AST) · 리소스(YAML/Dockerfile/TOML/JSON/XML 등)를 확장자에 따라 자동으로 적절한 chunker 에 라우팅한다. embedded text 가 없는 scanned PDF 는 `[pdf.ocr]` 로 page-단위 OCR (opt-in). 전체 확장자→chunker 매핑은 [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).
|
||||
|
||||
### RAG (근거 인용 + 거절)
|
||||
|
||||
검색 결과를 근거로 LLM 답변을 생성하고 [#번호] 인용을 단다. 근거가 부족하면 답을 지어내지 않고 거절한다. compound 질문은 `--multi-hop` 으로 분해→synthesize. 답변의 groundedness 는 mDeBERTa XNLI 로 검증할 수 있다 (`[rag] nli_threshold`, default off).
|
||||
|
||||
### TUI
|
||||
|
||||
`kebab tui` 는 Ratatui 셸 — Library / Search / Ask / Inspect 패널을 vim-style 모드로 다룬다. 키 매핑은 앱 내 `F1` cheatsheet 가 권위 소스다.
|
||||
|
||||
## 명령
|
||||
|
||||
| 명령 | 동작 |
|
||||
|------|------|
|
||||
| `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 |
|
||||
| `kebab ingest [<path>]` | Markdown / 이미지 / PDF / Rust 소스코드 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`), **소스코드** (`.rs` → `code-rust-ast-v1`, `.py` → `code-python-ast-v1`, `.ts`/`.tsx` → `code-ts-ast-v1`, `.js`/`.mjs`/`.cjs`/`.jsx` → `code-js-ast-v1` — 모두 tree-sitter AST chunker). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. 코드 chunk 는 `citation.kind = "code"` 에 `citation.lang = "<lang>"` + `symbol` + line range 를 담고, SearchHit top-level 에 `code_lang` + `repo` (`.git/` walk-up 의 디렉토리 이름) 가 backfill 됨. `--code-lang rust` / `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` / `--media code` filter 로 언어별·코드 전용 검색 가능 (p10-1A-1 filter flags). Python symbol 은 workspace 경로 → dotted module path prefix (예: `kebab_eval.metrics.compute_mrr`), TS/JS symbol 은 slash-style module path prefix (예: `src/Foo.Foo.search`). |
|
||||
| `kebab search --mode {lexical,vector,hybrid} "<query>" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor <opaque>] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID] [--trace] [--bulk] [--repo NAME ...] [--code-lang LIST]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` 는 반복 가능 flag (`--tag rust --tag async`) 로 OR 매칭, `--media` 는 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). **`--trace` (p9-fb-37)** — `search_response.v1.trace` 에 lexical / vector pre-fusion 후보 + RRF union + per-stage timing (`lexical_ms` / `vector_ms` / `fusion_ms` / `total_ms`) 노출. trace 요청은 캐시 우회 (`--no-cache` 없이도 항상 cold). **`--bulk` (p9-fb-42)** — stdin ndjson 으로 N query 한 번에 실행. `--json` 면 stdout per-query ndjson (`bulk_search_item.v1`) + stderr summary (`bulk_summary: total=N succeeded=S failed=F`). Cap 100. agent 가 query decomposition 후 sub-query 일괄 실행 시 single round-trip — App instance 재사용으로 캐시 / embedder cold-start 비용 한 번만. Per-query failure 는 item 의 `error` (error.v1) 에 격리, 다른 query 계속 진행. **code corpus filters (p10-1A-1):** `--repo` 는 반복 가능 (`--repo kebab --repo other`) OR 매칭. `--code-lang` 는 반복 또는 comma 다중 값 (`--code-lang rust,python`), 알 수 없는 값은 빈 hits. `--media code` 는 Tier 1/2/3 모든 code chunk 포함. 1A-1 시점에서는 indexed 된 code chunk 가 없어 filter 가 항상 빈 결과 — 1A-2 (Rust AST chunker) 머지 이후 실효. |
|
||||
| `kebab ingest [<path>]` | 워크스페이스 스캔 후 새/변경 문서 색인 (idempotent · incremental, `--force-reingest` 로 강제 재처리). 미지원 확장자는 자동 skip. 진행바는 현재 **파일명** · 느린 **phase(ocr/caption/embed)+모델명** · **경과초**`(Ns)` · 문서별 청크 수 · phase별 소요시간(parse/chunk/ocr/caption/embed/store)을 표시하고, 종료 시 **최장 소요 파일 top-5** 를 요약한다 (`--json` 은 `asset_phase`/`asset_chunked`/`asset_timings` 이벤트로, 사람용 요약은 미출력) |
|
||||
| `kebab ingest-file <path>` | 단일 파일 ingest (workspace 외부 가능 — `_external/` 로 deterministic copy) |
|
||||
| `kebab ingest-stdin --title <T>` | stdin 의 markdown 본문 ingest |
|
||||
| `kebab search --mode {lexical,vector,hybrid} "<query>" [flags]` | 검색 (default hybrid = RRF fusion, citation 포함). 필터/budget flag 는 `--help` |
|
||||
| `kebab ask "<query>" [flags]` | RAG 답변 + 근거 인용 (Ollama 필요). `--session` (multi-turn) · `--stream` · `--multi-hop` |
|
||||
| `kebab list docs` | 색인된 문서 목록 |
|
||||
| `kebab inspect doc <id>` / `kebab inspect chunk <id>` | raw record 보기 |
|
||||
| `kebab fetch chunk <id> [--context N]` / `kebab fetch doc <id> [--max-tokens N]` / `kebab fetch span <doc_id> <ls> <le> [--max-tokens N]` | (p9-fb-35) verbatim text fetch from indexed corpus. wire = `fetch_result.v1` (kind discriminator). chunk: target + ±N ordinal-context chunks. doc: full normalized markdown. span: 1-based line range (PDF/audio rejected as `error.v1.code = span_not_supported`). chars/4 budget on doc/span. |
|
||||
| `kebab ask "<query>" [--show-citations / --hide-citations] [--session <id>] [--stream]` | RAG 답변 + 근거 인용. 답변 후 `근거:` block 으로 full path / line range / score 한 줄씩 (default ON — `--hide-citations` 로 끄기, pipe 시 유용). 근거 부족 시 거절. Ollama 필요. `--session <id>` 로 multi-turn — 첫 호출에서 SQLite `chat_sessions` 에 자동 생성, 이후 호출은 prior turns 를 history 로 받아 follow-up. session id 는 사용자 지정 (e.g. `kb-rust-async-2026-05`) — `kebab reset --data-only` 로 모든 session wipe. **`--stream` (p9-fb-33)** 로 ndjson `answer_event.v1` event (retrieval_done → token* → final) 를 stderr 에 흘리고 stdout 마지막 줄에 기존 `answer.v1` — agent 가 token 즉시 소비 가능 |
|
||||
| `kebab doctor` | 설정/모델/DB 헬스 체크 |
|
||||
| `kebab tui` | Ratatui 셸 (Library + Search + Ask + Inspect 패널, desktop 진행 중). Library 에서 `r` 키로 background ingest 시작 — 화면 하단 status bar 가 진행 표시, 완료/abort 시 final 라인 잠시 유지 후 자동 hide. ingest 진행 중 `Esc` / `Ctrl-C` 가 cancel signal (그 외에는 quit). vim-style mode (header 우측 `-- NORMAL --` / `-- INSERT --`) — Library/Inspect 는 자동 NORMAL, Search/Ask 는 자동 INSERT. `i` 로 Normal→Insert (모든 pane — p9-fb-21), `Esc` 로 Insert→Normal 어디서나. mode-authoritative dispatch — Search 의 `j/k/o/g`, Ask 의 `e/j/k` 는 NORMAL 모드에서만 명령으로 동작, INSERT 에서는 입력 문자로 typing. (Search 의 chunk inspect 키는 `i`→`o` 로 rebind — `i` 가 universal Insert toggle.) **`F1` 로 cheatsheet popup** (현재 pane 의 키 매핑 + global 토글 표) — `Esc` / `F1` 로 닫기. Search 패널은 200ms debounce 후 background worker 가 검색 — 키 입력으로 UI freeze 안 됨, 사용자가 계속 타이핑하면 stale 결과 자동 폐기 (generation counter). Ask 패널은 multi-turn — 같은 conversation 안에서 Q1/A1, Q2/A2 transcript 누적, 다음 질문이 이전 턴을 history 로 받아 답변. 답변 본문은 markdown 렌더 (bold/italic/inline code/heading/list/code fence/table/blockquote, raw `**bold**` 가 실제 굵게 표시). `Ctrl-L` 로 새 conversation 시작. Search 의 `g` 키가 `$EDITOR` (기본 `vi`) 로 hit 의 citation 위치 열기 — 종료 후 TUI 화면이 자동으로 깨끗이 redraw. CLI `kebab ask` 는 raw markdown 그대로 (terminal 호환성 위해). Library 의 doc-list 가 한글 / 일본어 / 중국어 (CJK) 제목을 wide-char 정확한 column width 로 truncate — 한글 제목이 한 줄을 넘기지 않음 (CJK 1 자 = 2 col). Search/Ask/Filter 입력의 cursor 가 wide char 위에서 column 단위로 정렬 — 한글 입력 시 caret 이 글자 옆에 정확히 놓임. `← / →` 로 입력 문자열 중간 cursor 이동 (한글 한 글자 = 2 column 이라도 한 번에 이동), `Home / End` 로 양 끝 점프, `Delete` 로 cursor 위치 char 삭제 — 모든 input pane (Ask / Search / Library filter overlay) 동일 (p9-fb-22). Ask 트랜스크립트는 새 답변이 viewport 아래로 누적될 때 자동으로 tail 을 따라감 (auto-scroll); `j` / `k` 로 위로 스크롤하면 freeze, `Shift-G` 로 다시 bottom + auto-tail 재개. 화면 하단 hint line 은 한국어 동사구로 (`"위로"` / `"아래로"` / `"필터"` / `"타이핑 검색어"` / `"Esc 로 NORMAL 모드"` / `"i 입력모드"` 등) + 현재 (pane, mode) 조합에 맞춰 자동 분기, **첫 fragment 가 항상 `F1 도움말`** (cheatsheet 발견성 보장). 모든 모드에서 항상 떠 있는 상태바 — `kebab v<version> │ <pane> │ <docs> docs │ <state>` (state: streaming/searching/indexing/idle, ingest 진행 중에는 progress 가 같은 자리에 흡수됨). Ask 진입 시 conversation id 8 자 prefix 도 함께 표시. Ask 트랜스크립트와 Inspect 양쪽에서 `PgUp / PgDn` 으로 10 줄씩 페이지 스크롤. Library 의 doc list 위에는 `TITLE / TAGS / UPDATED / CHUNKS` 컬럼 헤더 행 표시 (display-width 정렬, Hangul / CJK 안전). |
|
||||
| `kebab reset [--all / --data-only / --vector-only / --config-only] [--yes]` | XDG 데이터 wipe. **Irreversible.** TTY 면 confirm prompt, 아니면 `--yes` 필수. `--vector-only` 는 SQLite `embedding_records` 도 함께 truncate (orphan 방지) |
|
||||
| `kebab eval run / compare` | golden query 회귀 측정 |
|
||||
| `kebab schema [--json]` | introspection — wire schemas / capabilities / models / stats 한 번에. `--json` 은 `schema.v1` wire; 사람 모드는 서식 출력. **stats 에 (p9-fb-37) `media_breakdown` (5 keys: markdown / pdf / image / audio / other) + `lang_breakdown` (BCP-47 코드, NULL 은 literal `"null"`) + `index_bytes` (sqlite + lancedb on-disk 합계) + `stale_doc_count` (`config.search.stale_threshold_days` 초과 doc 수) 추가.** |
|
||||
| `kebab ingest-file <path>` | 단일 파일 ingest (workspace 외부 가능). 바이트는 `<workspace.root>/_external/<hash12>.<ext>` 로 copy. `.kebabignore` 매치 시 stderr warn 후 진행 (explicit ingest 가 bypass intent). |
|
||||
| `kebab ingest-stdin --title <T> [--source-uri <URI>]` | stdin 의 markdown 본문 ingest. frontmatter (title + source_uri) 자동 prepend. v1 markdown only. |
|
||||
| `kebab mcp` | MCP (Model Context Protocol) stdio server. agent host (Claude Code / Cursor / OpenAI Agents) 가 spawn 하여 tool 호출 (`search` / `bulk_search` / `ask` / `fetch` / `schema` / `doctor` / `ingest_file` / `ingest_stdin`). `--config` honor. |
|
||||
| `kebab inspect doc <id>` / `inspect chunk <id>` | raw record 보기 |
|
||||
| `kebab fetch chunk\|doc\|span <id> [flags]` | indexed corpus 에서 verbatim text fetch |
|
||||
| `kebab eval run \| aggregate \| compare \| variants` | golden query 회귀 측정 + 변형 일관성 진단 |
|
||||
| `kebab schema [--json]` | introspection — wire schemas / capabilities / models / stats |
|
||||
| `kebab doctor` | 설정 / 모델 / DB 헬스 체크 |
|
||||
| `kebab tui` | Ratatui 셸 (Library / Search / Ask / Inspect) |
|
||||
| `kebab mcp` | MCP stdio server (`search` / `bulk_search` / `ask` / `fetch` / `schema` / `doctor` / `ingest_file` / `ingest_stdin`) |
|
||||
| `kebab reset [--all \| --data-only \| --vector-only \| --config-only \| --orphans-only] [--yes]` | XDG 데이터 wipe (**irreversible**) |
|
||||
|
||||
모든 명령에 `--json` 플래그. 출력은 frozen wire schema v1 (`schema_version` 항상 포함, 예: `ingest_report.v1`, `ingest_progress.v1`, `search_hit.v1`, `answer.v1`, `doctor.v1`, `reset_report.v1`, `schema.v1`). `--json` 모드에서 fatal error 는 stderr 에 `error.v1` ndjson 으로 emit (exit code 0/1/2/3 unchanged).
|
||||
모든 명령에 `--json` 플래그가 있고, 출력은 frozen **wire schema v1** 을 따른다 (`schema_version` 항상 포함). `--json` 모드에서 fatal error 는 stderr 에 `error.v1` ndjson 으로 emit (exit code 0/1/2/3 불변). 글로벌 flag: `--readonly` (write-path 비활성화), `--quiet` (human stderr 억제), env `KEBAB_PROGRESS=plain`. 전체 flag·wire 의미는 `kebab <cmd> --help` 와 [docs/wire-schema/v1/](docs/wire-schema/v1/). 외부 agent 통합(Claude Code skill / MCP)은 [docs/mcp-usage.md](docs/mcp-usage.md) 와 [integrations/](integrations/).
|
||||
|
||||
글로벌 플래그: `--readonly` (또는 `KEBAB_READONLY=1`) — 모든 write-path 명령 (`ingest` / `ingest-file` / `ingest-stdin` / `reset`) 을 비활성화, exit 1. `--quiet` — 진행 바 / hint 등 human-readable stderr 억제 (exit code / stdout 출력은 그대로). `KEBAB_PROGRESS=plain` — TTY 가 없는 환경에서도 진행 상황을 plain-text 한 줄씩 stderr 로 출력 (spinner 대신).
|
||||
## Configuration
|
||||
|
||||
### Score 해석 (fb-38)
|
||||
`~/.config/kebab/config.toml` 은 `kebab init` 가 XDG 경로에 생성한다. 핵심 노브만 정리한다 (전체 절은 생성된 파일 주석 참고, 예시는 [docs/SMOKE.md](docs/SMOKE.md)).
|
||||
|
||||
`search_hit.v1.score` 는 **ranking signal** 이지 confidence 가 아니다. `score_kind` 필드로 의미 선언:
|
||||
```toml
|
||||
[workspace]
|
||||
root = "~/KnowledgeBase" # 색인할 폴더. 절대 / tilde / env / 상대 경로 가능.
|
||||
# 상대 경로의 base 는 config.toml 위치 (cwd 무관).
|
||||
|
||||
| `score_kind` | 의미 | 범위 |
|
||||
|--------------|------|------|
|
||||
| `rrf` (hybrid) | RRF normalized | `[0, 1]`, ceiling = 1.0 (양 채널 rank=1) |
|
||||
| `bm25` (lexical) | raw BM25 | unbounded (≥ 0) |
|
||||
| `cosine` (vector) | cosine sim | `[-1, 1]` |
|
||||
|
||||
#### RRF 수식 (hybrid mode)
|
||||
|
||||
```
|
||||
chunk c 의 raw RRF = Σ_m 1 / (k_rrf + rank_m(c))
|
||||
|
||||
여기서 m ∈ {lexical, vector}, k_rrf = config.search.rrf_k (default 60).
|
||||
양 채널 모두 rank=1 일 때 raw RRF = 2 / (k_rrf + 1) ≈ 0.0328.
|
||||
|
||||
normalize: rrf_score = raw_rrf / (2 / (k_rrf + 1))
|
||||
→ rrf_score ∈ [0, 1]. 양쪽 rank=1 → 1.0, 한 쪽만 등장 → ≈ 0.5 천장.
|
||||
[models.embedding]
|
||||
provider = "fastembed" # "fastembed"(기본, onnxruntime) / "candle"(순수 Rust)
|
||||
# / "ollama"(원격 HTTP) / "none"(lexical-only).
|
||||
# candle 는 같은 모델·같은 벡터를 순수 Rust 로 돌려
|
||||
# NUMA 서버의 onnxruntime 48-스레드 double-free 를 피하는
|
||||
# opt-in 백엔드 (e5 는 재색인 불필요).
|
||||
model = "multilingual-e5-large" # 다국어 sentence embedding (1024-dim).
|
||||
# 첫 ingest 시 ONNX (~1.3GB) 자동 다운로드.
|
||||
# candle provider 는 safetensors (~2GB) 다운로드.
|
||||
# candle/ollama 는 "snowflake-arctic-embed-l-v2.0"
|
||||
# (설명형 query 의 recall 보강) 도 지원 — 아래 참고.
|
||||
dimensions = 1024 # config 와 LanceDB stored dim 불일치 시 검색 0건.
|
||||
num_threads = 0 # candle 전용 CPU 스레드 캡 (0=auto=#cores).
|
||||
# env KEBAB_EMBED_THREADS 가 우선. NUMA 노드 바인딩은
|
||||
# numactl 과 조합. fastembed provider 는 무시.
|
||||
# endpoint = "http://127.0.0.1:11434" # provider="ollama" 전용 HTTP endpoint.
|
||||
# 생략 시 [models.llm].endpoint 로 폴백.
|
||||
# fastembed/candle provider 는 무시.
|
||||
```
|
||||
|
||||
`rrf_score = 0.5` 의 의미: chunk 가 한 채널 (lexical 또는 vector) 에서만 rank 1 로 등장. confidence 50% 가 아님 — RRF 수식의 산술적 천장.
|
||||
**arctic-embed-l-v2.0 (설명형 query recall 보강)**: 기본 e5-large 대신
|
||||
Snowflake `arctic-embed-l-v2.0` 임베더를 쓸 수 있다 (1024-dim, opt-in). 측정에서
|
||||
설명형/약어/영문 용어 query 의 recall@10 이 e5 대비 향상됐다. 두 경로:
|
||||
|
||||
agent 가 trust threshold 가 필요하면 top-level `score` 가 아닌 nested `retrieval.lexical_score` (BM25 raw) / `retrieval.vector_score` (cosine raw) 사용.
|
||||
```toml
|
||||
# (A) candle 백엔드 — 순수 Rust, in-process (NUMA 안전, Metal GPU 가능):
|
||||
[models.embedding]
|
||||
provider = "candle"
|
||||
model = "snowflake-arctic-embed-l-v2.0" # CLS pooling, query 에 "query: " 접두어
|
||||
# (문서는 무접두어). safetensors ~2GB 다운로드.
|
||||
|
||||
## 논리 아키텍처
|
||||
# (B) ollama 백엔드 — 원격/로컬 Ollama 데몬에 위임 (POST /api/embed):
|
||||
[models.embedding]
|
||||
provider = "ollama"
|
||||
model = "snowflake-arctic-embed2" # Ollama 모델 태그 (ollama pull 필요)
|
||||
endpoint = "http://127.0.0.1:11434" # 생략 시 [models.llm].endpoint
|
||||
```
|
||||
|
||||
> ⚠️ e5 → arctic 전환은 `embedding_version` cascade 를 트리거한다 (모델이 다르면
|
||||
> 벡터도 다름). 기존 e5 KB 와 혼용 불가 — 전환 시 **재색인** 필요 (`kebab reset`
|
||||
> 후 재 ingest). 기본값은 e5 라 기존 사용자는 영향 없음.
|
||||
|
||||
**Apple Silicon GPU 가속 (candle / macOS)**: M-시리즈 맥에서 candle 임베딩을
|
||||
GPU(Metal)로 돌리면 CPU 대비 대용량 ingest 가 크게 빨라진다. 빌드 또는 설치 시
|
||||
`embed_metal` feature 를 켠다:
|
||||
|
||||
```bash
|
||||
# 빌드만:
|
||||
cargo build --release --features embed_metal
|
||||
# 전역 설치 (~/.cargo/bin/kebab):
|
||||
cargo install --path crates/kebab-cli --features embed_metal --locked
|
||||
```
|
||||
|
||||
벡터는 CPU candle 과 동일 모델이라 호환되므로, 맥에서 GPU 로 색인한
|
||||
`kebab.sqlite` + `lancedb/` 를 그대로 Linux 서버(CPU candle)로 복사해 질의할 수
|
||||
있다. 색인 로그에 `candle device = Metal (GPU)` 가 보이면 GPU 사용 중. metal
|
||||
feature 는 macOS 전용 (Linux/서버는 기본 CPU 빌드).
|
||||
|
||||
```toml
|
||||
|
||||
[models.llm]
|
||||
endpoint = "http://localhost:11434" # Ollama host:port
|
||||
model = "gemma4:e4b"
|
||||
# request_timeout_secs = 300 # 큰 모델은 늘림. 0 은 disable 이 아니라 "즉시 timeout".
|
||||
|
||||
[search]
|
||||
stale_threshold_days = 30 # search hit / citation 의 stale 플래그 기준 (0 = off).
|
||||
|
||||
[rag]
|
||||
prompt_template_version = "rag-v3" # 답변 언어 = 질문 언어. rag-v1/v2 는 legacy.
|
||||
nli_threshold = 0.0 # >0 (예: 0.5) 면 mDeBERTa XNLI groundedness 검증.
|
||||
```
|
||||
|
||||
- **파생물 캐시** — embedding 결과를 내용 해시로 자동 캐싱한다 (위 「핵심 기능」 참고). 설정 항목 없음.
|
||||
- **`[ingest.code]`** — code ingest 의 skip 정책 (`skip_generated_header`, `max_file_bytes`, `extra_skip_globs`). `.gitignore` 자동 honor, `.kebabignore` 는 추가 layer.
|
||||
- **`[pdf.ocr]`** — scanned PDF 의 page-단위 OCR (default off / opt-in, page 당 ~수십 초 cost). 활성화 후 v0.19 시절 색인분은 `kebab ingest --force-reingest` 로 재처리.
|
||||
- **`--config <path>`** — 임시 워크스페이스 / 격리 테스트용 (CLI · TUI 모두 honor).
|
||||
- **`kebab config migrate`** — 새 버전에서 추가된 config 섹션을 기존 `config.toml` 에 설명 주석과 함께 채워 넣는다 (사용자가 손본 값·주석·순서는 보존, 멱등, 변경 시 자동 `.bak` 백업). `--dry-run` 으로 변경 미리보기. `kebab doctor` 가 갱신 필요 시 안내한다. `kebab init` 으로 새로 생성되는 config.toml 도 섹션별 주석을 포함한다.
|
||||
- **`KEBAB_*` env** — 일부 키 override (`KEBAB_RAG_SCORE_GATE`, `KEBAB_EVAL_GOLDEN` 등).
|
||||
- **XDG layout**: `~/.config/kebab/`, `~/.local/share/kebab/`, `~/.cache/kebab/`, `~/.local/state/kebab/`.
|
||||
|
||||
## 아키텍처
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
@@ -132,7 +207,7 @@ flowchart TB
|
||||
|
||||
subgraph Pipeline["도메인 + 파이프라인"]
|
||||
parse["parse-md / parse-pdf / parse-image / parse-code"]
|
||||
chunker["chunker (md-heading-v1, pdf-page-v1, code-rust-ast-v1, code-python-ast-v1, code-ts-ast-v1, code-js-ast-v1)"]
|
||||
chunker["chunker (md / pdf / code-AST / manifest)"]
|
||||
embedder["embedder (fastembed multilingual-e5-large)"]
|
||||
retriever["retriever (lexical / vector / hybrid RRF)"]
|
||||
rag["RAG pipeline"]
|
||||
@@ -174,70 +249,22 @@ flowchart TB
|
||||
rag --> ollama
|
||||
```
|
||||
|
||||
`kebab-app` 가 facade — UI binary 가 store / parse / search / llm / rag 를 직접 참조하지 않는다 (frozen 설계 §8). 자세한 crate-level 의존성 + 디렉토리 + 핵심 기술 결정은 [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).
|
||||
v0.21.0 기준 핵심 설계:
|
||||
|
||||
## Configuration
|
||||
- **crate facade** — `kebab-app` 가 유일한 facade다. UI binary (`kebab-cli` / `kebab-tui`) 는 store / parse / search / llm / rag 를 직접 참조하지 않는다 (frozen 설계 §8). 각 user-facing 엔트리는 `*_with_config(cfg, …)` 동반 함수로 explicit config 를 thread 한다.
|
||||
- **chunk_id 는 위치 기반** — chunk 의 정체성은 문서 내 위치(ordinal + span)다. 반면 파생물 캐시 키는 **내용 해시**라, 내용이 같으면 위치·문서가 달라도 동일 캐시를 재사용한다.
|
||||
- **wire schema v1** — 모든 `--json` 출력은 `schema_version` 을 담는 frozen contract다. 깨는 변경은 `*.v2` major bump을 요구한다.
|
||||
- **versioning cascade** — `parser_version` / `chunker_version` / `embedding_version` / `prompt_template_version` / `index_version` 변경은 downstream record(청크·임베딩·캐시·eval)를 무효화한다.
|
||||
|
||||
- `~/.config/kebab/config.toml` — `kebab init` 가 XDG 경로에 생성. `[workspace]` (root, exclude — include 필드는 제거됨, 지원 형식은 자동 결정), `[storage]`, `[chunking]`, `[models.embedding]`, `[models.llm]`, `[image.ocr]`, `[image.caption]`, `[search]`, `[rag]`, `[ui]` 절.
|
||||
- `[models.embedding]` —
|
||||
- `model` (default `"multilingual-e5-large"`, fb-39b) — 다국어 sentence embedding 모델. 1024-dim. ONNX (~1.3 GB) 첫 실행 시 fastembed cache (`config.storage.model_dir/fastembed/`) 에 자동 다운로드. `"multilingual-e5-small"` (384 dim) 는 backwards-compat 으로 사용 가능 — TOML 에 명시.
|
||||
- `dimensions` (default `1024`) — 모델의 embedding 차원. config 와 LanceDB stored dim 불일치 시 검색 결과 0 건 (orphan table). 모델 변경 시 `kebab reset --vector-only && kebab ingest` 로 vector index 재구축 권장.
|
||||
- `[ui] theme = "dark" | "light"` 로 TUI 팔레트 선택 (default `"dark"`, 알 수 없는 값은 dark fallback).
|
||||
- `[search] stale_threshold_days = 30` (p9-fb-32) — search hit / RAG citation 의 `stale` 플래그 기준 (default 30 일, `0` 으로 비활성화). 옛 config 의 `workspace.include = [...]` 은 silently 무시 + 단발 deprecation warning (p9-fb-25).
|
||||
- `[ingest.code]` (p10-1A-1) — code ingest 의 skip 정책 + chunker 기본값.
|
||||
- `skip_generated_header = true` — 첫 ~512 byte 의 generated marker (`@generated` / `DO NOT EDIT` 등) 감지 시 skip.
|
||||
- `max_file_bytes = 262144` (256 KiB) / `max_file_lines = 5000` — 파일당 cap, 초과 시 skip.
|
||||
- `extra_skip_globs = []` — 사용자 추가 skip 패턴 (`.gitignore` 문법).
|
||||
- `.gitignore` honor: 자동 적용. `.kebabignore` 는 추가 layer. 우선순위: built-in safety net (`node_modules/` / `target/` / `__pycache__/` / `.venv/` / `venv/` / `env/`) > `.gitignore` > `.kebabignore`.
|
||||
- `[rag] prompt_template_version` (default `"rag-v2"`) — RAG system prompt version. `"rag-v1"` 은 legacy backwards-compat (사용자 명시 시 유지). v2 강화 규칙: (1) fact 인용 시 [#번호] 앞에 chunk 속 원문 큰따옴표 표기, (2) 학습 지식 동원 금지, (3) 근거 모호 시 "확실하지 않다" 명시.
|
||||
- `--config <path>` flag — 임시 워크스페이스 / 격리 테스트 시 사용. CLI / TUI 모두 honor.
|
||||
- `KEBAB_*` env — 일부 키 override (`KEBAB_RAG_SCORE_GATE`, `KEBAB_EVAL_GOLDEN`, `KEBAB_COMMIT_HASH` 등).
|
||||
- XDG layout: `~/.config/kebab/`, `~/.local/share/kebab/`, `~/.cache/kebab/`, `~/.local/state/kebab/`.
|
||||
- `workspace.root` 경로 형식: 절대 (`/foo/bar`) / tilde (`~/KnowledgeBase`, default) / env (`${XDG_DATA_HOME}/kebab`) / 상대 (`./notes`, `notes`, `../shared/x`) 모두 가능. **상대 경로의 base 는 config.toml 자체가 위치한 디렉토리** — 사용자의 `cwd` 와 무관 (`--config /tmp/cfg.toml` + `root = "kb"` → `/tmp/kb`). p9-fb-05 정책.
|
||||
|
||||
config 예시는 [docs/SMOKE.md](docs/SMOKE.md) 의 `/tmp/kebab-smoke/config.toml` 블록 참조.
|
||||
|
||||
## 외부 AI 통합
|
||||
|
||||
`--json` 출력 + frozen wire schema v1 가 stable contract. 통합 옵션:
|
||||
|
||||
- **Claude Code skill** — repo 의 [`integrations/claude-code/`](integrations/claude-code/) 가 ship-ready skill. `cp -r integrations/claude-code/kebab ~/.claude/skills/` 한 번이면 새 Claude Code 세션부터 자동 trigger (내부 시스템 / 위키 lookup / 사내 runbook 질문). multi-turn 은 `kebab ask --session <id> --json` 으로 영속 — skill 이 conversation id 관리하면 외부 agent 도 `--repl` 없이 stateful 대화 가능 (p9-fb-18).
|
||||
- **Codex / 기타 agent host** — `--json` + frozen wire schema v1 가 stable contract. 동일 패턴으로 ~50줄 wrapper 작성 가능. `integrations/<host>/` 에 추가 PR 환영.
|
||||
- **MCP server** — stdio JSON-RPC 로 `kebab-app` facade 1:1 노출. `kebab mcp` 참조.
|
||||
- **HTTP wrapper** — `kebab serve --bind 127.0.0.1:7711` (P+, local-only 가치 신중).
|
||||
|
||||
## MCP 사용
|
||||
|
||||
`kebab mcp` 가 stdio MCP server. 8 tool: `search` / `bulk_search` (p9-fb-42 — N query 한 번에) / `ask` / `fetch` (p9-fb-35) / `schema` / `doctor` / `ingest_file` / `ingest_stdin`.
|
||||
|
||||
Claude Code 빠른 등록 (`~/.claude/mcp.json` 또는 host 동등 위치):
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"kebab": {
|
||||
"command": "kebab",
|
||||
"args": ["mcp"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
자세한 사용법 (Cursor / OpenAI Agents / Copilot CLI config, per-tool 입출력 예시, troubleshooting, multi-turn ask + session 관리, performance / security) — **[docs/mcp-usage.md](docs/mcp-usage.md)** 참조.
|
||||
crate-level 의존성 그래프 · 디렉토리 트리 · 확장자→chunker 전체 매핑 · 핵심 기술 결정은 [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md), 진척도는 [HANDOFF.md](HANDOFF.md).
|
||||
|
||||
## 비-목표
|
||||
|
||||
다중 사용자 SaaS / K8s / 원격 vector DB / enterprise RBAC / 실시간 협업 / 모든 파일 포맷의 완벽한 parsing / agent 임의 파일 수정 / multi-workspace / LLM-as-judge eval / CLIP 시각 embedding / `kebab://` protocol handler — frozen 설계 §11 / §0 참조.
|
||||
다중 사용자 SaaS / K8s / 원격 vector DB / enterprise RBAC / 실시간 협업 / agent 임의 파일 수정 / multi-workspace / LLM-as-judge eval / CLIP 시각 embedding — frozen 설계 §0 / §11 참조.
|
||||
|
||||
## 라이선스
|
||||
## 버전 / 라이선스 / 참고
|
||||
|
||||
`MIT OR Apache-2.0` (workspace `Cargo.toml` 의 `license` 필드).
|
||||
|
||||
## 참고
|
||||
|
||||
- 진척도: [HANDOFF.md](HANDOFF.md)
|
||||
- 아키텍처: [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)
|
||||
- Frozen 설계: [docs/superpowers/specs/2026-04-27-kebab-final-form-design.md](docs/superpowers/specs/2026-04-27-kebab-final-form-design.md)
|
||||
- Task 인덱스: [tasks/INDEX.md](tasks/INDEX.md)
|
||||
- 머지 후 hotfix 로그: [tasks/HOTFIXES.md](tasks/HOTFIXES.md)
|
||||
- Smoke 절차: [docs/SMOKE.md](docs/SMOKE.md)
|
||||
- **버전**: v0.21.0 (`kebab --version` 으로 확인).
|
||||
- **라이선스**: `MIT OR Apache-2.0`.
|
||||
- 진척도: [HANDOFF.md](HANDOFF.md) · 아키텍처: [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) · Frozen 설계: [docs/superpowers/specs/2026-04-27-kebab-final-form-design.md](docs/superpowers/specs/2026-04-27-kebab-final-form-design.md)
|
||||
- Task 인덱스: [tasks/INDEX.md](tasks/INDEX.md) · Hotfix 로그: [tasks/HOTFIXES.md](tasks/HOTFIXES.md) · Smoke 절차: [docs/SMOKE.md](docs/SMOKE.md) · MCP 사용: [docs/mcp-usage.md](docs/mcp-usage.md)
|
||||
|
||||
@@ -12,17 +12,22 @@ kebab-core = { path = "../kebab-core" }
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
kebab-source-fs = { path = "../kebab-source-fs" }
|
||||
kebab-parse-md = { path = "../kebab-parse-md" }
|
||||
kebab-parse-types = { path = "../kebab-parse-types" }
|
||||
kebab-normalize = { path = "../kebab-normalize" }
|
||||
kebab-chunk = { path = "../kebab-chunk" }
|
||||
kebab-store-sqlite = { path = "../kebab-store-sqlite" }
|
||||
kebab-store-vector = { path = "../kebab-store-vector" }
|
||||
kebab-search = { path = "../kebab-search" }
|
||||
kebab-embed = { path = "../kebab-embed" }
|
||||
kebab-embed-local = { path = "../kebab-embed-local" }
|
||||
kebab-embed-candle = { path = "../kebab-embed-candle" }
|
||||
kebab-embed-ollama = { path = "../kebab-embed-ollama" }
|
||||
kebab-llm = { path = "../kebab-llm" }
|
||||
kebab-llm-local = { path = "../kebab-llm-local" }
|
||||
kebab-rag = { path = "../kebab-rag" }
|
||||
# p9-fb-41 PR-9c-2: facade construction of OnnxNliVerifier when
|
||||
# `[rag] nli_threshold > 0`. Trait-only consumption via kebab-rag's
|
||||
# `with_verifier`; no kebab-nli internals leak into kebab-app code
|
||||
# beyond the construction site in `open_with_config`.
|
||||
kebab-nli = { path = "../kebab-nli" }
|
||||
# P6-4: image extractor + OCR + caption adapters live here. App
|
||||
# threads them into the per-asset dispatch (see `ingest_one_asset`
|
||||
# image branch). Trait-only consumption — no `kebab-parse-image`
|
||||
@@ -32,6 +37,11 @@ kebab-parse-image = { path = "../kebab-parse-image" }
|
||||
# per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the
|
||||
# resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
|
||||
kebab-parse-pdf = { path = "../kebab-parse-pdf" }
|
||||
lopdf = { workspace = true }
|
||||
# Enhancement 1 (v0.20.x r2): JPEG dimension decode in pdf_ocr_apply.rs.
|
||||
# jpeg feature added explicitly (F3 closure-r1) rather than relying on
|
||||
# feature unification via kebab-parse-image.
|
||||
image = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
|
||||
# p10-1A-2: Rust AST extractor lives here. App threads it into the
|
||||
# per-asset dispatch (see `ingest_one_asset` Code branch) and runs the
|
||||
# resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`.
|
||||
@@ -41,6 +51,7 @@ blake3 = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] }
|
||||
tracing-appender = "0.2"
|
||||
@@ -58,21 +69,40 @@ unicode-normalization = "0.1"
|
||||
ignore = "0.4"
|
||||
# p9-fb-34: opaque pagination cursor encodes payload as base64.
|
||||
base64 = { workspace = true }
|
||||
# Enhancement 3 (v0.20.x r2): direct SQL queries for inspect_ocr_stats/failures.
|
||||
rusqlite = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
# doc-side expansion (Phase 2) Task 4: ExpansionGenerator unit tests build
|
||||
# MockLanguageModel (gated behind kebab-llm's `mock` feature, default OFF in
|
||||
# [dependencies]). Enabling it here turns it on for the test build only.
|
||||
kebab-llm = { path = "../kebab-llm", features = ["mock"] }
|
||||
rusqlite = { workspace = true }
|
||||
filetime = "0.2"
|
||||
tempfile = { workspace = true }
|
||||
# Image-pipeline integration tests use wiremock to stub Ollama for OCR
|
||||
# / caption HTTP calls. Async runtime to host the mock server only;
|
||||
# the kb-app code under test stays sync.
|
||||
wiremock = { workspace = true }
|
||||
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||
image = { version = "0.25", default-features = false, features = ["png"] }
|
||||
image = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
|
||||
# P7-3 PDF integration tests build in-memory PDF fixtures via the same
|
||||
# lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
|
||||
# to the same major (0.32) so byte output is identical between the two
|
||||
# fixture surfaces.
|
||||
lopdf = "0.32"
|
||||
lopdf = { workspace = true }
|
||||
# error_wire::tests::llm_unreachable_classifies_to_model_unreachable needs a real
|
||||
# reqwest::Error (private constructor) — built from a connect-refused call.
|
||||
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
|
||||
|
||||
[features]
|
||||
# Marker feature — spec §6.3 Option A (단순): lindera 는 kebab-chunk 가 default dep 으로 소유.
|
||||
# disable path 없음; 이 feature 는 spec §6.3 명시를 honor 하는 role 만.
|
||||
default = ["fts_korean_morphological"]
|
||||
fts_korean_morphological = []
|
||||
# opt-in (macOS): candle embedder runs on the Apple Silicon GPU. See kebab-embed-candle.
|
||||
embed_metal = ["kebab-embed-candle/metal"]
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
@@ -40,11 +40,19 @@ use anyhow::{Context, Result, anyhow};
|
||||
use lru::LruCache;
|
||||
|
||||
use kebab_core::{
|
||||
Answer, DocumentStore, Embedder, IndexVersion, LanguageModel, Retriever, SearchHit,
|
||||
SearchMode, SearchOpts, SearchQuery, VectorStore,
|
||||
Answer, DocumentStore, Embedder, ExtractContext, Extractor, IndexVersion, LanguageModel,
|
||||
MediaType, Retriever, SearchHit, SearchMode, SearchOpts, SearchQuery, VectorStore,
|
||||
};
|
||||
use kebab_embed_candle::CandleEmbedder;
|
||||
use kebab_embed_local::FastembedEmbedder;
|
||||
use kebab_embed_ollama::OllamaEmbedder;
|
||||
use kebab_llm_local::OllamaLanguageModel;
|
||||
use kebab_parse_code::{
|
||||
CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor,
|
||||
KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor,
|
||||
};
|
||||
use kebab_parse_image::ImageExtractor;
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use kebab_rag::{AskOpts, RagPipeline};
|
||||
use kebab_search::{HybridRetriever, LexicalRetriever, VectorRetriever};
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
@@ -73,6 +81,14 @@ pub struct SearchResponse {
|
||||
/// p9-fb-37: present when caller passed `SearchOpts.trace = true`.
|
||||
/// Consumers that ignore trace should leave this `None`.
|
||||
pub trace: Option<kebab_core::SearchTrace>,
|
||||
/// v0.17.0 A5 Step 4b: human / agent-readable advisory string set
|
||||
/// when the empty hit list is likely due to a query shorter than the
|
||||
/// FTS5 trigram tokenizer's 3-char minimum. `None` otherwise. CLI
|
||||
/// surfaces it on stderr (text mode); MCP / `--json` consumers
|
||||
/// surface it however they prefer. See
|
||||
/// `docs/superpowers/specs/2026-05-22-korean-trigram-tokenizer-design.md`
|
||||
/// §3.3.
|
||||
pub hint: Option<String>,
|
||||
}
|
||||
|
||||
/// Facade state — see module docs for lifetime rules.
|
||||
@@ -84,6 +100,12 @@ pub struct SearchResponse {
|
||||
pub struct App {
|
||||
pub(crate) config: kebab_config::Config,
|
||||
pub(crate) sqlite: Arc<SqliteStore>,
|
||||
/// post-v0.18.0 extractor-dispatch-unification: polymorphic Extractor
|
||||
/// registry. App init 시 1회 등록되어 `extract_for(...)` 가 lookup
|
||||
/// 한다. 현재 11 entry (ImageExtractor + PdfTextExtractor + 9 AST).
|
||||
/// MarkdownExtractor 는 별 PR 에서 추가 — markdown ingest path 는
|
||||
/// 본 PR 에서 free-function 그대로 유지.
|
||||
pub(crate) extractors: Vec<Box<dyn Extractor + Send + Sync>>,
|
||||
/// Memoized embedder — built lazily on first `embedder()` call when
|
||||
/// embeddings are enabled. `OnceLock` keeps the struct `Sync` and
|
||||
/// the build path cold-only-once.
|
||||
@@ -102,6 +124,17 @@ pub struct App {
|
||||
/// `corpus_revision` snapshot embedded in `SearchCacheKey`
|
||||
/// invalidates every entry the moment a new ingest commit lands.
|
||||
search_cache: Option<Mutex<LruCache<SearchCacheKey, Vec<SearchHit>>>>,
|
||||
/// p9-fb-41 PR-9c-2: NLI verifier built eagerly at
|
||||
/// `open_with_config` time when `config.rag.nli_threshold > 0`,
|
||||
/// consumed by `RagPipeline::with_verifier` on every `ask` /
|
||||
/// `ask_with_session` call. `None` when the gate is disabled
|
||||
/// (default, threshold = 0) — multi-hop skips step 8.5 entirely
|
||||
/// and single-pass never touches the verifier.
|
||||
///
|
||||
/// Built eagerly (not lazy) so the `open_with_config` `?`
|
||||
/// propagation surfaces NLI model construction errors at App
|
||||
/// boot time, before any user query runs.
|
||||
pipeline_verifier: Option<Arc<dyn kebab_nli::NliVerifier>>,
|
||||
}
|
||||
|
||||
/// p9-fb-19: cache key for `App::search`. Includes every field that
|
||||
@@ -158,20 +191,108 @@ impl App {
|
||||
sqlite
|
||||
.run_migrations()
|
||||
.context("kb-app: run SqliteStore migrations")?;
|
||||
// V009 의 tokenized_korean_text column 의 first-boot eager backfill.
|
||||
// 신규 ingest 의 chunks_ai trigger 가 이미 채우므로 NULL row 가 없으면 즉시 0 반환 (idempotent).
|
||||
// V007 → V009 업그레이드 시 KB 크기 비례 (~10000 chunk 당 ~30-60s).
|
||||
let backfill_count = sqlite
|
||||
.backfill_tokenized_korean_text(
|
||||
|done, total| {
|
||||
if total > 0 && done % 500 == 0 {
|
||||
tracing::info!(
|
||||
target: "kebab-app",
|
||||
"korean tokenizer backfill: {done}/{total}"
|
||||
);
|
||||
}
|
||||
},
|
||||
kebab_chunk::tokenize_korean_morphological,
|
||||
)
|
||||
.unwrap_or_else(|e| {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
"korean tokenizer backfill failed: {e}"
|
||||
);
|
||||
0
|
||||
});
|
||||
if backfill_count > 0 {
|
||||
tracing::info!(
|
||||
target: "kebab-app",
|
||||
"korean tokenizer backfill complete: {backfill_count} chunks updated"
|
||||
);
|
||||
}
|
||||
// p9-fb-19: build the LRU cache from config. Capacity 0 →
|
||||
// `None` (cache disabled — every search hits the retrievers).
|
||||
let search_cache = NonZeroUsize::new(config.search.cache_capacity)
|
||||
.map(|cap| Mutex::new(LruCache::new(cap)));
|
||||
// post-v0.18.0 extractor-dispatch-unification: build the 11-entry
|
||||
// Extractor registry. All entries are state-less unit structs with
|
||||
// zero-cost `new()`, so init cost is effectively 0 and side effects
|
||||
// are 0 — `pipeline_verifier` fallible `?` below may bail but the
|
||||
// already-constructed `extractors` Vec drops without cost. Markdown
|
||||
// is NOT registered (see field doc).
|
||||
let extractors: Vec<Box<dyn Extractor + Send + Sync>> = vec![
|
||||
Box::new(ImageExtractor::new()),
|
||||
Box::new(PdfTextExtractor::new()),
|
||||
Box::new(RustAstExtractor::new()),
|
||||
Box::new(PythonAstExtractor::new()),
|
||||
Box::new(TypescriptAstExtractor::new()),
|
||||
Box::new(JavascriptAstExtractor::new()),
|
||||
Box::new(GoAstExtractor::new()),
|
||||
Box::new(JavaAstExtractor::new()),
|
||||
Box::new(KotlinAstExtractor::new()),
|
||||
Box::new(CAstExtractor::new()),
|
||||
Box::new(CppAstExtractor::new()),
|
||||
];
|
||||
// p9-fb-41 PR-9c-2: build the NLI verifier when the gate is
|
||||
// enabled. App carries it on `RagPipeline` via
|
||||
// `with_verifier` so the rag crate doesn't have to know about
|
||||
// kebab-nli construction. Failure (`?`) surfaces as a user-
|
||||
// facing error at App boot — never a panic in the pipeline's
|
||||
// `expect("verifier must be Some when nli_threshold > 0.0")`.
|
||||
let pipeline_verifier: Option<Arc<dyn kebab_nli::NliVerifier>> = if config.rag.nli_threshold
|
||||
> 0.0
|
||||
{
|
||||
let v = kebab_nli::OnnxNliVerifier::new(&config)
|
||||
.context("kebab-app: construct OnnxNliVerifier (config.rag.nli_threshold > 0)")?;
|
||||
Some(Arc::new(v))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(Self {
|
||||
config,
|
||||
sqlite: Arc::new(sqlite),
|
||||
extractors,
|
||||
embedder: OnceLock::new(),
|
||||
vector: OnceLock::new(),
|
||||
llm: OnceLock::new(),
|
||||
search_cache,
|
||||
pipeline_verifier,
|
||||
})
|
||||
}
|
||||
|
||||
/// Polymorphic dispatcher for the [`Extractor`] trait. Looks up the
|
||||
/// first Extractor whose `supports(media)` returns true and invokes
|
||||
/// `extract(ctx, bytes)` on it.
|
||||
///
|
||||
/// Errors with `anyhow!("no Extractor for media_type {media:?}")`
|
||||
/// when no matching Extractor is registered. Callers in
|
||||
/// `ingest_one_*_asset` reach this only after the outer 4-arm
|
||||
/// dispatch (`MediaType::Markdown` / `Image` / `Pdf` / `Code(lang)`)
|
||||
/// has matched, so a miss is a programming error — NOT a user-
|
||||
/// facing skip.
|
||||
pub(crate) fn extract_for(
|
||||
&self,
|
||||
media: &MediaType,
|
||||
ctx: &ExtractContext<'_>,
|
||||
bytes: &[u8],
|
||||
) -> Result<kebab_core::CanonicalDocument> {
|
||||
let extractor = self
|
||||
.extractors
|
||||
.iter()
|
||||
.find(|e| e.supports(media))
|
||||
.ok_or_else(|| anyhow!("no Extractor for media_type {media:?}"))?;
|
||||
extractor.extract(ctx, bytes)
|
||||
}
|
||||
|
||||
/// Run a [`SearchQuery`] through the configured retriever stack and
|
||||
/// return the top-k hits. p9-fb-19: result is served from the
|
||||
/// in-process LRU cache when the same `(query_norm, mode, k,
|
||||
@@ -235,7 +356,9 @@ impl App {
|
||||
// so other in-flight searches can use the cache concurrently.
|
||||
drop(guard);
|
||||
let hits = self.search_uncached(query)?;
|
||||
let mut guard = cache.lock().unwrap_or_else(|e| e.into_inner());
|
||||
let mut guard = cache
|
||||
.lock()
|
||||
.unwrap_or_else(std::sync::PoisonError::into_inner);
|
||||
guard.put(key, hits.clone());
|
||||
Ok(hits)
|
||||
}
|
||||
@@ -315,11 +438,7 @@ impl App {
|
||||
///
|
||||
/// `SearchResponse.next_cursor` and `truncated` are independent
|
||||
/// signals — see `SearchResponse` doc for details.
|
||||
pub fn search_with_opts(
|
||||
&self,
|
||||
query: SearchQuery,
|
||||
opts: SearchOpts,
|
||||
) -> Result<SearchResponse> {
|
||||
pub fn search_with_opts(&self, query: SearchQuery, opts: SearchOpts) -> Result<SearchResponse> {
|
||||
use crate::cursor;
|
||||
|
||||
let corpus_revision = self.sqlite.corpus_revision().to_string();
|
||||
@@ -404,12 +523,11 @@ impl App {
|
||||
// Apply offset + k_effective truncation (mirrors non-trace path).
|
||||
let drop_n = offset.min(traced_hits.len());
|
||||
traced_hits.drain(..drop_n);
|
||||
let mut hits: Vec<SearchHit> =
|
||||
traced_hits.into_iter().take(k_effective).collect();
|
||||
let mut hits: Vec<SearchHit> = traced_hits.into_iter().take(k_effective).collect();
|
||||
|
||||
// Snippet truncation if opts.snippet_chars set (mirror non-trace path).
|
||||
if opts.snippet_chars.is_some() {
|
||||
for h in hits.iter_mut() {
|
||||
for h in &mut hits {
|
||||
if h.snippet.chars().count() > snippet_chars {
|
||||
h.snippet = trim_to_chars(&h.snippet, snippet_chars);
|
||||
}
|
||||
@@ -418,11 +536,13 @@ impl App {
|
||||
|
||||
// Trace path skips the budget loop. Caller will inspect
|
||||
// `hits.len()` and `trace.timing` rather than paginate.
|
||||
let hint: Option<String> = None;
|
||||
return Ok(SearchResponse {
|
||||
hits,
|
||||
next_cursor: None,
|
||||
truncated: false,
|
||||
trace: Some(trace),
|
||||
hint,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -434,15 +554,14 @@ impl App {
|
||||
// Skip offset.
|
||||
let drop_n = offset.min(all_hits.len());
|
||||
all_hits.drain(..drop_n);
|
||||
let mut hits: Vec<SearchHit> =
|
||||
all_hits.into_iter().take(k_effective).collect();
|
||||
let mut hits: Vec<SearchHit> = all_hits.into_iter().take(k_effective).collect();
|
||||
|
||||
// Apply snippet_chars override if shorter than what the
|
||||
// retriever returned (retriever already honored
|
||||
// `config.search.snippet_chars`; this only kicks in when the
|
||||
// caller asked for *less*).
|
||||
if opts.snippet_chars.is_some() {
|
||||
for h in hits.iter_mut() {
|
||||
for h in &mut hits {
|
||||
if h.snippet.chars().count() > snippet_chars {
|
||||
h.snippet = trim_to_chars(&h.snippet, snippet_chars);
|
||||
}
|
||||
@@ -456,15 +575,11 @@ impl App {
|
||||
// Step 1: shorten snippets progressively to a 60-char floor.
|
||||
const SNIPPET_FLOOR: usize = 60;
|
||||
let mut current_snippet_cap = snippet_chars;
|
||||
while estimate_chars(&hits) > max_chars
|
||||
&& current_snippet_cap > SNIPPET_FLOOR
|
||||
{
|
||||
current_snippet_cap =
|
||||
(current_snippet_cap / 2).max(SNIPPET_FLOOR);
|
||||
for h in hits.iter_mut() {
|
||||
while estimate_chars(&hits) > max_chars && current_snippet_cap > SNIPPET_FLOOR {
|
||||
current_snippet_cap = (current_snippet_cap / 2).max(SNIPPET_FLOOR);
|
||||
for h in &mut hits {
|
||||
if h.snippet.chars().count() > current_snippet_cap {
|
||||
h.snippet =
|
||||
trim_to_chars(&h.snippet, current_snippet_cap);
|
||||
h.snippet = trim_to_chars(&h.snippet, current_snippet_cap);
|
||||
truncated = true;
|
||||
}
|
||||
}
|
||||
@@ -505,11 +620,13 @@ impl App {
|
||||
None
|
||||
};
|
||||
|
||||
let hint: Option<String> = None;
|
||||
Ok(SearchResponse {
|
||||
hits,
|
||||
next_cursor,
|
||||
truncated,
|
||||
trace: None,
|
||||
hint,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -518,11 +635,27 @@ impl App {
|
||||
pub fn ask(&self, query: &str, opts: AskOpts) -> Result<Answer> {
|
||||
let retriever = self.build_retriever(opts.mode)?;
|
||||
let llm = self.llm()?;
|
||||
let pipeline =
|
||||
RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
|
||||
let pipeline = self.build_pipeline(retriever, llm);
|
||||
pipeline.ask(query, opts)
|
||||
}
|
||||
|
||||
/// p9-fb-41 PR-9c-2: shared pipeline builder used by [`Self::ask`]
|
||||
/// and [`Self::ask_with_session`]. Attaches the App-built NLI
|
||||
/// verifier (when `cfg.rag.nli_threshold > 0`) via
|
||||
/// `RagPipeline::with_verifier`, keeping the construction site in
|
||||
/// a single place so the two call paths can't drift.
|
||||
fn build_pipeline(
|
||||
&self,
|
||||
retriever: Arc<dyn Retriever>,
|
||||
llm: Arc<dyn LanguageModel>,
|
||||
) -> RagPipeline {
|
||||
let pipeline = RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
|
||||
match &self.pipeline_verifier {
|
||||
Some(v) => pipeline.with_verifier(v.clone()),
|
||||
None => pipeline,
|
||||
}
|
||||
}
|
||||
|
||||
/// p9-fb-18: shared retriever-stack builder used by [`Self::ask`]
|
||||
/// and [`Self::ask_with_session`]. Lexical mode uses the FTS5
|
||||
/// retriever directly; vector / hybrid require embeddings (and
|
||||
@@ -587,12 +720,7 @@ impl App {
|
||||
/// returns; on persistence error, the answer is still returned
|
||||
/// (don't lose the user's compute) but the error is logged so
|
||||
/// the operator notices.
|
||||
pub fn ask_with_session(
|
||||
&self,
|
||||
session_id: &str,
|
||||
query: &str,
|
||||
opts: AskOpts,
|
||||
) -> Result<Answer> {
|
||||
pub fn ask_with_session(&self, session_id: &str, query: &str, opts: AskOpts) -> Result<Answer> {
|
||||
use kebab_core::traits::{ChatSessionRepo, ChatSessionRow, ChatTurnRow};
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
@@ -625,17 +753,13 @@ impl App {
|
||||
|
||||
// p9-fb-18 R1: shared retriever builder removes the prior
|
||||
// copy of `ask`'s 35-line stack — see [`Self::build_retriever`].
|
||||
// p9-fb-41 PR-9c-2: shared `build_pipeline` attaches the NLI
|
||||
// verifier when the gate is enabled.
|
||||
let retriever = self.build_retriever(opts.mode)?;
|
||||
let llm = self.llm()?;
|
||||
let pipeline =
|
||||
RagPipeline::new(self.config.clone(), retriever, llm, self.sqlite.clone());
|
||||
let answer = pipeline.ask_with_history(
|
||||
query,
|
||||
history,
|
||||
session_id.to_string(),
|
||||
next_index,
|
||||
opts,
|
||||
)?;
|
||||
let pipeline = self.build_pipeline(retriever, llm);
|
||||
let answer =
|
||||
pipeline.ask_with_history(query, history, session_id.to_string(), next_index, opts)?;
|
||||
|
||||
// Auto-create the session header on first use. Title from
|
||||
// the first question (≤40 chars after trim).
|
||||
@@ -676,7 +800,8 @@ impl App {
|
||||
turn_index: next_index,
|
||||
question: query.to_string(),
|
||||
answer: answer.answer.clone(),
|
||||
citations_json: serde_json::to_string(&answer.citations).unwrap_or_else(|_| "[]".to_string()),
|
||||
citations_json: serde_json::to_string(&answer.citations)
|
||||
.unwrap_or_else(|_| "[]".to_string()),
|
||||
created_at: now_unix,
|
||||
};
|
||||
if let Err(e) = self.sqlite.append_turn(&turn_row) {
|
||||
@@ -710,10 +835,31 @@ impl App {
|
||||
if let Some(e) = self.embedder.get() {
|
||||
return Ok(Some(e.clone()));
|
||||
}
|
||||
let emb: Arc<dyn Embedder + Send + Sync> = Arc::new(
|
||||
FastembedEmbedder::new(&self.config)
|
||||
.context("kb-app: load FastembedEmbedder")?,
|
||||
);
|
||||
// Provider branch (Track 1 spec §3 + arctic-embedder spec). The
|
||||
// `embeddings_disabled()` check above already handled `"none"`; here we
|
||||
// route the live providers. `fastembed`/`onnx`/(empty) keep the default
|
||||
// onnxruntime path (vectors unchanged — `embedding_version` is
|
||||
// preserved); `candle` selects the pure-Rust NUMA-safe backend (e5 or
|
||||
// arctic via its model registry); `ollama` offloads to a remote
|
||||
// `/api/embed` daemon.
|
||||
let provider = self.config.models.embedding.provider.as_str();
|
||||
let emb: Arc<dyn Embedder + Send + Sync> = match provider {
|
||||
"fastembed" | "onnx" | "" => Arc::new(
|
||||
FastembedEmbedder::new(&self.config).context("kb-app: load FastembedEmbedder")?,
|
||||
),
|
||||
"candle" => Arc::new(
|
||||
CandleEmbedder::new(&self.config).context("kb-app: load CandleEmbedder")?,
|
||||
),
|
||||
"ollama" => Arc::new(
|
||||
OllamaEmbedder::new(&self.config).context("kb-app: load OllamaEmbedder")?,
|
||||
),
|
||||
other => {
|
||||
return Err(anyhow!(
|
||||
"kb-app: unknown embedding provider {other:?}; expected one of \
|
||||
`fastembed` (default), `candle`, `ollama`, or `none` (lexical-only)"
|
||||
));
|
||||
}
|
||||
};
|
||||
// `set` returns Err if another thread won the race; in that case
|
||||
// the loser still returns the (now-cached) winner via `get()`.
|
||||
let _ = self.embedder.set(emb.clone());
|
||||
@@ -788,7 +934,9 @@ impl App {
|
||||
/// clear` admin command). No-op when the cache is disabled.
|
||||
pub fn clear_search_cache(&self) {
|
||||
if let Some(cache) = self.search_cache.as_ref() {
|
||||
let mut guard = cache.lock().unwrap_or_else(|e| e.into_inner());
|
||||
let mut guard = cache
|
||||
.lock()
|
||||
.unwrap_or_else(std::sync::PoisonError::into_inner);
|
||||
guard.clear();
|
||||
}
|
||||
}
|
||||
@@ -809,8 +957,8 @@ impl App {
|
||||
/// git tree) correctly keep `repo: None` — `Metadata.repo` is already
|
||||
/// `None` for those, so the assignment is a no-op.
|
||||
fn backfill_repo(&self, hits: &mut [SearchHit]) {
|
||||
use std::collections::HashMap;
|
||||
use kebab_core::DocumentId;
|
||||
use std::collections::HashMap;
|
||||
|
||||
// doc_id → Option<String> where None means "not found / no repo"
|
||||
let mut cache: HashMap<DocumentId, Option<String>> = HashMap::new();
|
||||
@@ -819,26 +967,24 @@ impl App {
|
||||
if hit.repo.is_some() {
|
||||
continue;
|
||||
}
|
||||
let repo_val = cache
|
||||
.entry(hit.doc_id.clone())
|
||||
.or_insert_with(|| {
|
||||
// Deliberately non-aborting: a failed store lookup for
|
||||
// one hit must not abort the whole search response. Log
|
||||
// the error so it's observable rather than silently
|
||||
// dropped (review #140 round 1).
|
||||
match self.sqlite.get_document(&hit.doc_id) {
|
||||
Ok(opt) => opt.and_then(|doc| doc.metadata.repo),
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
doc_id = %hit.doc_id,
|
||||
error = %e,
|
||||
"backfill_repo: get_document failed; leaving hit.repo = None"
|
||||
);
|
||||
None
|
||||
}
|
||||
let repo_val = cache.entry(hit.doc_id.clone()).or_insert_with(|| {
|
||||
// Deliberately non-aborting: a failed store lookup for
|
||||
// one hit must not abort the whole search response. Log
|
||||
// the error so it's observable rather than silently
|
||||
// dropped (review #140 round 1).
|
||||
match self.sqlite.get_document(&hit.doc_id) {
|
||||
Ok(opt) => opt.and_then(|doc| doc.metadata.repo),
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
doc_id = %hit.doc_id,
|
||||
error = %e,
|
||||
"backfill_repo: get_document failed; leaving hit.repo = None"
|
||||
);
|
||||
None
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
if let Some(r) = repo_val {
|
||||
hit.repo = Some(r.clone());
|
||||
}
|
||||
@@ -849,10 +995,7 @@ impl App {
|
||||
/// "switch to --mode lexical" error when embeddings are disabled.
|
||||
fn require_embeddings(
|
||||
&self,
|
||||
) -> Result<(
|
||||
Arc<dyn Embedder + Send + Sync>,
|
||||
Arc<LanceVectorStore>,
|
||||
)> {
|
||||
) -> Result<(Arc<dyn Embedder + Send + Sync>, Arc<LanceVectorStore>)> {
|
||||
let emb = self.embedder()?.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"embeddings disabled (config.models.embedding.provider == \"none\" \
|
||||
@@ -874,8 +1017,16 @@ impl App {
|
||||
/// the active config. This token surfaces in `SearchHit.index_version`
|
||||
/// and on snapshot tests; including the chunker version pins it to
|
||||
/// the chunking policy in effect.
|
||||
///
|
||||
/// V009 (2026-05-28): FTS5 tokenizer 가 trigram → unicode61 + 한국어
|
||||
/// 형태소 분해 column 로 갱신됨. `fts5-v009-korean-morphological`
|
||||
/// suffix 가 V007 baseline 과 구별되어 eval runner 의 config
|
||||
/// snapshot 및 search cache 무효화에 picks up 된다.
|
||||
fn lexical_index_version(config: &kebab_config::Config) -> IndexVersion {
|
||||
IndexVersion(format!("lex:{}", config.chunking.chunker_version))
|
||||
IndexVersion(format!(
|
||||
"lex:{}:fts5-v009-korean-morphological",
|
||||
config.chunking.chunker_version
|
||||
))
|
||||
}
|
||||
|
||||
/// p9-fb-37: stand-in for the vector retriever in the trace path when
|
||||
@@ -979,6 +1130,223 @@ fn backfill_code_lang(hits: &mut [SearchHit]) {
|
||||
}
|
||||
}
|
||||
|
||||
// ── v0.20.x r2 Enhancement 3: OCR stats + failures inspect ──────────────
|
||||
|
||||
/// Wire type for `kebab inspect ocr-stats --json` (`ocr_stats.v1`).
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct OcrStatsV1 {
|
||||
pub schema_version: &'static str,
|
||||
pub total_events: u64,
|
||||
pub total_runs: u64,
|
||||
pub success_count: u64,
|
||||
pub failure_count: u64,
|
||||
pub success_rate: f64,
|
||||
pub p50_ms: Option<u64>,
|
||||
pub p90_ms: Option<u64>,
|
||||
pub p99_ms: Option<u64>,
|
||||
pub max_ms: Option<u64>,
|
||||
pub by_engine: std::collections::BTreeMap<String, u64>,
|
||||
pub by_doc: Vec<OcrStatsByDoc>,
|
||||
}
|
||||
|
||||
/// Per-doc breakdown row inside `OcrStatsV1`.
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct OcrStatsByDoc {
|
||||
pub doc_id: String,
|
||||
pub failure_count: u64,
|
||||
pub success_count: u64,
|
||||
pub p90_ms: Option<u64>,
|
||||
}
|
||||
|
||||
/// Wire type for `kebab inspect ocr-failures --json` (`ocr_failures.v1`).
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct OcrFailuresV1 {
|
||||
pub schema_version: &'static str,
|
||||
pub doc_id: Option<String>,
|
||||
pub failure_count: u64,
|
||||
pub failures: Vec<OcrFailureRow>,
|
||||
}
|
||||
|
||||
/// Single failure row inside `OcrFailuresV1`.
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct OcrFailureRow {
|
||||
pub ts: String,
|
||||
pub page: u32,
|
||||
pub ms: u64,
|
||||
pub reason: String,
|
||||
pub image_byte_size: Option<u64>,
|
||||
}
|
||||
|
||||
impl App {
|
||||
/// Corpus-wide OCR statistics from the `pdf_ocr_events` SQLite mirror.
|
||||
pub fn inspect_ocr_stats(&self) -> Result<OcrStatsV1> {
|
||||
self.inspect_ocr_stats_with_config(&self.config)
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn inspect_ocr_stats_with_config(&self, _cfg: &kebab_config::Config) -> Result<OcrStatsV1> {
|
||||
use crate::ingest_log::percentiles;
|
||||
let conn = self.sqlite.read_conn();
|
||||
|
||||
// 1. Aggregate counters
|
||||
let (total_events, success_count, failure_count, total_runs): (u64, u64, u64, u64) = conn
|
||||
.query_row(
|
||||
"SELECT COUNT(*), \
|
||||
SUM(CASE WHEN success=1 THEN 1 ELSE 0 END), \
|
||||
SUM(CASE WHEN success=0 THEN 1 ELSE 0 END), \
|
||||
COUNT(DISTINCT run_id) \
|
||||
FROM pdf_ocr_events",
|
||||
[],
|
||||
|r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)),
|
||||
)
|
||||
.unwrap_or((0, 0, 0, 0));
|
||||
|
||||
let success_rate = if total_events == 0 {
|
||||
0.0
|
||||
} else {
|
||||
success_count as f64 / total_events as f64
|
||||
};
|
||||
|
||||
// 2. Latency percentiles from successful events
|
||||
let samples: Vec<u64> = {
|
||||
let mut stmt = conn
|
||||
.prepare("SELECT ms FROM pdf_ocr_events WHERE success=1 ORDER BY ms")
|
||||
.context("prepare ms query")?;
|
||||
stmt.query_map([], |r| r.get::<_, u64>(0))
|
||||
.context("query ms")?
|
||||
.filter_map(Result::ok)
|
||||
.collect()
|
||||
};
|
||||
let (p50_ms, p90_ms, p99_ms, max_ms) = percentiles(&samples);
|
||||
|
||||
// 3. Engine breakdown
|
||||
let mut by_engine = std::collections::BTreeMap::new();
|
||||
{
|
||||
let mut stmt = conn
|
||||
.prepare("SELECT ocr_engine, COUNT(*) FROM pdf_ocr_events GROUP BY ocr_engine")
|
||||
.context("prepare engine query")?;
|
||||
let rows = stmt
|
||||
.query_map([], |r| Ok((r.get::<_, String>(0)?, r.get::<_, u64>(1)?)))
|
||||
.context("query engine")?;
|
||||
for row in rows.filter_map(Result::ok) {
|
||||
by_engine.insert(row.0, row.1);
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Top-10 docs by failure count
|
||||
let by_doc: Vec<OcrStatsByDoc> = {
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"SELECT doc_id, \
|
||||
SUM(CASE WHEN success=0 THEN 1 ELSE 0 END), \
|
||||
SUM(CASE WHEN success=1 THEN 1 ELSE 0 END) \
|
||||
FROM pdf_ocr_events \
|
||||
WHERE doc_id IS NOT NULL \
|
||||
GROUP BY doc_id \
|
||||
ORDER BY 2 DESC \
|
||||
LIMIT 10",
|
||||
)
|
||||
.context("prepare by_doc query")?;
|
||||
stmt.query_map([], |r| {
|
||||
Ok(OcrStatsByDoc {
|
||||
doc_id: r.get(0)?,
|
||||
failure_count: r.get(1)?,
|
||||
success_count: r.get(2)?,
|
||||
p90_ms: None, // per-doc p90 deferred (open question #3)
|
||||
})
|
||||
})
|
||||
.context("query by_doc")?
|
||||
.filter_map(Result::ok)
|
||||
.collect()
|
||||
};
|
||||
|
||||
Ok(OcrStatsV1 {
|
||||
schema_version: "ocr_stats.v1",
|
||||
total_events,
|
||||
total_runs,
|
||||
success_count,
|
||||
failure_count,
|
||||
success_rate,
|
||||
p50_ms,
|
||||
p90_ms,
|
||||
p99_ms,
|
||||
max_ms,
|
||||
by_engine,
|
||||
by_doc,
|
||||
})
|
||||
}
|
||||
|
||||
/// Recent OCR failure rows, optionally filtered by `doc_id`.
|
||||
pub fn inspect_ocr_failures(
|
||||
&self,
|
||||
doc_id: Option<&str>,
|
||||
limit: usize,
|
||||
) -> Result<OcrFailuresV1> {
|
||||
self.inspect_ocr_failures_with_config(&self.config, doc_id, limit)
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn inspect_ocr_failures_with_config(
|
||||
&self,
|
||||
_cfg: &kebab_config::Config,
|
||||
doc_id: Option<&str>,
|
||||
limit: usize,
|
||||
) -> Result<OcrFailuresV1> {
|
||||
let conn = self.sqlite.read_conn();
|
||||
let failures: Vec<OcrFailureRow> = if let Some(did) = doc_id {
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"SELECT ts, page, ms, COALESCE(reason,'unknown'), image_byte_size \
|
||||
FROM pdf_ocr_events \
|
||||
WHERE success=0 AND doc_id=? \
|
||||
ORDER BY ts DESC \
|
||||
LIMIT ?",
|
||||
)
|
||||
.context("prepare failures by doc_id")?;
|
||||
stmt.query_map(rusqlite::params![did, limit as i64], |r| {
|
||||
Ok(OcrFailureRow {
|
||||
ts: r.get(0)?,
|
||||
page: r.get(1)?,
|
||||
ms: r.get(2)?,
|
||||
reason: r.get(3)?,
|
||||
image_byte_size: r.get(4)?,
|
||||
})
|
||||
})
|
||||
.context("query failures by doc_id")?
|
||||
.filter_map(Result::ok)
|
||||
.collect()
|
||||
} else {
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"SELECT ts, page, ms, COALESCE(reason,'unknown'), image_byte_size \
|
||||
FROM pdf_ocr_events \
|
||||
WHERE success=0 \
|
||||
ORDER BY ts DESC \
|
||||
LIMIT ?",
|
||||
)
|
||||
.context("prepare failures corpus-wide")?;
|
||||
stmt.query_map(rusqlite::params![limit as i64], |r| {
|
||||
Ok(OcrFailureRow {
|
||||
ts: r.get(0)?,
|
||||
page: r.get(1)?,
|
||||
ms: r.get(2)?,
|
||||
reason: r.get(3)?,
|
||||
image_byte_size: r.get(4)?,
|
||||
})
|
||||
})
|
||||
.context("query failures corpus-wide")?
|
||||
.filter_map(Result::ok)
|
||||
.collect()
|
||||
};
|
||||
Ok(OcrFailuresV1 {
|
||||
schema_version: "ocr_failures.v1",
|
||||
doc_id: doc_id.map(String::from),
|
||||
failure_count: failures.len() as u64,
|
||||
failures,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -1077,3 +1445,128 @@ mod tests_trace {
|
||||
assert!(resp.trace.is_some(), "trace populated when opts.trace=true");
|
||||
}
|
||||
}
|
||||
|
||||
/// post-v0.18.0 extractor-dispatch-unification: in-crate unit tests for
|
||||
/// the `App.extractors` registry + `App::extract_for` polymorphic
|
||||
/// dispatch. In-crate (not `tests/`) because `extractors` + `extract_for`
|
||||
/// are `pub(crate)` — integration tests cannot reach them.
|
||||
///
|
||||
/// Spec §5.1 + plan §2 Step 10 — 3 test class:
|
||||
/// 1. registry length = 11 (image + pdf + 9 AST).
|
||||
/// 2. mutually-exclusive `supports()` grid over 16 sample MediaTypes.
|
||||
/// 3. `extract_for` returns `Err("no Extractor ...")` for registry-NOT-cover
|
||||
/// MediaType (Audio).
|
||||
#[cfg(test)]
|
||||
mod tests_extractor_dispatch {
|
||||
use super::*;
|
||||
use kebab_core::{AudioType, ExtractConfig, ImageType};
|
||||
|
||||
/// helper: tempdir-isolated App for tests (mirrors `tests_trace`'s
|
||||
/// `open_app_with_temp_dir` pattern).
|
||||
fn open_app_with_temp_dir() -> (tempfile::TempDir, App) {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let mut cfg = kebab_config::Config::defaults();
|
||||
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
|
||||
// Bring up migrations.
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
drop(store);
|
||||
let app = App::open_with_config(cfg).unwrap();
|
||||
(dir, app)
|
||||
}
|
||||
|
||||
/// Registry length invariant: 11 Extractor (image + pdf + 9 AST).
|
||||
/// Markdown is NOT registered (free-function path — defer to a
|
||||
/// separate PR per spec §3.4).
|
||||
#[test]
|
||||
fn registry_has_eleven_extractors() {
|
||||
let (_dir, app) = open_app_with_temp_dir();
|
||||
assert_eq!(
|
||||
app.extractors.len(),
|
||||
11,
|
||||
"registry must hold 11 Extractors (image + pdf + 9 AST). \
|
||||
markdown 은 별 PR."
|
||||
);
|
||||
}
|
||||
|
||||
/// 11 Extractor 의 `supports()` 가 16 sample MediaType 에 대해
|
||||
/// mutually exclusive — 어떤 두 Extractor 도 동일 MediaType 에
|
||||
/// 대해 true 반환 안 됨.
|
||||
#[test]
|
||||
fn supports_grid_is_mutually_exclusive() {
|
||||
let (_dir, app) = open_app_with_temp_dir();
|
||||
let samples = vec![
|
||||
MediaType::Markdown,
|
||||
MediaType::Pdf,
|
||||
MediaType::Image(ImageType::Png),
|
||||
MediaType::Image(ImageType::Jpeg),
|
||||
MediaType::Code("rust".into()),
|
||||
MediaType::Code("python".into()),
|
||||
MediaType::Code("typescript".into()),
|
||||
MediaType::Code("javascript".into()),
|
||||
MediaType::Code("go".into()),
|
||||
MediaType::Code("java".into()),
|
||||
MediaType::Code("kotlin".into()),
|
||||
MediaType::Code("c".into()),
|
||||
MediaType::Code("cpp".into()),
|
||||
MediaType::Code("yaml".into()), // registry NOT cover
|
||||
MediaType::Code("shell".into()), // registry NOT cover
|
||||
MediaType::Audio(AudioType::Wav), // registry NOT cover
|
||||
];
|
||||
for sample in &samples {
|
||||
let hits: Vec<_> = app
|
||||
.extractors
|
||||
.iter()
|
||||
.filter(|e| e.supports(sample))
|
||||
.collect();
|
||||
assert!(
|
||||
hits.len() <= 1,
|
||||
"mutually exclusive violated for {sample:?}: {} hits",
|
||||
hits.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// `extract_for` 가 registry NOT cover MediaType (Audio) 에 대해
|
||||
/// `Err("no Extractor for media_type ...")` 반환. Audio MediaType
|
||||
/// 사용으로 RawAsset 의 actual content 의존 회피 — registry NOT
|
||||
/// cover → 즉시 Err.
|
||||
#[test]
|
||||
fn extract_for_unsupported_media_errors() {
|
||||
let (_dir, app) = open_app_with_temp_dir();
|
||||
|
||||
// Minimal RawAsset. Actual content never read — Audio MediaType
|
||||
// 는 registry NOT cover → `extract_for` 가 dispatch loop 안에서
|
||||
// 바로 Err 반환. RawAsset field set 은 `crates/kebab-core/src/
|
||||
// asset.rs:62-73` 와 정합 (8 field).
|
||||
let asset = kebab_core::RawAsset {
|
||||
asset_id: kebab_core::AssetId("00".repeat(16)),
|
||||
source_uri: kebab_core::SourceUri::File("/tmp/dummy.wav".into()),
|
||||
workspace_path: kebab_core::WorkspacePath("dummy.wav".to_string()),
|
||||
media_type: MediaType::Audio(AudioType::Wav),
|
||||
byte_len: 0,
|
||||
checksum: kebab_core::Checksum("00".repeat(32)),
|
||||
discovered_at: time::OffsetDateTime::now_utc(),
|
||||
// AssetStorage::Inline 미존재 — actual variant `Copied { path }`
|
||||
// 사용 (kebab-core/src/asset.rs:55-60).
|
||||
stored: kebab_core::AssetStorage::Copied {
|
||||
path: std::path::PathBuf::from("/tmp/dummy.wav"),
|
||||
},
|
||||
};
|
||||
|
||||
let workspace_root: std::path::PathBuf = std::path::PathBuf::from("/tmp");
|
||||
let cfg = ExtractConfig::default();
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root: &workspace_root,
|
||||
config: &cfg,
|
||||
};
|
||||
let result = app.extract_for(&MediaType::Audio(AudioType::Wav), &ctx, &[]);
|
||||
assert!(result.is_err(), "Audio 는 registry 미포함 → Err 기대");
|
||||
let err_msg = format!("{:#}", result.unwrap_err());
|
||||
assert!(
|
||||
err_msg.contains("no Extractor"),
|
||||
"unexpected err: {err_msg}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,6 +96,11 @@ fn serialize_search_response(r: &SearchResponse) -> Value {
|
||||
None => Value::Null,
|
||||
};
|
||||
map.insert("trace".to_string(), trace_v);
|
||||
// v0.17.0 A5 Step 4b: only emit `hint` when set — matches
|
||||
// the CLI wire wrapper's additive emit pattern.
|
||||
if let Some(hint) = &r.hint {
|
||||
map.insert("hint".to_string(), Value::String(hint.clone()));
|
||||
}
|
||||
}
|
||||
v
|
||||
}
|
||||
@@ -121,7 +126,10 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
|
||||
let text = obj
|
||||
.get("query")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing required field: query")?
|
||||
.ok_or(
|
||||
"missing required field: query \
|
||||
(expected {\"query\":\"<text>\",\"mode\":\"lexical|vector|hybrid\",\"k\":3,...})",
|
||||
)?
|
||||
.to_string();
|
||||
|
||||
let mode = match obj.get("mode").and_then(|v| v.as_str()) {
|
||||
@@ -134,9 +142,8 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
|
||||
|
||||
let k = obj
|
||||
.get("k")
|
||||
.and_then(|v| v.as_u64())
|
||||
.map(|n| n as usize)
|
||||
.unwrap_or(0); // 0 → use config default in app
|
||||
.and_then(serde_json::Value::as_u64)
|
||||
.map_or(0, |n| n as usize); // 0 → use config default in app
|
||||
|
||||
let trust_min = match obj.get("trust_min").and_then(|v| v.as_str()) {
|
||||
None => None,
|
||||
@@ -204,14 +211,17 @@ fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
|
||||
let opts = SearchOpts {
|
||||
max_tokens: obj
|
||||
.get("max_tokens")
|
||||
.and_then(|v| v.as_u64())
|
||||
.and_then(serde_json::Value::as_u64)
|
||||
.map(|n| n as usize),
|
||||
snippet_chars: obj
|
||||
.get("snippet_chars")
|
||||
.and_then(|v| v.as_u64())
|
||||
.and_then(serde_json::Value::as_u64)
|
||||
.map(|n| n as usize),
|
||||
cursor: obj.get("cursor").and_then(|v| v.as_str()).map(String::from),
|
||||
trace: obj.get("trace").and_then(|v| v.as_bool()).unwrap_or(false),
|
||||
trace: obj
|
||||
.get("trace")
|
||||
.and_then(serde_json::Value::as_bool)
|
||||
.unwrap_or(false),
|
||||
};
|
||||
|
||||
Ok((
|
||||
@@ -295,4 +305,17 @@ mod tests {
|
||||
assert!(items[1].error.is_some());
|
||||
assert_eq!(items[1].error.as_ref().unwrap()["code"], "invalid_input");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn missing_query_error_message_includes_shape_hint() {
|
||||
let cfg = open_temp();
|
||||
let raw = vec![serde_json::json!({"mode": "lexical"})];
|
||||
let (items, _summary) = bulk_search_with_config(cfg, raw).unwrap();
|
||||
let err = items[0].error.as_ref().unwrap();
|
||||
let msg = err["message"].as_str().unwrap();
|
||||
assert!(
|
||||
msg.contains("query") && msg.contains("mode"),
|
||||
"missing shape hint in error message: {msg}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
61
crates/kebab-app/src/derivation_payload.rs
Normal file
61
crates/kebab-app/src/derivation_payload.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
//! Derivation-cache payload encoding helpers (design 2026-05-31 §3.3).
|
||||
//!
|
||||
//! - embedding: `dimensions × f32` little-endian bytes (1024×4 = 4096 B/chunk).
|
||||
//! - alias / korean_tokens: UTF-8 as-is (handled inline by the caller — no
|
||||
//! helper needed, `String::as_bytes` / `String::from_utf8`).
|
||||
|
||||
/// Encode an embedding vector as a little-endian `f32` byte string (§3.3).
|
||||
pub fn encode_embedding(vector: &[f32]) -> Vec<u8> {
|
||||
let mut out = Vec::with_capacity(vector.len() * 4);
|
||||
for &v in vector {
|
||||
out.extend_from_slice(&v.to_le_bytes());
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Decode a little-endian `f32` byte string back into a vector (§3.3).
|
||||
///
|
||||
/// Returns `None` if the payload length is not a multiple of 4 (corrupt
|
||||
/// entry) — the caller treats this as a cache miss and recomputes, so a bad
|
||||
/// payload never produces a wrong vector.
|
||||
pub fn decode_embedding(payload: &[u8]) -> Option<Vec<f32>> {
|
||||
if payload.len() % 4 != 0 {
|
||||
return None;
|
||||
}
|
||||
Some(
|
||||
payload
|
||||
.chunks_exact(4)
|
||||
.map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn roundtrips_vector() {
|
||||
let v = vec![0.0_f32, 1.5, -2.25, 3.125e10, f32::MIN, f32::MAX];
|
||||
let bytes = encode_embedding(&v);
|
||||
assert_eq!(bytes.len(), v.len() * 4);
|
||||
assert_eq!(decode_embedding(&bytes), Some(v));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_vector_roundtrips() {
|
||||
assert_eq!(encode_embedding(&[]), Vec::<u8>::new());
|
||||
assert_eq!(decode_embedding(&[]), Some(vec![]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn misaligned_payload_is_none() {
|
||||
assert_eq!(decode_embedding(&[1, 2, 3]), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn little_endian_layout_is_fixed() {
|
||||
// 1.0_f32 == 0x3F800000, little-endian bytes [0x00,0x00,0x80,0x3F].
|
||||
assert_eq!(encode_embedding(&[1.0]), vec![0x00, 0x00, 0x80, 0x3F]);
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,6 @@
|
||||
|
||||
pub use crate::doctor_signal::{DoctorUnhealthy, NoHitSignal, RefusalSignal};
|
||||
|
||||
pub use kebab_config::{ConfigInvalid, ConfigNotFound};
|
||||
pub use kebab_llm_local::LlmError;
|
||||
pub use kebab_config::ConfigInvalid;
|
||||
pub use kebab_store_sqlite::NotIndexed;
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use crate::error_signal::{ConfigInvalid, LlmError, NotIndexed};
|
||||
use crate::error_signal::{ConfigInvalid, ConfigNotFound, LlmError, NotIndexed};
|
||||
|
||||
// p9-fb-34: `stale_cursor` is constructed directly by `cursor::decode`
|
||||
// and surfaced through `StructuredError` (an anyhow-friendly wrapper
|
||||
@@ -65,6 +65,20 @@ pub fn classify(err: &anyhow::Error, verbose: bool) -> ErrorV1 {
|
||||
hint: Some("check `--config <path>` and TOML syntax".to_string()),
|
||||
};
|
||||
}
|
||||
if let Some(s) = err.downcast_ref::<ConfigNotFound>() {
|
||||
return ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "config_not_found".to_string(),
|
||||
message: s.to_string(),
|
||||
details: json!({
|
||||
"path": s.path.to_string_lossy(),
|
||||
}),
|
||||
hint: Some(
|
||||
"verify --config <path>; pass an existing toml file or omit --config to use XDG default"
|
||||
.to_string(),
|
||||
),
|
||||
};
|
||||
}
|
||||
if let Some(s) = err.downcast_ref::<NotIndexed>() {
|
||||
return ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
@@ -91,7 +105,7 @@ pub fn classify(err: &anyhow::Error, verbose: bool) -> ErrorV1 {
|
||||
}
|
||||
let mut details = json!({});
|
||||
if verbose {
|
||||
let chain: Vec<String> = err.chain().map(|c| c.to_string()).collect();
|
||||
let chain: Vec<String> = err.chain().map(std::string::ToString::to_string).collect();
|
||||
details = json!({"chain": chain});
|
||||
}
|
||||
ErrorV1 {
|
||||
@@ -158,7 +172,10 @@ mod tests {
|
||||
});
|
||||
let v1 = classify(&err, false);
|
||||
assert_eq!(v1.code, "config_invalid");
|
||||
assert_eq!(v1.details.get("path").and_then(|p| p.as_str()), Some("/tmp/x.toml"));
|
||||
assert_eq!(
|
||||
v1.details.get("path").and_then(|p| p.as_str()),
|
||||
Some("/tmp/x.toml")
|
||||
);
|
||||
assert!(v1.hint.is_some());
|
||||
}
|
||||
|
||||
@@ -182,7 +199,8 @@ mod tests {
|
||||
// the resulting LlmError::Unreachable maps to "model_unreachable".
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.timeout(std::time::Duration::from_millis(500))
|
||||
.build().unwrap();
|
||||
.build()
|
||||
.unwrap();
|
||||
let err = client.get("http://127.0.0.1:1").send().unwrap_err();
|
||||
let llm = LlmError::Unreachable {
|
||||
endpoint: "http://127.0.0.1:1".to_string(),
|
||||
@@ -198,7 +216,10 @@ mod tests {
|
||||
let llm = LlmError::ModelNotPulled("gemma4:e4b".to_string());
|
||||
let v1 = classify(&anyhow::Error::new(llm), false);
|
||||
assert_eq!(v1.code, "model_not_pulled");
|
||||
assert_eq!(v1.details.get("model").and_then(|p| p.as_str()), Some("gemma4:e4b"));
|
||||
assert_eq!(
|
||||
v1.details.get("model").and_then(|p| p.as_str()),
|
||||
Some("gemma4:e4b")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -235,7 +256,10 @@ mod tests {
|
||||
// (single source of truth). classify must not pattern-match on
|
||||
// anyhow string contents — that would create two sources of
|
||||
// truth. The bare anyhow string falls through to "generic".
|
||||
assert_ne!(v1.code, "stale_cursor", "classify must not produce stale_cursor from bare anyhow string");
|
||||
assert_ne!(
|
||||
v1.code, "stale_cursor",
|
||||
"classify must not produce stale_cursor from bare anyhow string"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -36,9 +36,7 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let already = existing
|
||||
.lines()
|
||||
.any(|line| line.trim() == KEBABIGNORE_LINE);
|
||||
let already = existing.lines().any(|line| line.trim() == KEBABIGNORE_LINE);
|
||||
if already {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -50,18 +48,14 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
|
||||
if !existing.is_empty() && !existing.ends_with('\n') {
|
||||
file.write_all(b"\n")?;
|
||||
}
|
||||
writeln!(file, "{}", KEBABIGNORE_LINE)?;
|
||||
writeln!(file, "{KEBABIGNORE_LINE}")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Copy bytes to `<external_dir>/<blake3-12>.<ext>`. Idempotent — if the
|
||||
/// destination file already exists with the expected hash, the existing
|
||||
/// file is reused (no second write). Returns the destination path.
|
||||
pub fn copy_to_external(
|
||||
external_dir: &Path,
|
||||
bytes: &[u8],
|
||||
ext: &str,
|
||||
) -> Result<PathBuf> {
|
||||
pub fn copy_to_external(external_dir: &Path, bytes: &[u8], ext: &str) -> Result<PathBuf> {
|
||||
let hash = blake3::hash(bytes);
|
||||
let hex = hash.to_hex();
|
||||
let prefix = &hex.as_str()[..12];
|
||||
@@ -82,11 +76,7 @@ pub fn copy_to_external(
|
||||
/// Internal `yaml_quote` always uses double-quoted YAML form with backslash
|
||||
/// escapes for `"` / `\` / control chars — agent-supplied titles with
|
||||
/// special characters are safe.
|
||||
pub fn inject_frontmatter(
|
||||
body: &str,
|
||||
title: &str,
|
||||
source_uri: Option<&str>,
|
||||
) -> Result<String> {
|
||||
pub fn inject_frontmatter(body: &str, title: &str, source_uri: Option<&str>) -> Result<String> {
|
||||
let head = body.trim_start();
|
||||
if head.starts_with("---\n") || head.starts_with("---\r\n") || head.starts_with("---\r") {
|
||||
anyhow::bail!(
|
||||
|
||||
@@ -50,14 +50,14 @@ impl App {
|
||||
fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result<FetchResult> {
|
||||
let target = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_chunk(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "chunk_not_found".to_string(),
|
||||
message: format!("chunk_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "chunk_not_found".to_string(),
|
||||
message: format!("chunk_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
let doc_id = target.doc_id.clone();
|
||||
let doc =
|
||||
@@ -107,14 +107,14 @@ fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result<FetchResult> {
|
||||
fn fetch_doc(app: &App, id: DocumentId, opts: FetchOpts) -> Result<FetchResult> {
|
||||
let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
let mut text = fmt_canonical_to_markdown(&doc);
|
||||
let mut truncated = false;
|
||||
@@ -176,14 +176,14 @@ fn fetch_span(
|
||||
) -> Result<FetchResult> {
|
||||
let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
// Reject line-incompatible media types (PDF / audio). `SourceType`
|
||||
// (markdown / note / paper / reference / inbox) is the *user-facing*
|
||||
|
||||
446
crates/kebab-app/src/ingest_log.rs
Normal file
446
crates/kebab-app/src/ingest_log.rs
Normal file
@@ -0,0 +1,446 @@
|
||||
//! Per-ingest-run structured ndjson log writer (v0.20.x ingest log feature).
|
||||
//!
|
||||
//! Each `kebab ingest` run produces one `ingest-{run_id}.ndjson` file in
|
||||
//! `config.logging.ingest_log_dir`. Records are appended line by line; the
|
||||
//! last record is always `kind="summary"`. `IngestLogWriter::open` returns
|
||||
//! `Ok(None)` when `ingest_log_enabled = false` so callers need not branch.
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
|
||||
pub struct IngestLogWriter {
|
||||
file: BufWriter<File>,
|
||||
path: PathBuf,
|
||||
run_id: String,
|
||||
started_at: SystemTime,
|
||||
}
|
||||
|
||||
impl IngestLogWriter {
|
||||
/// Open a new log file. Returns `Ok(None)` when `cfg.ingest_log_enabled == false` (AC-6).
|
||||
pub fn open(cfg: &kebab_config::LoggingCfg) -> anyhow::Result<Option<Self>> {
|
||||
if !cfg.ingest_log_enabled {
|
||||
return Ok(None);
|
||||
}
|
||||
let run_id = generate_run_id();
|
||||
let log_dir = expand_log_dir(&cfg.ingest_log_dir);
|
||||
std::fs::create_dir_all(&log_dir)?;
|
||||
// Cleanup before creating the new file (non-critical: warn on error).
|
||||
if let Err(e) = cleanup_old_logs(&log_dir, cfg.keep_recent_runs, cfg.retention_days) {
|
||||
tracing::warn!(target: "kebab-app", "ingest log cleanup failed: {e}");
|
||||
}
|
||||
let path = log_dir.join(format!("ingest-{run_id}.ndjson"));
|
||||
let file = BufWriter::new(File::create(&path)?);
|
||||
Ok(Some(Self {
|
||||
file,
|
||||
path,
|
||||
run_id,
|
||||
started_at: SystemTime::now(),
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn write_event(&mut self, event: &LogEvent<'_>) -> anyhow::Result<()> {
|
||||
serde_json::to_writer(&mut self.file, event)?;
|
||||
writeln!(self.file)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_summary(&mut self, summary: &IngestSummary) -> anyhow::Result<()> {
|
||||
serde_json::to_writer(&mut self.file, summary)?;
|
||||
writeln!(self.file)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> anyhow::Result<()> {
|
||||
self.file.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn run_id(&self) -> &str {
|
||||
&self.run_id
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.path
|
||||
}
|
||||
|
||||
pub fn started_at(&self) -> SystemTime {
|
||||
self.started_at
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for IngestLogWriter {
|
||||
fn drop(&mut self) {
|
||||
let _ = self.file.flush();
|
||||
}
|
||||
}
|
||||
|
||||
/// ISO 8601 compact timestamp + uuid v7 suffix: `20260528T013000Z-abc123de`.
|
||||
/// uuid v7 is the workspace dep (Cargo.toml); `rand` is not added (spec §6 R-5).
|
||||
fn generate_run_id() -> String {
|
||||
use time::macros::format_description;
|
||||
let now = time::OffsetDateTime::now_utc();
|
||||
let ts = now
|
||||
.format(format_description!(
|
||||
"[year][month][day]T[hour][minute][second]Z"
|
||||
))
|
||||
.unwrap_or_else(|_| "19700101T000000Z".to_string());
|
||||
let uid = uuid::Uuid::now_v7().simple().to_string();
|
||||
let suffix = &uid[uid.len() - 8..];
|
||||
format!("{ts}-{suffix}")
|
||||
}
|
||||
|
||||
/// Expand `{state_dir}` placeholder → XDG state dir (spec §6 R-3).
|
||||
/// Other tilde/env expansion is delegated to `kebab_config::expand_path`.
|
||||
fn expand_log_dir(path: &Path) -> PathBuf {
|
||||
let path_str = path.to_string_lossy();
|
||||
if path_str.contains("{state_dir}") {
|
||||
let state_dir = kebab_config::Config::xdg_state_dir();
|
||||
PathBuf::from(path_str.replace("{state_dir}", &state_dir.to_string_lossy()))
|
||||
} else {
|
||||
path.to_path_buf()
|
||||
}
|
||||
}
|
||||
|
||||
/// RFC 3339 UTC timestamp for log records.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn now_ts() -> String {
|
||||
time::OffsetDateTime::now_utc()
|
||||
.format(&Rfc3339)
|
||||
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string())
|
||||
}
|
||||
|
||||
/// Ingest event record (ndjson line). `kind` is the discriminator.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||
pub enum LogEvent<'a> {
|
||||
Ocr {
|
||||
ts: String,
|
||||
/// v0.20.x r2: additive field — doc_id for dual-write SQLite correlation.
|
||||
/// Round 1 ndjson logs deserialize with doc_id=None (Serde Option default).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
doc_id: Option<&'a str>,
|
||||
doc_path: &'a str,
|
||||
page: u32,
|
||||
image_byte_size: Option<u64>,
|
||||
image_width: Option<u32>,
|
||||
image_height: Option<u32>,
|
||||
ms: u64,
|
||||
chars: u32,
|
||||
success: bool,
|
||||
reason: Option<&'a str>,
|
||||
ocr_engine: &'a str,
|
||||
},
|
||||
ParseError {
|
||||
ts: String,
|
||||
doc_path: &'a str,
|
||||
reason: &'a str,
|
||||
message: &'a str,
|
||||
},
|
||||
Skip {
|
||||
ts: String,
|
||||
doc_path: &'a str,
|
||||
reason: &'a str,
|
||||
detail: Option<&'a str>,
|
||||
},
|
||||
Error {
|
||||
ts: String,
|
||||
code: &'a str,
|
||||
message: &'a str,
|
||||
},
|
||||
}
|
||||
|
||||
/// Final summary record — always the last line of the log file.
|
||||
/// Explicit `kind` field serializes to `"kind": "summary"`.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct IngestSummary {
|
||||
pub kind: String,
|
||||
pub ts: String,
|
||||
pub run_id: String,
|
||||
pub scanned: u32,
|
||||
pub new: u32,
|
||||
pub errors: u32,
|
||||
pub ocr_pages: u32,
|
||||
pub ocr_failures: u32,
|
||||
pub ocr_p50_ms: Option<u64>,
|
||||
pub ocr_p90_ms: Option<u64>,
|
||||
pub ocr_max_ms: Option<u64>,
|
||||
pub duration_ms: u64,
|
||||
}
|
||||
|
||||
impl IngestSummary {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
ts: String,
|
||||
run_id: String,
|
||||
scanned: u32,
|
||||
new: u32,
|
||||
errors: u32,
|
||||
ocr_pages: u32,
|
||||
ocr_failures: u32,
|
||||
ocr_ms_samples: &[u64],
|
||||
duration_ms: u64,
|
||||
) -> Self {
|
||||
let (p50, p90, _p99, max) = percentiles(ocr_ms_samples);
|
||||
Self {
|
||||
kind: "summary".to_string(),
|
||||
ts,
|
||||
run_id,
|
||||
scanned,
|
||||
new,
|
||||
errors,
|
||||
ocr_pages,
|
||||
ocr_failures,
|
||||
ocr_p50_ms: p50,
|
||||
ocr_p90_ms: p90,
|
||||
ocr_max_ms: max,
|
||||
duration_ms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple percentile extraction on a sorted copy of `samples`.
|
||||
/// Returns `(p50, p90, p99, max)`. All `None` when samples is empty.
|
||||
/// p99 surfaces via `inspect ocr-stats`; `IngestSummary` uses p50/p90/max only.
|
||||
pub(crate) fn percentiles(samples: &[u64]) -> (Option<u64>, Option<u64>, Option<u64>, Option<u64>) {
|
||||
if samples.is_empty() {
|
||||
return (None, None, None, None);
|
||||
}
|
||||
let mut sorted = samples.to_vec();
|
||||
sorted.sort_unstable();
|
||||
let n = sorted.len();
|
||||
let p50 = sorted[(n.saturating_sub(1) * 50) / 100];
|
||||
let p90 = sorted[(n.saturating_sub(1) * 90) / 100];
|
||||
let p99 = sorted[(n.saturating_sub(1) * 99) / 100];
|
||||
let max = *sorted.last().unwrap();
|
||||
(Some(p50), Some(p90), Some(p99), Some(max))
|
||||
}
|
||||
|
||||
/// Delete old ingest log files from `log_dir`.
|
||||
///
|
||||
/// **Retention rule (§3.4 OR-on-stale semantics):**
|
||||
/// Keep a file iff BOTH conditions hold: (idx < keep_recent) AND (modified > cutoff).
|
||||
/// Delete iff (idx >= keep_recent) OR (modified <= cutoff) — either stale condition
|
||||
/// triggers deletion. Files are indexed newest-first so `idx=0` is the most recent.
|
||||
pub(crate) fn cleanup_old_logs(
|
||||
log_dir: &Path,
|
||||
keep_recent: u32,
|
||||
retention_days: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut entries: Vec<_> = std::fs::read_dir(log_dir)?
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
e.path()
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.is_some_and(|s| s.starts_with("ingest-") && s.ends_with(".ndjson"))
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort newest-first by mtime (files without mtime go to the end).
|
||||
entries.sort_by_key(|e| std::cmp::Reverse(e.metadata().ok().and_then(|m| m.modified().ok())));
|
||||
|
||||
let cutoff = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(
|
||||
u64::from(retention_days) * 86400,
|
||||
))
|
||||
.unwrap_or(SystemTime::UNIX_EPOCH);
|
||||
|
||||
for (idx, entry) in entries.into_iter().enumerate() {
|
||||
let modified = entry
|
||||
.metadata()
|
||||
.ok()
|
||||
.and_then(|m| m.modified().ok())
|
||||
.unwrap_or(SystemTime::UNIX_EPOCH);
|
||||
// Keep iff (idx < keep_recent) AND (modified > cutoff).
|
||||
if (idx as u32) < keep_recent && modified > cutoff {
|
||||
continue;
|
||||
}
|
||||
if let Err(e) = std::fs::remove_file(entry.path()) {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
"failed to remove old log {}: {e}",
|
||||
entry.path().display()
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_config::LoggingCfg;
|
||||
use std::time::SystemTime;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn generate_run_id_has_iso_prefix_and_8_hex_suffix() {
|
||||
let id = generate_run_id();
|
||||
// Format: YYYYMMDDTHHmmssZ-xxxxxxxx (total len = 16+1+8 = 25)
|
||||
assert_eq!(id.len(), 25, "run_id len should be 25: {id}");
|
||||
let (prefix, suffix) = id.split_once('-').expect("run_id should contain '-'");
|
||||
assert_eq!(prefix.len(), 16, "prefix should be 16 chars: {prefix}");
|
||||
assert!(prefix.contains('T'), "prefix should contain T: {prefix}");
|
||||
assert!(prefix.ends_with('Z'), "prefix should end with Z: {prefix}");
|
||||
assert_eq!(suffix.len(), 8, "suffix should be 8 chars: {suffix}");
|
||||
assert!(
|
||||
suffix.chars().all(|c| c.is_ascii_hexdigit()),
|
||||
"suffix should be hex: {suffix}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_log_dir_substitutes_state_dir_placeholder() {
|
||||
let input = PathBuf::from("{state_dir}/logs");
|
||||
let expanded = expand_log_dir(&input);
|
||||
let expected = kebab_config::Config::xdg_state_dir().join("logs");
|
||||
assert_eq!(expanded, expected);
|
||||
assert!(!expanded.to_string_lossy().contains("{state_dir}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn writer_disabled_returns_none() {
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: false,
|
||||
ingest_log_dir: PathBuf::from("/tmp/should-not-exist"),
|
||||
..Default::default()
|
||||
};
|
||||
let result = IngestLogWriter::open(&cfg).expect("open should not error");
|
||||
assert!(result.is_none(), "disabled writer should return None");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn writer_writes_one_event_per_line_with_kind_discriminator() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: tmp.path().to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
|
||||
let path = writer.path().to_path_buf();
|
||||
|
||||
writer
|
||||
.write_event(&LogEvent::Skip {
|
||||
ts: now_ts(),
|
||||
doc_path: "a.zip",
|
||||
reason: "builtin_blacklist",
|
||||
detail: Some(".zip extension"),
|
||||
})
|
||||
.unwrap();
|
||||
writer
|
||||
.write_event(&LogEvent::Error {
|
||||
ts: now_ts(),
|
||||
code: "ingest_fatal",
|
||||
message: "something bad",
|
||||
})
|
||||
.unwrap();
|
||||
writer
|
||||
.write_event(&LogEvent::ParseError {
|
||||
ts: now_ts(),
|
||||
doc_path: "weird.pdf",
|
||||
reason: "lopdf_error",
|
||||
message: "unexpected EOF",
|
||||
})
|
||||
.unwrap();
|
||||
writer.flush().unwrap();
|
||||
|
||||
let contents = std::fs::read_to_string(&path).unwrap();
|
||||
let lines: Vec<&str> = contents.lines().collect();
|
||||
assert_eq!(lines.len(), 3, "expected 3 lines, got: {}", lines.len());
|
||||
for line in &lines {
|
||||
assert!(
|
||||
line.starts_with('{'),
|
||||
"each line should be JSON object: {line}"
|
||||
);
|
||||
assert!(
|
||||
line.contains("\"kind\""),
|
||||
"each line should have 'kind': {line}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn drop_flushes_pending_buffer() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: tmp.path().to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
|
||||
let path = writer.path().to_path_buf();
|
||||
writer
|
||||
.write_event(&LogEvent::Error {
|
||||
ts: now_ts(),
|
||||
code: "test",
|
||||
message: "drop flush test",
|
||||
})
|
||||
.unwrap();
|
||||
// Drop without explicit flush — Drop impl should flush BufWriter.
|
||||
drop(writer);
|
||||
let contents = std::fs::read_to_string(&path).unwrap();
|
||||
assert!(
|
||||
contents.lines().count() >= 1,
|
||||
"file should have at least 1 line after drop"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC-7: keep_recent=3 with 5 files, oldest 2 should be deleted.
|
||||
#[test]
|
||||
fn cleanup_keeps_recent_n_drops_old() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let dir = tmp.path();
|
||||
// Create 5 files with mtime spread across 60 days
|
||||
for i in 0..5u64 {
|
||||
let path = dir.join(format!("ingest-file{i}.ndjson"));
|
||||
std::fs::write(&path, b"x").unwrap();
|
||||
// Set mtime: file 0 = newest, file 4 = 60 days old
|
||||
let age_days = i * 15; // 0, 15, 30, 45, 60 days old
|
||||
let mtime = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(age_days * 86400))
|
||||
.unwrap();
|
||||
filetime::set_file_mtime(&path, filetime::FileTime::from_system_time(mtime)).unwrap();
|
||||
}
|
||||
// keep_recent=3, retention_days=90 (no time-based deletion)
|
||||
cleanup_old_logs(dir, 3, 90).unwrap();
|
||||
let remaining: Vec<_> = std::fs::read_dir(dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.collect();
|
||||
assert_eq!(remaining.len(), 3, "expected 3 files after cleanup");
|
||||
}
|
||||
|
||||
/// F5 OR-on-stale: files within keep_recent count but older than retention_days
|
||||
/// must still be deleted.
|
||||
#[test]
|
||||
fn cleanup_drops_stale_even_within_count() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let dir = tmp.path();
|
||||
// 2 files, both 90 days old — well past retention_days=30
|
||||
for i in 0..2u64 {
|
||||
let path = dir.join(format!("ingest-old{i}.ndjson"));
|
||||
std::fs::write(&path, b"x").unwrap();
|
||||
let mtime = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(90 * 86400))
|
||||
.unwrap();
|
||||
filetime::set_file_mtime(&path, filetime::FileTime::from_system_time(mtime)).unwrap();
|
||||
}
|
||||
// keep_recent=10 (both within count) but retention_days=30 → both stale
|
||||
cleanup_old_logs(dir, 10, 30).unwrap();
|
||||
let remaining: Vec<_> = std::fs::read_dir(dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.collect();
|
||||
assert_eq!(
|
||||
remaining.len(),
|
||||
0,
|
||||
"stale files must be deleted even within keep_recent"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -46,10 +46,21 @@ pub struct AggregateCounts {
|
||||
/// Ordering invariant per design §2.4a:
|
||||
///
|
||||
/// ```text
|
||||
/// ScanStarted < ScanCompleted < (AssetStarted < AssetFinished)*
|
||||
/// < (Completed | Aborted)
|
||||
/// ScanStarted < ScanCompleted
|
||||
/// < ( AssetStarted
|
||||
/// [< (PdfOcrStarted < PdfOcrFinished)*]
|
||||
/// [< AssetChunked]
|
||||
/// [< AssetTimings]
|
||||
/// < AssetFinished )*
|
||||
/// < (Completed | Aborted)
|
||||
/// ```
|
||||
///
|
||||
/// `[]` = optional. `PdfOcr*` is per-PDF asset only (v0.20.0 sub-item 1).
|
||||
/// `AssetChunked` / `AssetTimings` are the v0.24.0 asset-internal phase
|
||||
/// events: `AssetChunked` fires once right after chunking (markdown /
|
||||
/// image / PDF); `AssetTimings` reports per-phase wall-clock once
|
||||
/// (markdown only).
|
||||
///
|
||||
/// Embed-batch events (`embed_batch_started` / `embed_batch_finished`
|
||||
/// in §2.4a) are reserved for a future iteration and are not emitted
|
||||
/// by this task; the spec calls them out as "임의 위치" (optional).
|
||||
@@ -79,12 +90,82 @@ pub enum IngestEvent {
|
||||
result: IngestItemKind,
|
||||
chunks: u32,
|
||||
},
|
||||
/// v0.24.0 (additive): emitted right after an asset is chunked, before
|
||||
/// expansion / embed / store. Surfaces "this document is N chunks"
|
||||
/// immediately so a single large document no longer looks frozen at
|
||||
/// `idx/total` while its per-chunk phases churn. `chunks` is the chunk
|
||||
/// count for asset `idx`.
|
||||
AssetChunked { idx: u32, total: u32, chunks: u32 },
|
||||
/// v0.26.1 (additive): emitted when an asset enters a *slow* internal
|
||||
/// phase, so the interactive progress bar can show **which** phase
|
||||
/// (and which model) is currently running instead of looking frozen.
|
||||
/// `phase` ∈ {`"ocr"`, `"caption"`, `"embed"`}; short phases
|
||||
/// (parse / chunk / store) are intentionally *not* emitted to avoid
|
||||
/// noise. `model` is the model performing the phase — the vision LLM
|
||||
/// id for `ocr` / `caption`, the embedder `model_id` for `embed`
|
||||
/// (`None` when the phase runs without a configured model, e.g. embed
|
||||
/// with no embedder wired). Emitted once per (asset, phase); no
|
||||
/// throttle needed (low frequency). Wire v1 consumers that predate
|
||||
/// this variant simply ignore the unknown `asset_phase` kind.
|
||||
AssetPhase {
|
||||
idx: u32,
|
||||
total: u32,
|
||||
phase: String,
|
||||
model: Option<String>,
|
||||
},
|
||||
/// v0.24.0 (additive): per-phase wall-clock (milliseconds) for asset
|
||||
/// `idx`, emitted once the asset's pipeline finishes. Lets a user see
|
||||
/// *where* the time went (parse / chunk / ocr / caption / embed /
|
||||
/// store) without parsing logs. The markdown path leaves `ocr_ms` /
|
||||
/// `caption_ms` at 0 (no image analysis); the image / PDF paths fill
|
||||
/// them so the slowest-asset summary attributes vision-model time
|
||||
/// correctly. `expansion_ms` is retained for wire compatibility but is
|
||||
/// always 0 since doc-side expansion was removed (HOTFIXES 2026-06-03).
|
||||
/// `ocr_ms` / `caption_ms` (v0.26.1) are additive with serde default 0
|
||||
/// so pre-v0.26.1 consumers deserialize cleanly.
|
||||
AssetTimings {
|
||||
idx: u32,
|
||||
total: u32,
|
||||
parse_ms: u64,
|
||||
chunk_ms: u64,
|
||||
expansion_ms: u64,
|
||||
embed_ms: u64,
|
||||
store_ms: u64,
|
||||
#[serde(default)]
|
||||
ocr_ms: u64,
|
||||
#[serde(default)]
|
||||
caption_ms: u64,
|
||||
},
|
||||
/// Run finished normally. `counts` is the final aggregate.
|
||||
Completed { counts: AggregateCounts },
|
||||
/// Run finished by user cancellation. `counts` is the partial
|
||||
/// aggregate at the cancel boundary. Emitted by `p9-fb-04`; this
|
||||
/// task never produces `Aborted`.
|
||||
Aborted { counts: AggregateCounts },
|
||||
/// PDF page 별 OCR 시작 시 emit. v0.20.0 sub-item 1.
|
||||
PdfOcrStarted { page: u32 },
|
||||
/// PDF page 별 OCR 종료 시 emit. v0.20.0 sub-item 1.
|
||||
/// `skipped` = `true` 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패).
|
||||
/// `chars = 0` 만으로는 "skip" 과 "0-char OCR result" 구분 불가, `skipped` field 가 명시적.
|
||||
PdfOcrFinished {
|
||||
page: u32,
|
||||
ms: u64,
|
||||
chars: u32,
|
||||
ocr_engine: String,
|
||||
skipped: bool,
|
||||
/// v0.20.x ingest log: raster image byte size (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_byte_size: Option<u64>,
|
||||
/// v0.20.x ingest log: raster image width in pixels (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_width: Option<u32>,
|
||||
/// v0.20.x ingest log: raster image height in pixels (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_height: Option<u32>,
|
||||
/// v0.20.x ingest log: OCR failure reason (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
failure_reason: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
/// Map a `MediaType` to the short label used by `IngestEvent::AssetStarted`.
|
||||
@@ -118,10 +199,7 @@ pub fn render_skipped_breakdown(map: &std::collections::BTreeMap<String, u32>) -
|
||||
/// Best-effort send into an optional `mpsc::Sender`. A dropped receiver
|
||||
/// is silently absorbed — the ingest hot path must not stall on a slow
|
||||
/// consumer. Logged at `trace` for diagnostics.
|
||||
pub(crate) fn emit(
|
||||
progress: Option<&std::sync::mpsc::Sender<IngestEvent>>,
|
||||
event: IngestEvent,
|
||||
) {
|
||||
pub(crate) fn emit(progress: Option<&std::sync::mpsc::Sender<IngestEvent>>, event: IngestEvent) {
|
||||
if let Some(tx) = progress {
|
||||
if tx.send(event).is_err() {
|
||||
tracing::trace!(
|
||||
@@ -165,13 +243,131 @@ mod tests {
|
||||
media: "markdown".into(),
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("asset_started"));
|
||||
assert_eq!(v.get("idx").and_then(|n| n.as_u64()), Some(1));
|
||||
assert_eq!(v.get("total").and_then(|n| n.as_u64()), Some(10));
|
||||
assert_eq!(
|
||||
v.get("kind").and_then(|s| s.as_str()),
|
||||
Some("asset_started")
|
||||
);
|
||||
assert_eq!(v.get("idx").and_then(serde_json::Value::as_u64), Some(1));
|
||||
assert_eq!(v.get("total").and_then(serde_json::Value::as_u64), Some(10));
|
||||
assert_eq!(v.get("path").and_then(|s| s.as_str()), Some("notes/foo.md"));
|
||||
assert_eq!(v.get("media").and_then(|s| s.as_str()), Some("markdown"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn asset_chunked_serializes_with_discriminator() {
|
||||
// v0.24.0 additive variant — `kind` must be snake_case
|
||||
// `asset_chunked` so wire v1 consumers branch on it cleanly.
|
||||
let ev = IngestEvent::AssetChunked {
|
||||
idx: 3,
|
||||
total: 10,
|
||||
chunks: 142,
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(
|
||||
v.get("kind").and_then(|s| s.as_str()),
|
||||
Some("asset_chunked")
|
||||
);
|
||||
assert_eq!(v.get("idx").and_then(serde_json::Value::as_u64), Some(3));
|
||||
assert_eq!(
|
||||
v.get("chunks").and_then(serde_json::Value::as_u64),
|
||||
Some(142)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn asset_timings_serializes_all_phase_fields() {
|
||||
let ev = IngestEvent::AssetTimings {
|
||||
idx: 2,
|
||||
total: 7,
|
||||
parse_ms: 12,
|
||||
chunk_ms: 3,
|
||||
expansion_ms: 45_000,
|
||||
embed_ms: 800,
|
||||
store_ms: 20,
|
||||
ocr_ms: 1_200,
|
||||
caption_ms: 3_400,
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(
|
||||
v.get("kind").and_then(|s| s.as_str()),
|
||||
Some("asset_timings")
|
||||
);
|
||||
// All phase fields are present (plain u64, always serialized).
|
||||
for (field, want) in [
|
||||
("parse_ms", 12u64),
|
||||
("chunk_ms", 3),
|
||||
("expansion_ms", 45_000),
|
||||
("embed_ms", 800),
|
||||
("store_ms", 20),
|
||||
("ocr_ms", 1_200),
|
||||
("caption_ms", 3_400),
|
||||
] {
|
||||
assert_eq!(
|
||||
v.get(field).and_then(serde_json::Value::as_u64),
|
||||
Some(want),
|
||||
"field {field}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn asset_timings_ocr_caption_default_to_zero_for_legacy_wire() {
|
||||
// v0.26.1 additive: a pre-v0.26.1 wire payload omits ocr_ms /
|
||||
// caption_ms; serde `default` must fill 0 so old producers stay
|
||||
// compatible.
|
||||
let legacy = serde_json::json!({
|
||||
"kind": "asset_timings",
|
||||
"idx": 1, "total": 1,
|
||||
"parse_ms": 5, "chunk_ms": 2, "expansion_ms": 0,
|
||||
"embed_ms": 10, "store_ms": 3
|
||||
});
|
||||
let ev: IngestEvent = serde_json::from_value(legacy).unwrap();
|
||||
match ev {
|
||||
IngestEvent::AssetTimings {
|
||||
ocr_ms,
|
||||
caption_ms,
|
||||
embed_ms,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(ocr_ms, 0);
|
||||
assert_eq!(caption_ms, 0);
|
||||
assert_eq!(embed_ms, 10);
|
||||
}
|
||||
other => panic!("unexpected event: {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn asset_phase_serializes_with_discriminator() {
|
||||
// v0.26.1 additive variant — `kind` must be snake_case
|
||||
// `asset_phase`, `phase` is the slow-phase label, `model` the
|
||||
// model id (nullable).
|
||||
let ev = IngestEvent::AssetPhase {
|
||||
idx: 4,
|
||||
total: 12,
|
||||
phase: "ocr".into(),
|
||||
model: Some("gemma4:e4b".into()),
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("asset_phase"));
|
||||
assert_eq!(v.get("idx").and_then(serde_json::Value::as_u64), Some(4));
|
||||
assert_eq!(v.get("phase").and_then(|s| s.as_str()), Some("ocr"));
|
||||
assert_eq!(v.get("model").and_then(|s| s.as_str()), Some("gemma4:e4b"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn asset_phase_model_none_serializes_as_null() {
|
||||
let ev = IngestEvent::AssetPhase {
|
||||
idx: 1,
|
||||
total: 1,
|
||||
phase: "embed".into(),
|
||||
model: None,
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("phase").and_then(|s| s.as_str()), Some("embed"));
|
||||
assert!(v.get("model").is_some_and(serde_json::Value::is_null));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_event_completed_has_counts() {
|
||||
let ev = IngestEvent::Completed {
|
||||
@@ -184,8 +380,14 @@ mod tests {
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("completed"));
|
||||
let counts = v.get("counts").unwrap();
|
||||
assert_eq!(counts.get("scanned").and_then(|n| n.as_u64()), Some(5));
|
||||
assert_eq!(counts.get("new").and_then(|n| n.as_u64()), Some(2));
|
||||
assert_eq!(
|
||||
counts.get("scanned").and_then(serde_json::Value::as_u64),
|
||||
Some(5)
|
||||
);
|
||||
assert_eq!(
|
||||
counts.get("new").and_then(serde_json::Value::as_u64),
|
||||
Some(2)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -26,7 +26,9 @@ pub fn init(level: LogLevel) -> Result<WorkerGuard> {
|
||||
let (nb, guard) = tracing_appender::non_blocking(file_appender);
|
||||
|
||||
let env_filter = match level {
|
||||
LogLevel::Default => EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")),
|
||||
LogLevel::Default => {
|
||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn"))
|
||||
}
|
||||
LogLevel::Verbose => EnvFilter::new("info"),
|
||||
LogLevel::Debug => EnvFilter::new("debug"),
|
||||
};
|
||||
|
||||
362
crates/kebab-app/src/pdf_ocr_apply.rs
Normal file
362
crates/kebab-app/src/pdf_ocr_apply.rs
Normal file
@@ -0,0 +1,362 @@
|
||||
// crates/kebab-app/src/pdf_ocr_apply.rs
|
||||
//
|
||||
// PDF post-extract OCR enrichment. parser isolation 보존 — kebab-parse-pdf 가
|
||||
// kebab-parse-image::OcrEngine 을 import 하지 않도록, helper 는 kebab-app 에 둠.
|
||||
// image path 의 apply_ocr (kebab-parse-image::ocr::apply_ocr) 의
|
||||
// PDF page 변형 — image 는 ImageRefBlock.ocr 를 mutate, PDF 는
|
||||
// Block::Paragraph.text / inlines 를 in-place mutate (단일 OCR fallback) 또는
|
||||
// 새 Block::Paragraph 를 push (always_on dual-block).
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, CommonBlock, Inline, Lang, ProvenanceEvent, ProvenanceKind,
|
||||
SourceSpan, TextBlock, id_for_block,
|
||||
};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
use kebab_parse_pdf::{compute_valid_char_ratio, extract_dctdecode_page_image};
|
||||
use lopdf::Document as LopdfDocument;
|
||||
use time::OffsetDateTime;
|
||||
use tracing::warn;
|
||||
|
||||
/// Extract width/height from a JPEG (or any image format) byte slice.
|
||||
/// Returns `None` on corrupt / unsupported data — callers fall back to
|
||||
/// `(None, None)` so OCR results remain valid (R-4 mitigation).
|
||||
fn extract_image_dimensions(bytes: &[u8]) -> Option<(u32, u32)> {
|
||||
use image::ImageReader;
|
||||
ImageReader::new(std::io::Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.ok()?
|
||||
.into_dimensions()
|
||||
.ok()
|
||||
}
|
||||
|
||||
/// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`].
|
||||
/// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade
|
||||
/// (`kebab_app::ingest_one_pdf_asset`) fills these from
|
||||
/// `kebab_config::Config::pdf::ocr` plus runtime flags (CLI / SIGINT).
|
||||
pub struct PdfOcrOpts {
|
||||
/// Master switch. `false` short-circuits to
|
||||
/// `PdfOcrSummary { pages_ocrd: 0, ms_total: 0 }` without lopdf reparse.
|
||||
pub enabled: bool,
|
||||
/// `true` → 모든 page OCR (dual-block path, new `Block::Paragraph` push).
|
||||
/// `false` → text-detect block 의 `min_char_count` 또는
|
||||
/// `valid_ratio_threshold` 미달인 page 만 OCR (in-place mutate).
|
||||
pub always_on: bool,
|
||||
/// 0.0..=1.0. text-detect block 의 `compute_valid_char_ratio` 가
|
||||
/// 본 임계 미만이면 OCR fallback. Default `0.5`.
|
||||
pub valid_ratio_threshold: f32,
|
||||
/// text-detect block 의 char count 가 본 임계 미만이면 OCR fallback.
|
||||
/// empty page (cover, blank separator) 자동 skip. Default `20`.
|
||||
pub min_char_count: u32,
|
||||
/// OCR engine 에 전달할 언어 힌트 (예: `Lang("kor".into())`).
|
||||
/// `None` → no hint passed to engine.
|
||||
pub lang_hint: Option<Lang>,
|
||||
/// Optional per-page cancellation handle. checked at start of each page
|
||||
/// loop iteration; set→true 시 `cancelled mid-PDF` error 반환. plan §6 E4
|
||||
/// + verifier LOW L-1 resolution + spec §4.8 line 1159 명시.
|
||||
pub cancel: Option<Arc<AtomicBool>>,
|
||||
}
|
||||
|
||||
/// OCR run summary returned by [`apply_ocr_to_pdf_pages`] for the caller's
|
||||
/// `IngestItem.pdf_ocr_pages` + `pdf_ocr_ms_total` wire fields (§4.6.2).
|
||||
#[derive(Debug)]
|
||||
pub struct PdfOcrSummary {
|
||||
/// Number of pages 가 OCR pipeline 을 실제 통과 (skipped page 제외).
|
||||
pub pages_ocrd: u32,
|
||||
/// Cumulative wall-clock duration of successful OCR engine calls (ms).
|
||||
/// `saturating_add` 사용 — 24-day cumulative 까지 overflow-safe.
|
||||
pub ms_total: u64,
|
||||
}
|
||||
|
||||
/// Post-extract OCR enrichment for PDF. Walks `canonical.blocks` page-by-page,
|
||||
/// classifies each page via `text_quality::compute_valid_char_ratio` +
|
||||
/// `min_char_count`, and either:
|
||||
/// - skips (vector PDF + sufficient text + `always_on=false`),
|
||||
/// - mutates the text-detect `Block::Paragraph` in-place with OCR output
|
||||
/// (scanned/mojibake page), or
|
||||
/// - pushes a new `Block::Paragraph` with dual ordinal (`always_on=true` +
|
||||
/// vector page).
|
||||
///
|
||||
/// Errors:
|
||||
/// - cancel handle (`opts.cancel = Some(true)`) → `Err("PDF OCR cancelled mid-PDF at page N")`.
|
||||
/// - lopdf re-parse failure → `Err(...)`.
|
||||
/// - per-page OCR engine failure 또는 DCTDecode 부재 → `ProvenanceKind::Warning`
|
||||
/// event push + `emit_progress(Finished { skipped: true })` + continue
|
||||
/// (no `Err` propagation).
|
||||
///
|
||||
/// See spec §4.1 + §4.4 for the full pipeline.
|
||||
pub fn apply_ocr_to_pdf_pages<F>(
|
||||
canonical: &mut CanonicalDocument,
|
||||
engine: &dyn OcrEngine,
|
||||
pdf_bytes: &[u8],
|
||||
opts: &PdfOcrOpts,
|
||||
mut emit_progress: F,
|
||||
) -> Result<PdfOcrSummary>
|
||||
where
|
||||
F: FnMut(PdfOcrProgress),
|
||||
{
|
||||
if !opts.enabled {
|
||||
return Ok(PdfOcrSummary {
|
||||
pages_ocrd: 0,
|
||||
ms_total: 0,
|
||||
});
|
||||
}
|
||||
let pdf_doc = LopdfDocument::load_mem(pdf_bytes)
|
||||
.context("kb-app::pdf_ocr_apply: re-parse PDF for image extract")?;
|
||||
let page_count = pdf_doc.get_pages().len() as u32;
|
||||
|
||||
let mut new_events: Vec<ProvenanceEvent> = Vec::new();
|
||||
let mut ocr_blocks: Vec<Block> = Vec::new();
|
||||
let mut pages_ocrd: u32 = 0;
|
||||
let mut ms_total: u64 = 0;
|
||||
|
||||
// canonical.blocks 의 page → block index map (text-detect block 의 in-place
|
||||
// mutate 또는 dual-block push 결정용).
|
||||
// PdfTextExtractor 가 page 마다 1 Block::Paragraph + SourceSpan::Page 를
|
||||
// 생성 (§1.4) — 그 invariant 사용.
|
||||
for page_num in 1..=page_count {
|
||||
if let Some(cancel) = &opts.cancel {
|
||||
if cancel.load(std::sync::atomic::Ordering::Relaxed) {
|
||||
anyhow::bail!("PDF OCR cancelled mid-PDF at page {page_num}");
|
||||
}
|
||||
}
|
||||
|
||||
let text_block_idx = find_paragraph_block_idx(&canonical.blocks, page_num);
|
||||
let text = match &canonical.blocks[text_block_idx] {
|
||||
Block::Paragraph(tb) => tb.text.clone(),
|
||||
_ => String::new(),
|
||||
};
|
||||
let chars = text.chars().count() as u32;
|
||||
let valid_ratio = compute_valid_char_ratio(&text);
|
||||
let needs_ocr = chars < opts.min_char_count || valid_ratio < opts.valid_ratio_threshold;
|
||||
|
||||
// 결정 matrix:
|
||||
// always_on=true → 모든 page OCR (dual-block).
|
||||
// always_on=false + needs_ocr → in-place OCR (text-detect block mutate).
|
||||
// needs_ocr=false → skip.
|
||||
let do_ocr = opts.always_on || needs_ocr;
|
||||
if !do_ocr {
|
||||
continue;
|
||||
}
|
||||
|
||||
emit_progress(PdfOcrProgress::Started { page: page_num });
|
||||
|
||||
let page_image_bytes = if let Some(b) = extract_dctdecode_page_image(&pdf_doc, page_num)? {
|
||||
b
|
||||
} else {
|
||||
let note = format!(
|
||||
"page={page_num} skipped: no DCTDecode image XObject (vector PDF page or unsupported /Filter — v1 supports DCTDecode passthrough only; see release notes for normalization guidance)"
|
||||
);
|
||||
warn!(target: "kebab-app", "{}", note);
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(note),
|
||||
});
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: 0,
|
||||
chars: 0,
|
||||
skipped: true,
|
||||
image_byte_size: None,
|
||||
image_width: None,
|
||||
image_height: None,
|
||||
failure_reason: None,
|
||||
});
|
||||
continue;
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
let ocr = match engine.recognize(&page_image_bytes, opts.lang_hint.as_ref()) {
|
||||
Ok(t) => t,
|
||||
Err(e) => {
|
||||
// OCR failure: warning event + skip (text-detect block 그대로).
|
||||
let note = format!(
|
||||
"page={} OCR failed engine={} version={} err={}",
|
||||
page_num,
|
||||
engine.engine_name(),
|
||||
engine.engine_version(),
|
||||
e
|
||||
);
|
||||
warn!(target: "kebab-app", "{}", note);
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(note),
|
||||
});
|
||||
let (image_width, image_height) = extract_image_dimensions(&page_image_bytes)
|
||||
.map_or((None, None), |(w, h)| (Some(w), Some(h)));
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: start.elapsed().as_millis() as u64,
|
||||
chars: 0,
|
||||
skipped: true,
|
||||
image_byte_size: Some(page_image_bytes.len() as u64),
|
||||
image_width,
|
||||
image_height,
|
||||
failure_reason: Some("ocr_error".to_string()),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let elapsed_ms = start.elapsed().as_millis() as u64;
|
||||
let chars_ocr = ocr.joined.chars().count() as u32;
|
||||
|
||||
pages_ocrd = pages_ocrd.saturating_add(1);
|
||||
ms_total = ms_total.saturating_add(elapsed_ms);
|
||||
|
||||
if opts.always_on && !needs_ocr {
|
||||
// dual-block path: 새 Block::Paragraph push, ordinal = page-1 + page_count.
|
||||
let ocr_ordinal = (page_num - 1) + page_count;
|
||||
let span_ocr = SourceSpan::Page {
|
||||
page: page_num,
|
||||
char_start: Some(0),
|
||||
char_end: Some(chars_ocr),
|
||||
};
|
||||
let block_id =
|
||||
id_for_block(&canonical.doc_id, "paragraph", &[], ocr_ordinal, &span_ocr);
|
||||
let common = CommonBlock {
|
||||
block_id,
|
||||
heading_path: Vec::new(),
|
||||
source_span: span_ocr,
|
||||
};
|
||||
ocr_blocks.push(Block::Paragraph(TextBlock {
|
||||
common,
|
||||
text: ocr.joined.clone(),
|
||||
inlines: if ocr.joined.is_empty() {
|
||||
Vec::new()
|
||||
} else {
|
||||
vec![Inline::Text {
|
||||
text: ocr.joined.clone(),
|
||||
}]
|
||||
},
|
||||
}));
|
||||
} else {
|
||||
// in-place mutate: text-detect block (빈 또는 low-valid) 의 text/inlines 교체.
|
||||
// block_id / ordinal 보존 — span 의 char_end 만 갱신.
|
||||
if let Block::Paragraph(tb) = &mut canonical.blocks[text_block_idx] {
|
||||
tb.text = ocr.joined.clone();
|
||||
tb.inlines = if ocr.joined.is_empty() {
|
||||
Vec::new()
|
||||
} else {
|
||||
vec![Inline::Text {
|
||||
text: ocr.joined.clone(),
|
||||
}]
|
||||
};
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(chars_ocr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::OcrApplied,
|
||||
note: Some(format!(
|
||||
"page={} engine={} version={} regions={} ms={} chars={}",
|
||||
page_num,
|
||||
engine.engine_name(),
|
||||
engine.engine_version(),
|
||||
ocr.regions.len(),
|
||||
elapsed_ms,
|
||||
chars_ocr
|
||||
)),
|
||||
});
|
||||
|
||||
let (image_width, image_height) = extract_image_dimensions(&page_image_bytes)
|
||||
.map_or((None, None), |(w, h)| (Some(w), Some(h)));
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: elapsed_ms,
|
||||
chars: chars_ocr,
|
||||
skipped: false,
|
||||
image_byte_size: Some(page_image_bytes.len() as u64),
|
||||
image_width,
|
||||
image_height,
|
||||
failure_reason: None,
|
||||
});
|
||||
}
|
||||
|
||||
canonical.blocks.extend(ocr_blocks);
|
||||
canonical.provenance.events.extend(new_events);
|
||||
Ok(PdfOcrSummary {
|
||||
pages_ocrd,
|
||||
ms_total,
|
||||
})
|
||||
}
|
||||
|
||||
fn find_paragraph_block_idx(blocks: &[Block], page_num: u32) -> usize {
|
||||
blocks
|
||||
.iter()
|
||||
.position(|b| match b {
|
||||
Block::Paragraph(tb) => matches!(
|
||||
tb.common.source_span,
|
||||
SourceSpan::Page { page, .. } if page == page_num
|
||||
),
|
||||
_ => false,
|
||||
})
|
||||
.expect("PdfTextExtractor emits 1 Block::Paragraph per page (invariant)")
|
||||
}
|
||||
|
||||
/// Per-page OCR progress event 가 caller 의 `emit_progress` closure 호출 시 emit.
|
||||
/// Step 6 의 ingest_one_pdf_asset 가 IngestEvent::PdfOcrStarted / PdfOcrFinished
|
||||
/// 로 carry (spec §4.6.1 wire schema).
|
||||
pub enum PdfOcrProgress {
|
||||
/// page 별 OCR 시작 시 emit. `engine.recognize` 호출 직전.
|
||||
Started {
|
||||
/// 1-based PDF page number.
|
||||
page: u32,
|
||||
},
|
||||
/// page 별 OCR 종료 시 emit (성공 / skip / failure 모두).
|
||||
Finished {
|
||||
/// 1-based PDF page number.
|
||||
page: u32,
|
||||
/// `engine.recognize` wall-clock duration. skip path 의 의미는 mixed
|
||||
/// (DCTDecode 부재 시 `0`, OCR engine 실패 시 actual latency before bail).
|
||||
ms: u64,
|
||||
/// OCR result text 의 char count. skip 시 `0`.
|
||||
chars: u32,
|
||||
/// `true` = DCTDecode 부재 또는 OCR engine 실패 로 skip.
|
||||
/// `false` = 정상 OCR 완료.
|
||||
skipped: bool,
|
||||
/// v0.20.x ingest log: raster image byte size (additive, optional).
|
||||
image_byte_size: Option<u64>,
|
||||
/// v0.20.x ingest log: raster image width in pixels (additive, optional).
|
||||
image_width: Option<u32>,
|
||||
/// v0.20.x ingest log: raster image height in pixels (additive, optional).
|
||||
image_height: Option<u32>,
|
||||
/// v0.20.x ingest log: failure reason string when OCR failed (additive, optional).
|
||||
/// Values: "timeout" | "ocr_error" | "network_error" | None (success).
|
||||
failure_reason: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn extract_image_dimensions_valid_jpeg() {
|
||||
let img = image::RgbImage::new(16, 12);
|
||||
let mut bytes = Vec::new();
|
||||
image::DynamicImage::from(img)
|
||||
.write_to(
|
||||
&mut std::io::Cursor::new(&mut bytes),
|
||||
image::ImageFormat::Jpeg,
|
||||
)
|
||||
.expect("encode jpeg");
|
||||
assert_eq!(extract_image_dimensions(&bytes), Some((16, 12)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_image_dimensions_corrupt_returns_none() {
|
||||
assert_eq!(extract_image_dimensions(b"not a jpeg"), None);
|
||||
}
|
||||
}
|
||||
@@ -85,8 +85,7 @@ pub fn enumerate_paths(scope: ResetScope, cfg: &Config) -> Vec<PathBuf> {
|
||||
ResetScope::All => vec![cfg_dir, data_dir, cache_dir, state_dir],
|
||||
ResetScope::DataOnly => vec![data_dir, cache_dir, state_dir],
|
||||
ResetScope::VectorOnly => {
|
||||
let vector_dir =
|
||||
expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
|
||||
let vector_dir = expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
|
||||
vec![vector_dir]
|
||||
}
|
||||
ResetScope::ConfigOnly => vec![cfg_dir],
|
||||
@@ -137,8 +136,8 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 {
|
||||
/// the double scan is acceptable for a rare destructive operation.
|
||||
pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
|
||||
use kebab_core::DocumentStore as _;
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
use kebab_core::SourceScope;
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
|
||||
let store = kebab_store_sqlite::SqliteStore::open(cfg)
|
||||
.context("enumerate_orphans: open SqliteStore")?;
|
||||
@@ -160,16 +159,13 @@ pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let connector = FsSourceConnector::new(cfg)
|
||||
.context("enumerate_orphans: build FsSourceConnector")?;
|
||||
let connector =
|
||||
FsSourceConnector::new(cfg).context("enumerate_orphans: build FsSourceConnector")?;
|
||||
let (assets, _skips) = connector
|
||||
.scan_with_skips(&scope)
|
||||
.context("enumerate_orphans: scan workspace")?;
|
||||
|
||||
let scanned: HashSet<WorkspacePath> = assets
|
||||
.into_iter()
|
||||
.map(|a| a.workspace_path)
|
||||
.collect();
|
||||
let scanned: HashSet<WorkspacePath> = assets.into_iter().map(|a| a.workspace_path).collect();
|
||||
|
||||
let mut orphans: Vec<WorkspacePath> = stored
|
||||
.into_iter()
|
||||
@@ -206,8 +202,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
||||
if !p.exists() {
|
||||
continue;
|
||||
}
|
||||
std::fs::remove_dir_all(p)
|
||||
.with_context(|| format!("remove {}", p.display()))?;
|
||||
std::fs::remove_dir_all(p).with_context(|| format!("remove {}", p.display()))?;
|
||||
removed.push(p.clone());
|
||||
}
|
||||
|
||||
@@ -229,8 +224,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
||||
/// Execute the `OrphansOnly` variant: reconcile stored docs against the
|
||||
/// current walker scope without touching any filesystem directory.
|
||||
fn execute_orphans_only(cfg: &Config) -> Result<ResetReport> {
|
||||
let orphans = enumerate_orphans(cfg)
|
||||
.context("execute_orphans_only: enumerate orphans")?;
|
||||
let orphans = enumerate_orphans(cfg).context("execute_orphans_only: enumerate orphans")?;
|
||||
|
||||
if orphans.is_empty() {
|
||||
return Ok(ResetReport {
|
||||
|
||||
@@ -39,6 +39,14 @@ pub struct Capabilities {
|
||||
pub struct Models {
|
||||
pub parser_version: String,
|
||||
pub chunker_version: String,
|
||||
/// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체.
|
||||
/// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존.
|
||||
#[serde(default)]
|
||||
pub active_parsers: Vec<String>,
|
||||
/// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체.
|
||||
/// 빈 corpus → empty Vec.
|
||||
#[serde(default)]
|
||||
pub active_chunkers: Vec<String>,
|
||||
pub embedding_version: String,
|
||||
pub prompt_template_version: String,
|
||||
pub index_version: String,
|
||||
@@ -63,14 +71,26 @@ pub struct Stats {
|
||||
/// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold.
|
||||
#[serde(default)]
|
||||
pub stale_doc_count: u64,
|
||||
/// p10-1A-1: code language breakdown (chunk counts by canonical lowercase
|
||||
/// language identifier). Empty until 1A-2 produces code chunks.
|
||||
/// p10-1A-1: code language breakdown (**doc** counts by canonical
|
||||
/// lowercase language identifier). Empty until 1A-2 produces code
|
||||
/// docs. v0.17.0 PR-C: doc-count semantics corrected here (the
|
||||
/// previous "chunk counts" wording was a longstanding mis-label —
|
||||
/// implementation has always been `COUNT(*) FROM documents
|
||||
/// GROUP BY code_lang`). Use `code_lang_chunk_breakdown` for the
|
||||
/// chunk-level companion.
|
||||
#[serde(default)]
|
||||
pub code_lang_breakdown: std::collections::BTreeMap<String, u32>,
|
||||
/// p10-1A-1: repo breakdown (chunk counts by `metadata.repo` value).
|
||||
/// Empty until 1A-2 produces code chunks.
|
||||
/// p10-1A-1: repo breakdown (**doc** counts by `metadata.repo`
|
||||
/// value). Empty until 1A-2 produces code docs. v0.17.0 PR-C:
|
||||
/// doc-count wording corrected (mirror of code_lang_breakdown).
|
||||
#[serde(default)]
|
||||
pub repo_breakdown: std::collections::BTreeMap<String, u32>,
|
||||
/// v0.17.0 PR-C: sister of [`Self::code_lang_breakdown`] returning
|
||||
/// chunk counts instead of doc counts. Indexing-pressure metric —
|
||||
/// one PDF spec → 200 chunks vs one Rust file → 5 chunks shows up
|
||||
/// here in a way `code_lang_breakdown` (doc count) hides.
|
||||
#[serde(default)]
|
||||
pub code_lang_chunk_breakdown: std::collections::BTreeMap<String, u32>,
|
||||
}
|
||||
|
||||
const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
@@ -88,6 +108,7 @@ const WIRE_SCHEMAS: &[&str] = &[
|
||||
"doc_summary.v1",
|
||||
"chunk_inspection.v1",
|
||||
"doctor.v1",
|
||||
"config_migration.v1",
|
||||
"ingest_report.v1",
|
||||
"ingest_progress.v1",
|
||||
"reset_report.v1",
|
||||
@@ -96,6 +117,9 @@ const WIRE_SCHEMAS: &[&str] = &[
|
||||
"error.v1",
|
||||
"bulk_search_item.v1",
|
||||
"bulk_search_response.v1",
|
||||
// v0.20.x r2 Enhancement 3: OCR statistics + failures introspection.
|
||||
"ocr_stats.v1",
|
||||
"ocr_failures.v1",
|
||||
];
|
||||
|
||||
/// Build a [`SchemaV1`] introspection report for the given config.
|
||||
@@ -130,10 +154,10 @@ fn capabilities_snapshot() -> Capabilities {
|
||||
rag_multi_turn: true,
|
||||
search_cache: true,
|
||||
incremental_ingest: true,
|
||||
streaming_ask: false,
|
||||
streaming_ask: true,
|
||||
http_daemon: false,
|
||||
mcp_server: true,
|
||||
single_file_ingest: false,
|
||||
single_file_ingest: true,
|
||||
bulk_search: true,
|
||||
}
|
||||
}
|
||||
@@ -148,12 +172,8 @@ fn open_store_for_stats(cfg: &Config) -> anyhow::Result<kebab_store_sqlite::Sqli
|
||||
kebab_store_sqlite::SqliteStore::open_existing(&db_path)
|
||||
}
|
||||
|
||||
fn collect_stats(
|
||||
cfg: &Config,
|
||||
store: &kebab_store_sqlite::SqliteStore,
|
||||
) -> anyhow::Result<Stats> {
|
||||
let counts = store
|
||||
.count_summary_with_threshold(cfg.search.stale_threshold_days as u64)?;
|
||||
fn collect_stats(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
|
||||
let counts = store.count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?;
|
||||
let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
|
||||
let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir)
|
||||
.map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?;
|
||||
@@ -171,16 +191,23 @@ fn collect_stats(
|
||||
// p10-1A-2 follow-up: dogfooding (2026-05-20) revealed this was a
|
||||
// placeholder — mirror of code_lang_breakdown for the repo field.
|
||||
repo_breakdown: store.repo_breakdown()?,
|
||||
// v0.17.0 PR-C: chunk-level companion (closes HOTFIXES
|
||||
// 2026-05-22 "code_lang_breakdown chunk granularity" LOW).
|
||||
code_lang_chunk_breakdown: store.code_lang_chunk_breakdown()?,
|
||||
})
|
||||
}
|
||||
|
||||
fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
|
||||
let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default();
|
||||
let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default();
|
||||
Models {
|
||||
// markdown parser only — pdf-page-v1 (P7) / image extractors (P6)
|
||||
// maintain their own versions; surface those when SchemaV1.models
|
||||
// becomes a multi-medium map (P+).
|
||||
parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
|
||||
chunker_version: cfg.chunking.chunker_version.clone(),
|
||||
active_parsers,
|
||||
active_chunkers,
|
||||
// EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
|
||||
embedding_version: cfg.models.embedding.model.clone(),
|
||||
prompt_template_version: cfg.rag.prompt_template_version.clone(),
|
||||
@@ -210,6 +237,11 @@ mod tests_stats_ext {
|
||||
v.get("repo_breakdown").is_some(),
|
||||
"Stats JSON must include repo_breakdown: {v}"
|
||||
);
|
||||
// v0.17.0 PR-C: chunk-level companion field.
|
||||
assert!(
|
||||
v.get("code_lang_chunk_breakdown").is_some(),
|
||||
"Stats JSON must include code_lang_chunk_breakdown (v0.17.0 PR-C): {v}"
|
||||
);
|
||||
// Empty BTreeMap serializes as `{}` — confirm it's an object, not null.
|
||||
assert!(
|
||||
v["code_lang_breakdown"].is_object(),
|
||||
@@ -219,6 +251,10 @@ mod tests_stats_ext {
|
||||
v["repo_breakdown"].is_object(),
|
||||
"repo_breakdown must be an object: {v}"
|
||||
);
|
||||
assert!(
|
||||
v["code_lang_chunk_breakdown"].is_object(),
|
||||
"code_lang_chunk_breakdown must be an object: {v}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -244,3 +280,27 @@ mod tests_stats_ext {
|
||||
assert_eq!(s.stats.stale_doc_count, 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_capabilities {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn capabilities_streaming_ask_matches_cli_surface() {
|
||||
// Bug #9: kebab ask --stream 가 answer_event.v1 ndjson 191 event 정상 emit →
|
||||
// capabilities.streaming_ask 가 true 여야 함.
|
||||
let caps = capabilities_snapshot();
|
||||
assert!(caps.streaming_ask, "streaming_ask must be true (Bug #9)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn capabilities_single_file_ingest_matches_cli_surface() {
|
||||
// Bug #9: kebab ingest-file <path> + kebab ingest-stdin --title <T> 양쪽 모두
|
||||
// ingest_report.v1 정상 emit → capabilities.single_file_ingest 가 true 여야 함.
|
||||
let caps = capabilities_snapshot();
|
||||
assert!(
|
||||
caps.single_file_ingest,
|
||||
"single_file_ingest must be true (Bug #9)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,11 +10,7 @@ use kebab_core::SearchHit;
|
||||
///
|
||||
/// p9-fb-32: mirrored in `kebab_rag::pipeline::compute_stale` (dep-boundary
|
||||
/// rule prevents `kebab-rag → kebab-app`). Update both together.
|
||||
pub fn compute_stale(
|
||||
indexed_at: OffsetDateTime,
|
||||
now: OffsetDateTime,
|
||||
threshold_days: u32,
|
||||
) -> bool {
|
||||
pub fn compute_stale(indexed_at: OffsetDateTime, now: OffsetDateTime, threshold_days: u32) -> bool {
|
||||
if threshold_days == 0 {
|
||||
return false;
|
||||
}
|
||||
@@ -23,11 +19,7 @@ pub fn compute_stale(
|
||||
}
|
||||
|
||||
/// Sets `stale` on each hit in place using `compute_stale`.
|
||||
pub fn mark_stale_in_place(
|
||||
hits: &mut [SearchHit],
|
||||
now: OffsetDateTime,
|
||||
threshold_days: u32,
|
||||
) {
|
||||
pub fn mark_stale_in_place(hits: &mut [SearchHit], now: OffsetDateTime, threshold_days: u32) {
|
||||
for h in hits {
|
||||
h.stale = compute_stale(h.indexed_at, now, threshold_days);
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@ fn ask_lexical_smoke() {
|
||||
history: Vec::new(),
|
||||
conversation_id: None,
|
||||
turn_index: None,
|
||||
multi_hop: false,
|
||||
};
|
||||
// The fixture workspace contains "ownership" content; the model's
|
||||
// citation behavior depends on its training, so we don't assert on
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
60
crates/kebab-app/tests/common/mock_ocr.rs
Normal file
60
crates/kebab-app/tests/common/mock_ocr.rs
Normal file
@@ -0,0 +1,60 @@
|
||||
use std::sync::Mutex;
|
||||
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Lang, OcrText};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
|
||||
pub struct MockOcrEngine {
|
||||
expected_texts: Vec<String>,
|
||||
call_index: Mutex<usize>,
|
||||
fail: bool,
|
||||
}
|
||||
|
||||
impl MockOcrEngine {
|
||||
/// Single text (backward-compat ctor for pdf_ocr_apply.rs 10 sites).
|
||||
pub fn single(text: impl Into<String>, fail: bool) -> Self {
|
||||
Self {
|
||||
expected_texts: vec![text.into()],
|
||||
call_index: Mutex::new(0),
|
||||
fail,
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-page texts (cursor advances per recognize call).
|
||||
pub fn per_page(texts: Vec<String>, fail: bool) -> Self {
|
||||
Self {
|
||||
expected_texts: texts,
|
||||
call_index: Mutex::new(0),
|
||||
fail,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OcrEngine for MockOcrEngine {
|
||||
fn engine_name(&self) -> &'static str {
|
||||
"mock-ocr"
|
||||
}
|
||||
|
||||
fn engine_version(&self) -> String {
|
||||
"mock-v1".to_string()
|
||||
}
|
||||
|
||||
fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
|
||||
if self.fail {
|
||||
anyhow::bail!("mock failure");
|
||||
}
|
||||
let mut idx = self.call_index.lock().unwrap();
|
||||
let text = self
|
||||
.expected_texts
|
||||
.get(*idx)
|
||||
.cloned()
|
||||
.unwrap_or_else(|| self.expected_texts.last().cloned().unwrap_or_default());
|
||||
*idx += 1;
|
||||
Ok(OcrText {
|
||||
joined: text,
|
||||
regions: vec![],
|
||||
engine: "mock-ocr".to_string(),
|
||||
engine_version: "mock-v1".to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -93,8 +93,7 @@ impl TestEnv {
|
||||
/// directly. Caller can invoke this multiple times to simulate
|
||||
/// re-opening the binary after a corpus revision bump.
|
||||
pub fn app(&self) -> kebab_app::App {
|
||||
kebab_app::App::open_with_config(self.config.clone())
|
||||
.expect("App::open_with_config")
|
||||
kebab_app::App::open_with_config(self.config.clone()).expect("App::open_with_config")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,3 +168,5 @@ fn copy_dir_recursive(src: &Path, dest: &Path) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub mod mock_ocr;
|
||||
|
||||
148
crates/kebab-app/tests/config_invalidation.rs
Normal file
148
crates/kebab-app/tests/config_invalidation.rs
Normal file
@@ -0,0 +1,148 @@
|
||||
//! v0.26.2: ingest-config invalidation — changing a setting that affects
|
||||
//! ingest output auto-re-indexes the affected assets on the next ingest
|
||||
//! (no `--force-reingest`), while changing an unrelated setting does not.
|
||||
//!
|
||||
//! These end-to-end tests exercise the model-free signals (chunking +
|
||||
//! `[ingest.code]` options vs `search` settings). The exhaustive per-setting
|
||||
//! mapping (image OCR / caption, pdf.ocr, code options, search/rag/ui
|
||||
//! invariance) is unit-tested in
|
||||
//! `kebab-app/src/lib.rs::ingest_config_signature_tests` — those toggles
|
||||
//! (OCR/caption) require a live vision endpoint to ingest, so the wiring is
|
||||
//! verified here via the signature-driven chunking path that shares the same
|
||||
//! `effective_parser_version` plumbing.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
use kebab_app::{IngestOpts, ingest_with_config, ingest_with_config_opts};
|
||||
use kebab_core::IngestItemKind;
|
||||
|
||||
/// Seed a workspace with a markdown + a rust file so both the markdown and
|
||||
/// the code ingest paths are exercised. Returns the first-ingest report.
|
||||
fn seed_and_first_ingest(env: &TestEnv) -> kebab_core::IngestReport {
|
||||
std::fs::write(
|
||||
env.workspace_root.join("demo.rs"),
|
||||
"/// adds two integers\npub fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n",
|
||||
)
|
||||
.unwrap();
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false).expect("first ingest");
|
||||
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||
assert!(first.new >= 1, "first ingest creates docs: {first:?}");
|
||||
assert_eq!(first.unchanged, 0, "first ingest has no unchanged: {first:?}");
|
||||
first
|
||||
}
|
||||
|
||||
fn reingest(env: &TestEnv) -> kebab_core::IngestReport {
|
||||
ingest_with_config_opts(env.config.clone(), env.scope(), false, IngestOpts::default())
|
||||
.expect("re-ingest")
|
||||
}
|
||||
|
||||
/// Re-running with the identical config skips every asset (no spurious
|
||||
/// re-index). Regression guard for over-invalidation.
|
||||
#[test]
|
||||
fn identical_config_skips_all_assets() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||
assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}");
|
||||
assert_eq!(second.unchanged, scanned, "every doc Unchanged: {second:?}");
|
||||
assert_eq!(second.errors, 0);
|
||||
}
|
||||
|
||||
/// Changing a common chunking parameter re-indexes EVERY media type
|
||||
/// (markdown + code here) without `--force-reingest`.
|
||||
#[test]
|
||||
fn chunking_change_reindexes_all_types() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
// Bump target_tokens — folds into every type's signature.
|
||||
env.config.chunking.target_tokens += 100;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||
assert_eq!(
|
||||
second.unchanged, 0,
|
||||
"chunking change must re-index all: {second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.updated, scanned,
|
||||
"every doc re-indexed as Updated: {second:?}"
|
||||
);
|
||||
assert_eq!(second.errors, 0);
|
||||
}
|
||||
|
||||
/// Changing an `[ingest.code]` option re-indexes only the code asset; the
|
||||
/// markdown assets stay Unchanged.
|
||||
#[test]
|
||||
fn code_option_change_reindexes_code_only() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
// Raise max_file_lines (keeps the tiny demo.rs in-scope; only the code
|
||||
// signature changes).
|
||||
env.config.ingest.code.max_file_lines += 1000;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||
assert_eq!(second.errors, 0);
|
||||
assert_eq!(
|
||||
second.updated, 1,
|
||||
"exactly the code asset re-indexed: {second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.unchanged,
|
||||
scanned - 1,
|
||||
"all markdown assets stay Unchanged: {second:?}"
|
||||
);
|
||||
|
||||
let items = second.items.as_ref().expect("items present");
|
||||
let code = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("demo.rs"))
|
||||
.expect("demo.rs item");
|
||||
assert_eq!(
|
||||
code.kind,
|
||||
IngestItemKind::Updated,
|
||||
"demo.rs must be re-indexed: {code:?}"
|
||||
);
|
||||
for i in items.iter().filter(|i| i.doc_path.0.ends_with(".md")) {
|
||||
assert_eq!(
|
||||
i.kind,
|
||||
IngestItemKind::Unchanged,
|
||||
"markdown must be Unchanged: {i:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Regression guard: changing a non-ingest setting (`search.default_k`) does
|
||||
/// NOT re-index anything.
|
||||
#[test]
|
||||
fn search_setting_change_reindexes_nothing() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
env.config.search.default_k += 5;
|
||||
env.config.search.snippet_chars += 50;
|
||||
env.config.rag.score_gate = 0.5;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(
|
||||
second.unchanged, scanned,
|
||||
"search/rag changes must not re-index: {second:?}"
|
||||
);
|
||||
assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}");
|
||||
assert_eq!(second.new, 0);
|
||||
assert_eq!(second.errors, 0);
|
||||
}
|
||||
85
crates/kebab-app/tests/config_migrate.rs
Normal file
85
crates/kebab-app/tests/config_migrate.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use std::fs;
|
||||
|
||||
#[test]
|
||||
fn migrate_writes_backup_and_atomic_with_dry_run_noop() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = dir.path().join("config.toml");
|
||||
fs::write(
|
||||
&cfg,
|
||||
"schema_version = 1\n\n[workspace]\nroot = \"/n\"\ninclude = [\"*.md\"]\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// dry-run: 파일·백업 미변경.
|
||||
let report = kebab_app::config_migrate_with_config_path(Some(&cfg), true).unwrap();
|
||||
assert!(report.changed);
|
||||
assert!(report.dry_run);
|
||||
assert!(report.backup_path.is_none());
|
||||
assert!(!dir.path().join("config.toml.bak").exists());
|
||||
assert!(
|
||||
fs::read_to_string(&cfg).unwrap().contains("include"),
|
||||
"dry-run modified file"
|
||||
);
|
||||
|
||||
// 실제 적용: 백업 생성 + 파일 갱신.
|
||||
let report = kebab_app::config_migrate_with_config_path(Some(&cfg), false).unwrap();
|
||||
assert!(report.changed);
|
||||
assert!(!report.dry_run);
|
||||
assert!(report.backup_path.is_some());
|
||||
assert!(dir.path().join("config.toml.bak").exists());
|
||||
let new = fs::read_to_string(&cfg).unwrap();
|
||||
assert!(!new.contains("include"));
|
||||
assert!(new.contains("[ingest.code]"));
|
||||
|
||||
// 멱등: 재실행 changed=false.
|
||||
let report = kebab_app::config_migrate_with_config_path(Some(&cfg), false).unwrap();
|
||||
assert!(!report.changed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn migrate_missing_file_errors() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = dir.path().join("nope.toml");
|
||||
assert!(kebab_app::config_migrate_with_config_path(Some(&cfg), false).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn annotated_default_serialization_contains_section_comments() {
|
||||
let doc = kebab_config::migrate::annotated_default_document();
|
||||
let text = doc.to_string();
|
||||
assert!(
|
||||
text.contains("code ingest skip 정책"),
|
||||
"section comment missing:\n{text}"
|
||||
);
|
||||
assert!(text.contains("[ingest.code]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doctor_flags_outdated_config() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = dir.path().join("config.toml");
|
||||
fs::write(
|
||||
&cfg,
|
||||
"schema_version = 1\n\n[workspace]\nroot = \"/n\"\ninclude=[\"*.md\"]\n",
|
||||
)
|
||||
.unwrap();
|
||||
let report = kebab_app::doctor_with_config_path(Some(&cfg)).unwrap();
|
||||
let check = report
|
||||
.checks
|
||||
.iter()
|
||||
.find(|c| c.name == "config_migration")
|
||||
.unwrap();
|
||||
assert!(!check.ok, "outdated config should fail check");
|
||||
assert!(check.hint.as_deref().unwrap().contains("config migrate"));
|
||||
assert!(!report.ok, "overall doctor should be false");
|
||||
|
||||
// migrate 후엔 통과.
|
||||
kebab_app::config_migrate_with_config_path(Some(&cfg), false).unwrap();
|
||||
let report = kebab_app::doctor_with_config_path(Some(&cfg)).unwrap();
|
||||
let check = report
|
||||
.checks
|
||||
.iter()
|
||||
.find(|c| c.name == "config_migration")
|
||||
.unwrap();
|
||||
assert!(check.ok, "after migrate should pass");
|
||||
}
|
||||
@@ -12,7 +12,11 @@ fn open(env: &common::TestEnv) -> App {
|
||||
#[test]
|
||||
fn fetch_chunk_returns_target_only_when_no_context() {
|
||||
let env = common::TestEnv::new();
|
||||
common::ingest_md(&env, "a.md", "# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n");
|
||||
common::ingest_md(
|
||||
&env,
|
||||
"a.md",
|
||||
"# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n",
|
||||
);
|
||||
let app = open(&env);
|
||||
|
||||
// Find a chunk via search to obtain its id.
|
||||
@@ -38,12 +42,17 @@ fn fetch_chunk_returns_target_only_when_no_context() {
|
||||
#[test]
|
||||
fn fetch_chunk_with_context_returns_neighbors() {
|
||||
let env = common::TestEnv::new();
|
||||
let body = "# H1\n\nA1\n\n# H2\n\nA2\n\n# H3\n\nA3\n\n# H4\n\nA4\n\n# H5\n\nA5\n";
|
||||
// v0.17.0 trigram tokenizer: terms must be ≥3 Unicode chars to
|
||||
// match. The earlier fixture used 2-char tokens like `A1`/`A3` for
|
||||
// section bodies — those zero-hit under trigram. Use 5-char unique
|
||||
// words per section so the query can pin one chunk deterministically.
|
||||
let body =
|
||||
"# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n";
|
||||
common::ingest_md(&env, "multi.md", body);
|
||||
let app = env.app();
|
||||
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "A3".to_string(),
|
||||
text: "cherry".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
@@ -106,7 +115,10 @@ fn fetch_doc_returns_serialized_markdown() {
|
||||
.unwrap();
|
||||
assert_eq!(result.kind, FetchKind::Doc);
|
||||
let text = result.text.expect("doc text");
|
||||
assert!(text.contains("Heading One"), "doc text contains heading: {text:?}");
|
||||
assert!(
|
||||
text.contains("Heading One"),
|
||||
"doc text contains heading: {text:?}"
|
||||
);
|
||||
assert!(text.contains("First paragraph"), "doc text contains body");
|
||||
assert!(!result.truncated);
|
||||
}
|
||||
@@ -151,7 +163,11 @@ fn fetch_doc_with_max_tokens_truncates() {
|
||||
.unwrap();
|
||||
assert!(result.truncated);
|
||||
let text = result.text.expect("doc text");
|
||||
assert!(text.chars().count() <= 100, "trimmed text len {}", text.chars().count());
|
||||
assert!(
|
||||
text.chars().count() <= 100,
|
||||
"trimmed text len {}",
|
||||
text.chars().count()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -288,8 +304,7 @@ fn fetch_span_line_start_beyond_total_returns_empty_text() {
|
||||
fn fetch_chunk_context_at_first_chunk_clamps_lower_bound() {
|
||||
let env = common::TestEnv::new();
|
||||
// Multi-chunk markdown so context ±N has neighbors.
|
||||
let body =
|
||||
"# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
|
||||
let body = "# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
|
||||
common::ingest_md(&env, "boundary.md", body);
|
||||
let app = env.app();
|
||||
let q = kebab_core::SearchQuery {
|
||||
|
||||
@@ -16,8 +16,8 @@
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::ingest_with_config_opts;
|
||||
use kebab_app::IngestOpts;
|
||||
use kebab_app::ingest_with_config_opts;
|
||||
use kebab_core::{DocFilter, DocumentStore, SearchMode, SearchQuery, SourceScope};
|
||||
|
||||
/// Helper: open the store via `TestEnv` and run `list_documents`.
|
||||
@@ -125,17 +125,10 @@ fn include_scope_narrowing_does_not_purge() {
|
||||
include: vec!["**/*.rs".to_string()],
|
||||
exclude: env.config.workspace.exclude.clone(),
|
||||
};
|
||||
let first = ingest_with_config_opts(
|
||||
env.config.clone(),
|
||||
wide_scope,
|
||||
false,
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.expect("first ingest (wide) must succeed");
|
||||
assert!(
|
||||
first.new >= 2,
|
||||
"expected at least 2 new docs: {first:?}"
|
||||
);
|
||||
let first =
|
||||
ingest_with_config_opts(env.config.clone(), wide_scope, false, IngestOpts::default())
|
||||
.expect("first ingest (wide) must succeed");
|
||||
assert!(first.new >= 2, "expected at least 2 new docs: {first:?}");
|
||||
assert_eq!(
|
||||
first.purged_deleted_files, 0,
|
||||
"no purges on first ingest: {first:?}"
|
||||
|
||||
@@ -24,8 +24,7 @@ use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
/// inspectable in stored DB rows.
|
||||
fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf {
|
||||
use image::{ImageBuffer, Rgb};
|
||||
let img: ImageBuffer<Rgb<u8>, _> =
|
||||
ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
|
||||
let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
|
||||
let path = root.join(name);
|
||||
img.save(&path).expect("write PNG fixture");
|
||||
path
|
||||
@@ -80,7 +79,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
|
||||
// Counters: scanned should include the PNG; new ≥ 1 (markdown
|
||||
// fixtures from the workspace tree may also count).
|
||||
assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items);
|
||||
assert!(
|
||||
report.scanned >= 1,
|
||||
"scanned={}, items={:?}",
|
||||
report.scanned,
|
||||
report.items
|
||||
);
|
||||
assert_eq!(report.errors, 0, "no errors on lenient OCR path");
|
||||
|
||||
// Locate the image doc in the report items.
|
||||
@@ -94,7 +98,11 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
kebab_core::IngestItemKind::New,
|
||||
"image asset must be classified New on first ingest"
|
||||
);
|
||||
assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk");
|
||||
assert_eq!(
|
||||
img_item.chunk_count,
|
||||
Some(1),
|
||||
"image emits exactly one chunk"
|
||||
);
|
||||
|
||||
// Inspect the stored chunk text via kb-app's inspect_chunk facade.
|
||||
let doc_id = img_item.doc_id.clone().expect("image doc id");
|
||||
@@ -117,10 +125,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
|
||||
// Sanity: the doc was actually persisted into SQLite (kb-app's
|
||||
// list_docs facade reads the same store the chunker writes to).
|
||||
let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
|
||||
.expect("list_docs");
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).expect("list_docs");
|
||||
assert!(
|
||||
summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")),
|
||||
summaries
|
||||
.iter()
|
||||
.any(|s| s.doc_path.0.ends_with("diagram.png")),
|
||||
"image doc must appear in list_docs"
|
||||
);
|
||||
|
||||
@@ -171,8 +181,7 @@ async fn ingest_image_with_ocr_and_caption_populates_both_fields() {
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("diagram.png"))
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let block = match &doc.blocks[0] {
|
||||
kebab_core::Block::ImageRef(b) => b,
|
||||
_ => unreachable!(),
|
||||
@@ -267,8 +276,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
|
||||
let cfg_clone = cfg.clone();
|
||||
let scope = env.scope();
|
||||
let report = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg_clone, scope, false)
|
||||
.expect("ingest with no OCR/caption")
|
||||
kebab_app::ingest_with_config(cfg_clone, scope, false).expect("ingest with no OCR/caption")
|
||||
})
|
||||
.await
|
||||
.expect("task");
|
||||
@@ -282,8 +290,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
|
||||
.find(|i| i.doc_path.0.ends_with("raw.png"))
|
||||
.unwrap();
|
||||
assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk");
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let block = match &doc.blocks[0] {
|
||||
kebab_core::Block::ImageRef(b) => b,
|
||||
_ => unreachable!(),
|
||||
@@ -392,16 +399,12 @@ async fn re_ingest_image_produces_unchanged_with_same_doc_id() {
|
||||
let scope1 = scope.clone();
|
||||
let scope2 = scope.clone();
|
||||
|
||||
let r1 = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg1, scope1, false).unwrap()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
let r2 = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg2, scope2, false).unwrap()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
let r1 = spawn_blocking(move || kebab_app::ingest_with_config(cfg1, scope1, false).unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
let r2 = spawn_blocking(move || kebab_app::ingest_with_config(cfg2, scope2, false).unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let id1 = r1
|
||||
.items
|
||||
|
||||
@@ -21,11 +21,16 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
// First ingest — populates the DB. Use the legacy entry so the
|
||||
// assertions cover the "previously ingested" set without needing
|
||||
// IngestOpts::default() to behave identically.
|
||||
let first =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||
assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
|
||||
assert_eq!(first.unchanged, 0, "first ingest cannot have unchanged: {first:?}");
|
||||
assert!(
|
||||
first.new >= 1,
|
||||
"first ingest must create new docs: {first:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
first.unchanged, 0,
|
||||
"first ingest cannot have unchanged: {first:?}"
|
||||
);
|
||||
|
||||
let scanned = first.scanned;
|
||||
|
||||
@@ -38,9 +43,15 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(second.scanned, scanned, "second scanned matches first: {second:?}");
|
||||
assert_eq!(
|
||||
second.scanned, scanned,
|
||||
"second scanned matches first: {second:?}"
|
||||
);
|
||||
assert_eq!(second.new, 0, "no new docs on re-ingest: {second:?}");
|
||||
assert_eq!(second.updated, 0, "nothing should be marked updated: {second:?}");
|
||||
assert_eq!(
|
||||
second.updated, 0,
|
||||
"nothing should be marked updated: {second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.unchanged, scanned,
|
||||
"every doc must be Unchanged: {second:?}"
|
||||
@@ -52,10 +63,12 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
fn force_reingest_bypasses_skip() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
let first =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||
assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
|
||||
assert!(
|
||||
first.new >= 1,
|
||||
"first ingest must create new docs: {first:?}"
|
||||
);
|
||||
let scanned = first.scanned;
|
||||
|
||||
let second = ingest_with_config_opts(
|
||||
|
||||
@@ -107,13 +107,9 @@ fn cancel_none_is_uncancellable_default() {
|
||||
// ingest_with_config_progress (no cancel) runs to completion.
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ fn ingest_file_copies_external_md_and_reports_new() {
|
||||
assert!(ext_dir.is_dir());
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
.filter_map(std::result::Result::ok)
|
||||
.collect();
|
||||
assert_eq!(entries.len(), 1, "exactly one file in _external/");
|
||||
let name = entries[0].file_name().to_string_lossy().into_owned();
|
||||
@@ -107,5 +107,8 @@ fn ingest_file_errors_on_unsupported_extension() {
|
||||
|
||||
let err = kebab_app::ingest_file_with_config(cfg, &docx).unwrap_err();
|
||||
assert!(err.to_string().contains("unsupported extension"), "{err}");
|
||||
assert!(err.to_string().contains(".docx") || err.to_string().contains("docx"), "{err}");
|
||||
assert!(
|
||||
err.to_string().contains(".docx") || err.to_string().contains("docx"),
|
||||
"{err}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -8,8 +8,7 @@ use common::TestEnv;
|
||||
#[test]
|
||||
fn ingest_then_list_inspects_round_trip() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
|
||||
// The fixture has 3 markdown files; first ingest should label them
|
||||
// all as New.
|
||||
@@ -27,17 +26,14 @@ fn ingest_then_list_inspects_round_trip() {
|
||||
}
|
||||
|
||||
// list_docs returns the 3 docs.
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert_eq!(docs.len(), 3, "docs: {docs:?}");
|
||||
|
||||
// inspect_doc round-trips one of them.
|
||||
let any_doc_id = docs[0].doc_id.clone();
|
||||
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id)
|
||||
.unwrap();
|
||||
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id).unwrap();
|
||||
assert_eq!(canonical.doc_id, any_doc_id);
|
||||
assert!(!canonical.blocks.is_empty(), "blocks empty");
|
||||
}
|
||||
@@ -46,12 +42,10 @@ fn ingest_then_list_inspects_round_trip() {
|
||||
fn ingest_idempotent_on_second_run() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
let r1 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let r1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(r1.new, 3);
|
||||
|
||||
let r2 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let r2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
// Same files re-ingested — p9-fb-23 task 7 introduced the early-skip
|
||||
// path: when checksum + parser/chunker/embedding versions all match,
|
||||
// the second run reports `Unchanged` rather than `Updated`. Pre-p9-fb-23
|
||||
@@ -63,19 +57,16 @@ fn ingest_idempotent_on_second_run() {
|
||||
assert_eq!(r2.unchanged, 3, "second run unchanged: {r2:?}");
|
||||
|
||||
// list_docs still has 3 docs (no duplicates).
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert_eq!(docs.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_summary_only_drops_items() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert!(report.items.is_none(), "summary-only should null items");
|
||||
}
|
||||
@@ -87,12 +78,10 @@ fn ingest_records_ingest_runs_row_with_aggregate_counts() {
|
||||
// of every run. `summary_only=true` writes `items_json=NULL`; the
|
||||
// counts MUST still be present.
|
||||
let env = TestEnv::lexical_only();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
|
||||
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir)
|
||||
.join("kebab.sqlite");
|
||||
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
|
||||
let conn = rusqlite::Connection::open(&db_path).expect("open kebab.sqlite");
|
||||
let (scanned, new_c, updated, skipped, errors, items_json): (
|
||||
i64,
|
||||
@@ -141,25 +130,18 @@ fn ingest_provider_none_skips_lance() {
|
||||
// tree shape (no `<data_dir>/lancedb` directory, or no `*.lance`
|
||||
// tables under it).
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0, "lexical-only run must not error");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir)
|
||||
.join("lancedb");
|
||||
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir).join("lancedb");
|
||||
if lance_dir.exists() {
|
||||
// If the dir was created (e.g., by an earlier consumer touching
|
||||
// the path), it MUST contain no `.lance` tables.
|
||||
let mut had_lance_table = false;
|
||||
for entry in std::fs::read_dir(&lance_dir).expect("read lance_dir") {
|
||||
let entry = entry.unwrap();
|
||||
if entry
|
||||
.path()
|
||||
.extension()
|
||||
.and_then(|s| s.to_str())
|
||||
== Some("lance")
|
||||
{
|
||||
if entry.path().extension().and_then(|s| s.to_str()) == Some("lance") {
|
||||
had_lance_table = true;
|
||||
break;
|
||||
}
|
||||
@@ -189,8 +171,7 @@ fn list_docs_filters_by_tags_any() {
|
||||
tags_any: vec!["rust".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let rust_docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
|
||||
let rust_docs = kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
|
||||
// intro.md and notes/cargo.md both tag "rust".
|
||||
assert_eq!(rust_docs.len(), 2, "expected 2 rust docs: {rust_docs:?}");
|
||||
}
|
||||
@@ -198,8 +179,9 @@ fn list_docs_filters_by_tags_any() {
|
||||
#[test]
|
||||
fn inspect_doc_not_found_returns_actionable_error() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bogus =
|
||||
kebab_core::DocumentId("0000000000000000000000000000000000000000000000000000000000000000".to_string());
|
||||
let bogus = kebab_core::DocumentId(
|
||||
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
|
||||
);
|
||||
let err = kebab_app::inspect_doc_with_config(env.config.clone(), &bogus).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(
|
||||
@@ -218,8 +200,7 @@ fn inspect_chunk_not_found_returns_actionable_error() {
|
||||
let bogus = kebab_core::ChunkId(
|
||||
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
|
||||
);
|
||||
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus)
|
||||
.unwrap_err();
|
||||
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("not found"), "got: {msg}");
|
||||
}
|
||||
@@ -251,22 +232,18 @@ fn ingest_with_config_opts_default_matches_legacy_behaviour() {
|
||||
#[test]
|
||||
fn ingest_stamps_chunker_version_on_document() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert!(report.new >= 1, "expected at least one new doc: {report:?}");
|
||||
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
|
||||
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert!(!docs.is_empty(), "no docs after ingest");
|
||||
|
||||
for doc_entry in &docs {
|
||||
let canonical =
|
||||
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id)
|
||||
.unwrap();
|
||||
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id).unwrap();
|
||||
assert!(
|
||||
canonical.last_chunker_version.is_some(),
|
||||
"last_chunker_version must be stamped for doc {}: got {:?}",
|
||||
|
||||
171
crates/kebab-app/tests/ingest_log_smoke.rs
Normal file
171
crates/kebab-app/tests/ingest_log_smoke.rs
Normal file
@@ -0,0 +1,171 @@
|
||||
// crates/kebab-app/tests/ingest_log_smoke.rs
|
||||
//
|
||||
// Integration tests for ingest_log feature (v0.20.x). Spec §5 AC-9 + AC-6.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_app::{IngestOpts, ingest_with_config_opts};
|
||||
use kebab_config::{Config, LoggingCfg};
|
||||
use kebab_core::SourceScope;
|
||||
use serde_json::Value;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Config {
|
||||
let data_dir = workspace.parent().unwrap().join("data");
|
||||
std::fs::create_dir_all(&data_dir).unwrap();
|
||||
let model_dir = workspace.parent().unwrap().join("models");
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.chunking.target_tokens = 80;
|
||||
cfg.chunking.overlap_tokens = 20;
|
||||
cfg.logging = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: log_dir.to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
cfg
|
||||
}
|
||||
|
||||
/// AC-9: ingest → log file exists + each line valid JSON + last line kind=summary + scanned>0.
|
||||
#[test]
|
||||
fn ingest_log_smoke() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let workspace = tmp.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let log_dir = tmp.path().join("logs");
|
||||
|
||||
// 1. Minimal corpus: 1 markdown + 1 scanned PDF (OCR disabled — no Ollama needed).
|
||||
std::fs::write(
|
||||
workspace.join("hello.md"),
|
||||
"# Hello\n\nThis is a smoke test.\n",
|
||||
)
|
||||
.unwrap();
|
||||
let pdf_src = PathBuf::from("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
if pdf_src.exists() {
|
||||
std::fs::copy(&pdf_src, workspace.join("scanned.pdf")).unwrap();
|
||||
}
|
||||
|
||||
// 2. Config with logging enabled.
|
||||
let cfg = minimal_config(&workspace, &log_dir);
|
||||
let scope = SourceScope {
|
||||
root: workspace.clone(),
|
||||
exclude: vec![],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// 3. Run ingest.
|
||||
ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
|
||||
.expect("ingest should succeed");
|
||||
|
||||
// 4. Assert log file exists in log_dir.
|
||||
let log_files: Vec<_> = std::fs::read_dir(&log_dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
e.file_name().to_string_lossy().starts_with("ingest-")
|
||||
&& e.file_name().to_string_lossy().ends_with(".ndjson")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(
|
||||
log_files.len(),
|
||||
1,
|
||||
"expected exactly 1 ingest-*.ndjson file, found: {log_files:?}"
|
||||
);
|
||||
|
||||
// 5. Parse each line as JSON — assert kind field present and valid.
|
||||
let body = std::fs::read_to_string(log_files[0].path()).unwrap();
|
||||
let lines: Vec<&str> = body.lines().collect();
|
||||
assert!(!lines.is_empty(), "log file should not be empty");
|
||||
|
||||
let valid_kinds = ["ocr", "parse_error", "skip", "error", "summary"];
|
||||
for line in &lines {
|
||||
let v: Value = serde_json::from_str(line)
|
||||
.unwrap_or_else(|e| panic!("line is not valid JSON: {e}\nline: {line}"));
|
||||
let kind = v
|
||||
.get("kind")
|
||||
.and_then(|k| k.as_str())
|
||||
.unwrap_or_else(|| panic!("line missing 'kind' field: {line}"));
|
||||
assert!(
|
||||
valid_kinds.contains(&kind),
|
||||
"unexpected kind '{kind}' in line: {line}"
|
||||
);
|
||||
}
|
||||
|
||||
// 6. Last line must be kind=summary with scanned > 0.
|
||||
let last = lines.last().unwrap();
|
||||
let last_v: Value = serde_json::from_str(last).unwrap();
|
||||
assert_eq!(
|
||||
last_v.get("kind").and_then(|k| k.as_str()),
|
||||
Some("summary"),
|
||||
"last line must be kind=summary, got: {last}"
|
||||
);
|
||||
let scanned = last_v.get("scanned").and_then(Value::as_u64).unwrap_or(0);
|
||||
assert!(scanned > 0, "summary.scanned should be > 0, got: {last}");
|
||||
}
|
||||
|
||||
/// AC-6: ingest_log_enabled=false → no log file created.
|
||||
#[test]
|
||||
fn ingest_log_disabled_emits_no_file() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let workspace = tmp.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let log_dir = tmp.path().join("logs");
|
||||
|
||||
std::fs::write(
|
||||
workspace.join("hello.md"),
|
||||
"# Hello\n\nDisabled log test.\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let data_dir = tmp.path().join("data");
|
||||
std::fs::create_dir_all(&data_dir).unwrap();
|
||||
let model_dir = tmp.path().join("models");
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.logging = LoggingCfg {
|
||||
ingest_log_enabled: false,
|
||||
ingest_log_dir: log_dir.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scope = SourceScope {
|
||||
root: workspace.clone(),
|
||||
exclude: vec![],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
|
||||
.expect("ingest should succeed");
|
||||
|
||||
// log_dir should either not exist or contain 0 ingest-*.ndjson files.
|
||||
let log_file_count = if log_dir.exists() {
|
||||
std::fs::read_dir(&log_dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
e.file_name().to_string_lossy().starts_with("ingest-")
|
||||
&& e.file_name().to_string_lossy().ends_with(".ndjson")
|
||||
})
|
||||
.count()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
assert_eq!(
|
||||
log_file_count, 0,
|
||||
"no ingest-*.ndjson file should be created when disabled"
|
||||
);
|
||||
}
|
||||
117
crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs
Normal file
117
crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
//! Integration smoke tests for the PDF OCR pipeline (§ Acceptance §9 #1 + #2).
|
||||
//!
|
||||
//! Tests 1 and 2 require a live Ollama endpoint — `#[ignore]` by default.
|
||||
//! Manual invoke:
|
||||
//! KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
|
||||
//! cargo test -p kebab-app --test ingest_pdf_ocr_smoke --ignored -j 4
|
||||
//!
|
||||
//! Test 3 (cancel) uses a dummy endpoint + pre-set cancel — runs by default
|
||||
//! to verify the cancel wiring doesn't panic/deadlock.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
fn ollama_endpoint() -> String {
|
||||
std::env::var("KEBAB_PDF_OCR_ENDPOINT").unwrap_or_else(|_| "http://localhost:11434".to_string())
|
||||
}
|
||||
|
||||
fn make_ocr_env_real() -> TestEnv {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.pdf.ocr.enabled = true;
|
||||
env.config.pdf.ocr.endpoint = Some(ollama_endpoint());
|
||||
env.config.models.embedding.provider = "none".to_string();
|
||||
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let dest = env.workspace_root.join("scanned_page1.pdf");
|
||||
std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
|
||||
|
||||
env
|
||||
}
|
||||
|
||||
/// § Acceptance §9 #1 — real Ollama OCR + IngestItem.pdf_ocr_pages = Some(1).
|
||||
#[test]
|
||||
#[ignore = "real Ollama qwen2.5vl:3b dependency"]
|
||||
fn ingest_with_mock_ocr_yields_pdf_ocr_summary() {
|
||||
let env = make_ocr_env_real();
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
|
||||
|
||||
assert!(report.new >= 1, "at least one PDF ingested: {report:?}");
|
||||
|
||||
let items = report.items.unwrap_or_default();
|
||||
let pdf_item = items.iter().find(|i| i.doc_path.0.ends_with(".pdf"));
|
||||
assert!(
|
||||
pdf_item.is_some(),
|
||||
"PDF item must appear in ingest report items: {items:?}"
|
||||
);
|
||||
let pdf_item = pdf_item.unwrap();
|
||||
assert!(
|
||||
pdf_item.pdf_ocr_pages.is_some(),
|
||||
"pdf_ocr_pages must be set for scanned PDF: {pdf_item:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.pdf_ocr_pages.unwrap(),
|
||||
1,
|
||||
"scanned_page1.pdf has exactly 1 page"
|
||||
);
|
||||
}
|
||||
|
||||
/// § Acceptance §9 #2 — OCR text indexed and retrievable via lexical search.
|
||||
#[test]
|
||||
#[ignore = "real Ollama qwen2.5vl:3b dependency"]
|
||||
fn ocr_text_indexed_and_searchable() {
|
||||
let env = make_ocr_env_real();
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
|
||||
|
||||
// Search for a Korean morpheme expected to appear in qwen2.5vl:3b OCR
|
||||
// output of the PoC ground-truth page. "다음" is a high-frequency token
|
||||
// in page1.txt truth file.
|
||||
let query = common::lexical_query("다음");
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query).expect("search");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"OCR-indexed text must surface in lexical search results"
|
||||
);
|
||||
}
|
||||
|
||||
/// Production cancel wiring smoke — pre-set cancel exits before any OCR call.
|
||||
/// Dummy endpoint (port 1 = connection-refused) means OCR HTTP calls would
|
||||
/// fail, but cancel=true prevents the loop from reaching OCR at all.
|
||||
/// Verifies no panic/deadlock regardless of Ok/Err outcome.
|
||||
#[test]
|
||||
fn ingest_with_cancel_aborts_mid_pdf() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.pdf.ocr.enabled = true;
|
||||
env.config.pdf.ocr.endpoint = Some("http://127.0.0.1:1".to_string());
|
||||
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let dest = env.workspace_root.join("scanned_page1.pdf");
|
||||
std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
|
||||
|
||||
let cancel = Arc::new(AtomicBool::new(true)); // pre-set — abort immediately
|
||||
|
||||
let result = kebab_app::ingest_with_config_cancellable(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
None,
|
||||
Some(cancel),
|
||||
);
|
||||
// Both Ok (pre-cancel exit) and Err (eager OCR engine fail) are acceptable —
|
||||
// key assertion is no panic/deadlock.
|
||||
let _ = result;
|
||||
}
|
||||
@@ -13,13 +13,9 @@ use kebab_core::IngestItemKind;
|
||||
fn run_with_progress() -> Vec<IngestEvent> {
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), false, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
@@ -73,40 +69,74 @@ fn progress_event_sequence_matches_design_section_2_4a() {
|
||||
other => panic!("expected Completed last, got {other:?}"),
|
||||
}
|
||||
|
||||
// Middle: 3 AssetStarted/AssetFinished pairs in monotonic idx order.
|
||||
let asset_events: Vec<&IngestEvent> = events[2..events.len() - 1].iter().collect();
|
||||
assert_eq!(
|
||||
asset_events.len(),
|
||||
6,
|
||||
"expected 3 (Started + Finished) pairs, got {asset_events:?}"
|
||||
);
|
||||
for (chunk_idx, pair) in asset_events.chunks(2).enumerate() {
|
||||
let expected_idx = chunk_idx as u32 + 1;
|
||||
match (pair[0], pair[1]) {
|
||||
(
|
||||
IngestEvent::AssetStarted {
|
||||
idx: si,
|
||||
total: st,
|
||||
media,
|
||||
..
|
||||
},
|
||||
IngestEvent::AssetFinished {
|
||||
idx: fi,
|
||||
total: ft,
|
||||
result,
|
||||
chunks,
|
||||
},
|
||||
) => {
|
||||
assert_eq!(*si, expected_idx, "Started idx mismatch: {pair:?}");
|
||||
assert_eq!(*fi, expected_idx, "Finished idx mismatch: {pair:?}");
|
||||
assert_eq!(*st, 3, "Started total mismatch");
|
||||
assert_eq!(*ft, 3, "Finished total mismatch");
|
||||
assert_eq!(media, "markdown", "fixture is markdown only");
|
||||
assert_eq!(*result, IngestItemKind::New, "first ingest → New");
|
||||
assert!(*chunks >= 1, "chunks: {pair:?}");
|
||||
// Middle (v0.24.0 ordering invariant §2.4a): per asset the stream is
|
||||
// AssetStarted < AssetChunked < [ExpansionProgress*] < AssetTimings
|
||||
// < AssetFinished
|
||||
// Expansion is disabled in the lexical fixture, so no ExpansionProgress
|
||||
// frames appear here — but AssetChunked + AssetTimings are emitted for
|
||||
// every markdown asset.
|
||||
let middle = &events[2..events.len() - 1];
|
||||
|
||||
// 3 AssetStarted events, monotonic idx 1..=3, all markdown, total = 3.
|
||||
let started: Vec<u32> = middle
|
||||
.iter()
|
||||
.filter_map(|e| match e {
|
||||
IngestEvent::AssetStarted {
|
||||
idx, total, media, ..
|
||||
} => {
|
||||
assert_eq!(*total, 3, "Started total mismatch: {e:?}");
|
||||
assert_eq!(media, "markdown", "fixture is markdown only: {e:?}");
|
||||
Some(*idx)
|
||||
}
|
||||
other => panic!("expected Started+Finished pair, got {other:?}"),
|
||||
}
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(started, vec![1, 2, 3], "AssetStarted idx order: {middle:?}");
|
||||
|
||||
// 3 AssetFinished events, monotonic idx 1..=3, each New with ≥1 chunk.
|
||||
let finished: Vec<u32> = middle
|
||||
.iter()
|
||||
.filter_map(|e| match e {
|
||||
IngestEvent::AssetFinished {
|
||||
idx,
|
||||
total,
|
||||
result,
|
||||
chunks,
|
||||
} => {
|
||||
assert_eq!(*total, 3, "Finished total mismatch: {e:?}");
|
||||
assert_eq!(*result, IngestItemKind::New, "first ingest → New: {e:?}");
|
||||
assert!(*chunks >= 1, "chunks: {e:?}");
|
||||
Some(*idx)
|
||||
}
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(finished, vec![1, 2, 3], "AssetFinished idx order: {middle:?}");
|
||||
|
||||
// v0.24.0 additive events: exactly one AssetChunked + one AssetTimings
|
||||
// per asset, each strictly bracketed by that asset's Started / Finished.
|
||||
for target in 1u32..=3 {
|
||||
let started_at = middle
|
||||
.iter()
|
||||
.position(|e| matches!(e, IngestEvent::AssetStarted { idx, .. } if *idx == target))
|
||||
.unwrap_or_else(|| panic!("missing AssetStarted for idx {target}: {middle:?}"));
|
||||
let finished_at = middle
|
||||
.iter()
|
||||
.position(|e| matches!(e, IngestEvent::AssetFinished { idx, .. } if *idx == target))
|
||||
.unwrap_or_else(|| panic!("missing AssetFinished for idx {target}: {middle:?}"));
|
||||
let chunked_at = middle
|
||||
.iter()
|
||||
.position(|e| matches!(e, IngestEvent::AssetChunked { idx, chunks, .. } if *idx == target && *chunks >= 1))
|
||||
.unwrap_or_else(|| panic!("missing AssetChunked for idx {target}: {middle:?}"));
|
||||
let timings_at = middle
|
||||
.iter()
|
||||
.position(|e| matches!(e, IngestEvent::AssetTimings { idx, .. } if *idx == target))
|
||||
.unwrap_or_else(|| panic!("missing AssetTimings for idx {target}: {middle:?}"));
|
||||
assert!(
|
||||
started_at < chunked_at && chunked_at < timings_at && timings_at < finished_at,
|
||||
"idx {target} ordering: started={started_at} chunked={chunked_at} \
|
||||
timings={timings_at} finished={finished_at}: {middle:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,13 +146,9 @@ fn ingest_with_config_progress_none_matches_ingest_with_config() {
|
||||
// `ingest_with_config_progress(..., None)` must produce identical
|
||||
// reports modulo wall-clock duration.
|
||||
let env = TestEnv::lexical_only();
|
||||
let r_none = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
let r_none =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, None)
|
||||
.unwrap();
|
||||
assert_eq!(r_none.scanned, 3);
|
||||
assert_eq!(r_none.new, 3);
|
||||
}
|
||||
@@ -134,12 +160,77 @@ fn dropped_receiver_does_not_panic_or_fail_ingest() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
drop(rx);
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
}
|
||||
|
||||
/// v0.20.0 sub-item 1: pdf_ocr_started + pdf_ocr_finished events 가 PDF asset 의
|
||||
/// OCR-enabled ingest 시 emit 됨을 검증. real Ollama 의존 — `#[ignore]` default.
|
||||
///
|
||||
/// Manual invoke:
|
||||
/// ```
|
||||
/// KEBAB_PDF_OCR_ENABLED=true \
|
||||
/// KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
|
||||
/// cargo test -p kebab-app --test ingest_progress \
|
||||
/// --ignored pdf_ocr_progress_emits_started_finished_events
|
||||
/// ```
|
||||
#[test]
|
||||
#[ignore = "real Ollama dependency — manual invoke via KEBAB_PDF_OCR_ENABLED=true"]
|
||||
fn pdf_ocr_progress_emits_started_finished_events() {
|
||||
// F1 fixture (DCTDecode JPEG passthrough) 을 tmpdir 의 workspace 로 copy.
|
||||
let tmpdir = tempfile::tempdir().expect("create tmpdir");
|
||||
let workspace = tmpdir.path().join("workspace");
|
||||
std::fs::create_dir_all(&workspace).expect("create workspace dir");
|
||||
let f1_src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let f1 = std::fs::read(&f1_src).expect("F1 fixture present");
|
||||
std::fs::write(workspace.join("page1.pdf"), &f1).expect("copy F1");
|
||||
|
||||
let data_dir = tmpdir.path().join("data");
|
||||
std::fs::create_dir_all(&data_dir).expect("create data dir");
|
||||
|
||||
let mut config = kebab_config::Config::defaults();
|
||||
config.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
config.models.embedding.provider = "none".to_string();
|
||||
config.models.embedding.dimensions = 0;
|
||||
config.pdf.ocr.enabled = true;
|
||||
if let Ok(endpoint) = std::env::var("KEBAB_PDF_OCR_ENDPOINT") {
|
||||
config.pdf.ocr.endpoint = Some(endpoint);
|
||||
}
|
||||
|
||||
let scope = kebab_core::SourceScope {
|
||||
root: workspace.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let _report = kebab_app::ingest_with_config_progress(config, scope, false, Some(tx))
|
||||
.expect("ingest_with_config_progress");
|
||||
|
||||
let events: Vec<_> = rx.iter().collect();
|
||||
|
||||
let started_count = events
|
||||
.iter()
|
||||
.filter(|e| matches!(e, IngestEvent::PdfOcrStarted { .. }))
|
||||
.count();
|
||||
let finished_count = events
|
||||
.iter()
|
||||
.filter(|e| matches!(e, IngestEvent::PdfOcrFinished { .. }))
|
||||
.count();
|
||||
|
||||
assert!(
|
||||
started_count >= 1,
|
||||
"PdfOcrStarted 가 ≥ 1 emit 됨 (got {started_count})"
|
||||
);
|
||||
assert!(
|
||||
finished_count >= 1,
|
||||
"PdfOcrFinished 가 ≥ 1 emit 됨 (got {finished_count})"
|
||||
);
|
||||
assert_eq!(
|
||||
started_count, finished_count,
|
||||
"Started 와 Finished 의 count 일치"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -29,13 +29,15 @@ fn ingest_stdin_writes_frontmatter_and_reports_new() {
|
||||
"## Body content\n\nMore.",
|
||||
"Article X",
|
||||
Some("https://example.com/x"),
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(report.new, 1, "{report:?}");
|
||||
|
||||
// _external/ contains exactly one .md file with frontmatter.
|
||||
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(std::result::Result::ok)
|
||||
.collect();
|
||||
assert_eq!(entries.len(), 1);
|
||||
let content = fs::read_to_string(entries[0].path()).unwrap();
|
||||
@@ -50,17 +52,14 @@ fn ingest_stdin_without_source_uri() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = fresh_cfg(dir.path());
|
||||
|
||||
let report = kebab_app::ingest_stdin_with_config(
|
||||
cfg.clone(),
|
||||
"## Body",
|
||||
"Title",
|
||||
None,
|
||||
).unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_stdin_with_config(cfg.clone(), "## Body", "Title", None).unwrap();
|
||||
assert_eq!(report.new, 1);
|
||||
|
||||
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(std::result::Result::ok)
|
||||
.collect();
|
||||
let content = fs::read_to_string(entries[0].path()).unwrap();
|
||||
assert!(content.contains("title: \"Title\""));
|
||||
|
||||
@@ -17,9 +17,8 @@ fn init_workspace_header_lists_supported_extensions() {
|
||||
}
|
||||
kebab_app::init_workspace(true).expect("init_workspace");
|
||||
let cfg_path = kebab_config::Config::xdg_config_path();
|
||||
let body = std::fs::read_to_string(&cfg_path).unwrap_or_else(|e| {
|
||||
panic!("read config at {}: {e}", cfg_path.display())
|
||||
});
|
||||
let body = std::fs::read_to_string(&cfg_path)
|
||||
.unwrap_or_else(|e| panic!("read config at {}: {e}", cfg_path.display()));
|
||||
assert!(
|
||||
body.contains("처리 가능한 형식"),
|
||||
"header lists supported types section: body=\n{body}"
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
//! Bug #3 regression: multi-scanned PDF ingest must produce globally unique chunk_ids.
|
||||
//! v0.20.0 sub-item 1 bugfix.
|
||||
//!
|
||||
//! Strategy: helper-level chain test (apply_ocr_to_pdf_pages → PdfPageV1Chunker).
|
||||
//! Facade mock injection is unavailable (kebab-app hardcodes OllamaVisionOcr), so
|
||||
//! this test covers the full OCR→chunk pipeline with real PDF fixtures + MockOcrEngine,
|
||||
//! adding value beyond kebab-chunk unit test B5 (which tests PdfPageV1Chunker alone).
|
||||
|
||||
mod common;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use common::mock_ocr::MockOcrEngine;
|
||||
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
||||
use kebab_chunk::PdfPageV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetStorage, Checksum, ChunkPolicy, Chunker, ExtractConfig, ExtractContext, Extractor,
|
||||
MediaType, RawAsset, SourceUri, WorkspacePath, id_for_asset,
|
||||
};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn make_pdf_asset(path: &str, hash_char: char, byte_len: u64) -> RawAsset {
|
||||
let fake_hash: String = hash_char.to_string().repeat(64);
|
||||
let asset_id = id_for_asset(&fake_hash);
|
||||
RawAsset {
|
||||
asset_id,
|
||||
source_uri: SourceUri::File(PathBuf::from(path)),
|
||||
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
|
||||
media_type: MediaType::Pdf,
|
||||
byte_len,
|
||||
checksum: Checksum(fake_hash),
|
||||
discovered_at: OffsetDateTime::UNIX_EPOCH,
|
||||
stored: AssetStorage::Copied {
|
||||
path: PathBuf::from(path),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_and_ocr(
|
||||
bytes: &[u8],
|
||||
path: &str,
|
||||
hash_char: char,
|
||||
engine: &dyn OcrEngine,
|
||||
) -> kebab_core::CanonicalDocument {
|
||||
let asset = make_pdf_asset(path, hash_char, bytes.len() as u64);
|
||||
let workspace_root = Path::new("/");
|
||||
let config = ExtractConfig::default();
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root,
|
||||
config: &config,
|
||||
};
|
||||
let mut canonical = PdfTextExtractor::new().extract(&ctx, bytes).unwrap();
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
apply_ocr_to_pdf_pages(&mut canonical, engine, bytes, &opts, |_| {}).unwrap();
|
||||
canonical
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_scanned_pdf_ingest_no_chunk_id_collision() {
|
||||
let f1_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||
.expect("F1 fixture missing");
|
||||
let f2_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page2.pdf")
|
||||
.expect("F2 fixture missing");
|
||||
|
||||
// Bug #3 trigger shape: 10-char early segment + ". " + 500-char tail.
|
||||
// byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500 → multi-chunk.
|
||||
// overlap_bytes = min(240, 750) = 240 / chars=80 → second chunk's actual_start
|
||||
// collapses to prev_min=0 without the fix → same #c0 suffix → chunk_id collision.
|
||||
let trigger_text = format!("{}. {}", "가".repeat(10), "나".repeat(500));
|
||||
|
||||
let f1_engine = MockOcrEngine::single("F1 mock OCR page text", false);
|
||||
let f2_engine = MockOcrEngine::single(&trigger_text, false);
|
||||
|
||||
let f1_canonical = extract_and_ocr(&f1_bytes, "page1.pdf", '1', &f1_engine);
|
||||
let f2_canonical = extract_and_ocr(&f2_bytes, "page2.pdf", '2', &f2_engine);
|
||||
|
||||
let chunk_policy = ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: PdfPageV1Chunker.chunker_version(),
|
||||
};
|
||||
|
||||
let f1_chunks = PdfPageV1Chunker
|
||||
.chunk(&f1_canonical, &chunk_policy)
|
||||
.unwrap();
|
||||
let f2_chunks = PdfPageV1Chunker
|
||||
.chunk(&f2_canonical, &chunk_policy)
|
||||
.unwrap();
|
||||
|
||||
assert!(
|
||||
f2_chunks.len() >= 2,
|
||||
"F2 trigger text must produce ≥2 chunks for the collision to be possible; got {}",
|
||||
f2_chunks.len()
|
||||
);
|
||||
|
||||
let all_ids: Vec<&str> = f1_chunks
|
||||
.iter()
|
||||
.chain(f2_chunks.iter())
|
||||
.map(|c| c.chunk_id.0.as_str())
|
||||
.collect();
|
||||
let total = all_ids.len();
|
||||
let unique: HashSet<&str> = all_ids.iter().copied().collect();
|
||||
assert_eq!(
|
||||
unique.len(),
|
||||
total,
|
||||
"all chunk_ids must be globally unique across F1 + F2 ({} unique vs {} total — collision detected)",
|
||||
unique.len(),
|
||||
total,
|
||||
);
|
||||
}
|
||||
156
crates/kebab-app/tests/ocr_inspect_smoke.rs
Normal file
156
crates/kebab-app/tests/ocr_inspect_smoke.rs
Normal file
@@ -0,0 +1,156 @@
|
||||
//! Integration smoke tests for `kebab inspect ocr-stats / ocr-failures`.
|
||||
//! AC-4, AC-5, AC-6, AC-11 (ocr_inspect_smoke binary), AC-13.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::App;
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
|
||||
/// Insert synthetic pdf_ocr_events rows directly so the test runs without
|
||||
/// a live Ollama endpoint.
|
||||
fn seed_ocr_events(env: &TestEnv, store: &SqliteStore) {
|
||||
// Success rows
|
||||
for i in 0..3u32 {
|
||||
store
|
||||
.record_pdf_ocr_event(
|
||||
"run-aaa",
|
||||
&format!("2026-05-28T0{i}:00:00Z"),
|
||||
Some("doc-abc"),
|
||||
"path/scanned.pdf",
|
||||
i + 1,
|
||||
Some(50_000),
|
||||
Some(200),
|
||||
Some(150),
|
||||
100 + u64::from(i) * 20,
|
||||
42,
|
||||
true,
|
||||
None,
|
||||
"qwen2.5vl",
|
||||
)
|
||||
.expect("seed success row");
|
||||
}
|
||||
// Failure row
|
||||
store
|
||||
.record_pdf_ocr_event(
|
||||
"run-bbb",
|
||||
"2026-05-28T10:00:00Z",
|
||||
Some("doc-abc"),
|
||||
"path/scanned.pdf",
|
||||
4,
|
||||
Some(30_000),
|
||||
Some(200),
|
||||
Some(150),
|
||||
9999,
|
||||
0,
|
||||
false,
|
||||
Some("ocr_error"),
|
||||
"qwen2.5vl",
|
||||
)
|
||||
.expect("seed failure row");
|
||||
// Row for different doc
|
||||
store
|
||||
.record_pdf_ocr_event(
|
||||
"run-ccc",
|
||||
"2026-05-28T11:00:00Z",
|
||||
Some("doc-xyz"),
|
||||
"path/other.pdf",
|
||||
1,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
200,
|
||||
10,
|
||||
true,
|
||||
None,
|
||||
"qwen2.5vl",
|
||||
)
|
||||
.expect("seed doc-xyz row");
|
||||
// Trigger migration (no-op if already done via App::open_with_config)
|
||||
let _ = env;
|
||||
}
|
||||
|
||||
fn open_app_with_seeded_events(env: &TestEnv) -> App {
|
||||
let app = env.app();
|
||||
let store = SqliteStore::open(&env.config).expect("open store for seed");
|
||||
store.run_migrations().expect("run migrations for seed");
|
||||
seed_ocr_events(env, &store);
|
||||
app
|
||||
}
|
||||
|
||||
/// AC-4: `inspect_ocr_stats` returns `schema_version = "ocr_stats.v1"`,
|
||||
/// `total_events >= 1`, `0 ≤ success_rate ≤ 1`.
|
||||
#[test]
|
||||
fn ocr_stats_after_seeded_events() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let app = open_app_with_seeded_events(&env);
|
||||
|
||||
let stats = app.inspect_ocr_stats().expect("inspect_ocr_stats");
|
||||
|
||||
assert_eq!(stats.schema_version, "ocr_stats.v1");
|
||||
assert!(stats.total_events >= 1, "total_events should be >= 1");
|
||||
assert!(
|
||||
(0.0..=1.0).contains(&stats.success_rate),
|
||||
"success_rate must be in [0, 1]: {}",
|
||||
stats.success_rate
|
||||
);
|
||||
assert!(stats.total_runs >= 1, "total_runs should be >= 1");
|
||||
// by_engine should have at least one entry
|
||||
assert!(!stats.by_engine.is_empty(), "by_engine must be non-empty");
|
||||
}
|
||||
|
||||
/// AC-6: `inspect_ocr_failures` (no doc_id, corpus-wide) returns failures list.
|
||||
#[test]
|
||||
fn ocr_failures_corpus_wide() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let app = open_app_with_seeded_events(&env);
|
||||
|
||||
let result = app
|
||||
.inspect_ocr_failures(None, 10)
|
||||
.expect("inspect_ocr_failures");
|
||||
|
||||
assert_eq!(result.schema_version, "ocr_failures.v1");
|
||||
assert!(result.failure_count >= 1, "expected at least 1 failure");
|
||||
assert!(
|
||||
!result.failures.is_empty(),
|
||||
"failures list must be non-empty"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC-5: `inspect_ocr_failures` with doc_id filter returns matching rows.
|
||||
#[test]
|
||||
fn ocr_failures_filter_by_doc_id() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let app = open_app_with_seeded_events(&env);
|
||||
|
||||
let result = app
|
||||
.inspect_ocr_failures(Some("doc-abc"), 10)
|
||||
.expect("inspect_ocr_failures by doc_id");
|
||||
|
||||
assert_eq!(result.schema_version, "ocr_failures.v1");
|
||||
assert_eq!(
|
||||
result.doc_id.as_deref(),
|
||||
Some("doc-abc"),
|
||||
"doc_id must be echoed back"
|
||||
);
|
||||
// All rows must belong to doc-abc (no cross-doc leak)
|
||||
for row in &result.failures {
|
||||
// rows are failure rows for doc-abc only (reason = ocr_error)
|
||||
assert_eq!(row.reason, "ocr_error");
|
||||
}
|
||||
}
|
||||
|
||||
/// AC-13: SKILL.md lists both new wire schemas.
|
||||
#[test]
|
||||
fn skill_md_lists_new_schemas() {
|
||||
let skill_md = std::fs::read_to_string("../../integrations/claude-code/kebab/SKILL.md")
|
||||
.expect("read SKILL.md");
|
||||
assert!(
|
||||
skill_md.contains("ocr_stats.v1"),
|
||||
"SKILL.md must mention ocr_stats.v1"
|
||||
);
|
||||
assert!(
|
||||
skill_md.contains("ocr_failures.v1"),
|
||||
"SKILL.md must mention ocr_failures.v1"
|
||||
);
|
||||
}
|
||||
81
crates/kebab-app/tests/open_with_config_nli.rs
Normal file
81
crates/kebab-app/tests/open_with_config_nli.rs
Normal file
@@ -0,0 +1,81 @@
|
||||
//! Tests for `App::open_with_config`'s NLI verifier construction path.
|
||||
//!
|
||||
//! Coverage:
|
||||
//! 1. `open_with_config_nli_fails_when_model_dir_unwritable_and_threshold_positive` —
|
||||
//! when `rag.nli_threshold > 0` and `storage.model_dir` is unwritable,
|
||||
//! `open_with_config` returns `Err` with "OnnxNliVerifier" in the
|
||||
//! error chain.
|
||||
//! 2. `open_with_config_nli_skipped_when_threshold_zero` —
|
||||
//! same bad `model_dir`, but `rag.nli_threshold = 0.0` (gate disabled),
|
||||
//! so `OnnxNliVerifier::new` is never called and `open_with_config`
|
||||
//! succeeds.
|
||||
//!
|
||||
//! `/proc/1/root` is the init process's filesystem root; on Linux it is
|
||||
//! owned by root and not traversable by unprivileged users, making
|
||||
//! `create_dir_all` fail with `EACCES` — a reliable "unwritable path"
|
||||
//! that requires no test setup beyond the path literal.
|
||||
|
||||
use kebab_config::Config;
|
||||
|
||||
/// Return a `Config` whose `data_dir` lives in a fresh `TempDir`
|
||||
/// (so `SqliteStore::open` succeeds) and whose `model_dir` is set to
|
||||
/// `/proc/1/root` (unwritable by non-root processes on Linux).
|
||||
///
|
||||
/// The `TempDir` is returned alongside the config so the caller keeps
|
||||
/// it alive until the test completes — dropping it early would delete
|
||||
/// the data directory before any assertions run.
|
||||
fn config_with_unwritable_model_dir() -> (tempfile::TempDir, Config) {
|
||||
let tmp = tempfile::tempdir().expect("tempdir");
|
||||
let mut cfg = Config::defaults();
|
||||
// Valid data_dir → SqliteStore::open + run_migrations succeed.
|
||||
cfg.storage.data_dir = tmp.path().to_string_lossy().into_owned();
|
||||
// /proc/1/root is only accessible to root; create_dir_all will
|
||||
// return EACCES for any unprivileged user, which is exactly the
|
||||
// failure mode we want to exercise.
|
||||
cfg.storage.model_dir = "/proc/1/root".to_string();
|
||||
(tmp, cfg)
|
||||
}
|
||||
|
||||
// ── 1. Failure path: threshold > 0 + unwritable model_dir ─────────────────
|
||||
|
||||
#[test]
|
||||
fn open_with_config_nli_fails_when_model_dir_unwritable_and_threshold_positive() {
|
||||
let (_tmp, mut cfg) = config_with_unwritable_model_dir();
|
||||
cfg.rag.nli_threshold = 0.5; // gate enabled → OnnxNliVerifier::new runs
|
||||
|
||||
let result = kebab_app::App::open_with_config(cfg);
|
||||
|
||||
let Err(err) = result else {
|
||||
panic!(
|
||||
"App::open_with_config must fail when model_dir is unwritable and nli_threshold > 0"
|
||||
);
|
||||
};
|
||||
// The error chain must identify the OnnxNliVerifier as the source so
|
||||
// an operator reading logs can trace the failure to the NLI config.
|
||||
let err_chain = format!("{err:?}");
|
||||
assert!(
|
||||
err_chain.contains("OnnxNliVerifier"),
|
||||
"error chain must mention OnnxNliVerifier; full chain: {err_chain}"
|
||||
);
|
||||
}
|
||||
|
||||
// ── 2. Success path: threshold = 0.0 → NLI verifier never constructed ──────
|
||||
|
||||
#[test]
|
||||
fn open_with_config_nli_skipped_when_threshold_zero() {
|
||||
let (_tmp, cfg) = config_with_unwritable_model_dir();
|
||||
// Default nli_threshold is 0.0 — gate disabled, verifier skipped.
|
||||
assert!(
|
||||
(cfg.rag.nli_threshold - 0.0).abs() < f32::EPSILON,
|
||||
"precondition: default nli_threshold must be 0.0 (gate disabled)"
|
||||
);
|
||||
|
||||
// A bad model_dir must NOT cause a failure when the NLI gate is off.
|
||||
let result = kebab_app::App::open_with_config(cfg);
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"App::open_with_config must succeed when nli_threshold = 0.0 \
|
||||
(OnnxNliVerifier is never constructed); err: {:?}",
|
||||
result.err()
|
||||
);
|
||||
}
|
||||
358
crates/kebab-app/tests/pdf_ocr_apply.rs
Normal file
358
crates/kebab-app/tests/pdf_ocr_apply.rs
Normal file
@@ -0,0 +1,358 @@
|
||||
//! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
use common::mock_ocr::MockOcrEngine;
|
||||
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
||||
use kebab_core::{
|
||||
AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext, Extractor,
|
||||
Inline, Lang, MediaType, RawAsset, SourceSpan, SourceUri, WorkspacePath, id_for_asset,
|
||||
};
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
// ── Fixture helpers ───────────────────────────────────────────────────────
|
||||
|
||||
fn f1_pdf_bytes() -> Vec<u8> {
|
||||
std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||
.expect("F1 fixture missing")
|
||||
}
|
||||
|
||||
fn make_raw_asset(path: &str, media_type: MediaType, byte_len: u64) -> RawAsset {
|
||||
let fake_hash = "0".repeat(64);
|
||||
let asset_id = id_for_asset(&fake_hash);
|
||||
RawAsset {
|
||||
asset_id,
|
||||
source_uri: SourceUri::File(PathBuf::from(path)),
|
||||
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
|
||||
media_type,
|
||||
byte_len,
|
||||
checksum: Checksum(fake_hash.clone()),
|
||||
discovered_at: OffsetDateTime::UNIX_EPOCH,
|
||||
stored: AssetStorage::Copied {
|
||||
path: PathBuf::from(path),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a CanonicalDocument from raw PDF bytes using PdfTextExtractor.
|
||||
/// F1 (scanned) returns an empty-text Block::Paragraph per page.
|
||||
fn extract_canonical_from_bytes(bytes: &[u8]) -> CanonicalDocument {
|
||||
let asset = make_raw_asset("test.pdf", MediaType::Pdf, bytes.len() as u64);
|
||||
let workspace_root = Path::new("/");
|
||||
let config = ExtractConfig::default();
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root,
|
||||
config: &config,
|
||||
};
|
||||
PdfTextExtractor::new().extract(&ctx, bytes).unwrap()
|
||||
}
|
||||
|
||||
/// F1 bytes → canonical with 1 empty Block::Paragraph for page 1.
|
||||
fn canonical_with_empty_block() -> CanonicalDocument {
|
||||
extract_canonical_from_bytes(&f1_pdf_bytes())
|
||||
}
|
||||
|
||||
/// F1-based canonical with block text replaced by `text` (high valid_ratio, chars≥20).
|
||||
fn canonical_with_filled_block(text: &str) -> CanonicalDocument {
|
||||
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
|
||||
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
|
||||
let char_count = text.chars().count() as u32;
|
||||
tb.text = text.to_string();
|
||||
tb.inlines = vec![Inline::Text {
|
||||
text: text.to_string(),
|
||||
}];
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(char_count);
|
||||
}
|
||||
}
|
||||
canonical
|
||||
}
|
||||
|
||||
/// F1-based canonical with block text replaced by PUA codepoints (low valid_ratio).
|
||||
fn canonical_with_mojibake_block() -> CanonicalDocument {
|
||||
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
|
||||
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
|
||||
let pua = "\u{E000}".repeat(25); // 25 PUA codepoints → valid_ratio ≈ 0
|
||||
let char_count = pua.chars().count() as u32;
|
||||
tb.text = pua.clone();
|
||||
tb.inlines = vec![Inline::Text { text: pua }];
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(char_count);
|
||||
}
|
||||
}
|
||||
canonical
|
||||
}
|
||||
|
||||
fn default_opts(enabled: bool) -> PdfOcrOpts {
|
||||
PdfOcrOpts {
|
||||
enabled,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// Test 1: F1 + enabled=true → in-place mutate
|
||||
#[test]
|
||||
fn f1_input_with_ocr_enabled_replaces_empty_block() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("MOCK_OCR_TEXT", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: Some(Lang("kor".into())),
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1);
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
assert!(first_para.is_some());
|
||||
assert_eq!(first_para.unwrap().text, "MOCK_OCR_TEXT");
|
||||
}
|
||||
|
||||
// Test 2: F3 vector (mock filled canonical) + enabled=true → OCR skip (needs_ocr=false)
|
||||
#[test]
|
||||
fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() {
|
||||
let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text
|
||||
let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "vector PDF 의 OCR 호출 0");
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
if let Some(tb) = first_para {
|
||||
assert!(tb.text.starts_with("충분한"), "원본 text 보존");
|
||||
}
|
||||
}
|
||||
|
||||
// Test 3: F1 + enabled=false → no-op
|
||||
#[test]
|
||||
fn f1_input_with_ocr_disabled_keeps_empty_block() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("IGNORED", false);
|
||||
let opts = default_opts(false);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0);
|
||||
assert_eq!(summary.ms_total, 0);
|
||||
}
|
||||
|
||||
// Test 4: mojibake canonical (PUA chars) + enabled=true → in-place mutate
|
||||
#[test]
|
||||
fn f4_input_with_ocr_enabled_replaces_mojibake_block() {
|
||||
let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image
|
||||
let mut canonical = canonical_with_mojibake_block();
|
||||
let engine = MockOcrEngine::single("OCR_MOJIBAKE_REPLACEMENT", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1, "mojibake page 의 OCR 호출");
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
if let Some(tb) = first_para {
|
||||
assert_eq!(tb.text, "OCR_MOJIBAKE_REPLACEMENT");
|
||||
}
|
||||
}
|
||||
|
||||
// Test 5: filled canonical + always_on=true → dual-block (+1 OCR block)
|
||||
#[test]
|
||||
fn f3_input_with_always_on_pushes_dual_blocks() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let original_block_count = canonical.blocks.len();
|
||||
let engine = MockOcrEngine::single("OCR_DUAL", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: true,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1);
|
||||
assert_eq!(
|
||||
canonical.blocks.len(),
|
||||
original_block_count + 1,
|
||||
"always_on 시 새 Block::Paragraph push"
|
||||
);
|
||||
let texts: Vec<&str> = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb.text.as_str()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
assert!(texts.contains(&"OCR_DUAL"), "OCR block 포함");
|
||||
assert!(
|
||||
texts.iter().any(|t| t.starts_with("vector")),
|
||||
"원본 text-detect block 보존"
|
||||
);
|
||||
}
|
||||
|
||||
// Test 6: F6 FlateDecode → extract_dctdecode_page_image=None → skip + warning
|
||||
#[test]
|
||||
fn f6_flatedecode_skipped_with_warning() {
|
||||
let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf")
|
||||
.expect("F6 fixture missing");
|
||||
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
summary.pages_ocrd, 0,
|
||||
"FlateDecode page 는 skip (DCTDecode-only v1 invariant)"
|
||||
);
|
||||
let warning_count = canonical
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
|
||||
.count();
|
||||
assert!(warning_count >= 1, "FlateDecode skip 시 Warning event 발행");
|
||||
}
|
||||
|
||||
// Test 7: F7 CCITTFax → skip + warning (verifier M-4 split)
|
||||
#[test]
|
||||
fn f7_ccittfax_skipped_with_warning() {
|
||||
let bytes =
|
||||
std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf").expect("F7 fixture missing");
|
||||
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "CCITTFax page 는 skip");
|
||||
let warning_count = canonical
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
|
||||
.count();
|
||||
assert!(warning_count >= 1, "CCITTFax skip 시 Warning event 발행");
|
||||
}
|
||||
|
||||
// Test 8: OCR engine failure → warning event + skip
|
||||
#[test]
|
||||
fn ocr_engine_failure_surfaces_as_warning() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("", true);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "OCR failure 시 pages_ocrd=0");
|
||||
let warning_with_failure = canonical.provenance.events.iter().any(|e| {
|
||||
e.kind == kebab_core::ProvenanceKind::Warning
|
||||
&& e.note.as_deref().unwrap_or("").contains("mock failure")
|
||||
});
|
||||
assert!(
|
||||
warning_with_failure,
|
||||
"OCR failure 의 error message 가 warning event 의 note 안"
|
||||
);
|
||||
}
|
||||
|
||||
// Test 9: dual-block ordinals are deterministic and unique
|
||||
#[test]
|
||||
fn dual_block_ordinals_are_deterministic_and_unique() {
|
||||
let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1
|
||||
let text = "vector 충분한 텍스트. This text has more than twenty characters total.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let engine = MockOcrEngine::single("DUAL", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: true,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
// page_count=1 → text-detect ordinal=0, ocr ordinal=1 (page_num-1 + page_count = 0+1=1)
|
||||
let para_count = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter(|b| matches!(b, Block::Paragraph(_)))
|
||||
.count();
|
||||
assert_eq!(para_count, 2, "dual-block: text-detect + OCR");
|
||||
|
||||
let all_page_1 = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(&tb.common.source_span),
|
||||
_ => None,
|
||||
})
|
||||
.all(|s| matches!(s, SourceSpan::Page { page: 1, .. }));
|
||||
assert!(all_page_1, "두 block 모두 page=1");
|
||||
}
|
||||
|
||||
// Test 10: cancel handle aborts mid-PDF
|
||||
#[test]
|
||||
fn cancel_handle_aborts_mid_pdf() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel
|
||||
let engine = MockOcrEngine::single("IGNORED", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: Some(cancel.clone()),
|
||||
};
|
||||
|
||||
let result = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {});
|
||||
let err = result.expect_err("cancel=true 시 error 반환");
|
||||
assert!(
|
||||
format!("{err}").contains("cancelled mid-PDF"),
|
||||
"error message 가 'cancelled mid-PDF' 포함: {err}"
|
||||
);
|
||||
}
|
||||
139
crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs
Normal file
139
crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs
Normal file
@@ -0,0 +1,139 @@
|
||||
//! Integration smoke test: dual-write (ndjson + SQLite) for PDF OCR events.
|
||||
//! AC-3: SQLite row count and doc_id matches ndjson LogEvent::Ocr.
|
||||
//!
|
||||
//! Uses wiremock to stub the Ollama `/api/generate` endpoint so the test
|
||||
//! runs without a live Ollama instance.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_config::LoggingCfg;
|
||||
use serde_json::Value;
|
||||
use tokio::task::spawn_blocking;
|
||||
use wiremock::matchers::{method, path};
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
fn scanned_pdf_src() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||
}
|
||||
|
||||
/// AC-3: ndjson OCR line count == pdf_ocr_events row count, and doc_id matches.
|
||||
#[tokio::test]
|
||||
async fn ingest_dual_write_doc_id_matches_ndjson() {
|
||||
let src = scanned_pdf_src();
|
||||
if !src.exists() {
|
||||
eprintln!("skipping test: scanned_page1.pdf fixture not found");
|
||||
return;
|
||||
}
|
||||
|
||||
let server = MockServer::start().await;
|
||||
// Stub Ollama /api/generate to return a minimal OCR response.
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/generate"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
|
||||
"model": "qwen2.5vl:3b",
|
||||
"response": "test ocr output",
|
||||
"done": true,
|
||||
"done_reason": "stop"
|
||||
})))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let mock_url = server.uri();
|
||||
|
||||
let result = spawn_blocking(move || {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
// Enable PDF OCR + set up mock endpoint
|
||||
env.config.pdf.ocr.enabled = true;
|
||||
env.config.pdf.ocr.endpoint = Some(mock_url.clone());
|
||||
env.config.pdf.ocr.model = "qwen2.5vl:3b".to_string();
|
||||
// Enable ingest log
|
||||
let log_dir = env.temp.path().join("logs");
|
||||
std::fs::create_dir_all(&log_dir).unwrap();
|
||||
env.config.logging = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: log_dir.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Copy scanned PDF into workspace
|
||||
let dest = env.workspace_root.join("scanned.pdf");
|
||||
std::fs::copy(scanned_pdf_src(), &dest).expect("copy scanned PDF");
|
||||
|
||||
// Run ingest
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
|
||||
|
||||
// Read ndjson log
|
||||
let log_files: Vec<_> = std::fs::read_dir(&log_dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
let name = e.file_name().to_string_lossy().to_string();
|
||||
name.starts_with("ingest-") && name.ends_with(".ndjson")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(log_files.len(), 1, "expected 1 ndjson log file");
|
||||
|
||||
let body = std::fs::read_to_string(log_files[0].path()).unwrap();
|
||||
let ocr_lines: Vec<Value> = body
|
||||
.lines()
|
||||
.filter_map(|l| serde_json::from_str(l).ok())
|
||||
.filter(|v: &Value| v.get("kind").and_then(Value::as_str) == Some("ocr"))
|
||||
.collect();
|
||||
|
||||
// Read pdf_ocr_events from SQLite
|
||||
let db_path = PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
|
||||
let conn = rusqlite::Connection::open(&db_path).expect("open db");
|
||||
let rows: Vec<(Option<String>, String)> = {
|
||||
let mut stmt = conn
|
||||
.prepare("SELECT doc_id, doc_path FROM pdf_ocr_events ORDER BY id")
|
||||
.expect("prepare");
|
||||
stmt.query_map([], |r| Ok((r.get(0)?, r.get(1)?)))
|
||||
.expect("query")
|
||||
.map(|r| r.expect("row"))
|
||||
.collect()
|
||||
};
|
||||
|
||||
(ocr_lines, rows)
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
|
||||
let (ocr_lines, rows) = result;
|
||||
|
||||
// At least one OCR event must be produced
|
||||
assert!(!ocr_lines.is_empty(), "expected ≥1 ndjson ocr line");
|
||||
assert!(!rows.is_empty(), "expected ≥1 pdf_ocr_events row");
|
||||
|
||||
// Row counts must match
|
||||
assert_eq!(
|
||||
ocr_lines.len(),
|
||||
rows.len(),
|
||||
"ndjson ocr lines ({}) must equal pdf_ocr_events rows ({})",
|
||||
ocr_lines.len(),
|
||||
rows.len()
|
||||
);
|
||||
|
||||
// doc_id in both sources must be non-null and consistent
|
||||
for (line, (sql_doc_id, _sql_doc_path)) in ocr_lines.iter().zip(rows.iter()) {
|
||||
let json_doc_id = line.get("doc_id").and_then(Value::as_str);
|
||||
assert!(
|
||||
json_doc_id.is_some(),
|
||||
"ndjson ocr line should have doc_id: {line}"
|
||||
);
|
||||
assert!(
|
||||
sql_doc_id.is_some(),
|
||||
"pdf_ocr_events row should have doc_id"
|
||||
);
|
||||
assert_eq!(
|
||||
json_doc_id,
|
||||
sql_doc_id.as_deref(),
|
||||
"ndjson doc_id must equal SQLite doc_id"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -46,17 +46,13 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
|
||||
operations: vec![
|
||||
Operation::new("BT", vec![]),
|
||||
Operation::new("Tf", vec!["F1".into(), 24.into()]),
|
||||
Operation::new(
|
||||
"Td",
|
||||
vec![Object::Integer(100), Object::Integer(700)],
|
||||
),
|
||||
Operation::new("Td", vec![Object::Integer(100), Object::Integer(700)]),
|
||||
Operation::new("Tj", vec![Object::string_literal(*text)]),
|
||||
Operation::new("ET", vec![]),
|
||||
],
|
||||
};
|
||||
let stream_data = content.encode().expect("content encode");
|
||||
let content_id =
|
||||
doc.add_object(Stream::new(dictionary! {}, stream_data));
|
||||
let content_id = doc.add_object(Stream::new(dictionary! {}, stream_data));
|
||||
page_dict.set("Contents", content_id);
|
||||
}
|
||||
let page_id = doc.add_object(page_dict);
|
||||
@@ -76,8 +72,7 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
|
||||
Object::Integer(842),
|
||||
],
|
||||
};
|
||||
doc.objects
|
||||
.insert(pages_id, Object::Dictionary(pages_dict));
|
||||
doc.objects.insert(pages_id, Object::Dictionary(pages_dict));
|
||||
|
||||
let catalog_id = doc.add_object(dictionary! {
|
||||
"Type" => "Catalog",
|
||||
@@ -146,9 +141,8 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
|
||||
write_pdf(&env.workspace_root, "three.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
|
||||
.expect("PDF ingest must succeed");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
|
||||
.expect("PDF ingest must succeed");
|
||||
|
||||
assert_eq!(report.errors, 0);
|
||||
let items = report.items.as_ref().expect("items present");
|
||||
@@ -157,23 +151,30 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
|
||||
.find(|i| i.doc_path.0.ends_with("three.pdf"))
|
||||
.expect("PDF item present");
|
||||
assert_eq!(pdf_item.kind, IngestItemKind::New);
|
||||
assert_eq!(pdf_item.block_count, Some(3), "one Block::Paragraph per page");
|
||||
assert_eq!(pdf_item.chunk_count, Some(3), "one chunk per non-empty page");
|
||||
assert_eq!(
|
||||
pdf_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
pdf_item.block_count,
|
||||
Some(3),
|
||||
"one Block::Paragraph per page"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.chunk_count,
|
||||
Some(3),
|
||||
"one chunk per non-empty page"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("pdf-text-v1")
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.chunker_version.as_ref().map(|c| c.0.as_str()),
|
||||
Some("pdf-page-v1")
|
||||
Some("pdf-page-v1.1")
|
||||
);
|
||||
|
||||
// Inspect the stored doc to confirm SourceSpan::Page round-trip.
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.expect("inspect_doc returns the PDF document");
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap())
|
||||
.expect("inspect_doc returns the PDF document");
|
||||
assert_eq!(doc.blocks.len(), 3);
|
||||
for (i, block) in doc.blocks.iter().enumerate() {
|
||||
let want_page = (i as u32) + 1;
|
||||
@@ -202,8 +203,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
|
||||
write_pdf(&env.workspace_root, "stable.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report1 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item1 = report1
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -214,8 +214,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
|
||||
.unwrap();
|
||||
assert_eq!(item1.kind, IngestItemKind::New);
|
||||
|
||||
let report2 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item2 = report2
|
||||
.items
|
||||
.unwrap()
|
||||
@@ -239,8 +238,7 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
|
||||
std::fs::write(&path, &bytes_v1).unwrap();
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report_v1 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report_v1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let id_v1 = report_v1
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -252,12 +250,10 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
|
||||
.clone()
|
||||
.unwrap();
|
||||
|
||||
let bytes_v2 =
|
||||
build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
|
||||
let bytes_v2 = build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
|
||||
std::fs::write(&path, &bytes_v2).unwrap();
|
||||
|
||||
let report_v2 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report_v2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item_v2 = report_v2
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -282,9 +278,11 @@ fn encrypted_pdf_fails_with_qpdf_hint() {
|
||||
write_pdf(&env.workspace_root, "secret.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 1, "encrypted PDF must increment errors exactly once");
|
||||
let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 1,
|
||||
"encrypted PDF must increment errors exactly once"
|
||||
);
|
||||
let items = report.items.as_ref().unwrap();
|
||||
let pdf_item = items
|
||||
.iter()
|
||||
@@ -310,9 +308,11 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
write_pdf(&env.workspace_root, "corrupt.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 1, "corrupt PDF must increment errors exactly once");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 1,
|
||||
"corrupt PDF must increment errors exactly once"
|
||||
);
|
||||
let items = report.items.as_ref().unwrap();
|
||||
let pdf_item = items
|
||||
.iter()
|
||||
@@ -322,11 +322,8 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
|
||||
// Confirm the doc was NOT stored — list_docs returns nothing for
|
||||
// this path.
|
||||
let summaries = kebab_app::list_docs_with_config(
|
||||
cfg,
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
|
||||
assert!(
|
||||
!summaries
|
||||
.iter()
|
||||
@@ -341,14 +338,15 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
#[test]
|
||||
fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bytes =
|
||||
build_text_pdf(&[Some("first page"), None, Some("third page")]);
|
||||
let bytes = build_text_pdf(&[Some("first page"), None, Some("third page")]);
|
||||
write_pdf(&env.workspace_root, "mixed.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0, "scanned candidate is a Warning, not Error");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 0,
|
||||
"scanned candidate is a Warning, not Error"
|
||||
);
|
||||
let pdf_item = report
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -365,14 +363,10 @@ fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
|
||||
assert_eq!(
|
||||
pdf_item.chunk_count,
|
||||
Some(2),
|
||||
"pdf-page-v1 emits 0 chunks for the empty page; total = 2"
|
||||
"pdf-page-v1.1 emits 0 chunks for the empty page; total = 2"
|
||||
);
|
||||
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let warnings: Vec<_> = doc
|
||||
.provenance
|
||||
.events
|
||||
@@ -419,8 +413,7 @@ fn ingest_report_arithmetic_invariant_holds_with_corrupt_pdf() {
|
||||
write_pdf(&env.workspace_root, "broken.pdf", &corrupt_pdf());
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
let total = report.new + report.updated + report.skipped + report.errors;
|
||||
assert_eq!(
|
||||
report.scanned, total,
|
||||
@@ -441,14 +434,12 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
let pages: Vec<String> = (1..=50)
|
||||
.map(|i| format!("Page {i} body — lorem ipsum dolor sit amet."))
|
||||
.collect();
|
||||
let page_refs: Vec<Option<&str>> =
|
||||
pages.iter().map(|s| Some(s.as_str())).collect();
|
||||
let page_refs: Vec<Option<&str>> = pages.iter().map(|s| Some(s.as_str())).collect();
|
||||
let bytes = build_text_pdf(&page_refs);
|
||||
write_pdf(&env.workspace_root, "long.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0);
|
||||
let pdf_item = report
|
||||
.items
|
||||
@@ -466,8 +457,7 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
|
||||
// Round-trip: list_docs sees the long PDF.
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
|
||||
assert!(summaries.iter().any(|s| s.doc_path.0.ends_with("long.pdf")));
|
||||
}
|
||||
|
||||
@@ -476,13 +466,11 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
#[test]
|
||||
fn inspect_doc_surfaces_page_spans() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bytes =
|
||||
build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
|
||||
let bytes = build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
|
||||
write_pdf(&env.workspace_root, "inspect.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let pdf_item = report
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -490,19 +478,15 @@ fn inspect_doc_surfaces_page_spans() {
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("inspect.pdf"))
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(doc.parser_version.0, "pdf-text-v1");
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
// v0.26.2: stored parser_version is now `pdf-text-v1|<ingest-config-sig>`
|
||||
// (the signature folds chunking / pdf.ocr settings for skip detection).
|
||||
// Assert the base identity by taking the prefix before the first '|'.
|
||||
assert_eq!(doc.parser_version.0.split('|').next().unwrap(), "pdf-text-v1");
|
||||
assert_eq!(doc.blocks.len(), 3);
|
||||
for block in &doc.blocks {
|
||||
match block {
|
||||
Block::Paragraph(p) => assert!(matches!(
|
||||
p.common.source_span,
|
||||
SourceSpan::Page { .. }
|
||||
)),
|
||||
Block::Paragraph(p) => assert!(matches!(p.common.source_span, SourceSpan::Page { .. })),
|
||||
other => panic!("expected Paragraph, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,19 +78,15 @@ fn reset_orphans_only_purges_out_of_scope_docs() {
|
||||
narrow_cfg.workspace.exclude = vec!["b.rs".to_string(), "c.rs".to_string()];
|
||||
|
||||
// Run orphans-only reset.
|
||||
let report = execute(ResetScope::OrphansOnly, &narrow_cfg)
|
||||
.expect("orphans-only reset must succeed");
|
||||
let report =
|
||||
execute(ResetScope::OrphansOnly, &narrow_cfg).expect("orphans-only reset must succeed");
|
||||
|
||||
assert_eq!(
|
||||
report.orphans_purged, 2,
|
||||
"expected 2 orphans purged (b.rs + c.rs): {report:?}"
|
||||
);
|
||||
|
||||
let mut purged: Vec<String> = report
|
||||
.purged_paths
|
||||
.iter()
|
||||
.map(|p| p.0.clone())
|
||||
.collect();
|
||||
let mut purged: Vec<String> = report.purged_paths.iter().map(|p| p.0.clone()).collect();
|
||||
purged.sort();
|
||||
assert_eq!(
|
||||
purged,
|
||||
|
||||
79
crates/kebab-app/tests/schema_active_versions.rs
Normal file
79
crates/kebab-app/tests/schema_active_versions.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
//! Integration tests for Bug #13: schema.v1.models.active_parsers + active_chunkers.
|
||||
|
||||
use kebab_app::schema_with_config;
|
||||
use kebab_config::Config;
|
||||
use kebab_core::SourceScope;
|
||||
|
||||
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.chunking.target_tokens = 80;
|
||||
cfg.chunking.overlap_tokens = 20;
|
||||
cfg
|
||||
}
|
||||
|
||||
fn minimal_scope(workspace_root: &std::path::Path) -> SourceScope {
|
||||
SourceScope {
|
||||
root: workspace_root.to_path_buf(),
|
||||
include: vec![],
|
||||
exclude: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn schema_models_active_arrays_empty_on_empty_corpus() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let workspace = dir.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let cfg = minimal_config(dir.path(), &workspace);
|
||||
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
drop(store);
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
assert!(
|
||||
s.models.active_parsers.is_empty(),
|
||||
"empty corpus → no parsers"
|
||||
);
|
||||
assert!(
|
||||
s.models.active_chunkers.is_empty(),
|
||||
"empty corpus → no chunkers"
|
||||
);
|
||||
// backward compat: 기존 단일 field 는 markdown default 보존.
|
||||
assert_eq!(s.models.parser_version, kebab_parse_md::PARSER_VERSION);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn schema_emits_active_parsers_and_chunkers_array_after_ingest() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let workspace = dir.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
std::fs::write(workspace.join("a.md"), "# A\nhello world\n").unwrap();
|
||||
let cfg = minimal_config(dir.path(), &workspace);
|
||||
let scope = minimal_scope(&workspace);
|
||||
|
||||
kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap();
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
assert!(
|
||||
!s.models.active_parsers.is_empty(),
|
||||
"active_parsers populated after ingest"
|
||||
);
|
||||
assert!(
|
||||
!s.models.active_chunkers.is_empty(),
|
||||
"active_chunkers populated after ingest"
|
||||
);
|
||||
// active arrays must be sorted (ORDER BY in SQL).
|
||||
let mut sorted = s.models.active_parsers.clone();
|
||||
sorted.sort();
|
||||
assert_eq!(
|
||||
s.models.active_parsers, sorted,
|
||||
"active_parsers must be sorted"
|
||||
);
|
||||
}
|
||||
@@ -57,7 +57,7 @@ fn schema_report_reflects_freshly_ingested_kb() {
|
||||
schema.wire.schemas
|
||||
);
|
||||
assert!(schema.capabilities.json_mode);
|
||||
assert!(!schema.capabilities.streaming_ask);
|
||||
assert!(schema.capabilities.streaming_ask); // Bug #9: streaming_ask is now true
|
||||
assert!(
|
||||
schema.capabilities.mcp_server,
|
||||
"mcp_server should be true after fb-30",
|
||||
|
||||
@@ -27,7 +27,10 @@ fn search_with_opts_no_budget_matches_search() {
|
||||
|
||||
assert_eq!(resp.hits.len(), baseline.len());
|
||||
assert!(!resp.truncated);
|
||||
assert!(resp.next_cursor.is_none(), "k=5 against 1 doc → no next page");
|
||||
assert!(
|
||||
resp.next_cursor.is_none(),
|
||||
"k=5 against 1 doc → no next page"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -62,7 +65,11 @@ fn budget_truncates_snippets_when_below_threshold() {
|
||||
fn cursor_paginates_to_next_page() {
|
||||
let env = common::TestEnv::new();
|
||||
for i in 0..6 {
|
||||
common::ingest_md(&env, &format!("d{i}.md"), &format!("# T{i}\n\nrust topic {i}\n"));
|
||||
common::ingest_md(
|
||||
&env,
|
||||
&format!("d{i}.md"),
|
||||
&format!("# T{i}\n\nrust topic {i}\n"),
|
||||
);
|
||||
}
|
||||
let app = env.app();
|
||||
|
||||
@@ -88,7 +95,10 @@ fn cursor_paginates_to_next_page() {
|
||||
page1.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
|
||||
let p2_ids: std::collections::HashSet<_> =
|
||||
page2.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
|
||||
assert!(p1_ids.is_disjoint(&p2_ids), "page 2 must not repeat page 1 hits");
|
||||
assert!(
|
||||
p1_ids.is_disjoint(&p2_ids),
|
||||
"page 2 must not repeat page 1 hits"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -46,3 +46,152 @@ fn korean_lexical_query_returns_korean_document() {
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// A4 Step 1c — multi-token Korean query (`해시 충돌`) must hit when
|
||||
/// the lexical builder routes it through a whole-phrase MATCH candidate.
|
||||
///
|
||||
/// Expected: FAIL until A5 (`build_match_string` redesign) lands — the
|
||||
/// current builder emits `"해시" "충돌"` AND, but FTS5 trigram tokenizer
|
||||
/// has no 2-char terms so each side is 0-hit. A5 introduces a whole-
|
||||
/// phrase candidate (`"해시 충돌"`) OR'd with the token AND, restoring
|
||||
/// hits for the dominant Korean usage pattern.
|
||||
#[test]
|
||||
fn lexical_multi_token_korean_query_hits() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Copy the synthetic Korean fixture (introduced in A4 Step 0) into
|
||||
// the test workspace. The fixture contains the exact phrase
|
||||
// "해시 충돌" multiple times.
|
||||
let dest = env.workspace_root.join("hash-table.md");
|
||||
let src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
.join("..")
|
||||
.join("fixtures")
|
||||
.join("search")
|
||||
.join("korean")
|
||||
.join("hash-table.md");
|
||||
std::fs::copy(&src, &dest).expect("copy korean fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query("해시 충돌"))
|
||||
.expect("search must succeed");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"multi-token Korean query '해시 충돌' must hit the hash-table fixture; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
let any_hash_table = hits.iter().any(|h| h.doc_path.0.contains("hash-table"));
|
||||
assert!(
|
||||
any_hash_table,
|
||||
"expected at least one hit on the hash-table fixture, got: {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// A4 Step 1c — mixed Korean+English multi-token query (`Rust 충돌은`).
|
||||
/// Both tokens are ≥3 chars, so the redesigned builder (A5) emits
|
||||
/// `("Rust 충돌은") OR ("Rust" AND "충돌은")`. With trigram tokenizer
|
||||
/// each side has substring coverage in the document, so the AND branch
|
||||
/// alone is enough. Expected: FAIL pre-A5, PASS post-A5.
|
||||
#[test]
|
||||
fn lexical_mixed_korean_english_multi_token_query_hits() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let doc_path = env.workspace_root.join("rust-hash.md");
|
||||
std::fs::write(
|
||||
&doc_path,
|
||||
"# Rust 해시 테이블\n\nRust 의 std::collections::HashMap 에서 \
|
||||
해시 충돌은 SipHash 로 완화한다.\n",
|
||||
)
|
||||
.expect("write rust-hash fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust 충돌은"))
|
||||
.expect("search must succeed");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"mixed Korean+English multi-token query 'Rust 충돌은' must hit the rust-hash fixture; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
let any_rust_hash = hits.iter().any(|h| h.doc_path.0.contains("rust-hash"));
|
||||
assert!(
|
||||
any_rust_hash,
|
||||
"expected at least one hit on the rust-hash fixture, got: {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
// ── S7 V009 morphological tokenizer end-to-end tests ─────────────────
|
||||
|
||||
/// S7 — V009 morphological tokenizer: 한국어 2자 query 가 end-to-end
|
||||
/// lexical 경로에서 hit. lindera ko-dic 이 '한국어를' → '한국어' 형태소로
|
||||
/// 분해, '서울은' → '서울' 로 분해하여 tokenized_korean_text column 에
|
||||
/// 기록 → FTS5 매칭.
|
||||
#[test]
|
||||
fn korean_morphological_2char_query_lexical_mode() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let doc_path = env.workspace_root.join("korean-wiki.md");
|
||||
std::fs::write(
|
||||
&doc_path,
|
||||
"# 한국어 위키\n\n한국어를 공부합니다.\n서울은 한국의 수도입니다.\n",
|
||||
)
|
||||
.expect("write korean-wiki fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("한국"))
|
||||
.expect("search 한국");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'한국' 2-char Korean query must return at least one hit (V009 morphological); got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("서울"))
|
||||
.expect("search 서울");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'서울' 2-char Korean query must return at least one hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// S7 — V009 morphological tokenizer: 한-영 혼합 query lexical hit.
|
||||
/// 'Rust' (English whole-token) + '최적화' (Korean morpheme) 각각 hit.
|
||||
#[test]
|
||||
fn korean_morphological_mixed_english_korean_query() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let doc_path = env.workspace_root.join("rust-optimization.md");
|
||||
std::fs::write(
|
||||
&doc_path,
|
||||
"# Rust 최적화 노트\n\nRust 최적화는 zero-cost abstraction 을 강조한다.\n",
|
||||
)
|
||||
.expect("write rust-optimization fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust"))
|
||||
.expect("search Rust");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'Rust' English whole-token must hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화"))
|
||||
.expect("search 최적화");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'최적화' Korean morpheme must hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -35,8 +35,8 @@ fn lexical_search_returns_hits_after_ingest() {
|
||||
fn lexical_search_empty_query_returns_empty() {
|
||||
let env = TestEnv::lexical_only();
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query(" "))
|
||||
.unwrap();
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query(" ")).unwrap();
|
||||
assert!(hits.is_empty(), "blank query must short-circuit empty");
|
||||
}
|
||||
|
||||
@@ -107,20 +107,25 @@ fn search_uncached_returns_same_hits_as_cached() {
|
||||
#[test]
|
||||
fn first_ingest_bumps_corpus_revision() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let store_before =
|
||||
kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
let store_before = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
store_before.run_migrations().unwrap();
|
||||
assert_eq!(store_before.corpus_revision(), 0, "fresh store seeds 0");
|
||||
// V004 seeds 0; V009 + V010 + V011 migrations each bump by 1 to
|
||||
// invalidate stale LRU caches (spec §5.2). Baseline before ingest = 3.
|
||||
// (V012 derivation_cache + V013 drop-chunk-aliases are structural/additive
|
||||
// — neither bumps corpus_revision.)
|
||||
let baseline = store_before.corpus_revision();
|
||||
assert_eq!(baseline, 3, "fresh store post-V011 baseline = 3");
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert!(report.new + report.updated > 0, "first ingest must commit ≥1 doc");
|
||||
|
||||
let store_after =
|
||||
kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert!(
|
||||
store_after.corpus_revision() >= 1,
|
||||
"ingest commit must bump corpus_revision (got {})",
|
||||
report.new + report.updated > 0,
|
||||
"first ingest must commit ≥1 doc"
|
||||
);
|
||||
|
||||
let store_after = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
assert!(
|
||||
store_after.corpus_revision() > baseline,
|
||||
"ingest commit must bump corpus_revision past baseline {baseline} (got {})",
|
||||
store_after.corpus_revision(),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -29,7 +29,9 @@ fn fresh_doc_is_not_stale_with_default_threshold() {
|
||||
assert!(
|
||||
hits.iter().all(|h| !h.stale),
|
||||
"freshly-ingested doc must not be stale at default 30d threshold: {:?}",
|
||||
hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::<Vec<_>>()
|
||||
hits.iter()
|
||||
.map(|h| (h.doc_path.0.clone(), h.stale))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -50,7 +52,9 @@ fn threshold_zero_disables_staleness() {
|
||||
assert!(
|
||||
hits.iter().all(|h| !h.stale),
|
||||
"threshold=0 disables staleness even for year-old docs: {:?}",
|
||||
hits.iter().map(|h| (h.doc_path.0.clone(), h.stale)).collect::<Vec<_>>()
|
||||
hits.iter()
|
||||
.map(|h| (h.doc_path.0.clone(), h.stale))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -14,12 +14,11 @@ use common::TestEnv;
|
||||
fn require_avx_or_panic() {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if !std::is_x86_feature_detected!("avx") {
|
||||
panic!(
|
||||
"kb-app vector integration test requires AVX-capable hardware; \
|
||||
host CPU lacks AVX. Run on an AVX-capable machine."
|
||||
);
|
||||
}
|
||||
assert!(
|
||||
std::is_x86_feature_detected!("avx"),
|
||||
"kb-app vector integration test requires AVX-capable hardware; \
|
||||
host CPU lacks AVX. Run on an AVX-capable machine."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,8 +29,7 @@ fn ingest_then_hybrid_search_returns_hits() {
|
||||
require_avx_or_panic();
|
||||
|
||||
let env = TestEnv::with_embeddings();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
@@ -57,8 +55,7 @@ fn ingest_then_vector_search_carries_embedding_model() {
|
||||
require_avx_or_panic();
|
||||
|
||||
let env = TestEnv::with_embeddings();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
|
||||
@@ -13,11 +13,7 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
|
||||
std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap();
|
||||
std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap();
|
||||
|
||||
let report = kebab_app::ingest_with_config(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
|
||||
let items = report.items.as_ref().expect("items array populated");
|
||||
let docx_item = items
|
||||
@@ -39,5 +35,8 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
|
||||
vec!["unsupported media type: <no-ext>".to_string()],
|
||||
);
|
||||
assert_eq!(report.skipped_by_extension.get("docx").copied(), Some(1));
|
||||
assert_eq!(report.skipped_by_extension.get("<no-ext>").copied(), Some(1));
|
||||
assert_eq!(
|
||||
report.skipped_by_extension.get("<no-ext>").copied(),
|
||||
Some(1)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -44,8 +44,8 @@ fn twin_files_fetch_span_uses_correct_asset() {
|
||||
std::fs::write(dir_b.join("note.md"), content).unwrap();
|
||||
|
||||
// Ingest all files (fixture workspace + our two new twins).
|
||||
let report = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("ingest must succeed");
|
||||
let report =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest must succeed");
|
||||
assert_eq!(report.errors, 0, "no ingest errors; report={report:?}");
|
||||
|
||||
// Both twin paths must appear as New in the report.
|
||||
@@ -53,8 +53,7 @@ fn twin_files_fetch_span_uses_correct_asset() {
|
||||
let twin_items: Vec<_> = items
|
||||
.iter()
|
||||
.filter(|i| {
|
||||
i.doc_path.0.ends_with("src_a/note.md")
|
||||
|| i.doc_path.0.ends_with("src_b/note.md")
|
||||
i.doc_path.0.ends_with("src_a/note.md") || i.doc_path.0.ends_with("src_b/note.md")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(
|
||||
@@ -149,7 +148,10 @@ fn twin_files_fetch_span_uses_correct_asset() {
|
||||
// at either twin, making one twin's span fetch behave incorrectly.
|
||||
let report2 = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest must succeed");
|
||||
assert_eq!(report2.errors, 0, "no ingest errors on second run; report={report2:?}");
|
||||
assert_eq!(
|
||||
report2.errors, 0,
|
||||
"no ingest errors on second run; report={report2:?}"
|
||||
);
|
||||
|
||||
// Re-open app after second ingest and verify span still works on both.
|
||||
let app2 = env.app();
|
||||
|
||||
@@ -43,9 +43,7 @@ fn twin_files_second_ingest_is_unchanged() {
|
||||
let items = first.items.as_ref().expect("items must be present");
|
||||
let twin_items: Vec<_> = items
|
||||
.iter()
|
||||
.filter(|i| {
|
||||
i.doc_path.0.ends_with("__init__.py")
|
||||
})
|
||||
.filter(|i| i.doc_path.0.ends_with("__init__.py"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
twin_items.len(),
|
||||
@@ -63,8 +61,14 @@ fn twin_files_second_ingest_is_unchanged() {
|
||||
// Second ingest — same files, same content → both must be Unchanged.
|
||||
let second = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest must succeed");
|
||||
assert_eq!(second.errors, 0, "second ingest: no errors; report={second:?}");
|
||||
assert_eq!(second.new, 0, "second ingest: no new docs; report={second:?}");
|
||||
assert_eq!(
|
||||
second.errors, 0,
|
||||
"second ingest: no errors; report={second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.new, 0,
|
||||
"second ingest: no new docs; report={second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.updated, 0,
|
||||
"second ingest: no updated docs (twin-file bug would set this to 2); report={second:?}"
|
||||
|
||||
@@ -13,14 +13,21 @@ serde_json_canonicalizer = "0.3"
|
||||
blake3 = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
serde_yaml = { workspace = true }
|
||||
lindera = { workspace = true, features = ["embed-ko-dic"] }
|
||||
lindera-ko-dic = { workspace = true, features = ["embed-ko-dic"] }
|
||||
|
||||
[dev-dependencies]
|
||||
# kb-parse-md / kb-normalize are dev-only — used by the snapshot integration
|
||||
# test to build a CanonicalDocument from a fixture Markdown file. Forbidden as
|
||||
# regular deps per design §8 (chunker consumes CanonicalDocument from kb-core
|
||||
# only); `cargo tree -p kb-chunk --depth 1` (default scope, excludes dev-deps)
|
||||
# kb-parse-md / kb-parse-code are dev-only — used by the snapshot integration
|
||||
# tests to build a CanonicalDocument from fixture files. kb-parse-md absorbed
|
||||
# kb-normalize in v0.19.0 (HOTFIXES.md 2026-05-26). Forbidden as regular deps
|
||||
# per design §8 (chunker consumes CanonicalDocument from kb-core only);
|
||||
# `cargo tree -p kb-chunk --depth 1` (default scope, excludes dev-deps)
|
||||
# confirms this.
|
||||
kebab-parse-md = { path = "../kebab-parse-md" }
|
||||
kebab-normalize = { path = "../kebab-normalize" }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
kebab-parse-md = { path = "../kebab-parse-md" }
|
||||
kebab-parse-code = { path = "../kebab-parse-code" }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
375
crates/kebab-chunk/src/code_c_ast_v1.rs
Normal file
375
crates/kebab-chunk/src/code_c_ast_v1.rs
Normal file
@@ -0,0 +1,375 @@
|
||||
//! `code-c-ast-v1` — maps a tree-sitter-derived C AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-c-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeCAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeCAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!("CodeCAstV1Chunker only handles code docs (got non-Code block)"),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeCAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-c-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.c".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-c-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("c".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("c".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("c".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_c_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeCAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-c-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "int parse() {\n\t// x\n}"),
|
||||
("print", 5, 7, "void print() {\n\t//\n\treturn;\n}"),
|
||||
]);
|
||||
let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-c-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} = {i};\n"))
|
||||
.collect::<String>();
|
||||
let code = format!("int big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "int parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeCAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
|
||||
let base: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeCAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
377
crates/kebab-chunk/src/code_cpp_ast_v1.rs
Normal file
377
crates/kebab-chunk/src/code_cpp_ast_v1.rs
Normal file
@@ -0,0 +1,377 @@
|
||||
//! `code-cpp-ast-v1` — maps a tree-sitter-derived C++ AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-cpp-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeCppAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeCppAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => {
|
||||
anyhow::bail!("CodeCppAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeCppAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-cpp-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.cpp".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-cpp-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("cpp".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("cpp".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("cpp".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_cpp_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeCppAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-cpp-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "int parse() {\n\t// x\n}"),
|
||||
("print", 5, 7, "void print() {\n\t//\n\treturn;\n}"),
|
||||
]);
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-cpp-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} = {i};\n"))
|
||||
.collect::<String>();
|
||||
let code = format!("int big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "int parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeCppAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
|
||||
let base: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeCppAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
383
crates/kebab-chunk/src/code_go_ast_v1.rs
Normal file
383
crates/kebab-chunk/src/code_go_ast_v1.rs
Normal file
@@ -0,0 +1,383 @@
|
||||
//! `code-go-ast-v1` — maps a tree-sitter-derived Go AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-go-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeGoAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeGoAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => {
|
||||
anyhow::bail!("CodeGoAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeGoAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-go-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.go".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-go-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("go".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("go".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("go".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_go_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeGoAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-go-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "func parse() {\n\t// x\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"func double() int {\n\t//\n\treturn 0\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-go-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} := {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("func big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "func parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeGoAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "func parse() {}\n")]);
|
||||
let base: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeGoAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
378
crates/kebab-chunk/src/code_java_ast_v1.rs
Normal file
378
crates/kebab-chunk/src/code_java_ast_v1.rs
Normal file
@@ -0,0 +1,378 @@
|
||||
//! `code-java-ast-v1` — maps a tree-sitter-derived Java AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-java-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeJavaAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeJavaAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeJavaAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeJavaAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-java-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/Main.java".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-java-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("java".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("java".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("java".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_java_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeJavaAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-java-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "void parse() {\n\t// x\n}"),
|
||||
("Foo.double", 5, 7, "int double() {\n\t//\n\treturn 0;\n}"),
|
||||
]);
|
||||
let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-java-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tint x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("void big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "void parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeJavaAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "void parse() {}\n")]);
|
||||
let base: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeJavaAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -39,17 +39,13 @@ impl Chunker for CodeJsAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeJsAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
_ => {
|
||||
anyhow::bail!("CodeJsAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodeJsAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodeJsAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodeJsAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodeJsAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -140,6 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
@@ -183,9 +191,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,46 +214,72 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("javascript".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("javascript".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("javascript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_js_ast_v1() {
|
||||
assert_eq!(CodeJsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-js-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodeJsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-js-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "function parse() {\n // x\n}"),
|
||||
("Foo.double", 5, 7, "function double() {\n //\n return 0;\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"function double() {\n //\n return 0;\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
@@ -256,7 +290,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-js-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +305,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!(" const x{i} = {i};")).collect::<Vec<_>>().join("\n");
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" const x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("function big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +345,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeJsAstV1Chunker"));
|
||||
@@ -304,11 +355,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "function parse() {}\n")]);
|
||||
let base: Vec<String> = CodeJsAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeJsAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +375,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeJsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodeJsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
383
crates/kebab-chunk/src/code_kotlin_ast_v1.rs
Normal file
383
crates/kebab-chunk/src/code_kotlin_ast_v1.rs
Normal file
@@ -0,0 +1,383 @@
|
||||
//! `code-kotlin-ast-v1` — maps a tree-sitter-derived Kotlin AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-kotlin-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeKotlinAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeKotlinAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeKotlinAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeKotlinAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-kotlin-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/Main.kt".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-kotlin-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("kotlin".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("kotlin".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("kotlin".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_kotlin_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeKotlinAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-kotlin-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "fun parse() {\n\t// x\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"fun double(): Int {\n\t//\n\treturn 0\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-kotlin-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tval x{i} = {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("fun big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "fun parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeKotlinAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "fun parse() {}\n")]);
|
||||
let base: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeKotlinAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -39,11 +39,7 @@ impl Chunker for CodePythonAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodePythonAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodePythonAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodePythonAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodePythonAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -140,6 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
@@ -183,9 +191,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,39 +214,60 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("python".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("python".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("python".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_python_ast_v1() {
|
||||
assert_eq!(CodePythonAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-python-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodePythonAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-python-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -256,7 +285,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-python-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +300,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!(" x{i} = {i}")).collect::<Vec<_>>().join("\n");
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" x{i} = {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("def big():\n{body}\n");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +340,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodePythonAstV1Chunker"));
|
||||
@@ -304,11 +350,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "def parse(): pass\n")]);
|
||||
let base: Vec<String> = CodePythonAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodePythonAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +370,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodePythonAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodePythonAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,11 +39,7 @@ impl Chunker for CodeRustAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodeRustAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodeRustAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodeRustAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodeRustAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -140,6 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
@@ -183,9 +191,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,39 +214,60 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("rust".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("rust".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("rust".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_rust_ast_v1() {
|
||||
assert_eq!(CodeRustAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-rust-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodeRustAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-rust-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -256,7 +285,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-rust-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +300,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!(" let x{i} = {i};")).collect::<Vec<_>>().join("\n");
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" let x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("pub fn big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +340,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeRustAstV1Chunker"));
|
||||
@@ -304,11 +350,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "fn parse(){}\n}")]);
|
||||
let base: Vec<String> = CodeRustAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeRustAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +370,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeRustAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodeRustAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
170
crates/kebab-chunk/src/code_text_paragraph_v1.rs
Normal file
170
crates/kebab-chunk/src/code_text_paragraph_v1.rs
Normal file
@@ -0,0 +1,170 @@
|
||||
//! p10-3: Tier 3 paragraph + line-window fallback chunker.
|
||||
//!
|
||||
//! Splits code/text files on blank-line paragraph boundaries. Paragraphs
|
||||
//! with more than 80 lines are further split into 80-line windows with a
|
||||
//! 20-line overlap (stride 60) — the same oversize pattern used by Tier 1/2
|
||||
//! chunkers but without AST structure, hence no symbol.
|
||||
//!
|
||||
//! Per spec §9.3: all emitted chunks carry `symbol: None`.
|
||||
|
||||
use crate::tier2_shared::{build_chunk_no_symbol, policy_hash};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "code-text-paragraph-v1";
|
||||
|
||||
/// Lines-per-window for the oversize fallback (Tier 3).
|
||||
const FALLBACK_LINES_PER_CHUNK: usize = 80;
|
||||
/// Overlap between consecutive windows.
|
||||
const FALLBACK_LINES_OVERLAP: usize = 20;
|
||||
// stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP = 60.
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeTextParagraphV1Chunker;
|
||||
|
||||
impl Chunker for CodeTextParagraphV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full source text.
|
||||
let (text, lang_str) = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => (cb.code.as_str(), cb.lang.as_deref().unwrap_or("")),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let mut chunks = Vec::new();
|
||||
for para in split_paragraphs(text) {
|
||||
push_paragraph(&mut chunks, doc, policy, ¶, lang_str)?;
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"code-text-paragraph-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
|
||||
/// A contiguous run of non-blank lines from the source text.
|
||||
struct Paragraph {
|
||||
/// Lines joined with `\n` (no trailing newline).
|
||||
text: String,
|
||||
/// 1-indexed line number of the first line in the source file.
|
||||
line_start: u32,
|
||||
/// 1-indexed line number of the last line in the source file.
|
||||
line_end: u32,
|
||||
}
|
||||
|
||||
/// Split `text` into `Paragraph`s separated by blank (all-whitespace) lines.
|
||||
///
|
||||
/// Blank lines are treated as boundaries and are NOT included in any
|
||||
/// paragraph's line range. Paragraphs that would consist entirely of blank
|
||||
/// lines are skipped.
|
||||
fn split_paragraphs(text: &str) -> Vec<Paragraph> {
|
||||
let mut paragraphs = Vec::new();
|
||||
let mut current: Vec<&str> = Vec::new();
|
||||
let mut current_start: Option<u32> = None;
|
||||
|
||||
for (idx, line) in text.lines().enumerate() {
|
||||
let line_no = (idx + 1) as u32;
|
||||
let is_blank = line.trim().is_empty();
|
||||
if is_blank {
|
||||
if let Some(start) = current_start.take() {
|
||||
let end = start + current.len() as u32 - 1;
|
||||
paragraphs.push(Paragraph {
|
||||
text: current.join("\n"),
|
||||
line_start: start,
|
||||
line_end: end,
|
||||
});
|
||||
current.clear();
|
||||
}
|
||||
} else {
|
||||
if current_start.is_none() {
|
||||
current_start = Some(line_no);
|
||||
}
|
||||
current.push(line);
|
||||
}
|
||||
}
|
||||
// Flush any trailing paragraph not terminated by a blank line.
|
||||
if let Some(start) = current_start {
|
||||
let end = start + current.len() as u32 - 1;
|
||||
paragraphs.push(Paragraph {
|
||||
text: current.join("\n"),
|
||||
line_start: start,
|
||||
line_end: end,
|
||||
});
|
||||
}
|
||||
paragraphs
|
||||
}
|
||||
|
||||
/// Emit one or more chunks for a single paragraph.
|
||||
///
|
||||
/// Paragraphs with ≤ `FALLBACK_LINES_PER_CHUNK` lines become a single chunk.
|
||||
/// Larger paragraphs are split into overlapping windows of
|
||||
/// `FALLBACK_LINES_PER_CHUNK` lines with stride `FALLBACK_LINES_PER_CHUNK -
|
||||
/// FALLBACK_LINES_OVERLAP`. The last window may be shorter. Window starts
|
||||
/// are passed as `split_key` so `id_for_chunk` can produce distinct ids
|
||||
/// across windows.
|
||||
fn push_paragraph(
|
||||
out: &mut Vec<Chunk>,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
para: &Paragraph,
|
||||
lang: &str,
|
||||
) -> Result<()> {
|
||||
let n_lines = (para.line_end - para.line_start + 1) as usize;
|
||||
|
||||
if n_lines <= FALLBACK_LINES_PER_CHUNK {
|
||||
// Use line_start as split_key so each paragraph gets a distinct
|
||||
// chunk_id even when block_ids is empty (no symbol, no AST structure).
|
||||
// Without this, all short paragraphs from the same doc share the same
|
||||
// base_policy_hash and therefore the same id_for_chunk result.
|
||||
out.push(build_chunk_no_symbol(
|
||||
doc,
|
||||
policy,
|
||||
¶.text,
|
||||
para.line_start,
|
||||
para.line_end,
|
||||
lang,
|
||||
VERSION_LABEL,
|
||||
Some(para.line_start),
|
||||
));
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Oversize: line-window split with overlap.
|
||||
let stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP;
|
||||
let lines: Vec<&str> = para.text.lines().collect();
|
||||
let mut i = 0usize;
|
||||
loop {
|
||||
let end = (i + FALLBACK_LINES_PER_CHUNK).min(lines.len());
|
||||
let window_text = lines[i..end].join("\n");
|
||||
let window_start = para.line_start + i as u32;
|
||||
let window_end = para.line_start + (end as u32) - 1;
|
||||
// Use window_start as split_key so chunk_ids are unique across windows.
|
||||
out.push(build_chunk_no_symbol(
|
||||
doc,
|
||||
policy,
|
||||
&window_text,
|
||||
window_start,
|
||||
window_end,
|
||||
lang,
|
||||
VERSION_LABEL,
|
||||
Some(window_start),
|
||||
));
|
||||
if end == lines.len() {
|
||||
break;
|
||||
}
|
||||
i += stride;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -39,17 +39,13 @@ impl Chunker for CodeTsAstV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeTsAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
_ => {
|
||||
anyhow::bail!("CodeTsAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
@@ -68,9 +64,12 @@ impl Chunker for CodeTsAstV1Chunker {
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code { line_start, line_end, symbol, lang } => {
|
||||
(*line_start, *line_end, symbol.clone(), lang.clone())
|
||||
}
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
@@ -84,8 +83,13 @@ impl Chunker for CodeTsAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
None, span, cb.code.clone(),
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
@@ -93,9 +97,7 @@ impl Chunker for CodeTsAstV1Chunker {
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol
|
||||
.as_ref()
|
||||
.map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
@@ -103,8 +105,13 @@ impl Chunker for CodeTsAstV1Chunker {
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc, &chunker_version, &block_ids, &base_policy_hash,
|
||||
Some(part_ls), span, text,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -140,6 +147,7 @@ fn make_chunk(
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
@@ -183,9 +191,9 @@ fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance,
|
||||
SourceType, TrustLevel, WorkspacePath,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -206,46 +214,72 @@ mod tests {
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span },
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("typescript".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(),
|
||||
lang: Lang("und".into()), blocks,
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![], tags: vec![],
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note, trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None, user: Default::default(),
|
||||
repo: Some("kebab".into()), git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)), code_lang: Some("typescript".into()),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("typescript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv, schema_version: 1, doc_version: 1,
|
||||
last_chunker_version: None, last_embedding_version: None,
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy { target_tokens: 500, overlap_tokens: 80,
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()) }
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_ts_ast_v1() {
|
||||
assert_eq!(CodeTsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-ts-ast-v1".into()));
|
||||
assert_eq!(
|
||||
CodeTsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-ts-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "function parse(): void {\n // x\n}"),
|
||||
("Foo.double", 5, 7, "function double(): number {\n //\n return 0;\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"function double(): number {\n //\n return 0;\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
@@ -256,7 +290,12 @@ mod tests {
|
||||
assert_eq!(c.chunker_version.0, "code-ts-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { symbol, line_start, line_end, .. } => {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
@@ -266,22 +305,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500).map(|i| format!(" const x{i} = {i};")).collect::<Vec<_>>().join("\n");
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" const x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("function big(): void {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len());
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}");
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len(); ids.sort(); ids.dedup();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
@@ -295,7 +345,8 @@ mod tests {
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(), inlines: vec![],
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeTsAstV1Chunker"));
|
||||
@@ -304,11 +355,19 @@ mod tests {
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "function parse(): void {}\n")]);
|
||||
let base: Vec<String> = CodeTsAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let base: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeTsAstV1Chunker.chunk(&doc, &policy())
|
||||
.unwrap().into_iter().map(|c| c.chunk_id.0).collect();
|
||||
let again: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
@@ -316,7 +375,9 @@ mod tests {
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(CodeTsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p));
|
||||
assert_eq!(
|
||||
CodeTsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
58
crates/kebab-chunk/src/dockerfile_file_v1.rs
Normal file
58
crates/kebab-chunk/src/dockerfile_file_v1.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
//! p10-2: dockerfile whole-file chunker (Tier 2).
|
||||
//!
|
||||
//! Reads entire Dockerfile content and emits a single Chunk with symbol
|
||||
//! "<dockerfile>", code_lang "dockerfile", line range 1..EOF.
|
||||
//! Oversize >200 lines splits into line-windows sharing the symbol via
|
||||
//! tier2_shared::push_chunks_with_oversize.
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "dockerfile-file-v1";
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct DockerfileFileV1Chunker;
|
||||
|
||||
impl Chunker for DockerfileFileV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full Dockerfile text.
|
||||
let text = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => cb.code.as_str(),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let total_lines = text.lines().count().max(1) as u32;
|
||||
let mut chunks = Vec::new();
|
||||
|
||||
push_chunks_with_oversize(
|
||||
&mut chunks,
|
||||
doc,
|
||||
policy,
|
||||
text,
|
||||
1,
|
||||
total_lines,
|
||||
"<dockerfile>",
|
||||
"dockerfile",
|
||||
VERSION_LABEL,
|
||||
None,
|
||||
)?;
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"dockerfile-file-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
162
crates/kebab-chunk/src/k8s_manifest_resource_v1.rs
Normal file
162
crates/kebab-chunk/src/k8s_manifest_resource_v1.rs
Normal file
@@ -0,0 +1,162 @@
|
||||
//! p10-2: k8s manifest resource-aware chunker.
|
||||
//!
|
||||
//! Splits a multi-document YAML file on `^---\s*$` boundaries, recognises
|
||||
//! documents that have both `apiVersion` and `kind` string fields as k8s
|
||||
//! resources, and emits one `Chunk` per resource (with oversize >200-line
|
||||
//! fallback). Non-k8s documents are skipped; invalid YAML yields 0 chunks
|
||||
//! for the entire file.
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "k8s-manifest-resource-v1";
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct K8sManifestResourceV1Chunker;
|
||||
|
||||
impl Chunker for K8sManifestResourceV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full YAML text.
|
||||
let text = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => cb.code.as_str(),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let slices = split_yaml_documents(text);
|
||||
let mut chunks: Vec<Chunk> = Vec::new();
|
||||
|
||||
for slice in slices {
|
||||
// Invalid YAML in any document → return 0 chunks for the file.
|
||||
let value: serde_yaml::Value = match serde_yaml::from_str(slice.text) {
|
||||
Ok(v) => v,
|
||||
Err(_) => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let Some(mapping) = value.as_mapping() else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let api = mapping
|
||||
.get("apiVersion")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("");
|
||||
let kind = mapping.get("kind").and_then(|v| v.as_str()).unwrap_or("");
|
||||
|
||||
// Skip non-k8s documents.
|
||||
if api.is_empty() || kind.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let metadata = mapping.get("metadata").and_then(|v| v.as_mapping());
|
||||
let name = metadata
|
||||
.and_then(|m| m.get("name"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("<unnamed>");
|
||||
let namespace = metadata
|
||||
.and_then(|m| m.get("namespace"))
|
||||
.and_then(|v| v.as_str());
|
||||
|
||||
let symbol = match namespace {
|
||||
Some(ns) if !ns.is_empty() => format!("{kind}/{ns}/{name}"),
|
||||
_ => format!("{kind}/{name}"),
|
||||
};
|
||||
|
||||
push_chunks_with_oversize(
|
||||
&mut chunks,
|
||||
doc,
|
||||
policy,
|
||||
slice.text,
|
||||
slice.line_start,
|
||||
slice.line_end,
|
||||
&symbol,
|
||||
"yaml",
|
||||
VERSION_LABEL,
|
||||
Some(slice.line_start),
|
||||
)?;
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"k8s-manifest-resource-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
|
||||
struct YamlSlice<'a> {
|
||||
text: &'a str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
}
|
||||
|
||||
/// Split raw YAML text into per-document slices on `---` separator lines.
|
||||
/// Line numbers are 1-indexed.
|
||||
fn split_yaml_documents(text: &str) -> Vec<YamlSlice<'_>> {
|
||||
let lines: Vec<&str> = text.lines().collect();
|
||||
|
||||
// Collect indices of separator lines (0-based), then append a sentinel at
|
||||
// the end so the last slice is always terminated.
|
||||
let mut separators: Vec<usize> = lines
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(i, l)| {
|
||||
let trimmed = l.trim_end();
|
||||
if trimmed == "---" || trimmed.starts_with("--- ") || trimmed.starts_with("---\t") {
|
||||
Some(i)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
separators.push(lines.len());
|
||||
|
||||
let mut slices: Vec<YamlSlice<'_>> = Vec::new();
|
||||
let mut doc_start_line: usize = 0; // 0-based index of current doc start
|
||||
|
||||
for sep_line in separators {
|
||||
if sep_line > doc_start_line {
|
||||
let start_byte = byte_offset_of_line(text, doc_start_line);
|
||||
let end_byte = byte_offset_of_line(text, sep_line);
|
||||
let slice_text = &text[start_byte..end_byte];
|
||||
if !slice_text.trim().is_empty() {
|
||||
slices.push(YamlSlice {
|
||||
text: slice_text,
|
||||
line_start: (doc_start_line + 1) as u32,
|
||||
line_end: sep_line as u32,
|
||||
});
|
||||
}
|
||||
}
|
||||
doc_start_line = sep_line + 1;
|
||||
}
|
||||
|
||||
slices
|
||||
}
|
||||
|
||||
/// Return the byte offset of the start of `line_idx` (0-based line index).
|
||||
fn byte_offset_of_line(text: &str, line_idx: usize) -> usize {
|
||||
if line_idx == 0 {
|
||||
return 0;
|
||||
}
|
||||
let mut count = 0usize;
|
||||
for (i, c) in text.char_indices() {
|
||||
if c == '\n' {
|
||||
count += 1;
|
||||
if count == line_idx {
|
||||
return i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
text.len()
|
||||
}
|
||||
@@ -15,16 +15,107 @@
|
||||
//! embedder, the retriever, the LLM, the RAG layer, or the UI layers.
|
||||
//! It consumes `CanonicalDocument` purely through `kb-core` types.
|
||||
|
||||
mod code_c_ast_v1;
|
||||
mod code_cpp_ast_v1;
|
||||
mod code_go_ast_v1;
|
||||
mod code_java_ast_v1;
|
||||
mod code_js_ast_v1;
|
||||
mod code_kotlin_ast_v1;
|
||||
mod code_python_ast_v1;
|
||||
mod code_rust_ast_v1;
|
||||
pub mod code_text_paragraph_v1;
|
||||
mod code_ts_ast_v1;
|
||||
pub mod dockerfile_file_v1;
|
||||
pub mod k8s_manifest_resource_v1;
|
||||
pub mod manifest_file_v1;
|
||||
mod md_heading_v1;
|
||||
mod pdf_page_v1;
|
||||
mod tier2_shared;
|
||||
|
||||
pub use code_c_ast_v1::CodeCAstV1Chunker;
|
||||
pub use code_cpp_ast_v1::CodeCppAstV1Chunker;
|
||||
pub use code_go_ast_v1::CodeGoAstV1Chunker;
|
||||
pub use code_java_ast_v1::CodeJavaAstV1Chunker;
|
||||
pub use code_js_ast_v1::CodeJsAstV1Chunker;
|
||||
pub use code_kotlin_ast_v1::CodeKotlinAstV1Chunker;
|
||||
pub use code_python_ast_v1::CodePythonAstV1Chunker;
|
||||
pub use code_rust_ast_v1::CodeRustAstV1Chunker;
|
||||
pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker;
|
||||
pub use code_ts_ast_v1::CodeTsAstV1Chunker;
|
||||
pub use dockerfile_file_v1::DockerfileFileV1Chunker;
|
||||
pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
|
||||
pub use manifest_file_v1::ManifestFileV1Chunker;
|
||||
pub use md_heading_v1::MdHeadingV1Chunker;
|
||||
pub use pdf_page_v1::PdfPageV1Chunker;
|
||||
|
||||
// ── Korean morphological tokenizer ───────────────────────────────────────────
|
||||
|
||||
use lindera::dictionary::{DictionaryKind, load_embedded_dictionary};
|
||||
use lindera::mode::Mode;
|
||||
use lindera::segmenter::Segmenter;
|
||||
use lindera::tokenizer::Tokenizer;
|
||||
|
||||
static KOREAN_TOKENIZER: std::sync::OnceLock<Option<Tokenizer>> = std::sync::OnceLock::new();
|
||||
|
||||
/// 한 codepoint 가 한글 음절 또는 자모인지 판정 — N-gram supplement 의 emit 대상 필터링.
|
||||
fn is_hangul(c: char) -> bool {
|
||||
matches!(
|
||||
c,
|
||||
'\u{AC00}'..='\u{D7A3}' // 한글 음절 (precomposed)
|
||||
| '\u{1100}'..='\u{11FF}' // 한글 자모
|
||||
| '\u{3130}'..='\u{318F}' // 한글 호환 자모
|
||||
)
|
||||
}
|
||||
|
||||
/// 한국어 chunk text 를 lindera ko-dic 으로 형태소 분해해 공백 join 한 결과를 반환.
|
||||
/// chunker 들이 `Chunk.tokenized_korean_text` pre-fill 에 사용.
|
||||
/// 분석 실패 시 None — 호출자는 NULL fallback 처리.
|
||||
/// Tokenizer 는 OnceLock 으로 1회 초기화; dict load 실패 시 영구 None.
|
||||
///
|
||||
/// v0.21.0 — N-gram supplement (Option β, post-v0.20.1 enhancement).
|
||||
/// ko-dic 가 compound noun (`한국정부`, `서울특별시` 등) 을 단일 token 으로
|
||||
/// 저장하는 정책 의 한계 해소 — morpheme 길이 ≥ 3 인 한글 token 에 대해
|
||||
/// 2-char sliding window n-gram 도 추가 emit. `'한국정부'` morpheme →
|
||||
/// `[한국정부, 한국, 국정, 정부]` 의 4 token 으로 expand. 사용자 의 2-char
|
||||
/// query (`'한국'`) 가 compound chunk 에서도 hit. 영어/숫자 token 은 영향
|
||||
/// 없음 (is_hangul filter). DB size + ingest latency 의 trade-off 는
|
||||
/// HOTFIXES 2026-05-28 의 "N-gram supplement (Option β)" 보강 entry.
|
||||
pub fn tokenize_korean_morphological(text: &str) -> Option<String> {
|
||||
if text.trim().is_empty() {
|
||||
return None;
|
||||
}
|
||||
let tokenizer = KOREAN_TOKENIZER.get_or_init(|| {
|
||||
let dict = match load_embedded_dictionary(DictionaryKind::KoDic) {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
tracing::warn!(target: "kebab-chunk", "tokenize_korean_morphological: dict load failed: {e}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
let segmenter = Segmenter::new(Mode::Normal, dict, None);
|
||||
Some(Tokenizer::new(segmenter))
|
||||
});
|
||||
let tokenizer = tokenizer.as_ref()?;
|
||||
let tokens = tokenizer.tokenize(text).ok()?;
|
||||
|
||||
let mut out_tokens: Vec<String> = Vec::with_capacity(tokens.len() * 2);
|
||||
for tok in tokens.iter() {
|
||||
let surface = tok.surface.as_ref();
|
||||
out_tokens.push(surface.to_string());
|
||||
|
||||
// N-gram supplement: 한글 morpheme 의 2-char sliding window.
|
||||
let chars: Vec<char> = surface.chars().collect();
|
||||
if chars.len() >= 3 && chars.iter().all(|c| is_hangul(*c)) {
|
||||
for window in chars.windows(2) {
|
||||
out_tokens.push(window.iter().collect());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let joined = out_tokens.join(" ");
|
||||
if joined.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(joined)
|
||||
}
|
||||
}
|
||||
|
||||
59
crates/kebab-chunk/src/manifest_file_v1.rs
Normal file
59
crates/kebab-chunk/src/manifest_file_v1.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
//! p10-2: manifest whole-file chunker (Tier 2).
|
||||
//!
|
||||
//! Reads entire manifest file (Cargo.toml / package.json / pom.xml / go.mod /
|
||||
//! build.gradle / pyproject.toml / tsconfig.json) and emits a single Chunk
|
||||
//! with symbol "<manifest>", code_lang read from Block::Code.lang, line range
|
||||
//! 1..EOF. Oversize >200 lines splits into line-windows sharing the symbol via
|
||||
//! tier2_shared::push_chunks_with_oversize.
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "manifest-file-v1";
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct ManifestFileV1Chunker;
|
||||
|
||||
impl Chunker for ManifestFileV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full manifest text.
|
||||
let (text, lang) = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => (cb.code.as_str(), cb.lang.as_deref().unwrap_or("")),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let total_lines = text.lines().count().max(1) as u32;
|
||||
let mut chunks = Vec::new();
|
||||
|
||||
push_chunks_with_oversize(
|
||||
&mut chunks,
|
||||
doc,
|
||||
policy,
|
||||
text,
|
||||
1,
|
||||
total_lines,
|
||||
"<manifest>",
|
||||
lang,
|
||||
VERSION_LABEL,
|
||||
None,
|
||||
)?;
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"manifest-file-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
//! `md-heading-v1` — heading-aware Markdown chunker.
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker,
|
||||
ChunkerVersion, DocumentId, SourceSpan, id_for_chunk,
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
/// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label
|
||||
@@ -99,11 +99,7 @@ impl Chunker for MdHeadingV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
let policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
@@ -152,22 +148,12 @@ impl Chunker for MdHeadingV1Chunker {
|
||||
// `collect_overlap_seed` keeps seed ≤ target/2, so
|
||||
// a flush here never produces a chunk smaller than
|
||||
// the seed budget.
|
||||
let would_exceed = acc.text_tokens + next_tokens
|
||||
> policy.target_tokens
|
||||
let would_exceed = acc.text_tokens + next_tokens > policy.target_tokens
|
||||
&& acc.has_non_heading_content();
|
||||
if would_exceed {
|
||||
let overlap_seed = collect_overlap_seed(
|
||||
&acc,
|
||||
policy.overlap_tokens,
|
||||
policy.target_tokens,
|
||||
);
|
||||
flush(
|
||||
&mut acc,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&policy_hash,
|
||||
&mut out,
|
||||
);
|
||||
let overlap_seed =
|
||||
collect_overlap_seed(&acc, policy.overlap_tokens, policy.target_tokens);
|
||||
flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
|
||||
// Seed next accumulator with the prior chunk's
|
||||
// tail blocks (paragraph-level overlap). The
|
||||
// heading is *not* re-included here — it lives
|
||||
@@ -292,10 +278,11 @@ fn build_chunk(
|
||||
) -> Chunk {
|
||||
debug_assert!(!blocks.is_empty(), "build_chunk requires ≥1 block");
|
||||
|
||||
let block_ids: Vec<BlockId> =
|
||||
blocks.iter().map(|b| common(b).block_id.clone()).collect();
|
||||
let source_spans: Vec<SourceSpan> =
|
||||
blocks.iter().map(|b| common(b).source_span.clone()).collect();
|
||||
let block_ids: Vec<BlockId> = blocks.iter().map(|b| common(b).block_id.clone()).collect();
|
||||
let source_spans: Vec<SourceSpan> = blocks
|
||||
.iter()
|
||||
.map(|b| common(b).source_span.clone())
|
||||
.collect();
|
||||
|
||||
// heading_path: pick the first non-Heading block's heading_path
|
||||
// (which already includes every parent heading per kb-normalize).
|
||||
@@ -339,17 +326,13 @@ fn build_chunk(
|
||||
text.len().div_ceil(BYTES_PER_TOKEN)
|
||||
};
|
||||
|
||||
let chunk_id = id_for_chunk(
|
||||
&doc.doc_id,
|
||||
chunker_version,
|
||||
&block_ids,
|
||||
policy_hash,
|
||||
);
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, &block_ids, policy_hash);
|
||||
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path,
|
||||
source_spans,
|
||||
@@ -387,9 +370,7 @@ fn render_block_text(b: &Block) -> String {
|
||||
// alt keeps lexical search hits on filenames working even when
|
||||
// P6-1's filename auto-fill is bypassed.
|
||||
Block::ImageRef(i) => {
|
||||
let alt = if !i.alt.is_empty() {
|
||||
i.alt.clone()
|
||||
} else {
|
||||
let alt = if i.alt.is_empty() {
|
||||
// P6-1 falls back to filename so this branch is
|
||||
// defensive — keep it lest a future test fixture or
|
||||
// synthetic block path skip the auto-fill.
|
||||
@@ -399,17 +380,11 @@ fn render_block_text(b: &Block) -> String {
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or("[image]")
|
||||
.to_string()
|
||||
} else {
|
||||
i.alt.clone()
|
||||
};
|
||||
let ocr = i
|
||||
.ocr
|
||||
.as_ref()
|
||||
.map(|o| o.joined.as_str())
|
||||
.unwrap_or("");
|
||||
let cap = i
|
||||
.caption
|
||||
.as_ref()
|
||||
.map(|c| c.text.as_str())
|
||||
.unwrap_or("");
|
||||
let ocr = i.ocr.as_ref().map_or("", |o| o.joined.as_str());
|
||||
let cap = i.caption.as_ref().map_or("", |c| c.text.as_str());
|
||||
[alt.as_str(), ocr, cap]
|
||||
.iter()
|
||||
.filter(|s| !s.is_empty())
|
||||
@@ -449,9 +424,8 @@ fn common(b: &Block) -> &kebab_core::CommonBlock {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang,
|
||||
Metadata, Provenance, SourceType, TableBlock, TextBlock, TrustLevel,
|
||||
WorkspacePath, id_for_block,
|
||||
AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang, Metadata, Provenance,
|
||||
SourceType, TableBlock, TextBlock, TrustLevel, WorkspacePath, id_for_block,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -494,12 +468,7 @@ mod tests {
|
||||
SourceSpan::Line { start, end }
|
||||
}
|
||||
|
||||
fn common_for(
|
||||
kind: &str,
|
||||
heading_path: &[String],
|
||||
ordinal: u32,
|
||||
s: SourceSpan,
|
||||
) -> CommonBlock {
|
||||
fn common_for(kind: &str, heading_path: &[String], ordinal: u32, s: SourceSpan) -> CommonBlock {
|
||||
CommonBlock {
|
||||
block_id: id_for_block(&doc_id(), kind, heading_path, ordinal, &s),
|
||||
heading_path: heading_path.to_vec(),
|
||||
@@ -534,12 +503,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn paragraph(
|
||||
text: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
line: u32,
|
||||
) -> Block {
|
||||
fn paragraph(text: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::Paragraph(TextBlock {
|
||||
common: common_for("paragraph", &hp, ordinal, span(line, line)),
|
||||
@@ -548,12 +512,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn code_block(
|
||||
code: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
s: SourceSpan,
|
||||
) -> Block {
|
||||
fn code_block(code: &str, heading_path: &[&str], ordinal: u32, s: SourceSpan) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::Code(CodeBlock {
|
||||
common: common_for("code", &hp, ordinal, s),
|
||||
@@ -580,12 +539,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn image_ref(
|
||||
alt: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
line: u32,
|
||||
) -> Block {
|
||||
fn image_ref(alt: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::ImageRef(ImageRefBlock {
|
||||
common: common_for("imageref", &hp, ordinal, span(line, line)),
|
||||
|
||||
@@ -53,18 +53,21 @@
|
||||
//! one chunk per atomic block. PdfPageV1 cannot.
|
||||
//!
|
||||
//! Workaround that doesn't change the §4.2 recipe: feed a per-chunk
|
||||
//! variant `format!("{base_policy_hash}#c{char_start}")` into the
|
||||
//! recipe's `policy_hash` slot (so distinct chunks distinguish via
|
||||
//! different policy_hash inputs), while storing the unmodified
|
||||
//! `base_policy_hash` in `Chunk.policy_hash` so the field still answers
|
||||
//! "what policy was active". Logged in `tasks/HOTFIXES.md`.
|
||||
//! variant `format!("{base_policy_hash}#c{segment_start}")` into the
|
||||
//! recipe's `policy_hash` slot. `segment_start` is the pre-overlap
|
||||
//! segment boundary, strictly increasing across the returned chunks
|
||||
//! even when the overlap walk collapses `actual_start` to a previous
|
||||
//! chunk's `prev_min`. Unmodified `base_policy_hash` is stored in
|
||||
//! `Chunk.policy_hash` so the field still answers "what policy was
|
||||
//! active". v1.1 second-iteration patch — logged in
|
||||
//! `tasks/HOTFIXES.md` (2026-05-27).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "pdf-page-v1";
|
||||
const VERSION_LABEL: &str = "pdf-page-v1.1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
|
||||
@@ -89,11 +92,7 @@ impl Chunker for PdfPageV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
// Validate up front — every block must be a Paragraph carrying
|
||||
// SourceSpan::Page. A mixed document signals a routing bug in
|
||||
// the caller (e.g. running this chunker on Markdown) and is
|
||||
@@ -106,18 +105,13 @@ impl Chunker for PdfPageV1Chunker {
|
||||
),
|
||||
};
|
||||
if !matches!(common.source_span, SourceSpan::Page { .. }) {
|
||||
anyhow::bail!(
|
||||
"PdfPageV1Chunker only handles PDF docs (got non-Page source_span)"
|
||||
);
|
||||
anyhow::bail!("PdfPageV1Chunker only handles PDF docs (got non-Page source_span)");
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let target_bytes = policy
|
||||
.target_tokens
|
||||
.saturating_mul(BYTES_PER_TOKEN)
|
||||
.max(1);
|
||||
let target_bytes = policy.target_tokens.saturating_mul(BYTES_PER_TOKEN).max(1);
|
||||
// Clamp the overlap to half the target. Without this, a policy
|
||||
// with `overlap_tokens >= target_tokens` would make every chunk
|
||||
// fully re-emit the previous chunk's text — mirrors
|
||||
@@ -146,7 +140,7 @@ impl Chunker for PdfPageV1Chunker {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (char_start, char_end, slice) in
|
||||
for (segment_start, char_start, char_end, slice) in
|
||||
chunk_page(&p.text, target_bytes, overlap_bytes)
|
||||
{
|
||||
// PDF chars-per-page comfortably fits in u32 (a single
|
||||
@@ -154,20 +148,20 @@ impl Chunker for PdfPageV1Chunker {
|
||||
// typography); silent `as u32` truncation would only
|
||||
// surface on corrupted input, where an explicit panic
|
||||
// is preferable to an off-by-2^32 span.
|
||||
let char_start_u32 = u32::try_from(char_start)
|
||||
.expect("page chars fit in u32");
|
||||
let char_end_u32 =
|
||||
u32::try_from(char_end).expect("page chars fit in u32");
|
||||
let char_start_u32 = u32::try_from(char_start).expect("page chars fit in u32");
|
||||
let char_end_u32 = u32::try_from(char_end).expect("page chars fit in u32");
|
||||
let span = SourceSpan::Page {
|
||||
page: page_num,
|
||||
char_start: Some(char_start_u32),
|
||||
char_end: Some(char_end_u32),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![p.common.block_id.clone()];
|
||||
// Per-chunk policy_hash variant prevents chunk_id
|
||||
// collision when a page produces multiple chunks. See
|
||||
// module docs for rationale.
|
||||
let per_chunk_hash = format!("{base_policy_hash}#c{char_start}");
|
||||
// v0.20.0 sub-item 1 bugfix (#3): per-chunk policy_hash
|
||||
// variant uses `segment_start` (pre-overlap boundary,
|
||||
// strictly increasing) instead of `char_start` (post-
|
||||
// overlap, may collapse to prev_min). See module docs +
|
||||
// spec §4.1 root cause + HOTFIXES.md 2026-05-27.
|
||||
let per_chunk_hash = format!("{base_policy_hash}#c{segment_start}");
|
||||
let chunk_id =
|
||||
id_for_chunk(&doc.doc_id, &chunker_version, &block_ids, &per_chunk_hash);
|
||||
let token_estimate = slice.len().div_ceil(BYTES_PER_TOKEN);
|
||||
@@ -176,6 +170,7 @@ impl Chunker for PdfPageV1Chunker {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&slice),
|
||||
text: slice,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
@@ -198,18 +193,28 @@ impl Chunker for PdfPageV1Chunker {
|
||||
}
|
||||
|
||||
/// Split a single page's text into ordered chunks, each represented as
|
||||
/// `(char_start, char_end, text_slice)`. Char positions are within the
|
||||
/// page text, suitable for `SourceSpan::Page::char_start` / `char_end`.
|
||||
/// `(segment_start, actual_start, chunk_end, text_slice)`.
|
||||
///
|
||||
/// - `segment_start` = pre-overlap segment boundary. Strictly increasing
|
||||
/// across the returned vec. Use this for chunk_id uniqueness suffixes.
|
||||
/// - `actual_start` = post-overlap start char index. May collapse to a
|
||||
/// previous chunk's `actual_start` under aggressive overlap policy.
|
||||
/// Use this for `SourceSpan::Page::char_start`.
|
||||
/// - `chunk_end` = chunk's end char index (exclusive).
|
||||
///
|
||||
/// Returns an empty vector when `text` is empty or whitespace-only.
|
||||
fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usize, usize, String)> {
|
||||
fn chunk_page(
|
||||
text: &str,
|
||||
target_bytes: usize,
|
||||
overlap_bytes: usize,
|
||||
) -> Vec<(usize, usize, usize, String)> {
|
||||
let chars: Vec<char> = text.chars().collect();
|
||||
let n = chars.len();
|
||||
if n == 0 {
|
||||
return Vec::new();
|
||||
}
|
||||
if text.len() <= target_bytes {
|
||||
return vec![(0, n, text.to_string())];
|
||||
return vec![(0, 0, n, text.to_string())];
|
||||
}
|
||||
|
||||
// Build candidate boundary positions (char indices where a chunk
|
||||
@@ -222,8 +227,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
let c = chars[k];
|
||||
let nx = chars[k + 1];
|
||||
let is_paragraph_break = c == '\n' && nx == '\n';
|
||||
let is_sentence_end =
|
||||
matches!(c, '.' | '?' | '!') && nx.is_whitespace();
|
||||
let is_sentence_end = matches!(c, '.' | '?' | '!') && nx.is_whitespace();
|
||||
if (is_paragraph_break || is_sentence_end) && k + 2 <= n {
|
||||
bounds.push(k + 2);
|
||||
}
|
||||
@@ -235,11 +239,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
bounds.dedup();
|
||||
|
||||
// UTF-8 byte length of the slice between two char indices.
|
||||
let byte_len = |a: usize, b: usize| -> usize {
|
||||
chars[a..b].iter().map(|c| c.len_utf8()).sum()
|
||||
};
|
||||
let byte_len = |a: usize, b: usize| -> usize { chars[a..b].iter().map(|c| c.len_utf8()).sum() };
|
||||
|
||||
let mut chunks: Vec<(usize, usize, String)> = Vec::new();
|
||||
let mut chunks: Vec<(usize, usize, usize, String)> = Vec::new();
|
||||
let mut seg_idx: usize = 0;
|
||||
while seg_idx + 1 < bounds.len() {
|
||||
let start = bounds[seg_idx];
|
||||
@@ -264,7 +266,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
// have absorbed up to `overlap_bytes` of bytes, but never past
|
||||
// the previous chunk's start (no full re-emission).
|
||||
let actual_start = if let Some(prev) = chunks.last() {
|
||||
let prev_min = prev.0;
|
||||
// prev tuple shape = (segment_start, actual_start, chunk_end, slice).
|
||||
// overlap walk floor = previous chunk's actual_start (prev.1).
|
||||
let prev_min = prev.1;
|
||||
let mut a = start;
|
||||
let mut acc_o: usize = 0;
|
||||
while a > prev_min {
|
||||
@@ -281,7 +285,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
};
|
||||
|
||||
let slice: String = chars[actual_start..chunk_end].iter().collect();
|
||||
chunks.push((actual_start, chunk_end, slice));
|
||||
chunks.push((start, actual_start, chunk_end, slice));
|
||||
seg_idx = end_idx;
|
||||
}
|
||||
|
||||
@@ -390,7 +394,11 @@ mod tests {
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
match c.source_spans[0] {
|
||||
SourceSpan::Page { page, char_start, char_end } => {
|
||||
SourceSpan::Page {
|
||||
page,
|
||||
char_start,
|
||||
char_end,
|
||||
} => {
|
||||
assert_eq!(page, (i as u32) + 1);
|
||||
assert_eq!(char_start, Some(0));
|
||||
assert!(char_end.unwrap() > 0);
|
||||
@@ -435,11 +443,16 @@ mod tests {
|
||||
// N-1's char_end).
|
||||
for w in chunks.windows(2) {
|
||||
let prev_end = match w[0].source_spans[0] {
|
||||
SourceSpan::Page { char_end: Some(e), .. } => e,
|
||||
SourceSpan::Page {
|
||||
char_end: Some(e), ..
|
||||
} => e,
|
||||
_ => panic!("missing char_end"),
|
||||
};
|
||||
let next_start = match w[1].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
assert!(
|
||||
@@ -450,7 +463,7 @@ mod tests {
|
||||
// chunk_ids stay distinct despite identical block_ids — the
|
||||
// per-chunk policy_hash variant is doing its job.
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
ids.sort();
|
||||
ids.sort_unstable();
|
||||
let total = ids.len();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), total, "all chunk_ids must be unique");
|
||||
@@ -653,11 +666,17 @@ mod tests {
|
||||
// overlap) is the failure mode.
|
||||
for w in chunks.windows(2) {
|
||||
let prev_start = match w[0].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
let next_start = match w[1].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
assert!(
|
||||
@@ -668,12 +687,49 @@ mod tests {
|
||||
// chunk_ids stay distinct (the per-chunk hash variant keys off
|
||||
// char_start which is now strictly increasing).
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
ids.sort();
|
||||
ids.sort_unstable();
|
||||
let total = ids.len();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), total, "chunk_ids must remain unique");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_chunk_page_with_aggressive_overlap_produces_unique_chunk_ids() {
|
||||
// 한국어 OCR text 의 trigger shape: 10 char "가" + ". " + 500 char "나".
|
||||
// → first segment [0, 12), second segment [12, n).
|
||||
// page_text byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500
|
||||
// → multi-chunk. overlap_bytes = min(240, 750) = 240 chars=80
|
||||
// → second chunk 의 actual_start 가 prev_min=0 collapse → same `#c0`.
|
||||
//
|
||||
// default_policy(500, 80) — target_tokens=500 → target_bytes=500*3=1500
|
||||
// (한국어 3byte/char 환산), overlap_tokens=80 → overlap_bytes=min(240, 750)=240.
|
||||
// verifier round 1 L-3 보강.
|
||||
let early_seg = "가".repeat(10);
|
||||
let tail = "나".repeat(500);
|
||||
let page_text = format!("{early_seg}. {tail}");
|
||||
|
||||
let doc = make_pdf_doc(&[&page_text]);
|
||||
let policy = default_policy(500, 80); // target=1500 byte, overlap=240 byte
|
||||
let chunks = PdfPageV1Chunker.chunk(&doc, &policy).unwrap();
|
||||
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"expected ≥2 chunks for {} byte page; got {}",
|
||||
page_text.len(),
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
ids.sort_unstable();
|
||||
let total = ids.len();
|
||||
ids.dedup();
|
||||
assert_eq!(
|
||||
ids.len(),
|
||||
total,
|
||||
"all chunk_ids must be unique even when overlap walks actual_start back to prev_min"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1_for_identical_policy() {
|
||||
// Cross-chunker policy fingerprint identity — important so a
|
||||
|
||||
200
crates/kebab-chunk/src/tier2_shared.rs
Normal file
200
crates/kebab-chunk/src/tier2_shared.rs
Normal file
@@ -0,0 +1,200 @@
|
||||
//! p10-2: Tier 2 chunker shared helpers (oversize fallback + Chunk build).
|
||||
//!
|
||||
//! Mirrors `code_rust_ast_v1`'s Chunk-construction pattern exactly so that
|
||||
//! id / hashes / token-count / ChunkPolicy semantics stay identical across
|
||||
//! Tier 1 (AST) and Tier 2 (resource-aware) chunkers.
|
||||
|
||||
use anyhow::Result;
|
||||
use kebab_core::{
|
||||
BlockId, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, DocumentId, SourceSpan,
|
||||
id_for_chunk,
|
||||
};
|
||||
|
||||
pub(crate) const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
|
||||
/// Compute the policy hash the same way `code_rust_ast_v1` does.
|
||||
pub(crate) fn policy_hash(policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
/// Emit one chunk for `(text, line_start..=line_end, symbol, lang)`, splitting
|
||||
/// into line-windows of at most `AST_CHUNK_MAX_LINES` if the slice is oversize.
|
||||
/// Mirrors the oversize path in `code_rust_ast_v1`'s `chunk` impl.
|
||||
///
|
||||
/// `base_split_key` is used as the `split_key` for the non-oversize single-chunk
|
||||
/// case. Callers that emit multiple chunks from the same document (e.g.
|
||||
/// `K8sManifestResourceV1Chunker` — one call per k8s resource) MUST pass
|
||||
/// `Some(line_start)` so that each call produces a distinct `chunk_id`.
|
||||
/// Single-chunk callers (dockerfile-file-v1, manifest-file-v1) pass `None` to
|
||||
/// keep chunk_ids stable (no sibling can collide when there's only one chunk).
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn push_chunks_with_oversize(
|
||||
out: &mut Vec<Chunk>,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
text: &str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
symbol: &str,
|
||||
lang: &str,
|
||||
chunker_version: &str,
|
||||
base_split_key: Option<u32>,
|
||||
) -> Result<()> {
|
||||
let n_lines = (line_end - line_start + 1).max(1);
|
||||
let cv = ChunkerVersion(chunker_version.to_string());
|
||||
let base_policy_hash = policy_hash(policy);
|
||||
|
||||
if n_lines <= AST_CHUNK_MAX_LINES {
|
||||
out.push(build_chunk(
|
||||
doc,
|
||||
&cv,
|
||||
&base_policy_hash,
|
||||
text,
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
base_split_key,
|
||||
));
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let lines: Vec<&str> = text.lines().collect();
|
||||
let total = lines.len();
|
||||
let mut window_start = line_start;
|
||||
let mut i = 0usize;
|
||||
while i < total {
|
||||
let take = (AST_CHUNK_MAX_LINES as usize).min(total - i);
|
||||
let window_text = lines[i..i + take].join("\n");
|
||||
let window_end = window_start + take as u32 - 1;
|
||||
out.push(build_chunk(
|
||||
doc,
|
||||
&cv,
|
||||
&base_policy_hash,
|
||||
&window_text,
|
||||
window_start,
|
||||
window_end,
|
||||
symbol,
|
||||
lang,
|
||||
Some(window_start),
|
||||
));
|
||||
i += take;
|
||||
window_start = window_end + 1;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build a single `Chunk`, mirroring `make_chunk` in `code_rust_ast_v1.rs`
|
||||
/// exactly (same id recipe, same token estimate, same field set).
|
||||
///
|
||||
/// `split_key` is `Some(line_start_of_window)` for oversize splits, `None`
|
||||
/// for normal single-chunk emission. Mirrors the `Some(part_ls)` / `None`
|
||||
/// split_key pattern in 1A-2.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn build_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
base_policy_hash: &str,
|
||||
text: &str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
symbol: &str,
|
||||
lang: &str,
|
||||
split_key: Option<u32>,
|
||||
) -> Chunk {
|
||||
let span = SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol: Some(symbol.to_string()),
|
||||
lang: Some(lang.to_string()),
|
||||
};
|
||||
build_chunk_from_span(
|
||||
doc,
|
||||
chunker_version,
|
||||
base_policy_hash,
|
||||
text,
|
||||
span,
|
||||
split_key,
|
||||
)
|
||||
}
|
||||
|
||||
/// Like `build_chunk` but emits `symbol: None`. Used by Tier 3 (per spec §9.3).
|
||||
///
|
||||
/// Accepts `policy: &ChunkPolicy` and `chunker_version: &str` (string slice)
|
||||
/// so callers don't need to pre-compute the hash and version wrapper.
|
||||
/// `split_key` is `Some(window_start)` for oversize line-window splits.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn build_chunk_no_symbol(
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
text: &str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
lang: &str,
|
||||
chunker_version: &str,
|
||||
split_key: Option<u32>,
|
||||
) -> Chunk {
|
||||
let cv = ChunkerVersion(chunker_version.to_string());
|
||||
let base_policy_hash = policy_hash(policy);
|
||||
let span = SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol: None,
|
||||
lang: Some(lang.to_string()),
|
||||
};
|
||||
build_chunk_from_span(doc, &cv, &base_policy_hash, text, span, split_key)
|
||||
}
|
||||
|
||||
/// Core chunk-building logic shared by `build_chunk` and `build_chunk_no_symbol`.
|
||||
///
|
||||
/// Takes a pre-built `SourceSpan` so the only difference between the two
|
||||
/// public helpers is whether `symbol` is `Some` or `None`. All id/hash/
|
||||
/// token mechanics are identical.
|
||||
fn build_chunk_from_span(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
base_policy_hash: &str,
|
||||
text: &str,
|
||||
span: SourceSpan,
|
||||
split_key: Option<u32>,
|
||||
) -> Chunk {
|
||||
// id_hash mirrors code_rust_ast_v1's make_chunk logic:
|
||||
// split_key Some(k) => "{base_policy_hash}#L{k}"
|
||||
// split_key None => base_policy_hash
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
|
||||
// block_ids: Tier 2/3 chunkers have no per-block structure (the whole file
|
||||
// is one Block::Code), so we pass an empty slice — same as using the doc-
|
||||
// level slice without explicit block granularity.
|
||||
let block_ids: Vec<BlockId> = vec![];
|
||||
|
||||
let chunk_id = id_for_chunk(
|
||||
&DocumentId(doc.doc_id.0.clone()),
|
||||
chunker_version,
|
||||
&block_ids,
|
||||
&id_hash,
|
||||
);
|
||||
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(text),
|
||||
text: text.to_string(),
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
196
crates/kebab-chunk/tests/code_c_ast_snapshot.rs
Normal file
196
crates/kebab-chunk/tests/code_c_ast_snapshot.rs
Normal file
@@ -0,0 +1,196 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative C code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_go_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeCAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("projects/record.c".into());
|
||||
let aid = AssetId("c".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-c-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Representative units:
|
||||
// 0. imports + defines (lines 1–4, ≤200)
|
||||
// 1. status_t enum typedef (lines 6–9, ≤200)
|
||||
// 2. record_t struct typedef (lines 11–16, ≤200)
|
||||
// 3. static counter decl glue (line 18, ≤200)
|
||||
// 4. parse_record fn (lines 20–23, ≤200)
|
||||
// 5. print_record fn (lines 25–27, ≤200)
|
||||
// 6. main fn (lines 29–33, ≤200)
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"<top-level>",
|
||||
1,
|
||||
18,
|
||||
"#include <stdio.h>\n#include <stdlib.h>\n\n#define MAX_BUF 4096\n\ntypedef enum {\n OK = 0,\n ERR_PARSE,\n ERR_IO,\n} status_t;\n\ntypedef struct {\n int id;\n char name[64];\n status_t status;\n} record_t;\n\nstatic int counter = 0;".to_string(),
|
||||
),
|
||||
(
|
||||
"parse_record",
|
||||
20,
|
||||
23,
|
||||
"int parse_record(const char *line, record_t *out) {\n if (line == NULL || out == NULL) return ERR_PARSE;\n return OK;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"print_record",
|
||||
25,
|
||||
27,
|
||||
"void print_record(const record_t *r) {\n printf(\"[%d] %s (status=%d)\\n\", r->id, r->name, r->status);\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"main",
|
||||
29,
|
||||
33,
|
||||
"int main(void) {\n record_t r = { .id = 1, .name = \"foo\", .status = OK };\n print_record(&r);\n return 0;\n}".to_string(),
|
||||
),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("c".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("c".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "record.c".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("c".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-c-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_c_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeCAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.c.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-c-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_c_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
354
crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs
Normal file
354
crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs
Normal file
@@ -0,0 +1,354 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative C++ code `CanonicalDocument`.
|
||||
//!
|
||||
//! Two complementary tests:
|
||||
//! 1. `code_cpp_ast_chunks_snapshot` — hand-built `fixed_doc()` validates the
|
||||
//! chunker's 1:1 mapping (design §6.3 / §8 boundary: no parse-code dep needed).
|
||||
//! 2. `code_cpp_ast_extractor_snapshot` — invokes `CppAstExtractor` against the
|
||||
//! real `tests/fixtures/sample.cpp` fixture, validating the extractor → chunker
|
||||
//! end-to-end pipeline. `kebab-parse-code` is a dev-dep (same pattern as
|
||||
//! `kebab-parse-md` in Markdown snapshot tests).
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeCppAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use kebab_parse_code::CppAstExtractor;
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("projects/record.cpp".into());
|
||||
let aid = AssetId("c".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-cpp-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Representative units (C++ specific):
|
||||
// 0. includes + namespace opening (lines 1–4, ≤200)
|
||||
// 1. class definition (lines 6–20, ≤200)
|
||||
// 2. template function (lines 22–25, ≤200)
|
||||
// 3. namespace closing + free fn (lines 27–29, ≤200)
|
||||
// 4. main fn (lines 31–34, ≤200)
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"<top-level>",
|
||||
1,
|
||||
4,
|
||||
"#include <string>\n#include <vector>\n\nnamespace kebab {".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::chunk::MdHeadingV1Chunker",
|
||||
6,
|
||||
20,
|
||||
"class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::identity",
|
||||
22,
|
||||
25,
|
||||
"template <typename T>\nT identity(T value) {\n return value;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::global_helper",
|
||||
27,
|
||||
29,
|
||||
"void global_helper() {\n // free function in kebab namespace\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"main",
|
||||
31,
|
||||
34,
|
||||
"int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}".to_string(),
|
||||
),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("cpp".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("cpp".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "record.cpp".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("cpp".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-cpp-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helper: run the real CppAstExtractor against tests/fixtures/sample.cpp
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn extract_cpp_fixture() -> CanonicalDocument {
|
||||
use kebab_core::{
|
||||
AssetId, AssetStorage, Checksum, ExtractConfig, ExtractContext, Extractor, RawAsset,
|
||||
SourceUri, WorkspacePath,
|
||||
};
|
||||
use std::path::PathBuf;
|
||||
|
||||
let bytes = std::fs::read(fixtures_dir().join("sample.cpp")).expect("read sample.cpp fixture");
|
||||
let src = String::from_utf8(bytes).expect("fixture is valid UTF-8");
|
||||
let wp = WorkspacePath("tests/fixtures/sample.cpp".to_string());
|
||||
let asset = RawAsset {
|
||||
asset_id: AssetId("e".repeat(64)),
|
||||
source_uri: SourceUri::File(PathBuf::from("tests/fixtures/sample.cpp")),
|
||||
workspace_path: wp,
|
||||
media_type: kebab_core::MediaType::Code("cpp".to_string()),
|
||||
byte_len: src.len() as u64,
|
||||
checksum: Checksum("f".repeat(64)),
|
||||
discovered_at: time::OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
stored: AssetStorage::Reference {
|
||||
path: PathBuf::from("tests/fixtures/sample.cpp"),
|
||||
sha: Checksum("f".repeat(64)),
|
||||
},
|
||||
};
|
||||
let cfg = ExtractConfig::default();
|
||||
let root = PathBuf::from("/tmp");
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root: &root,
|
||||
config: &cfg,
|
||||
};
|
||||
CppAstExtractor::new()
|
||||
.extract(&ctx, src.as_bytes())
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 1 (hand-built): chunker-only 1:1 mapping validation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn code_cpp_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.cpp.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-cpp-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_cpp_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 2 (real extractor): end-to-end extractor → chunker pipeline
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Validates that the real `CppAstExtractor` processes `sample.cpp` and
|
||||
/// emits the expected set of symbols through the full chunker pipeline.
|
||||
///
|
||||
/// `sample.cpp` contains:
|
||||
/// - `#include` directives + nested namespace `kebab::chunk` → glue + struct unit
|
||||
/// - `class MdHeadingV1Chunker` with methods (ctor, dtor, chunk_doc, operator())
|
||||
/// - `template <typename T> T identity(T value)` (template fn)
|
||||
/// - `void kebab::global_helper()` (free fn in namespace)
|
||||
/// - `int main()` (global free fn)
|
||||
#[test]
|
||||
fn code_cpp_ast_extractor_snapshot() {
|
||||
let doc = extract_cpp_fixture();
|
||||
|
||||
// Verify the extractor emits all expected named units.
|
||||
let block_syms: Vec<Option<String>> = doc
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Code(c) => match &c.common.source_span {
|
||||
SourceSpan::Code { symbol, .. } => Some(symbol.clone()),
|
||||
_ => None,
|
||||
},
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Must include namespace-qualified class and its methods
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker")),
|
||||
"class unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker")),
|
||||
"ctor unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker")),
|
||||
"dtor unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::chunk_doc")),
|
||||
"chunk_doc unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::operator()")),
|
||||
"operator() unit missing: {block_syms:?}"
|
||||
);
|
||||
// Template function (inside kebab::chunk namespace in the fixture)
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::identity")),
|
||||
"identity template fn unit missing: {block_syms:?}"
|
||||
);
|
||||
// Free function in outer namespace
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::global_helper")),
|
||||
"global_helper unit missing: {block_syms:?}"
|
||||
);
|
||||
// Global main
|
||||
assert!(
|
||||
block_syms.iter().any(|s| s.as_deref() == Some("main")),
|
||||
"main unit missing: {block_syms:?}"
|
||||
);
|
||||
}
|
||||
|
||||
/// End-to-end chunker output from real extractor is deterministic.
|
||||
#[test]
|
||||
fn code_cpp_ast_extractor_chunks_deterministic() {
|
||||
let doc1 = extract_cpp_fixture();
|
||||
let doc2 = extract_cpp_fixture();
|
||||
assert_eq!(
|
||||
doc1.blocks, doc2.blocks,
|
||||
"extractor output non-deterministic"
|
||||
);
|
||||
|
||||
let policy = fixed_policy();
|
||||
let chunks1 = CodeCppAstV1Chunker.chunk(&doc1, &policy).unwrap();
|
||||
let chunks2 = CodeCppAstV1Chunker.chunk(&doc2, &policy).unwrap();
|
||||
assert_eq!(
|
||||
chunks1
|
||||
.iter()
|
||||
.map(|c| c.chunk_id.0.clone())
|
||||
.collect::<Vec<_>>(),
|
||||
chunks2
|
||||
.iter()
|
||||
.map(|c| c.chunk_id.0.clone())
|
||||
.collect::<Vec<_>>(),
|
||||
"chunker output non-deterministic"
|
||||
);
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_go_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_go_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Go code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeGoAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("kebab_eval/metrics.go".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-go-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "func BigCompute(data []int) int {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!("\tv{i} := 0\n\tif {i} < len(data) {{\n\t\tv{i} = data[{i}]\n\t}}\n"))
|
||||
.collect();
|
||||
let footer = "\treturn len(data)\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. free fn `ComputeMRR` (lines 7–12, ≤200)
|
||||
// 2. struct `MetricsCollector` (lines 14–20, ≤200)
|
||||
// 3. struct `BaseEvaluator` (lines 22–30, ≤200)
|
||||
// 4. method `Run` (lines 32–38, ≤200)
|
||||
// 5. method `Report` (lines 40–46, ≤200)
|
||||
// 6. BigCompute (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import (\n\t\"fmt\"\n\t\"os\"\n\t\"strings\"\n)".to_string(),
|
||||
),
|
||||
(
|
||||
"ComputeMRR",
|
||||
7,
|
||||
12,
|
||||
"func ComputeMRR(scores []float64) float64 {\n\tif len(scores) == 0 {\n\t\treturn 0.0\n\t}\n\t_ = fmt.Sprintf(\"%v\", scores)\n\treturn 1.0 / float64(len(scores))\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector",
|
||||
14,
|
||||
20,
|
||||
"type MetricsCollector struct {\n\tScores []float64\n\tLabels []string\n\tCounts map[string]int\n\tTotals map[string]float64\n\tTags []string\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseEvaluator",
|
||||
22,
|
||||
30,
|
||||
"type BaseEvaluator struct {\n\tName string\n}\n\nfunc (e *BaseEvaluator) Evaluate(data []string) error {\n\t_ = os.Stderr\n\t_ = strings.Join(data, \",\")\n\treturn nil\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.Run",
|
||||
32,
|
||||
38,
|
||||
"func (m *MetricsCollector) Run(inputs []float64) {\n\tfor _, inp := range inputs {\n\t\tm.Scores = append(\n\t\t\tm.Scores,\n\t\t\tinp,\n\t\t)\n\t}\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.Report",
|
||||
40,
|
||||
46,
|
||||
"func (m *MetricsCollector) Report() map[string]interface{} {\n\treturn map[string]interface{}{\n\t\t\"mean\": 0.0,\n\t\t\"count\": len(m.Scores),\n\t\t\"tags\": m.Tags,\n\t}\n}".to_string(),
|
||||
),
|
||||
("BigCompute", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("go".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("go".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "metrics.go".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("go".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-go-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_go_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.go.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-go-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_go_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_java_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_java_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Java code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeJavaAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("src/main/java/com/example/Metrics.java".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-java-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line method body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "public class BigCompute {\n public int compute(int[] data) {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" int v{i} = {i} < data.length ? data[{i}] : 0;\n"))
|
||||
.collect();
|
||||
let footer = " return data.length;\n }\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. free method `computeMRR` (lines 7–12, ≤200)
|
||||
// 2. class `MetricsCollector` (lines 14–20, ≤200)
|
||||
// 3. class `BaseEvaluator` (lines 22–30, ≤200)
|
||||
// 4. method `MetricsCollector.run` (lines 32–38, ≤200)
|
||||
// 5. method `MetricsCollector.report` (lines 40–46, ≤200)
|
||||
// 6. BigCompute (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import java.util.List;\nimport java.util.Map;\nimport java.util.ArrayList;\nimport java.util.HashMap;\nimport java.util.stream.Collectors;".to_string(),
|
||||
),
|
||||
(
|
||||
"computeMRR",
|
||||
7,
|
||||
12,
|
||||
"public static double computeMRR(List<Double> scores) {\n if (scores.isEmpty()) {\n return 0.0;\n }\n return 1.0 / scores.size();\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector",
|
||||
14,
|
||||
20,
|
||||
"public class MetricsCollector {\n private List<Double> scores;\n private List<String> labels;\n private Map<String, Integer> counts;\n private Map<String, Double> totals;\n private List<String> tags;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseEvaluator",
|
||||
22,
|
||||
30,
|
||||
"public class BaseEvaluator {\n private String name;\n\n public BaseEvaluator(String name) {\n this.name = name;\n }\n\n public void evaluate(List<String> data) throws Exception {\n String joined = String.join(\",\", data);\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.run",
|
||||
32,
|
||||
38,
|
||||
"public void run(List<Double> inputs) {\n for (Double inp : inputs) {\n scores.add(\n inp\n );\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.report",
|
||||
40,
|
||||
46,
|
||||
"public Map<String, Object> report() {\n Map<String, Object> result = new HashMap<>();\n result.put(\"mean\", 0.0);\n result.put(\"count\", scores.size());\n result.put(\"tags\", tags);\n return result;\n}".to_string(),
|
||||
),
|
||||
("BigCompute", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("java".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("java".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "Metrics.java".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("java".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-java-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_java_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.java.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-java-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_java_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeJsAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
221
crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Kotlin code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeKotlinAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("src/main/kotlin/com/example/Metrics.kt".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-kotlin-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "class BigCompute {\n fun compute(data: IntArray): Int {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" val v{i} = if ({i} < data.size) data[{i}] else 0\n"))
|
||||
.collect();
|
||||
let footer = " return data.size\n }\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. top-level fn `computeMRR` (lines 7–12, ≤200)
|
||||
// 2. data class `MetricsCollector` (lines 14–20, ≤200)
|
||||
// 3. class `BaseEvaluator` (lines 22–30, ≤200)
|
||||
// 4. method `MetricsCollector.run` (lines 32–38, ≤200)
|
||||
// 5. method `MetricsCollector.report` (lines 40–46, ≤200)
|
||||
// 6. BigCompute (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import kotlin.collections.List\nimport kotlin.collections.Map\nimport kotlin.collections.MutableList\nimport kotlin.collections.MutableMap\nimport kotlin.collections.mutableListOf".to_string(),
|
||||
),
|
||||
(
|
||||
"computeMRR",
|
||||
7,
|
||||
12,
|
||||
"fun computeMRR(scores: List<Double>): Double {\n if (scores.isEmpty()) {\n return 0.0\n }\n return 1.0 / scores.size\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector",
|
||||
14,
|
||||
20,
|
||||
"data class MetricsCollector(\n val scores: MutableList<Double> = mutableListOf(),\n val labels: MutableList<String> = mutableListOf(),\n val counts: MutableMap<String, Int> = mutableMapOf(),\n val totals: MutableMap<String, Double> = mutableMapOf(),\n val tags: MutableList<String> = mutableListOf(),\n)".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseEvaluator",
|
||||
22,
|
||||
30,
|
||||
"open class BaseEvaluator(val name: String) {\n\n fun evaluate(data: List<String>) {\n val joined = data.joinToString(\",\")\n println(joined)\n }\n\n open fun describe(): String = name\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.run",
|
||||
32,
|
||||
38,
|
||||
"fun MetricsCollector.run(inputs: List<Double>) {\n for (inp in inputs) {\n scores.add(\n inp\n )\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.report",
|
||||
40,
|
||||
46,
|
||||
"fun MetricsCollector.report(): Map<String, Any> {\n return mapOf(\n \"mean\" to 0.0,\n \"count\" to scores.size,\n \"tags\" to tags,\n )\n}".to_string(),
|
||||
),
|
||||
("BigCompute", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("kotlin".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("kotlin".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "Metrics.kt".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("kotlin".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-kotlin-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_kotlin_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.kt.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-kotlin-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_kotlin_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodePythonAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeRustAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
270
crates/kebab-chunk/tests/code_text_paragraph_v1.rs
Normal file
270
crates/kebab-chunk/tests/code_text_paragraph_v1.rs
Normal file
@@ -0,0 +1,270 @@
|
||||
//! Behavioural tests for `CodeTextParagraphV1Chunker`.
|
||||
//!
|
||||
//! Documents are constructed manually (no kebab-parse-code dependency) by
|
||||
//! placing raw text into a single `Block::Code`, mirroring the pattern used
|
||||
//! in `k8s_manifest_resource_v1.rs`.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeTextParagraphV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
/// Build a `CanonicalDocument` with a single `Block::Code` containing `text`
|
||||
/// and the supplied `lang` label.
|
||||
fn text_doc(lang: &str, text: &str) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("scripts/sample.sh".into());
|
||||
let aid = AssetId("d".repeat(64));
|
||||
let pv = ParserVersion("code-text-paragraph-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
let line_count = text.lines().count() as u32;
|
||||
let span = SourceSpan::Code {
|
||||
line_start: 1,
|
||||
line_end: line_count.max(1),
|
||||
symbol: None,
|
||||
lang: Some(lang.into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], 0, &span);
|
||||
let block = Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some(lang.into()),
|
||||
code: text.to_string(),
|
||||
});
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "sample.sh".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks: vec![block],
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some(lang.into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-text-paragraph-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// `sample_shell.sh` has 4 paragraphs separated by 3 blank lines:
|
||||
/// - paragraph 1: lines 1-2 (shebang + set -euo pipefail)
|
||||
/// - paragraph 2: lines 4-7 (env setup block)
|
||||
/// - paragraph 3: lines 9-11 (ingest block)
|
||||
/// - paragraph 4: lines 13-15 (report block)
|
||||
///
|
||||
/// We assert:
|
||||
/// - exactly 4 chunks (one per paragraph)
|
||||
/// - all symbols are None (Tier 3 spec §9.3)
|
||||
/// - all langs are "shell"
|
||||
/// - line ranges are strictly ascending and do NOT include the blank lines
|
||||
/// (lines 3, 8, 12 must not appear in any range)
|
||||
#[test]
|
||||
fn shell_multi_paragraph_splits_on_blank_lines() {
|
||||
let fixture_path = fixtures_dir().join("sample_shell.sh");
|
||||
let text = std::fs::read_to_string(&fixture_path)
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
let doc = text_doc("shell", &text);
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
4,
|
||||
"expected 4 chunks (one per paragraph), got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
// All symbols must be None (Tier 3 requirement).
|
||||
for (i, chunk) in chunks.iter().enumerate() {
|
||||
match &chunk.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.is_none(),
|
||||
"chunk[{i}] symbol must be None for Tier 3 chunker, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
other => panic!("chunk[{i}]: expected Code span, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
// All langs must be "shell".
|
||||
for (i, chunk) in chunks.iter().enumerate() {
|
||||
match &chunk.source_spans[0] {
|
||||
SourceSpan::Code { lang, .. } => {
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("shell"),
|
||||
"chunk[{i}] lang must be 'shell', got {lang:?}"
|
||||
);
|
||||
}
|
||||
other => panic!("chunk[{i}]: expected Code span, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
// Line ranges must be strictly ascending with no overlap,
|
||||
// and blank lines (3, 8, 12) must not be included in any range.
|
||||
let expected_ranges: &[(u32, u32)] = &[(1, 2), (4, 7), (9, 11), (13, 15)];
|
||||
let actual_ranges: Vec<(u32, u32)> = chunks
|
||||
.iter()
|
||||
.map(|c| match &c.source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => (*line_start, *line_end),
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
actual_ranges, expected_ranges,
|
||||
"line ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}"
|
||||
);
|
||||
}
|
||||
|
||||
/// `sample_long_paragraph.txt` has exactly 200 non-blank lines and no blank
|
||||
/// lines, so the entire file is one paragraph. 200 > 80 (FALLBACK_LINES_PER_CHUNK),
|
||||
/// so the oversize window split fires with stride 60:
|
||||
/// - window 1: lines 1-80
|
||||
/// - window 2: lines 61-140
|
||||
/// - window 3: lines 121-200
|
||||
///
|
||||
/// All chunk_ids must be distinct (the #L{window_start} split_key suffix).
|
||||
#[test]
|
||||
fn single_long_paragraph_line_window_split() {
|
||||
let fixture_path = fixtures_dir().join("sample_long_paragraph.txt");
|
||||
let text = std::fs::read_to_string(&fixture_path)
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
assert_eq!(
|
||||
text.lines().count(),
|
||||
200,
|
||||
"fixture must have exactly 200 lines"
|
||||
);
|
||||
|
||||
let doc = text_doc("shell", &text);
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
3,
|
||||
"expected 3 window chunks for 200-line paragraph, got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
let expected_ranges: &[(u32, u32)] = &[(1, 80), (61, 140), (121, 200)];
|
||||
let actual_ranges: Vec<(u32, u32)> = chunks
|
||||
.iter()
|
||||
.map(|c| match &c.source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => (*line_start, *line_end),
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
actual_ranges, expected_ranges,
|
||||
"window ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}"
|
||||
);
|
||||
|
||||
// All chunk_ids must be distinct (#L{window_start} suffix differentiates them).
|
||||
let ids: std::collections::HashSet<_> = chunks.iter().map(|c| c.chunk_id.clone()).collect();
|
||||
assert_eq!(
|
||||
ids.len(),
|
||||
chunks.len(),
|
||||
"oversize window chunks must have distinct chunk_ids"
|
||||
);
|
||||
}
|
||||
|
||||
/// An empty source file (no non-blank lines) must yield zero chunks.
|
||||
#[test]
|
||||
fn empty_file_emits_zero_chunks() {
|
||||
let doc = text_doc("shell", "");
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
0,
|
||||
"empty file must yield 0 chunks, got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
}
|
||||
|
||||
/// The `lang` field on each emitted chunk must match the `lang` passed to
|
||||
/// `text_doc`, regardless of content. `symbol` must be `None` (Tier 3 spec).
|
||||
#[test]
|
||||
fn lang_field_preserved_from_input_doc() {
|
||||
let doc = text_doc("yaml", "key1: value1\nkey2: value2\n");
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert!(!chunks.is_empty(), "expected at least one chunk");
|
||||
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { lang, symbol, .. } => {
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("yaml"),
|
||||
"lang must be 'yaml', got {lang:?}"
|
||||
);
|
||||
assert!(
|
||||
symbol.is_none(),
|
||||
"symbol must be None for Tier 3 chunker, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
}
|
||||
}
|
||||
@@ -13,9 +13,9 @@ use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeTsAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock,
|
||||
Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath,
|
||||
id_for_block, id_for_doc,
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
138
crates/kebab-chunk/tests/dockerfile_file_v1.rs
Normal file
138
crates/kebab-chunk/tests/dockerfile_file_v1.rs
Normal file
@@ -0,0 +1,138 @@
|
||||
//! Behavioural tests for `DockerfileFileV1Chunker`.
|
||||
//!
|
||||
//! Documents are constructed manually (no kebab-parse-code dependency) by
|
||||
//! placing the raw Dockerfile text into a single `Block::Code`, mirroring the
|
||||
//! pattern used in `k8s_manifest_resource_v1.rs`.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::DockerfileFileV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
/// Build a `CanonicalDocument` with a single `Block::Code` containing `dockerfile_text`.
|
||||
fn dockerfile_doc(dockerfile_text: &str) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("build/Dockerfile".into());
|
||||
let aid = AssetId("d".repeat(64));
|
||||
let pv = ParserVersion("code-dockerfile-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
let line_count = dockerfile_text.lines().count() as u32;
|
||||
let span = SourceSpan::Code {
|
||||
line_start: 1,
|
||||
line_end: line_count.max(1),
|
||||
symbol: None,
|
||||
lang: Some("dockerfile".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], 0, &span);
|
||||
let block = Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("dockerfile".into()),
|
||||
code: dockerfile_text.to_string(),
|
||||
});
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "Dockerfile".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks: vec![block],
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("dockerfile".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("dockerfile-file-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// A simple 5-line Dockerfile fixture must emit exactly 1 chunk with the
|
||||
/// correct symbol, lang, and line range.
|
||||
#[test]
|
||||
fn dockerfile_emits_single_chunk() {
|
||||
let fixture_path = fixtures_dir().join("sample.dockerfile");
|
||||
let text = std::fs::read_to_string(&fixture_path)
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
let doc = dockerfile_doc(&text);
|
||||
let chunks = DockerfileFileV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
1,
|
||||
"expected 1 chunk, got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
// Inspect the Chunk's source_spans for symbol / lang / line range.
|
||||
let span = chunks[0].source_spans.first().expect("at least one span");
|
||||
match span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => {
|
||||
assert_eq!(*line_start, 1, "line_start must be 1");
|
||||
assert_eq!(*line_end, 5, "line_end must be 5 (5-line fixture)");
|
||||
assert_eq!(
|
||||
symbol.as_deref(),
|
||||
Some("<dockerfile>"),
|
||||
"symbol must be '<dockerfile>'"
|
||||
);
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("dockerfile"),
|
||||
"lang must be 'dockerfile'"
|
||||
);
|
||||
}
|
||||
other => panic!("expected SourceSpan::Code, got {other:?}"),
|
||||
}
|
||||
|
||||
// Verify chunker_version label.
|
||||
assert_eq!(chunks[0].chunker_version.0, "dockerfile-file-v1");
|
||||
}
|
||||
90
crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json
vendored
Normal file
90
crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json
vendored
Normal file
@@ -0,0 +1,90 @@
|
||||
[
|
||||
{
|
||||
"block_ids": [
|
||||
"8149e12ca002489acb4a0f74c97a061a"
|
||||
],
|
||||
"chunk_id": "ec3cf06ae56c8e9796bbc9196438b7c5",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 18,
|
||||
"line_start": 1,
|
||||
"symbol": "<top-level>"
|
||||
}
|
||||
],
|
||||
"text": "#include <stdio.h>\n#include <stdlib.h>\n\n#define MAX_BUF 4096\n\ntypedef enum {\n OK = 0,\n ERR_PARSE,\n ERR_IO,\n} status_t;\n\ntypedef struct {\n int id;\n char name[64];\n status_t status;\n} record_t;\n\nstatic int counter = 0;",
|
||||
"token_estimate": 78,
|
||||
"tokenized_korean_text": "# include < stdio . h > # include < stdlib . h > # define MAX _ BUF 4096 typedef enum { OK = 0 , ERR _ PARSE , ERR _ IO , } status _ t ; typedef struct { int id ; char name [ 64 ]; status _ t status ; } record _ t ; static int counter = 0 ;"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"1baaa89f21a47b2f32d6396a24a85454"
|
||||
],
|
||||
"chunk_id": "c2d7a81c898106733ef2e703774a6a4a",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 23,
|
||||
"line_start": 20,
|
||||
"symbol": "parse_record"
|
||||
}
|
||||
],
|
||||
"text": "int parse_record(const char *line, record_t *out) {\n if (line == NULL || out == NULL) return ERR_PARSE;\n return OK;\n}",
|
||||
"token_estimate": 41,
|
||||
"tokenized_korean_text": "int parse _ record ( const char * line , record _ t * out ) { if ( line == NULL || out == NULL ) return ERR _ PARSE ; return OK ; }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"8d0e14cbcc6d1e92d7878ab796ea68b8"
|
||||
],
|
||||
"chunk_id": "0e4d7b131ab64eba03b51903b5d8f96d",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 27,
|
||||
"line_start": 25,
|
||||
"symbol": "print_record"
|
||||
}
|
||||
],
|
||||
"text": "void print_record(const record_t *r) {\n printf(\"[%d] %s (status=%d)\\n\", r->id, r->name, r->status);\n}",
|
||||
"token_estimate": 35,
|
||||
"tokenized_korean_text": "void print _ record ( const record _ t * r ) { printf (\"[% d ] % s ( status =% d )\\ n \", r -> id , r -> name , r -> status ); }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"9c2ede84423871b615d48c38fefb1853"
|
||||
],
|
||||
"chunk_id": "e076f8edb2ff141d7e99b4106bb95157",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 33,
|
||||
"line_start": 29,
|
||||
"symbol": "main"
|
||||
}
|
||||
],
|
||||
"text": "int main(void) {\n record_t r = { .id = 1, .name = \"foo\", .status = OK };\n print_record(&r);\n return 0;\n}",
|
||||
"token_estimate": 38,
|
||||
"tokenized_korean_text": "int main ( void ) { record _ t r = { . id = 1 , . name = \" foo \", . status = OK }; print _ record (& r ); return 0 ; }"
|
||||
}
|
||||
]
|
||||
File diff suppressed because one or more lines are too long
112
crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json
vendored
Normal file
112
crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json
vendored
Normal file
@@ -0,0 +1,112 @@
|
||||
[
|
||||
{
|
||||
"block_ids": [
|
||||
"53292605459065d170cd36c118e20546"
|
||||
],
|
||||
"chunk_id": "50a5b324300d9082eac4ce2a422810e1",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 4,
|
||||
"line_start": 1,
|
||||
"symbol": "<top-level>"
|
||||
}
|
||||
],
|
||||
"text": "#include <string>\n#include <vector>\n\nnamespace kebab {",
|
||||
"token_estimate": 18,
|
||||
"tokenized_korean_text": "# include < string > # include < vector > namespace kebab {"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"f349acad94c9fa4cf9ad1c0a93e83610"
|
||||
],
|
||||
"chunk_id": "0e6bc7c522665af8a4b0f66afb9d29c8",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 20,
|
||||
"line_start": 6,
|
||||
"symbol": "kebab::chunk::MdHeadingV1Chunker"
|
||||
}
|
||||
],
|
||||
"text": "class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};",
|
||||
"token_estimate": 95,
|
||||
"tokenized_korean_text": "class MdHeadingV 1 Chunker { public : MdHeadingV 1 Chunker ( ) = default ; ~ MdHeadingV 1 Chunker ( ) = default ; std : : string chunk _ doc ( const std : : string & doc ) { return doc ; } int operator ( ) ( int x ) const { return x * 2 ; } private : int counter _ = 0 ; };"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"8b9811387717d0bd4abf84abcc35b8b1"
|
||||
],
|
||||
"chunk_id": "d9326d252905b665b2adb9a416c20451",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 25,
|
||||
"line_start": 22,
|
||||
"symbol": "kebab::identity"
|
||||
}
|
||||
],
|
||||
"text": "template <typename T>\nT identity(T value) {\n return value;\n}",
|
||||
"token_estimate": 21,
|
||||
"tokenized_korean_text": "template < typename T > T identity ( T value ) { return value ; }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"1754cb6b971f6a4cb292f144a4f0570b"
|
||||
],
|
||||
"chunk_id": "56ee5f991de4a413c016da8dc4acfc35",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 29,
|
||||
"line_start": 27,
|
||||
"symbol": "kebab::global_helper"
|
||||
}
|
||||
],
|
||||
"text": "void global_helper() {\n // free function in kebab namespace\n}",
|
||||
"token_estimate": 22,
|
||||
"tokenized_korean_text": "void global _ helper ( ) { / / free function in kebab namespace }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"14b5f3393d6d25f822f5b70763d24acd"
|
||||
],
|
||||
"chunk_id": "c0d7c043cdd575c530db3909b54cc906",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 34,
|
||||
"line_start": 31,
|
||||
"symbol": "main"
|
||||
}
|
||||
],
|
||||
"text": "int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}",
|
||||
"token_estimate": 23,
|
||||
"tokenized_korean_text": "int main ( ) { kebab : : chunk : : MdHeadingV 1 Chunker c ; return 0 ; }"
|
||||
}
|
||||
]
|
||||
244
crates/kebab-chunk/tests/fixtures/code-sample.go.chunks.snapshot.json
vendored
Normal file
244
crates/kebab-chunk/tests/fixtures/code-sample.go.chunks.snapshot.json
vendored
Normal file
@@ -0,0 +1,244 @@
|
||||
[
|
||||
{
|
||||
"block_ids": [
|
||||
"c182bf37e32c7fc1b868bd617f8eaf66"
|
||||
],
|
||||
"chunk_id": "43de518d946dc18ec040ae20d74e0cff",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 5,
|
||||
"line_start": 1,
|
||||
"symbol": "imports"
|
||||
}
|
||||
],
|
||||
"text": "import (\n\t\"fmt\"\n\t\"os\"\n\t\"strings\"\n)",
|
||||
"token_estimate": 12,
|
||||
"tokenized_korean_text": "import ( \" fmt \" \" os \" \" strings \" )"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"c9992cdcfdf3c2a7700a4abc4782a8a4"
|
||||
],
|
||||
"chunk_id": "af4c382a83f1e8cdea495d8b33c11abc",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 12,
|
||||
"line_start": 7,
|
||||
"symbol": "ComputeMRR"
|
||||
}
|
||||
],
|
||||
"text": "func ComputeMRR(scores []float64) float64 {\n\tif len(scores) == 0 {\n\t\treturn 0.0\n\t}\n\t_ = fmt.Sprintf(\"%v\", scores)\n\treturn 1.0 / float64(len(scores))\n}",
|
||||
"token_estimate": 50,
|
||||
"tokenized_korean_text": "func ComputeMRR ( scores [ ] float 64 ) float 64 { if len ( scores ) == 0 { return 0 . 0 } _ = fmt . Sprintf (\"% v \", scores ) return 1 . 0 / float 64 ( len ( scores ) ) }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5f18dc3e79fe946ba05d32c3bfc00684"
|
||||
],
|
||||
"chunk_id": "4be6d8f180bc19b8651877e5264852ac",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 20,
|
||||
"line_start": 14,
|
||||
"symbol": "MetricsCollector"
|
||||
}
|
||||
],
|
||||
"text": "type MetricsCollector struct {\n\tScores []float64\n\tLabels []string\n\tCounts map[string]int\n\tTotals map[string]float64\n\tTags []string\n}",
|
||||
"token_estimate": 45,
|
||||
"tokenized_korean_text": "type MetricsCollector struct { Scores [ ] float 64 Labels [ ] string Counts map [ string ] int Totals map [ string ] float 64 Tags [ ] string }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"3009cc022ca832c323393e4f9bcdb388"
|
||||
],
|
||||
"chunk_id": "3ae182f4c6d304ee7f0aaf447142f948",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 30,
|
||||
"line_start": 22,
|
||||
"symbol": "BaseEvaluator"
|
||||
}
|
||||
],
|
||||
"text": "type BaseEvaluator struct {\n\tName string\n}\n\nfunc (e *BaseEvaluator) Evaluate(data []string) error {\n\t_ = os.Stderr\n\t_ = strings.Join(data, \",\")\n\treturn nil\n}",
|
||||
"token_estimate": 53,
|
||||
"tokenized_korean_text": "type BaseEvaluator struct { Name string } func ( e * BaseEvaluator ) Evaluate ( data [ ] string ) error { _ = os . Stderr _ = strings . Join ( data , \",\") return nil }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"e0e83d1d7f9327a1902ae9a8f67c1f1c"
|
||||
],
|
||||
"chunk_id": "b962f14980e756bb8ba514e2282756cd",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 38,
|
||||
"line_start": 32,
|
||||
"symbol": "MetricsCollector.Run"
|
||||
}
|
||||
],
|
||||
"text": "func (m *MetricsCollector) Run(inputs []float64) {\n\tfor _, inp := range inputs {\n\t\tm.Scores = append(\n\t\t\tm.Scores,\n\t\t\tinp,\n\t\t)\n\t}\n}",
|
||||
"token_estimate": 44,
|
||||
"tokenized_korean_text": "func ( m * MetricsCollector ) Run ( inputs [ ] float 64 ) { for _, inp := range inputs { m . Scores = append ( m . Scores , inp , ) } }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"0e6a572bc3fe2bd6d173fe614bd1b763"
|
||||
],
|
||||
"chunk_id": "441c695e990e7f49188068433e313e87",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 46,
|
||||
"line_start": 40,
|
||||
"symbol": "MetricsCollector.Report"
|
||||
}
|
||||
],
|
||||
"text": "func (m *MetricsCollector) Report() map[string]interface{} {\n\treturn map[string]interface{}{\n\t\t\"mean\": 0.0,\n\t\t\"count\": len(m.Scores),\n\t\t\"tags\": m.Tags,\n\t}\n}",
|
||||
"token_estimate": 53,
|
||||
"tokenized_korean_text": "func ( m * MetricsCollector ) Report ( ) map [ string ] interface {} { return map [ string ] interface {}{ \" mean \": 0 . 0 , \" count \": len ( m . Scores ) , \" tags \": m . Tags , } }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "7a942d871c588ec69426290561f05179",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 247,
|
||||
"line_start": 48,
|
||||
"symbol": "BigCompute [part 1/5]"
|
||||
}
|
||||
],
|
||||
"text": "func BigCompute(data []int) int {\n\tv0 := 0\n\tif 0 < len(data) {\n\t\tv0 = data[0]\n\t}\n\tv1 := 0\n\tif 1 < len(data) {\n\t\tv1 = data[1]\n\t}\n\tv2 := 0\n\tif 2 < len(data) {\n\t\tv2 = data[2]\n\t}\n\tv3 := 0\n\tif 3 < len(data) {\n\t\tv3 = data[3]\n\t}\n\tv4 := 0\n\tif 4 < len(data) {\n\t\tv4 = data[4]\n\t}\n\tv5 := 0\n\tif 5 < len(data) {\n\t\tv5 = data[5]\n\t}\n\tv6 := 0\n\tif 6 < len(data) {\n\t\tv6 = data[6]\n\t}\n\tv7 := 0\n\tif 7 < len(data) {\n\t\tv7 = data[7]\n\t}\n\tv8 := 0\n\tif 8 < len(data) {\n\t\tv8 = data[8]\n\t}\n\tv9 := 0\n\tif 9 < len(data) {\n\t\tv9 = data[9]\n\t}\n\tv10 := 0\n\tif 10 < len(data) {\n\t\tv10 = data[10]\n\t}\n\tv11 := 0\n\tif 11 < len(data) {\n\t\tv11 = data[11]\n\t}\n\tv12 := 0\n\tif 12 < len(data) {\n\t\tv12 = data[12]\n\t}\n\tv13 := 0\n\tif 13 < len(data) {\n\t\tv13 = data[13]\n\t}\n\tv14 := 0\n\tif 14 < len(data) {\n\t\tv14 = data[14]\n\t}\n\tv15 := 0\n\tif 15 < len(data) {\n\t\tv15 = data[15]\n\t}\n\tv16 := 0\n\tif 16 < len(data) {\n\t\tv16 = data[16]\n\t}\n\tv17 := 0\n\tif 17 < len(data) {\n\t\tv17 = data[17]\n\t}\n\tv18 := 0\n\tif 18 < len(data) {\n\t\tv18 = data[18]\n\t}\n\tv19 := 0\n\tif 19 < len(data) {\n\t\tv19 = data[19]\n\t}\n\tv20 := 0\n\tif 20 < len(data) {\n\t\tv20 = data[20]\n\t}\n\tv21 := 0\n\tif 21 < len(data) {\n\t\tv21 = data[21]\n\t}\n\tv22 := 0\n\tif 22 < len(data) {\n\t\tv22 = data[22]\n\t}\n\tv23 := 0\n\tif 23 < len(data) {\n\t\tv23 = data[23]\n\t}\n\tv24 := 0\n\tif 24 < len(data) {\n\t\tv24 = data[24]\n\t}\n\tv25 := 0\n\tif 25 < len(data) {\n\t\tv25 = data[25]\n\t}\n\tv26 := 0\n\tif 26 < len(data) {\n\t\tv26 = data[26]\n\t}\n\tv27 := 0\n\tif 27 < len(data) {\n\t\tv27 = data[27]\n\t}\n\tv28 := 0\n\tif 28 < len(data) {\n\t\tv28 = data[28]\n\t}\n\tv29 := 0\n\tif 29 < len(data) {\n\t\tv29 = data[29]\n\t}\n\tv30 := 0\n\tif 30 < len(data) {\n\t\tv30 = data[30]\n\t}\n\tv31 := 0\n\tif 31 < len(data) {\n\t\tv31 = data[31]\n\t}\n\tv32 := 0\n\tif 32 < len(data) {\n\t\tv32 = data[32]\n\t}\n\tv33 := 0\n\tif 33 < len(data) {\n\t\tv33 = data[33]\n\t}\n\tv34 := 0\n\tif 34 < len(data) {\n\t\tv34 = data[34]\n\t}\n\tv35 := 0\n\tif 35 < len(data) {\n\t\tv35 = data[35]\n\t}\n\tv36 := 0\n\tif 36 < len(data) {\n\t\tv36 = data[36]\n\t}\n\tv37 := 0\n\tif 37 < len(data) {\n\t\tv37 = data[37]\n\t}\n\tv38 := 0\n\tif 38 < len(data) {\n\t\tv38 = data[38]\n\t}\n\tv39 := 0\n\tif 39 < len(data) {\n\t\tv39 = data[39]\n\t}\n\tv40 := 0\n\tif 40 < len(data) {\n\t\tv40 = data[40]\n\t}\n\tv41 := 0\n\tif 41 < len(data) {\n\t\tv41 = data[41]\n\t}\n\tv42 := 0\n\tif 42 < len(data) {\n\t\tv42 = data[42]\n\t}\n\tv43 := 0\n\tif 43 < len(data) {\n\t\tv43 = data[43]\n\t}\n\tv44 := 0\n\tif 44 < len(data) {\n\t\tv44 = data[44]\n\t}\n\tv45 := 0\n\tif 45 < len(data) {\n\t\tv45 = data[45]\n\t}\n\tv46 := 0\n\tif 46 < len(data) {\n\t\tv46 = data[46]\n\t}\n\tv47 := 0\n\tif 47 < len(data) {\n\t\tv47 = data[47]\n\t}\n\tv48 := 0\n\tif 48 < len(data) {\n\t\tv48 = data[48]\n\t}\n\tv49 := 0\n\tif 49 < len(data) {\n\t\tv49 = data[49]",
|
||||
"token_estimate": 847,
|
||||
"tokenized_korean_text": "func BigCompute ( data [ ] int ) int { v 0 := 0 if 0 < len ( data ) { v 0 = data [ 0 ] } v 1 := 0 if 1 < len ( data ) { v 1 = data [ 1 ] } v 2 := 0 if 2 < len ( data ) { v 2 = data [ 2 ] } v 3 := 0 if 3 < len ( data ) { v 3 = data [ 3 ] } v 4 := 0 if 4 < len ( data ) { v 4 = data [ 4 ] } v 5 := 0 if 5 < len ( data ) { v 5 = data [ 5 ] } v 6 := 0 if 6 < len ( data ) { v 6 = data [ 6 ] } v 7 := 0 if 7 < len ( data ) { v 7 = data [ 7 ] } v 8 := 0 if 8 < len ( data ) { v 8 = data [ 8 ] } v 9 := 0 if 9 < len ( data ) { v 9 = data [ 9 ] } v 10 := 0 if 10 < len ( data ) { v 10 = data [ 10 ] } v 11 := 0 if 11 < len ( data ) { v 11 = data [ 11 ] } v 12 := 0 if 12 < len ( data ) { v 12 = data [ 12 ] } v 13 := 0 if 13 < len ( data ) { v 13 = data [ 13 ] } v 14 := 0 if 14 < len ( data ) { v 14 = data [ 14 ] } v 15 := 0 if 15 < len ( data ) { v 15 = data [ 15 ] } v 16 := 0 if 16 < len ( data ) { v 16 = data [ 16 ] } v 17 := 0 if 17 < len ( data ) { v 17 = data [ 17 ] } v 18 := 0 if 18 < len ( data ) { v 18 = data [ 18 ] } v 19 := 0 if 19 < len ( data ) { v 19 = data [ 19 ] } v 20 := 0 if 20 < len ( data ) { v 20 = data [ 20 ] } v 21 := 0 if 21 < len ( data ) { v 21 = data [ 21 ] } v 22 := 0 if 22 < len ( data ) { v 22 = data [ 22 ] } v 23 := 0 if 23 < len ( data ) { v 23 = data [ 23 ] } v 24 := 0 if 24 < len ( data ) { v 24 = data [ 24 ] } v 25 := 0 if 25 < len ( data ) { v 25 = data [ 25 ] } v 26 := 0 if 26 < len ( data ) { v 26 = data [ 26 ] } v 27 := 0 if 27 < len ( data ) { v 27 = data [ 27 ] } v 28 := 0 if 28 < len ( data ) { v 28 = data [ 28 ] } v 29 := 0 if 29 < len ( data ) { v 29 = data [ 29 ] } v 30 := 0 if 30 < len ( data ) { v 30 = data [ 30 ] } v 31 := 0 if 31 < len ( data ) { v 31 = data [ 31 ] } v 32 := 0 if 32 < len ( data ) { v 32 = data [ 32 ] } v 33 := 0 if 33 < len ( data ) { v 33 = data [ 33 ] } v 34 := 0 if 34 < len ( data ) { v 34 = data [ 34 ] } v 35 := 0 if 35 < len ( data ) { v 35 = data [ 35 ] } v 36 := 0 if 36 < len ( data ) { v 36 = data [ 36 ] } v 37 := 0 if 37 < len ( data ) { v 37 = data [ 37 ] } v 38 := 0 if 38 < len ( data ) { v 38 = data [ 38 ] } v 39 := 0 if 39 < len ( data ) { v 39 = data [ 39 ] } v 40 := 0 if 40 < len ( data ) { v 40 = data [ 40 ] } v 41 := 0 if 41 < len ( data ) { v 41 = data [ 41 ] } v 42 := 0 if 42 < len ( data ) { v 42 = data [ 42 ] } v 43 := 0 if 43 < len ( data ) { v 43 = data [ 43 ] } v 44 := 0 if 44 < len ( data ) { v 44 = data [ 44 ] } v 45 := 0 if 45 < len ( data ) { v 45 = data [ 45 ] } v 46 := 0 if 46 < len ( data ) { v 46 = data [ 46 ] } v 47 := 0 if 47 < len ( data ) { v 47 = data [ 47 ] } v 48 := 0 if 48 < len ( data ) { v 48 = data [ 48 ] } v 49 := 0 if 49 < len ( data ) { v 49 = data [ 49 ]"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "3f44ba43c9415652e2705bb667776e76",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 447,
|
||||
"line_start": 248,
|
||||
"symbol": "BigCompute [part 2/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv50 := 0\n\tif 50 < len(data) {\n\t\tv50 = data[50]\n\t}\n\tv51 := 0\n\tif 51 < len(data) {\n\t\tv51 = data[51]\n\t}\n\tv52 := 0\n\tif 52 < len(data) {\n\t\tv52 = data[52]\n\t}\n\tv53 := 0\n\tif 53 < len(data) {\n\t\tv53 = data[53]\n\t}\n\tv54 := 0\n\tif 54 < len(data) {\n\t\tv54 = data[54]\n\t}\n\tv55 := 0\n\tif 55 < len(data) {\n\t\tv55 = data[55]\n\t}\n\tv56 := 0\n\tif 56 < len(data) {\n\t\tv56 = data[56]\n\t}\n\tv57 := 0\n\tif 57 < len(data) {\n\t\tv57 = data[57]\n\t}\n\tv58 := 0\n\tif 58 < len(data) {\n\t\tv58 = data[58]\n\t}\n\tv59 := 0\n\tif 59 < len(data) {\n\t\tv59 = data[59]\n\t}\n\tv60 := 0\n\tif 60 < len(data) {\n\t\tv60 = data[60]\n\t}\n\tv61 := 0\n\tif 61 < len(data) {\n\t\tv61 = data[61]\n\t}\n\tv62 := 0\n\tif 62 < len(data) {\n\t\tv62 = data[62]\n\t}\n\tv63 := 0\n\tif 63 < len(data) {\n\t\tv63 = data[63]\n\t}\n\tv64 := 0\n\tif 64 < len(data) {\n\t\tv64 = data[64]\n\t}\n\tv65 := 0\n\tif 65 < len(data) {\n\t\tv65 = data[65]\n\t}\n\tv66 := 0\n\tif 66 < len(data) {\n\t\tv66 = data[66]\n\t}\n\tv67 := 0\n\tif 67 < len(data) {\n\t\tv67 = data[67]\n\t}\n\tv68 := 0\n\tif 68 < len(data) {\n\t\tv68 = data[68]\n\t}\n\tv69 := 0\n\tif 69 < len(data) {\n\t\tv69 = data[69]\n\t}\n\tv70 := 0\n\tif 70 < len(data) {\n\t\tv70 = data[70]\n\t}\n\tv71 := 0\n\tif 71 < len(data) {\n\t\tv71 = data[71]\n\t}\n\tv72 := 0\n\tif 72 < len(data) {\n\t\tv72 = data[72]\n\t}\n\tv73 := 0\n\tif 73 < len(data) {\n\t\tv73 = data[73]\n\t}\n\tv74 := 0\n\tif 74 < len(data) {\n\t\tv74 = data[74]\n\t}\n\tv75 := 0\n\tif 75 < len(data) {\n\t\tv75 = data[75]\n\t}\n\tv76 := 0\n\tif 76 < len(data) {\n\t\tv76 = data[76]\n\t}\n\tv77 := 0\n\tif 77 < len(data) {\n\t\tv77 = data[77]\n\t}\n\tv78 := 0\n\tif 78 < len(data) {\n\t\tv78 = data[78]\n\t}\n\tv79 := 0\n\tif 79 < len(data) {\n\t\tv79 = data[79]\n\t}\n\tv80 := 0\n\tif 80 < len(data) {\n\t\tv80 = data[80]\n\t}\n\tv81 := 0\n\tif 81 < len(data) {\n\t\tv81 = data[81]\n\t}\n\tv82 := 0\n\tif 82 < len(data) {\n\t\tv82 = data[82]\n\t}\n\tv83 := 0\n\tif 83 < len(data) {\n\t\tv83 = data[83]\n\t}\n\tv84 := 0\n\tif 84 < len(data) {\n\t\tv84 = data[84]\n\t}\n\tv85 := 0\n\tif 85 < len(data) {\n\t\tv85 = data[85]\n\t}\n\tv86 := 0\n\tif 86 < len(data) {\n\t\tv86 = data[86]\n\t}\n\tv87 := 0\n\tif 87 < len(data) {\n\t\tv87 = data[87]\n\t}\n\tv88 := 0\n\tif 88 < len(data) {\n\t\tv88 = data[88]\n\t}\n\tv89 := 0\n\tif 89 < len(data) {\n\t\tv89 = data[89]\n\t}\n\tv90 := 0\n\tif 90 < len(data) {\n\t\tv90 = data[90]\n\t}\n\tv91 := 0\n\tif 91 < len(data) {\n\t\tv91 = data[91]\n\t}\n\tv92 := 0\n\tif 92 < len(data) {\n\t\tv92 = data[92]\n\t}\n\tv93 := 0\n\tif 93 < len(data) {\n\t\tv93 = data[93]\n\t}\n\tv94 := 0\n\tif 94 < len(data) {\n\t\tv94 = data[94]\n\t}\n\tv95 := 0\n\tif 95 < len(data) {\n\t\tv95 = data[95]\n\t}\n\tv96 := 0\n\tif 96 < len(data) {\n\t\tv96 = data[96]\n\t}\n\tv97 := 0\n\tif 97 < len(data) {\n\t\tv97 = data[97]\n\t}\n\tv98 := 0\n\tif 98 < len(data) {\n\t\tv98 = data[98]\n\t}\n\tv99 := 0\n\tif 99 < len(data) {\n\t\tv99 = data[99]",
|
||||
"token_estimate": 850,
|
||||
"tokenized_korean_text": "} v 50 := 0 if 50 < len ( data ) { v 50 = data [ 50 ] } v 51 := 0 if 51 < len ( data ) { v 51 = data [ 51 ] } v 52 := 0 if 52 < len ( data ) { v 52 = data [ 52 ] } v 53 := 0 if 53 < len ( data ) { v 53 = data [ 53 ] } v 54 := 0 if 54 < len ( data ) { v 54 = data [ 54 ] } v 55 := 0 if 55 < len ( data ) { v 55 = data [ 55 ] } v 56 := 0 if 56 < len ( data ) { v 56 = data [ 56 ] } v 57 := 0 if 57 < len ( data ) { v 57 = data [ 57 ] } v 58 := 0 if 58 < len ( data ) { v 58 = data [ 58 ] } v 59 := 0 if 59 < len ( data ) { v 59 = data [ 59 ] } v 60 := 0 if 60 < len ( data ) { v 60 = data [ 60 ] } v 61 := 0 if 61 < len ( data ) { v 61 = data [ 61 ] } v 62 := 0 if 62 < len ( data ) { v 62 = data [ 62 ] } v 63 := 0 if 63 < len ( data ) { v 63 = data [ 63 ] } v 64 := 0 if 64 < len ( data ) { v 64 = data [ 64 ] } v 65 := 0 if 65 < len ( data ) { v 65 = data [ 65 ] } v 66 := 0 if 66 < len ( data ) { v 66 = data [ 66 ] } v 67 := 0 if 67 < len ( data ) { v 67 = data [ 67 ] } v 68 := 0 if 68 < len ( data ) { v 68 = data [ 68 ] } v 69 := 0 if 69 < len ( data ) { v 69 = data [ 69 ] } v 70 := 0 if 70 < len ( data ) { v 70 = data [ 70 ] } v 71 := 0 if 71 < len ( data ) { v 71 = data [ 71 ] } v 72 := 0 if 72 < len ( data ) { v 72 = data [ 72 ] } v 73 := 0 if 73 < len ( data ) { v 73 = data [ 73 ] } v 74 := 0 if 74 < len ( data ) { v 74 = data [ 74 ] } v 75 := 0 if 75 < len ( data ) { v 75 = data [ 75 ] } v 76 := 0 if 76 < len ( data ) { v 76 = data [ 76 ] } v 77 := 0 if 77 < len ( data ) { v 77 = data [ 77 ] } v 78 := 0 if 78 < len ( data ) { v 78 = data [ 78 ] } v 79 := 0 if 79 < len ( data ) { v 79 = data [ 79 ] } v 80 := 0 if 80 < len ( data ) { v 80 = data [ 80 ] } v 81 := 0 if 81 < len ( data ) { v 81 = data [ 81 ] } v 82 := 0 if 82 < len ( data ) { v 82 = data [ 82 ] } v 83 := 0 if 83 < len ( data ) { v 83 = data [ 83 ] } v 84 := 0 if 84 < len ( data ) { v 84 = data [ 84 ] } v 85 := 0 if 85 < len ( data ) { v 85 = data [ 85 ] } v 86 := 0 if 86 < len ( data ) { v 86 = data [ 86 ] } v 87 := 0 if 87 < len ( data ) { v 87 = data [ 87 ] } v 88 := 0 if 88 < len ( data ) { v 88 = data [ 88 ] } v 89 := 0 if 89 < len ( data ) { v 89 = data [ 89 ] } v 90 := 0 if 90 < len ( data ) { v 90 = data [ 90 ] } v 91 := 0 if 91 < len ( data ) { v 91 = data [ 91 ] } v 92 := 0 if 92 < len ( data ) { v 92 = data [ 92 ] } v 93 := 0 if 93 < len ( data ) { v 93 = data [ 93 ] } v 94 := 0 if 94 < len ( data ) { v 94 = data [ 94 ] } v 95 := 0 if 95 < len ( data ) { v 95 = data [ 95 ] } v 96 := 0 if 96 < len ( data ) { v 96 = data [ 96 ] } v 97 := 0 if 97 < len ( data ) { v 97 = data [ 97 ] } v 98 := 0 if 98 < len ( data ) { v 98 = data [ 98 ] } v 99 := 0 if 99 < len ( data ) { v 99 = data [ 99 ]"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "e4763e10f059d97f40c2932761b56c3e",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 647,
|
||||
"line_start": 448,
|
||||
"symbol": "BigCompute [part 3/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv100 := 0\n\tif 100 < len(data) {\n\t\tv100 = data[100]\n\t}\n\tv101 := 0\n\tif 101 < len(data) {\n\t\tv101 = data[101]\n\t}\n\tv102 := 0\n\tif 102 < len(data) {\n\t\tv102 = data[102]\n\t}\n\tv103 := 0\n\tif 103 < len(data) {\n\t\tv103 = data[103]\n\t}\n\tv104 := 0\n\tif 104 < len(data) {\n\t\tv104 = data[104]\n\t}\n\tv105 := 0\n\tif 105 < len(data) {\n\t\tv105 = data[105]\n\t}\n\tv106 := 0\n\tif 106 < len(data) {\n\t\tv106 = data[106]\n\t}\n\tv107 := 0\n\tif 107 < len(data) {\n\t\tv107 = data[107]\n\t}\n\tv108 := 0\n\tif 108 < len(data) {\n\t\tv108 = data[108]\n\t}\n\tv109 := 0\n\tif 109 < len(data) {\n\t\tv109 = data[109]\n\t}\n\tv110 := 0\n\tif 110 < len(data) {\n\t\tv110 = data[110]\n\t}\n\tv111 := 0\n\tif 111 < len(data) {\n\t\tv111 = data[111]\n\t}\n\tv112 := 0\n\tif 112 < len(data) {\n\t\tv112 = data[112]\n\t}\n\tv113 := 0\n\tif 113 < len(data) {\n\t\tv113 = data[113]\n\t}\n\tv114 := 0\n\tif 114 < len(data) {\n\t\tv114 = data[114]\n\t}\n\tv115 := 0\n\tif 115 < len(data) {\n\t\tv115 = data[115]\n\t}\n\tv116 := 0\n\tif 116 < len(data) {\n\t\tv116 = data[116]\n\t}\n\tv117 := 0\n\tif 117 < len(data) {\n\t\tv117 = data[117]\n\t}\n\tv118 := 0\n\tif 118 < len(data) {\n\t\tv118 = data[118]\n\t}\n\tv119 := 0\n\tif 119 < len(data) {\n\t\tv119 = data[119]\n\t}\n\tv120 := 0\n\tif 120 < len(data) {\n\t\tv120 = data[120]\n\t}\n\tv121 := 0\n\tif 121 < len(data) {\n\t\tv121 = data[121]\n\t}\n\tv122 := 0\n\tif 122 < len(data) {\n\t\tv122 = data[122]\n\t}\n\tv123 := 0\n\tif 123 < len(data) {\n\t\tv123 = data[123]\n\t}\n\tv124 := 0\n\tif 124 < len(data) {\n\t\tv124 = data[124]\n\t}\n\tv125 := 0\n\tif 125 < len(data) {\n\t\tv125 = data[125]\n\t}\n\tv126 := 0\n\tif 126 < len(data) {\n\t\tv126 = data[126]\n\t}\n\tv127 := 0\n\tif 127 < len(data) {\n\t\tv127 = data[127]\n\t}\n\tv128 := 0\n\tif 128 < len(data) {\n\t\tv128 = data[128]\n\t}\n\tv129 := 0\n\tif 129 < len(data) {\n\t\tv129 = data[129]\n\t}\n\tv130 := 0\n\tif 130 < len(data) {\n\t\tv130 = data[130]\n\t}\n\tv131 := 0\n\tif 131 < len(data) {\n\t\tv131 = data[131]\n\t}\n\tv132 := 0\n\tif 132 < len(data) {\n\t\tv132 = data[132]\n\t}\n\tv133 := 0\n\tif 133 < len(data) {\n\t\tv133 = data[133]\n\t}\n\tv134 := 0\n\tif 134 < len(data) {\n\t\tv134 = data[134]\n\t}\n\tv135 := 0\n\tif 135 < len(data) {\n\t\tv135 = data[135]\n\t}\n\tv136 := 0\n\tif 136 < len(data) {\n\t\tv136 = data[136]\n\t}\n\tv137 := 0\n\tif 137 < len(data) {\n\t\tv137 = data[137]\n\t}\n\tv138 := 0\n\tif 138 < len(data) {\n\t\tv138 = data[138]\n\t}\n\tv139 := 0\n\tif 139 < len(data) {\n\t\tv139 = data[139]\n\t}\n\tv140 := 0\n\tif 140 < len(data) {\n\t\tv140 = data[140]\n\t}\n\tv141 := 0\n\tif 141 < len(data) {\n\t\tv141 = data[141]\n\t}\n\tv142 := 0\n\tif 142 < len(data) {\n\t\tv142 = data[142]\n\t}\n\tv143 := 0\n\tif 143 < len(data) {\n\t\tv143 = data[143]\n\t}\n\tv144 := 0\n\tif 144 < len(data) {\n\t\tv144 = data[144]\n\t}\n\tv145 := 0\n\tif 145 < len(data) {\n\t\tv145 = data[145]\n\t}\n\tv146 := 0\n\tif 146 < len(data) {\n\t\tv146 = data[146]\n\t}\n\tv147 := 0\n\tif 147 < len(data) {\n\t\tv147 = data[147]\n\t}\n\tv148 := 0\n\tif 148 < len(data) {\n\t\tv148 = data[148]\n\t}\n\tv149 := 0\n\tif 149 < len(data) {\n\t\tv149 = data[149]",
|
||||
"token_estimate": 917,
|
||||
"tokenized_korean_text": "} v 100 := 0 if 100 < len ( data ) { v 100 = data [ 100 ] } v 101 := 0 if 101 < len ( data ) { v 101 = data [ 101 ] } v 102 := 0 if 102 < len ( data ) { v 102 = data [ 102 ] } v 103 := 0 if 103 < len ( data ) { v 103 = data [ 103 ] } v 104 := 0 if 104 < len ( data ) { v 104 = data [ 104 ] } v 105 := 0 if 105 < len ( data ) { v 105 = data [ 105 ] } v 106 := 0 if 106 < len ( data ) { v 106 = data [ 106 ] } v 107 := 0 if 107 < len ( data ) { v 107 = data [ 107 ] } v 108 := 0 if 108 < len ( data ) { v 108 = data [ 108 ] } v 109 := 0 if 109 < len ( data ) { v 109 = data [ 109 ] } v 110 := 0 if 110 < len ( data ) { v 110 = data [ 110 ] } v 111 := 0 if 111 < len ( data ) { v 111 = data [ 111 ] } v 112 := 0 if 112 < len ( data ) { v 112 = data [ 112 ] } v 113 := 0 if 113 < len ( data ) { v 113 = data [ 113 ] } v 114 := 0 if 114 < len ( data ) { v 114 = data [ 114 ] } v 115 := 0 if 115 < len ( data ) { v 115 = data [ 115 ] } v 116 := 0 if 116 < len ( data ) { v 116 = data [ 116 ] } v 117 := 0 if 117 < len ( data ) { v 117 = data [ 117 ] } v 118 := 0 if 118 < len ( data ) { v 118 = data [ 118 ] } v 119 := 0 if 119 < len ( data ) { v 119 = data [ 119 ] } v 120 := 0 if 120 < len ( data ) { v 120 = data [ 120 ] } v 121 := 0 if 121 < len ( data ) { v 121 = data [ 121 ] } v 122 := 0 if 122 < len ( data ) { v 122 = data [ 122 ] } v 123 := 0 if 123 < len ( data ) { v 123 = data [ 123 ] } v 124 := 0 if 124 < len ( data ) { v 124 = data [ 124 ] } v 125 := 0 if 125 < len ( data ) { v 125 = data [ 125 ] } v 126 := 0 if 126 < len ( data ) { v 126 = data [ 126 ] } v 127 := 0 if 127 < len ( data ) { v 127 = data [ 127 ] } v 128 := 0 if 128 < len ( data ) { v 128 = data [ 128 ] } v 129 := 0 if 129 < len ( data ) { v 129 = data [ 129 ] } v 130 := 0 if 130 < len ( data ) { v 130 = data [ 130 ] } v 131 := 0 if 131 < len ( data ) { v 131 = data [ 131 ] } v 132 := 0 if 132 < len ( data ) { v 132 = data [ 132 ] } v 133 := 0 if 133 < len ( data ) { v 133 = data [ 133 ] } v 134 := 0 if 134 < len ( data ) { v 134 = data [ 134 ] } v 135 := 0 if 135 < len ( data ) { v 135 = data [ 135 ] } v 136 := 0 if 136 < len ( data ) { v 136 = data [ 136 ] } v 137 := 0 if 137 < len ( data ) { v 137 = data [ 137 ] } v 138 := 0 if 138 < len ( data ) { v 138 = data [ 138 ] } v 139 := 0 if 139 < len ( data ) { v 139 = data [ 139 ] } v 140 := 0 if 140 < len ( data ) { v 140 = data [ 140 ] } v 141 := 0 if 141 < len ( data ) { v 141 = data [ 141 ] } v 142 := 0 if 142 < len ( data ) { v 142 = data [ 142 ] } v 143 := 0 if 143 < len ( data ) { v 143 = data [ 143 ] } v 144 := 0 if 144 < len ( data ) { v 144 = data [ 144 ] } v 145 := 0 if 145 < len ( data ) { v 145 = data [ 145 ] } v 146 := 0 if 146 < len ( data ) { v 146 = data [ 146 ] } v 147 := 0 if 147 < len ( data ) { v 147 = data [ 147 ] } v 148 := 0 if 148 < len ( data ) { v 148 = data [ 148 ] } v 149 := 0 if 149 < len ( data ) { v 149 = data [ 149 ]"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "24176c911d0bacf9a29fa7f8251f5036",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 847,
|
||||
"line_start": 648,
|
||||
"symbol": "BigCompute [part 4/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv150 := 0\n\tif 150 < len(data) {\n\t\tv150 = data[150]\n\t}\n\tv151 := 0\n\tif 151 < len(data) {\n\t\tv151 = data[151]\n\t}\n\tv152 := 0\n\tif 152 < len(data) {\n\t\tv152 = data[152]\n\t}\n\tv153 := 0\n\tif 153 < len(data) {\n\t\tv153 = data[153]\n\t}\n\tv154 := 0\n\tif 154 < len(data) {\n\t\tv154 = data[154]\n\t}\n\tv155 := 0\n\tif 155 < len(data) {\n\t\tv155 = data[155]\n\t}\n\tv156 := 0\n\tif 156 < len(data) {\n\t\tv156 = data[156]\n\t}\n\tv157 := 0\n\tif 157 < len(data) {\n\t\tv157 = data[157]\n\t}\n\tv158 := 0\n\tif 158 < len(data) {\n\t\tv158 = data[158]\n\t}\n\tv159 := 0\n\tif 159 < len(data) {\n\t\tv159 = data[159]\n\t}\n\tv160 := 0\n\tif 160 < len(data) {\n\t\tv160 = data[160]\n\t}\n\tv161 := 0\n\tif 161 < len(data) {\n\t\tv161 = data[161]\n\t}\n\tv162 := 0\n\tif 162 < len(data) {\n\t\tv162 = data[162]\n\t}\n\tv163 := 0\n\tif 163 < len(data) {\n\t\tv163 = data[163]\n\t}\n\tv164 := 0\n\tif 164 < len(data) {\n\t\tv164 = data[164]\n\t}\n\tv165 := 0\n\tif 165 < len(data) {\n\t\tv165 = data[165]\n\t}\n\tv166 := 0\n\tif 166 < len(data) {\n\t\tv166 = data[166]\n\t}\n\tv167 := 0\n\tif 167 < len(data) {\n\t\tv167 = data[167]\n\t}\n\tv168 := 0\n\tif 168 < len(data) {\n\t\tv168 = data[168]\n\t}\n\tv169 := 0\n\tif 169 < len(data) {\n\t\tv169 = data[169]\n\t}\n\tv170 := 0\n\tif 170 < len(data) {\n\t\tv170 = data[170]\n\t}\n\tv171 := 0\n\tif 171 < len(data) {\n\t\tv171 = data[171]\n\t}\n\tv172 := 0\n\tif 172 < len(data) {\n\t\tv172 = data[172]\n\t}\n\tv173 := 0\n\tif 173 < len(data) {\n\t\tv173 = data[173]\n\t}\n\tv174 := 0\n\tif 174 < len(data) {\n\t\tv174 = data[174]\n\t}\n\tv175 := 0\n\tif 175 < len(data) {\n\t\tv175 = data[175]\n\t}\n\tv176 := 0\n\tif 176 < len(data) {\n\t\tv176 = data[176]\n\t}\n\tv177 := 0\n\tif 177 < len(data) {\n\t\tv177 = data[177]\n\t}\n\tv178 := 0\n\tif 178 < len(data) {\n\t\tv178 = data[178]\n\t}\n\tv179 := 0\n\tif 179 < len(data) {\n\t\tv179 = data[179]\n\t}\n\tv180 := 0\n\tif 180 < len(data) {\n\t\tv180 = data[180]\n\t}\n\tv181 := 0\n\tif 181 < len(data) {\n\t\tv181 = data[181]\n\t}\n\tv182 := 0\n\tif 182 < len(data) {\n\t\tv182 = data[182]\n\t}\n\tv183 := 0\n\tif 183 < len(data) {\n\t\tv183 = data[183]\n\t}\n\tv184 := 0\n\tif 184 < len(data) {\n\t\tv184 = data[184]\n\t}\n\tv185 := 0\n\tif 185 < len(data) {\n\t\tv185 = data[185]\n\t}\n\tv186 := 0\n\tif 186 < len(data) {\n\t\tv186 = data[186]\n\t}\n\tv187 := 0\n\tif 187 < len(data) {\n\t\tv187 = data[187]\n\t}\n\tv188 := 0\n\tif 188 < len(data) {\n\t\tv188 = data[188]\n\t}\n\tv189 := 0\n\tif 189 < len(data) {\n\t\tv189 = data[189]\n\t}\n\tv190 := 0\n\tif 190 < len(data) {\n\t\tv190 = data[190]\n\t}\n\tv191 := 0\n\tif 191 < len(data) {\n\t\tv191 = data[191]\n\t}\n\tv192 := 0\n\tif 192 < len(data) {\n\t\tv192 = data[192]\n\t}\n\tv193 := 0\n\tif 193 < len(data) {\n\t\tv193 = data[193]\n\t}\n\tv194 := 0\n\tif 194 < len(data) {\n\t\tv194 = data[194]\n\t}\n\tv195 := 0\n\tif 195 < len(data) {\n\t\tv195 = data[195]\n\t}\n\tv196 := 0\n\tif 196 < len(data) {\n\t\tv196 = data[196]\n\t}\n\tv197 := 0\n\tif 197 < len(data) {\n\t\tv197 = data[197]\n\t}\n\tv198 := 0\n\tif 198 < len(data) {\n\t\tv198 = data[198]\n\t}\n\tv199 := 0\n\tif 199 < len(data) {\n\t\tv199 = data[199]",
|
||||
"token_estimate": 917,
|
||||
"tokenized_korean_text": "} v 150 := 0 if 150 < len ( data ) { v 150 = data [ 150 ] } v 151 := 0 if 151 < len ( data ) { v 151 = data [ 151 ] } v 152 := 0 if 152 < len ( data ) { v 152 = data [ 152 ] } v 153 := 0 if 153 < len ( data ) { v 153 = data [ 153 ] } v 154 := 0 if 154 < len ( data ) { v 154 = data [ 154 ] } v 155 := 0 if 155 < len ( data ) { v 155 = data [ 155 ] } v 156 := 0 if 156 < len ( data ) { v 156 = data [ 156 ] } v 157 := 0 if 157 < len ( data ) { v 157 = data [ 157 ] } v 158 := 0 if 158 < len ( data ) { v 158 = data [ 158 ] } v 159 := 0 if 159 < len ( data ) { v 159 = data [ 159 ] } v 160 := 0 if 160 < len ( data ) { v 160 = data [ 160 ] } v 161 := 0 if 161 < len ( data ) { v 161 = data [ 161 ] } v 162 := 0 if 162 < len ( data ) { v 162 = data [ 162 ] } v 163 := 0 if 163 < len ( data ) { v 163 = data [ 163 ] } v 164 := 0 if 164 < len ( data ) { v 164 = data [ 164 ] } v 165 := 0 if 165 < len ( data ) { v 165 = data [ 165 ] } v 166 := 0 if 166 < len ( data ) { v 166 = data [ 166 ] } v 167 := 0 if 167 < len ( data ) { v 167 = data [ 167 ] } v 168 := 0 if 168 < len ( data ) { v 168 = data [ 168 ] } v 169 := 0 if 169 < len ( data ) { v 169 = data [ 169 ] } v 170 := 0 if 170 < len ( data ) { v 170 = data [ 170 ] } v 171 := 0 if 171 < len ( data ) { v 171 = data [ 171 ] } v 172 := 0 if 172 < len ( data ) { v 172 = data [ 172 ] } v 173 := 0 if 173 < len ( data ) { v 173 = data [ 173 ] } v 174 := 0 if 174 < len ( data ) { v 174 = data [ 174 ] } v 175 := 0 if 175 < len ( data ) { v 175 = data [ 175 ] } v 176 := 0 if 176 < len ( data ) { v 176 = data [ 176 ] } v 177 := 0 if 177 < len ( data ) { v 177 = data [ 177 ] } v 178 := 0 if 178 < len ( data ) { v 178 = data [ 178 ] } v 179 := 0 if 179 < len ( data ) { v 179 = data [ 179 ] } v 180 := 0 if 180 < len ( data ) { v 180 = data [ 180 ] } v 181 := 0 if 181 < len ( data ) { v 181 = data [ 181 ] } v 182 := 0 if 182 < len ( data ) { v 182 = data [ 182 ] } v 183 := 0 if 183 < len ( data ) { v 183 = data [ 183 ] } v 184 := 0 if 184 < len ( data ) { v 184 = data [ 184 ] } v 185 := 0 if 185 < len ( data ) { v 185 = data [ 185 ] } v 186 := 0 if 186 < len ( data ) { v 186 = data [ 186 ] } v 187 := 0 if 187 < len ( data ) { v 187 = data [ 187 ] } v 188 := 0 if 188 < len ( data ) { v 188 = data [ 188 ] } v 189 := 0 if 189 < len ( data ) { v 189 = data [ 189 ] } v 190 := 0 if 190 < len ( data ) { v 190 = data [ 190 ] } v 191 := 0 if 191 < len ( data ) { v 191 = data [ 191 ] } v 192 := 0 if 192 < len ( data ) { v 192 = data [ 192 ] } v 193 := 0 if 193 < len ( data ) { v 193 = data [ 193 ] } v 194 := 0 if 194 < len ( data ) { v 194 = data [ 194 ] } v 195 := 0 if 195 < len ( data ) { v 195 = data [ 195 ] } v 196 := 0 if 196 < len ( data ) { v 196 = data [ 196 ] } v 197 := 0 if 197 < len ( data ) { v 197 = data [ 197 ] } v 198 := 0 if 198 < len ( data ) { v 198 = data [ 198 ] } v 199 := 0 if 199 < len ( data ) { v 199 = data [ 199 ]"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "438127626378632c03780d10603de32c",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 890,
|
||||
"line_start": 848,
|
||||
"symbol": "BigCompute [part 5/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv200 := 0\n\tif 200 < len(data) {\n\t\tv200 = data[200]\n\t}\n\tv201 := 0\n\tif 201 < len(data) {\n\t\tv201 = data[201]\n\t}\n\tv202 := 0\n\tif 202 < len(data) {\n\t\tv202 = data[202]\n\t}\n\tv203 := 0\n\tif 203 < len(data) {\n\t\tv203 = data[203]\n\t}\n\tv204 := 0\n\tif 204 < len(data) {\n\t\tv204 = data[204]\n\t}\n\tv205 := 0\n\tif 205 < len(data) {\n\t\tv205 = data[205]\n\t}\n\tv206 := 0\n\tif 206 < len(data) {\n\t\tv206 = data[206]\n\t}\n\tv207 := 0\n\tif 207 < len(data) {\n\t\tv207 = data[207]\n\t}\n\tv208 := 0\n\tif 208 < len(data) {\n\t\tv208 = data[208]\n\t}\n\tv209 := 0\n\tif 209 < len(data) {\n\t\tv209 = data[209]\n\t}\n\treturn len(data)\n}",
|
||||
"token_estimate": 191,
|
||||
"tokenized_korean_text": "} v 200 := 0 if 200 < len ( data ) { v 200 = data [ 200 ] } v 201 := 0 if 201 < len ( data ) { v 201 = data [ 201 ] } v 202 := 0 if 202 < len ( data ) { v 202 = data [ 202 ] } v 203 := 0 if 203 < len ( data ) { v 203 = data [ 203 ] } v 204 := 0 if 204 < len ( data ) { v 204 = data [ 204 ] } v 205 := 0 if 205 < len ( data ) { v 205 = data [ 205 ] } v 206 := 0 if 206 < len ( data ) { v 206 = data [ 206 ] } v 207 := 0 if 207 < len ( data ) { v 207 = data [ 207 ] } v 208 := 0 if 208 < len ( data ) { v 208 = data [ 208 ] } v 209 := 0 if 209 < len ( data ) { v 209 = data [ 209 ] } return len ( data ) }"
|
||||
}
|
||||
]
|
||||
178
crates/kebab-chunk/tests/fixtures/code-sample.java.chunks.snapshot.json
vendored
Normal file
178
crates/kebab-chunk/tests/fixtures/code-sample.java.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
178
crates/kebab-chunk/tests/fixtures/code-sample.kt.chunks.snapshot.json
vendored
Normal file
178
crates/kebab-chunk/tests/fixtures/code-sample.kt.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
33
crates/kebab-chunk/tests/fixtures/sample.c
vendored
Normal file
33
crates/kebab-chunk/tests/fixtures/sample.c
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define MAX_BUF 4096
|
||||
|
||||
typedef enum {
|
||||
OK = 0,
|
||||
ERR_PARSE,
|
||||
ERR_IO,
|
||||
} status_t;
|
||||
|
||||
typedef struct {
|
||||
int id;
|
||||
char name[64];
|
||||
status_t status;
|
||||
} record_t;
|
||||
|
||||
static int counter = 0;
|
||||
|
||||
int parse_record(const char *line, record_t *out) {
|
||||
if (line == NULL || out == NULL) return ERR_PARSE;
|
||||
return OK;
|
||||
}
|
||||
|
||||
void print_record(const record_t *r) {
|
||||
printf("[%d] %s (status=%d)\n", r->id, r->name, r->status);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
record_t r = { .id = 1, .name = "foo", .status = OK };
|
||||
print_record(&r);
|
||||
return 0;
|
||||
}
|
||||
40
crates/kebab-chunk/tests/fixtures/sample.cpp
vendored
Normal file
40
crates/kebab-chunk/tests/fixtures/sample.cpp
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace kebab {
|
||||
namespace chunk {
|
||||
|
||||
class MdHeadingV1Chunker {
|
||||
public:
|
||||
MdHeadingV1Chunker() = default;
|
||||
~MdHeadingV1Chunker() = default;
|
||||
|
||||
std::string chunk_doc(const std::string& doc) {
|
||||
return doc;
|
||||
}
|
||||
|
||||
int operator()(int x) const {
|
||||
return x * 2;
|
||||
}
|
||||
|
||||
private:
|
||||
int counter_ = 0;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
T identity(T value) {
|
||||
return value;
|
||||
}
|
||||
|
||||
} // namespace chunk
|
||||
|
||||
void global_helper() {
|
||||
// free function in kebab namespace
|
||||
}
|
||||
|
||||
} // namespace kebab
|
||||
|
||||
int main() {
|
||||
kebab::chunk::MdHeadingV1Chunker c;
|
||||
return 0;
|
||||
}
|
||||
5
crates/kebab-chunk/tests/fixtures/sample.dockerfile
vendored
Normal file
5
crates/kebab-chunk/tests/fixtures/sample.dockerfile
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
FROM rust:1.94-slim AS builder
|
||||
WORKDIR /app
|
||||
COPY . .
|
||||
RUN cargo build --release
|
||||
CMD ["/app/target/release/kebab"]
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user