Compare commits
666 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 403e162ac0 | |||
| fdf09c369c | |||
| e7b58017fd | |||
| 90812e981f | |||
| 15e6918cef | |||
| a8ec354188 | |||
| 2686a4f27d | |||
| 25e94feab8 | |||
| 7b7330cdf2 | |||
| 3d45994693 | |||
| d5c69f6715 | |||
| 148c8b7040 | |||
| 898cdaa043 | |||
| 01a17acd3f | |||
| f3a7222ec5 | |||
| 3d5bb599e3 | |||
| 375a0693e4 | |||
| 8cc4e6d563 | |||
| 901416d8e9 | |||
| b706e3e88c | |||
| 8f8d3a4100 | |||
| 75a543ff69 | |||
| a283e56c5c | |||
| 47ef6532f7 | |||
| 03b0745e9d | |||
| e7cb20990a | |||
| bebf6e4ac7 | |||
| 736d791056 | |||
| 6c9c8df43e | |||
| 0263667684 | |||
| 4918983d9c | |||
| aeaa18a564 | |||
| c91ff909ce | |||
| 8dee610a97 | |||
| d71ed2516b | |||
| 095c9f37a2 | |||
| 16ddb1dfc3 | |||
| 72c99c452c | |||
| cbcae69abf | |||
| 7505645008 | |||
| e2ae9a4589 | |||
| 1dfab6dfc5 | |||
| fc5103642e | |||
| e03d03cb26 | |||
| 16aadea222 | |||
| a48c405826 | |||
| 21e02d8a93 | |||
| a64c31ee94 | |||
| ec96648956 | |||
| ecaf224381 | |||
| b1c5feb3f3 | |||
| ca8c0645fb | |||
| c7af6612b7 | |||
| acb4fa6c65 | |||
| 8bfa4ba76e | |||
| ad0ccf4ccf | |||
| b351523e51 | |||
| a48b055358 | |||
| 581e1d5d55 | |||
| c17d6e67a8 | |||
| af8fd34716 | |||
| 369aeb3d24 | |||
| 99f8cfa691 | |||
| d85d7348a5 | |||
| edac3ae737 | |||
| 6ec4e6809f | |||
| 1011c75fff | |||
| 8f7b6ee538 | |||
| 76841af7d3 | |||
| 980e20fd8d | |||
| cd79ed326c | |||
| 9dbf9d781d | |||
| 9501edd82b | |||
| 4b4a4c0b32 | |||
| f2cc325cf3 | |||
| b7e022a5e3 | |||
| bd7c4fd7ef | |||
| 4dcb4a45d6 | |||
| 6d86214060 | |||
| 6bbb8f854b | |||
| 2a4df4d48d | |||
| 16f3d6eef2 | |||
| fa89c7b561 | |||
| a4c81fed86 | |||
| 5b7c02fe13 | |||
| 88c5b83dea | |||
| 2619b7bff7 | |||
| e9b520216e | |||
| a8fd76499c | |||
| 0282a81c67 | |||
| f3587b7143 | |||
| 483b1ec06b | |||
| d279f343e7 | |||
| b56469f010 | |||
| 6ba8cb2c88 | |||
| afa8af0f88 | |||
| b9d20d23d1 | |||
| 86b4e1ebd0 | |||
| 825543549d | |||
| bcb8b93751 | |||
| 116b3e6377 | |||
| 69b53d1c97 | |||
| a271352e33 | |||
| cde4d75f6b | |||
| bddcd53688 | |||
| 2a207f9868 | |||
| cc31868d24 | |||
| 0df47febf0 | |||
| b12a616ab2 | |||
| 848b75c069 | |||
| 467a974901 | |||
| 098413922b | |||
| 695010ea7a | |||
| 8bb7c276d0 | |||
| 01a03463a6 | |||
| b6ad947378 | |||
| 1529e6d991 | |||
| 5ad1f98227 | |||
| a58cae2ff3 | |||
| 7a1dff1684 | |||
| 0988f66331 | |||
| 82e02aa4fe | |||
| db4af0cc72 | |||
| ab20202241 | |||
| a51e6395c0 | |||
| fe4c854673 | |||
| 1de3f4ffca | |||
| 7fbfec647d | |||
| ca8c83b1ba | |||
| 6c611990d8 | |||
| 166b1404e4 | |||
| 2d0168b7ab | |||
| 4afcaf96d2 | |||
| 16c4579399 | |||
| 40d7faee71 | |||
| a3bb2580bf | |||
| 2429189447 | |||
| d93b757cf1 | |||
| 571996938c | |||
| be79bdb83d | |||
| 4e76f103c1 | |||
| 4fd672193f | |||
| 1454321b12 | |||
| 649ec35108 | |||
| dece5e89fc | |||
| 3cb49f1f9b | |||
| f5ff823984 | |||
| b82eaec21a | |||
| 6daa43375b | |||
| 85efeeca3e | |||
| 2b4ba8e104 | |||
| b08941d6ab | |||
| 6bf4e82e62 | |||
| a0c7fa3d1a | |||
| ebc6bf45c4 | |||
| d8fdc815be | |||
| 9f2a56d091 | |||
| fe20be8195 | |||
| 028d9ad4ea | |||
| a3513c9110 | |||
| f2a76cfe94 | |||
| 8c56ef3010 | |||
| 5d9ea588ed | |||
| 53ec9b4dc5 | |||
| 21b52bc285 | |||
| 97fd895a10 | |||
| d13eb87401 | |||
| 26f3a7756c | |||
| 881f949fcb | |||
| c5de5f812b | |||
| f94e0c4a9b | |||
| 923b959610 | |||
| b63af20b72 | |||
| e8f44a57e3 | |||
| 4b4a8cbb3a | |||
| 4dc1c10be1 | |||
| bd86f61c9c | |||
| b134ae9dd5 | |||
| 597d8b70ad | |||
| b106120e93 | |||
| 43366b1b15 | |||
| 70507e94ca | |||
| 7bbdc89ae3 | |||
| 7c24734cc7 | |||
| 9a36a06f97 | |||
| 35c987df1c | |||
| d9ec7b8dc3 | |||
| 4e451c9f7c | |||
| 6482bf1321 | |||
| 5977c8cdf1 | |||
| 89d334a92b | |||
| 09333d0b05 | |||
| 685007789a | |||
| 445b096215 | |||
| 415227bf76 | |||
| f9dc0f749f | |||
| bef0c98867 | |||
| f8a4c79727 | |||
| f60304beb4 | |||
| 6a9551e0fa | |||
| 46e99470eb | |||
| 9b44e27dfe | |||
| 854a180365 | |||
| 5bba95fd71 | |||
| 2c7fa7142a | |||
| d9c7aabce1 | |||
| 10b0e2f4f2 | |||
| 28f513795e | |||
| 760eee89c8 | |||
| f763049923 | |||
| 8cf73d1f43 | |||
| a58ee10dfb | |||
| e674ff474b | |||
| 241ded59df | |||
| 436fd015a2 | |||
| d9acda517a | |||
| b4d9e60816 | |||
| 90726ab283 | |||
| 1d4e301e5e | |||
| 48197687b7 | |||
| c9e05941c5 | |||
| 4c5ccd5447 | |||
| b9ee09f176 | |||
| 4672cba6c6 | |||
| fd918a60ce | |||
| 9f003ef1cd | |||
| 8d81bc1071 | |||
| c2cd3a7ab7 | |||
| fb3952d54f | |||
| aeeff3635b | |||
| 9d7faab650 | |||
| bcd1e37dab | |||
| e7a4330798 | |||
| 574e1b1ca1 | |||
| c1e82cca92 | |||
| 2c05dbd0dd | |||
| 96766406aa | |||
| 710945c4b0 | |||
| d4395a306b | |||
| bd48baa19a | |||
| b02ac8200e | |||
| 336962715a | |||
| 1a224bf983 | |||
| a210bf5d52 | |||
| 429287f6cb | |||
| 08495eb425 | |||
| 98cf4e8a04 | |||
| 4030f04f37 | |||
| 7c27633df2 | |||
| 3712d005cc | |||
| 7c85de065a | |||
| a0ccc7b021 | |||
| a8fd6994d2 | |||
| 505b3889fb | |||
| 772575d8f0 | |||
| 00ffe9c792 | |||
| 681c48b2a3 | |||
| 546c1564b0 | |||
| 79ad6e376f | |||
| 6ffbe0a5a3 | |||
| ab3408cb49 | |||
| b807fd5aa5 | |||
| 93436f9eca | |||
| 11ce7847a1 | |||
| 1d88dccf8a | |||
| 1eb0bbecb3 | |||
| 44fbffff26 | |||
| 63aece3ea1 | |||
| 28a8bbeace | |||
| 52a97303dc | |||
| 71fb2cbcb3 | |||
| 85855ef596 | |||
| da25ce330b | |||
| 5bfea3c28b | |||
| b6756f8ce3 | |||
| 016f380428 | |||
| bf28a1e4d9 | |||
| 24221826ed | |||
| 8a2f7affa6 | |||
| f28a422f79 | |||
| c56242d04f | |||
| 17c48a0ee6 | |||
| 64a009314c | |||
| ddfe7ba099 | |||
| 104363a0db | |||
| 6188a50c1c | |||
| 94e6146013 | |||
| 12c7dc9efb | |||
| cd1d4fb807 | |||
| 7150c376bb | |||
| 6280abf2df | |||
| 192da45dbf | |||
| cf35f36f88 | |||
| ed34f2e03f | |||
| 624b44c46b | |||
| caf690dc72 | |||
| 1640ecf288 | |||
| 90e77631a8 | |||
| fa251db48f | |||
| 3114c31841 | |||
| 271329efbd | |||
| f2867540d2 | |||
| e118844256 | |||
| 41c5edc517 | |||
| d02149c010 | |||
| 0c69b9621b | |||
| 0d69d85757 | |||
| a67300317b | |||
| abb05ebc23 | |||
| 26fdc4f344 | |||
| 3f5e0e6e90 | |||
| 578a60e3bb | |||
| 64f518e08e | |||
| fa9f91ead4 | |||
| 9ee89c2a94 | |||
| 13a3361ba2 | |||
| 0def913abd | |||
| ff9d5f5f86 | |||
| 70a5068c0d | |||
| 93ddece111 | |||
| 67559fb3ce | |||
| d79e432916 | |||
| 0ee18149e7 | |||
| 8a68289499 | |||
| 6ac7fea7b9 | |||
| fe123c0c6d | |||
| 753b1ff5e5 | |||
| 8dcedc4b11 | |||
| 8781c6112b | |||
| 14197b5e02 | |||
| 584247f1ea | |||
| a0c0dca321 | |||
| 667495ae6a | |||
| 08d72a12e0 | |||
| 1969c8e3b5 | |||
| c6207d196e | |||
| 840c6c40a6 | |||
| b81574afa9 | |||
| 6beff35a2f | |||
| 75a4207aa1 | |||
| 86aa180ad7 | |||
| 802c573c07 | |||
| 438870ee25 | |||
| 192835e5bf | |||
| 1034de25a2 | |||
| d1560be80d | |||
| b2a2902e38 | |||
| 03cd41c48f | |||
| 926042049c | |||
| e0a29225da | |||
| b541567946 | |||
| a58d400abd | |||
| 8add684ffc | |||
| 7a90df1485 | |||
| 46f408dc0f | |||
| 49e60fb314 | |||
| 6bc7a83d3c | |||
| df3c5b8caf | |||
| 5051ea7534 | |||
| 88d7fbc182 | |||
| 0b7d8af759 | |||
| 9342b9543f | |||
| a8aa03042f | |||
| 9d4a60aac5 | |||
| 8ce7a911ee | |||
| 75c1c7b911 | |||
| b5c12ecb6f | |||
| a1192ce3b2 | |||
| 17ee400fd5 | |||
| 217dddb4ba | |||
| 308666dbd5 | |||
| 522ae7b8bc | |||
| 166e1ddfaf | |||
| 226ce8b744 | |||
| 22d4161728 | |||
| 51004ac593 | |||
| 8996e73282 | |||
| 22dba09857 | |||
| aaa90b1754 | |||
| 077f92f41e | |||
| 5ce7f60932 | |||
| 47857b2622 | |||
| 1e4cff879b | |||
| 2d7a566624 | |||
| 813bdd1a16 | |||
| ff1bedbef5 | |||
| 30e03c7a12 | |||
| 2ce6ae47c5 | |||
| ebc4ef2eea | |||
| 7bda1509b7 | |||
| 61d48d67a3 | |||
| f4c840b994 | |||
| 15244b7494 | |||
| a7f7ab9f93 | |||
| 1b19e33a4f | |||
| 9c9e391b15 | |||
| f95cd55484 | |||
| ab288135e9 | |||
| c19aa006d0 | |||
| f1a4f67e12 | |||
| 6463c52827 | |||
| 2559d0d95a | |||
| 4524830306 | |||
| 8cdd3903c7 | |||
| 8b89961ada | |||
| eec90996aa | |||
| ce1c778b4a | |||
| 453ec15df4 | |||
| 1e6de9fe9f | |||
| 9fa2a1ebac | |||
| 749c6ae240 | |||
| 5f2bd9e97e | |||
| 1ce06c1e2d | |||
| d26efe167f | |||
| d6d165df01 | |||
| 2baa846c6b | |||
| 27baec82ea | |||
| acf8cf3be2 | |||
| ea5f7b22c8 | |||
| 5497c6e7b5 | |||
| 5a90940f1c | |||
| 4389b887f0 | |||
| 360f825f3a | |||
| 641b92af7d | |||
| 08fb743598 | |||
| 0a2a7ae214 | |||
| 803d02b68b | |||
| 4e8b84c4e0 | |||
| 16dc02cfa2 | |||
| 74f1b0571b | |||
| 918ee6c0be | |||
| 68ada396f3 | |||
| 23c4ad97b9 | |||
| 1f566b8bfa | |||
| 26562588e3 | |||
| 4503b5b12f | |||
| 44813df052 | |||
| d6bb6cfd3b | |||
| d53995a6d4 | |||
| c215034653 | |||
| 31245a4328 | |||
| acb61b6830 | |||
| 20feb3133e | |||
| de63f161ac | |||
| 1815091247 | |||
| 6a0b340941 | |||
| 9664e97497 | |||
| 8bdb3e8090 | |||
| dcad9ccda2 | |||
| ed0f4769b3 | |||
| 0c61758931 | |||
| 39b766ea59 | |||
| 7f287abacb | |||
| d715631928 | |||
| 73e5b359d8 | |||
| c780aca904 | |||
| b1d5047399 | |||
| 80c2d31fb3 | |||
| 97e9f558f4 | |||
| da51e59081 | |||
| 11a0fc758f | |||
| b5d1fe8c1e | |||
| 580576c2c6 | |||
| 808b92a6c5 | |||
| c74f8d269e | |||
| df85bafa7f | |||
| a93b33ffbe | |||
| 402a4506a2 | |||
| a531dc37dc | |||
| 7a6a24ad10 | |||
| 42712b50c2 | |||
| 9f3edb7e24 | |||
| 5c265bb59f | |||
| a08ed32199 | |||
| 9362cd0aae | |||
|
|
7961f8813d | ||
|
|
7bbd2c0cbf | ||
|
|
d13f58d28a | ||
|
|
298f4adc81 | ||
|
|
4e8b70a04b | ||
|
|
682f7dd3a2 | ||
|
|
40b3ea8408 | ||
|
|
9fce24b106 | ||
|
|
8bbe25dc10 | ||
|
|
abfdcbd31d | ||
|
|
69d1593bc5 | ||
|
|
2a8451c033 | ||
|
|
ff11f81f7f | ||
|
|
bf4ebf8d2a | ||
|
|
351c7a0826 | ||
|
|
7329ba96ee | ||
|
|
fa4eeb5a87 | ||
|
|
3b1e878aed | ||
|
|
005a9011ea | ||
|
|
c6d61b0b37 | ||
|
|
49487dc46b | ||
| 2c2bf9bac5 | |||
| 72798bd3ff | |||
|
|
c3177561b9 | ||
| a465b71f99 | |||
|
|
787007172a | ||
|
|
b954e9ce66 | ||
|
|
c62a8ff503 | ||
|
|
69c94b6692 | ||
|
|
d5321701ea | ||
|
|
2c3461c465 | ||
| 240120ee80 | |||
|
|
5870a1de15 | ||
|
|
f00fb376fe | ||
|
|
bb0ec0469f | ||
|
|
f303c76f52 | ||
|
|
cd5b1e3bfc | ||
|
|
7c6c2e8102 | ||
| 3a9a52326d | |||
|
|
b53376e96e | ||
|
|
441f1192ee | ||
|
|
e8da415624 | ||
|
|
d8e5f35601 | ||
|
|
6ab0d782ef | ||
|
|
2bbe94eb05 | ||
|
|
9ac13fa256 | ||
|
|
67f2c16cc2 | ||
|
|
1ebbd6b711 | ||
|
|
892175d009 | ||
|
|
de9016fe16 | ||
|
|
35df15df99 | ||
| b0becf43b8 | |||
| 21ecbb00d4 | |||
|
|
8cd21e8342 | ||
|
|
b35f163f56 | ||
|
|
600c6182fc | ||
|
|
0e8b800b6b | ||
|
|
126559ce7a | ||
|
|
137fc4ee31 | ||
|
|
59f01f8185 | ||
|
|
9f70681b77 | ||
|
|
6d6eb442be | ||
|
|
28d3250546 | ||
| 945319ae93 | |||
|
|
c864bd007f | ||
|
|
67aee9f480 | ||
|
|
4440fa6659 | ||
|
|
b51cdb9e8f | ||
|
|
4e739f3cd8 | ||
|
|
3a621bba0d | ||
|
|
3c605b1a5d | ||
|
|
56f20b7235 | ||
|
|
0359bd9682 | ||
| cf3acfc136 | |||
|
|
668e1174cc | ||
| 745a75a82b | |||
|
|
6a33d08aea | ||
|
|
a40593590b | ||
|
|
5687cbc0e2 | ||
|
|
653e432a30 | ||
|
|
f7e2072d66 | ||
|
|
72c227af23 | ||
|
|
69037c313a | ||
|
|
6a067e3ab1 | ||
|
|
231d80e82d | ||
|
|
69c6e23432 | ||
|
|
1e943f21dc | ||
|
|
fb31befef1 | ||
|
|
5f6b2fa259 | ||
| a0497d9c53 | |||
|
|
b221686133 | ||
| a72c6f307c | |||
|
|
84287d0ef6 | ||
|
|
6e7446861b | ||
|
|
b06f4654e7 | ||
|
|
4e0379c04f | ||
|
|
6a18847892 | ||
|
|
c6cc1e2bfe | ||
|
|
86475e5ba2 | ||
|
|
2c80e2ad91 | ||
|
|
d3f38c76e9 | ||
|
|
31c1e05951 | ||
|
|
7210386699 | ||
| a7115be699 | |||
|
|
b86b763dfb | ||
|
|
7dddc1d706 | ||
|
|
2a6b3dc7e6 | ||
|
|
8d8f1c0294 | ||
|
|
77bf19566c | ||
|
|
beb40249a3 | ||
|
|
0fffd69071 | ||
|
|
1b9d89eb3a | ||
|
|
7d1f855f7e | ||
|
|
610d29f053 | ||
|
|
75eeae3933 | ||
|
|
9653592c16 | ||
|
|
353aa5cc78 | ||
|
|
4eda9c317d | ||
| 9817a3de59 | |||
|
|
e084b306e5 | ||
|
|
f485608108 | ||
|
|
9f076003e2 | ||
|
|
e1fcea6313 | ||
|
|
5e0cff1b92 | ||
|
|
603061fb86 | ||
|
|
21220f6d39 | ||
|
|
f25ad31741 | ||
|
|
af80cedd81 | ||
|
|
aabe66f5e2 | ||
|
|
ebbc3a46ae | ||
|
|
e00418537f | ||
|
|
dbb7b54d5d | ||
|
|
a80f65c6f2 | ||
| a9ff122ab2 | |||
|
|
225831ffcd | ||
|
|
a082b78f8e | ||
|
|
e1c6b7055a | ||
|
|
39bf0de949 | ||
|
|
29629e6786 | ||
|
|
e8caf2a57e | ||
|
|
e5c99f5b80 | ||
|
|
307fd8d527 | ||
|
|
31475f0312 | ||
|
|
0ca9b1d5c3 | ||
|
|
4949775c8b | ||
| 877ad18f34 | |||
|
|
df42d8f621 | ||
| 6a01f15261 | |||
|
|
cb04bd8c8d | ||
|
|
efc6b7ebb0 | ||
|
|
1008bca342 | ||
|
|
1f39b6bc2c | ||
|
|
aeee7ed771 | ||
|
|
15cdc97cae | ||
|
|
cc41adabb5 | ||
|
|
16db60f7bd | ||
|
|
e398272a24 | ||
|
|
e891e487cf | ||
|
|
dfef65f196 | ||
|
|
8faad2f407 | ||
|
|
f4ce6652b2 | ||
|
|
922849cd95 | ||
|
|
3a7a28e682 | ||
|
|
8b0f64db6b | ||
|
|
4728a87957 | ||
|
|
401a47fb43 | ||
| 6d4a648349 | |||
|
|
b20c1dd56a | ||
| 834a1e1723 | |||
|
|
3328760dca | ||
| f25e16f80c | |||
| 4475abbf4f | |||
|
|
5be90cffec | ||
| 6f0b2bcc37 | |||
|
|
36fe7416c8 | ||
| d6e2e6273e | |||
|
|
cb266e0071 | ||
|
|
ee15528acf | ||
| e03b754a16 | |||
|
|
6b13d8e11f | ||
| fea91d5c99 | |||
|
|
0e762e6374 | ||
|
|
b230fbb495 | ||
|
|
afbd64dafc | ||
|
|
6bedba4a7f | ||
|
|
fd4125c0a0 | ||
|
|
4191347491 | ||
|
|
dd33902f5a | ||
|
|
c8a8bc9045 | ||
|
|
2de28c43da | ||
|
|
9d96504bd9 |
6
.gitattributes
vendored
Normal file
6
.gitattributes
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
# PP-OCRv5 ONNX OCR models (paddle-onnx engine). git-lfs is not installed on
|
||||
# this host, so they are committed as plain binary blobs (treated as binary —
|
||||
# no textual diff/merge). If/when git-lfs becomes available, migrate with
|
||||
# `git lfs migrate import --include='*.onnx'` and restore the filter line:
|
||||
# *.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx -text
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,6 +1,7 @@
|
||||
.superpowers/
|
||||
.worktrees/
|
||||
.claude/
|
||||
/target/
|
||||
.omc/
|
||||
/target
|
||||
**/*.rs.bk
|
||||
Cargo.lock.bak
|
||||
|
||||
68
CLAUDE.md
68
CLAUDE.md
@@ -27,7 +27,7 @@ cargo build --release # produces target/release/kebab
|
||||
|
||||
`-j 1` for the full workspace test isn't optional: 18 integration-test binaries each link `lance` + `datafusion` + `arrow` + `tantivy` and the parallel link step exhausts memory (linker gets SIGKILL'd, build silently fails partway). Per-crate runs are fine in parallel.
|
||||
|
||||
`target/` is 6–10 GB after a fresh build (DataFusion + Lance + fastembed + 18 × test-binary debug info). The dev/test profile is already trimmed (`debug = "line-tables-only"`, `split-debuginfo = "unpacked"` — see workspace `Cargo.toml`). Run `cargo clean` after phase merges if disk pressure shows up; backtraces still resolve to function + line.
|
||||
`target/` is 6–10 GB after a fresh build but **balloons to 90+ GB after a few task cycles** (each fb-* batch adds incremental compile artifacts on top of the existing 18 × test-binary debug info). The dev/test profile is already trimmed (`debug = "line-tables-only"`, `split-debuginfo = "unpacked"` — see workspace `Cargo.toml`). Run `cargo clean` **routinely after each merged PR**, not just "if pressure shows up" — disk space is tight and recovery via `cargo clean` is cheap (one re-link per crate on next build). Verified pattern: 92 GB → 0 GB in seconds, backtraces still resolve to function + line.
|
||||
|
||||
## The facade rule
|
||||
|
||||
@@ -60,7 +60,7 @@ Read the relevant task spec's deps section before adding an import. New crates i
|
||||
|
||||
## Wire schema v1
|
||||
|
||||
All `--json` output carries a `schema_version` field. Current schemas: `ingest_report.v1`, `ingest_progress.v1`, `search_hit.v1`, `answer.v1`, `doctor.v1`, `reset_report.v1`, `eval_run.v1`, `eval_compare.v1`, `list_docs.v1`, `schema.v1`, `error.v1`. Schemas live in `docs/wire-schema/v1/`. The wire shape is the contract for external integrations (Claude Code skills, MCP, etc.); breaking it requires a `*.v2` major bump and parallel-running both for one phase. In `--json` mode, fatal errors emit `error.v1` to stderr as ndjson (non-`--json` mode keeps plain stderr text); exit codes 0/1/2/3 are unchanged — `error.v1.code` provides fine-grained agent branching.
|
||||
All `--json` output carries a `schema_version` field. Current schemas: `ingest_report.v1`, `ingest_progress.v1`, `search_hit.v1`, `answer.v1`, `doctor.v1`, `reset_report.v1`, `schema.v1`, `error.v1`, `chunk_inspection.v1`, `citation.v1`, `doc_summary.v1`. Schemas live in `docs/wire-schema/v1/`. The wire shape is the contract for external integrations (Claude Code skills, MCP, etc.); breaking it requires a `*.v2` major bump and parallel-running both for one phase. In `--json` mode, fatal errors emit `error.v1` to stderr as ndjson (non-`--json` mode keeps plain stderr text); exit codes 0/1/2/3 are unchanged — `error.v1.code` provides fine-grained agent branching.
|
||||
|
||||
In-tree integration packages live under `integrations/<host>/` — currently `integrations/claude-code/kebab/` (a Claude Code skill that calls `kebab search --json` / `kebab ask --json`). Any wire schema major bump (v1→v2) MUST update each shipped integration in the same PR, same as the version-cascade rule below. Per-user trigger keywords (team / system / acronym) belong in the user's local copy of the skill, not in the repo-shipped frontmatter — keep `integrations/claude-code/kebab/SKILL.md`'s `description` generic.
|
||||
|
||||
@@ -81,11 +81,71 @@ Bump 자체는 단순 minor / patch 한 줄 수정 (`Cargo.toml` workspace `vers
|
||||
Release 절차:
|
||||
|
||||
1. `gitea-release v<X.Y.Z>` (gitea-ops skill) 으로 tag + push + release notes.
|
||||
2. release notes 는 사용자 도그푸딩에 영향 가는 surface 변경 위주 — wire schema 추가, CLI flag 신규, TUI 키 변경, V00X migration 등.
|
||||
3. 프리-1.0 (`0.x.y`) 단계: minor bump 시 wire schema additive / surface 변경 누적, patch bump 시 bug fix only.
|
||||
2. release notes 는 사용자 도그푸딩에 영향이 가는 surface 변경을 위주로 — wire schema 추가, CLI flag 신규, TUI 키 변경, V00X migration 등 — 다룬다. 이때 추가된 기능과 변경사항은 유저가 이해할 수 있도록 친절하고 자세하게 풀어서 설명해야 하며, 단순히 commit subject 를 나열하는 형태로 끝내면 안 된다. 필요하다면 도그푸딩이나 테스트 결과도 함께 적어 둔다.
|
||||
3. 프리-1.0 (`0.x.y`) 단계 bump 규칙 — **기능(behavior) 또는 인터페이스(interface) 변경 여부**로 판정:
|
||||
- **minor bump** (`0.x.0`): 기능 또는 인터페이스에 *실질적* 변경이 있을 때. 인터페이스 = 신규/변경/삭제된 CLI subcommand·flag, config 키, wire schema 의 **breaking** 변경, 임베딩/검색/RAG 등 사용자가 받는 **결과·동작**의 변화, V00X migration, frozen 설계 변경. 기능 = 새 source 형식·검색 모드·백엔드 등 *할 수 있는 일*의 추가/변경.
|
||||
- **patch bump** (`0.x.y`): 기능·인터페이스 변경이 **없을** 때. bug fix, 내부 refactor, 성능 개선, 로깅/진행표시 등 **관측성(observability) 개선**, **additive-only wire 변경**(backward-compat 신규 필드/이벤트라 기존 소비자 무영향), 문서. ← 즉 "결과가 같고 새 명령/플래그/config 도 없으면 patch".
|
||||
- 경계 예: 진행 로그에 phase/파일명 추가 + additive wire 이벤트(asset_phase) = **patch** (검색·색인 결과 불변, 새 명령/플래그/config 없음). arctic 임베더 provider + 신규 config 키 = **minor** (인터페이스 추가). 별칭 기능 제거 + migration = **minor** (동작·인터페이스 변경).
|
||||
|
||||
**bump 시점 = release 시점 같은 commit**. 즉 commit `chore: bump version 0.x → 0.y` 직후 같은 commit 에 tag. v0.1.0 (`2319206`) 처럼 bump 없이 tag 만 찍는 패턴은 후속 release 가 대상 commit 을 헷갈리게 함 — pre-release snapshot 은 SHA reference 로 충분.
|
||||
|
||||
## Dogfood trigger
|
||||
|
||||
도그푸딩 = 새 binary 를 실제 KB / 실제 query 로 돌려보고 user-visible 동작이 spec 의 의도와 일치하는지 확인하는 종단 검증. unit / integration test 가 못 잡는 회귀 (UX 어색함, performance regression, 의외의 token 처리, embedding drift, RAG hallucination) 를 catch 함. PR 머지 전 또는 머지 직후 release notes 작성 전에 실시.
|
||||
|
||||
### 도그푸딩이 필요한 시점
|
||||
|
||||
다음 트리거 중 하나라도 hit 시 도그푸딩 필수. **모두 release-level 또는 user-visible behavior 변경 임**.
|
||||
|
||||
**Schema / migration**:
|
||||
- 신규 V00X migration (예: V007 trigram, V008 OCR mirror, V009 morphological) — `corpus_revision` cascade + auto-backfill 정책의 사용자 경험 확인.
|
||||
- frozen design contract 변경 (`docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` §X 갱신) — verbatim CI diff-check 외의 user-visible side effect 확인.
|
||||
|
||||
**Wire schema / CLI surface**:
|
||||
- 신규 `--json` 필드, exit code 변경, 또는 schema major bump (v1 → v2) — agent / external integration 의 호환성 검증.
|
||||
- `kebab` 의 subcommand 또는 flag 추가/삭제/rename — agent skill / muscle memory 영향.
|
||||
|
||||
**Search / RAG behavior**:
|
||||
- FTS5 tokenizer / chunker / embedder 모델 / RAG prompt template 변경 — 같은 query 의 hit ordering, snippet, RAG citation 패턴이 자연스럽게 변화하는지.
|
||||
- score gate, RRF fusion ratio, NLI threshold 같은 ranking 파라미터 default 변경.
|
||||
|
||||
**Performance**:
|
||||
- ingest / search / ask latency 의 의도된 변화 (예: lindera tokenize, OCR 추가, multi-hop RAG) — actual wall-clock 측정 + release notes 에 명시.
|
||||
- 대용량 KB (수천 doc / 만 chunk) 의 first-boot eager backfill 시간이 사용자 hang 인지에 영향 안 가는지.
|
||||
|
||||
**Language / locale**:
|
||||
- 한국어 / 일본어 / 중국어 lexical 동작 변경 (V007 trigram, V009 morphological, future N-gram).
|
||||
- 영어 substring 매칭 같은 ad-hoc 부산물의 회귀.
|
||||
|
||||
**File / asset surface**:
|
||||
- 신규 source 형식 (PDF OCR, audio, video) — extractor / chunker 의 실제 corpus 동작.
|
||||
- `.kebabignore` / `_external/` 같은 workspace 정책 변경.
|
||||
|
||||
**Release-level**: 위 트리거 중 하나가 hit 되어 `Cargo.toml` workspace `version` bump 가 필요하면, **bump commit 이전에 도그푸딩 evidence 가 HOTFIXES + release notes 에 명시** 되어 있어야 함. evidence 없는 release 는 사용자가 "왜 bump 했는지" 추적 불가.
|
||||
|
||||
### 도그푸딩 데이터 보관소
|
||||
|
||||
모든 도그푸딩 source 문서 + KB state + 로그는 `/build/dogfood/` 한 디렉토리에 누적 보관한다. **분류는 문서 의미 / 종류 / 형식 기준만** — kebab version, 생성 시점, scenario name 같은 prefix 금지 (`v0.20.1-dogfood/`, `dogfood-v018/` 같은 디렉토리 신설 X). 자세한 layout 은 `/build/dogfood/README.md` 참조.
|
||||
|
||||
- `/build/dogfood/corpus/` — source 문서 (read-only). format 별 분류 (`markdown/`, `code/`, `html/`, `images/`, `pdf/`, `manifest/`, `resources/`) + 각 format 내 category 별 (예: `markdown/{korean,english,bilingual,tech-docs,coding-md-corpus,topics,notes,edge-cases}`, `code/{rust,python,...}`). 새 fixture 는 적절한 category subdir 에 추가.
|
||||
- `/build/dogfood/kb/` — 도그푸딩 run 의 KB 출력 (SQLite + LanceDB + assets + models). 매 run 마다 reset 가능. 별 KB 디렉토리 신설 X.
|
||||
- `/build/dogfood/logs/` — 누적 실행 로그 (ndjson + stderr + summary).
|
||||
- `/build/dogfood/config.toml` — canonical 도그푸딩 config (없으면 `kebab init` 후 path override).
|
||||
- `/build/dogfood/_archive/` — regeneratable stale state (이전 run 의 sqlite/lancedb, XDG snapshot). 디스크 압박 시 wipe 가능.
|
||||
|
||||
`/tmp/kebab-smoke/`, `/tmp/kebab-*`, `/build/cache/dogfood*`, `/home/altair823/KnowledgeBase`, `~/.config/kebab/`, `~/.local/share/kebab/`, `~/.local/state/kebab/` 같은 위치 신규 사용 금지 — 모두 `/build/dogfood/` 로 일관. ad-hoc fixture 가 필요하면 `corpus/<format>/<category>/` 에 추가.
|
||||
|
||||
### 도그푸딩 결과 기록
|
||||
|
||||
도그푸딩 evidence 는 두 곳에 cascade:
|
||||
|
||||
1. **`tasks/HOTFIXES.md` 의 dated entry** — 시나리오 별 hit count 표 + snippet evidence + known limitation. 미래에 spec drift 의심 시 git history 외 immediate reference 가 됨.
|
||||
2. **`docs/release-notes/v<X.Y.Z>-draft.md`** (또는 gitea release body) — 사용자 도그푸딩 영향에 영향이 가는 surface 변경을 4 단락 (변경 사실 / trade-off / mitigation / upgrade 절차) 으로 풀어서 설명. evidence link.
|
||||
|
||||
도그푸딩 단계에서 *발견된 bug* (spec 과 실제 동작의 mismatch, performance regression, UX 어색함) 는 즉시 fix → re-dogfood. fix 가 별 PR 으로 빠지면 머지 후 HOTFIXES 에 dated entry.
|
||||
|
||||
DOGFOOD scenario catalog (§1~§13) 는 `docs/DOGFOOD.md`. 신규 release 마다 §관련 section 의 scenario list 갱신 + 신규 scenario 추가.
|
||||
|
||||
## Naming + paths
|
||||
|
||||
- Crate prefix: `kebab-` (kebab-case package, `kebab_` snake_case in Rust modules).
|
||||
|
||||
2242
Cargo.lock
generated
2242
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
135
Cargo.toml
135
Cargo.toml
@@ -2,17 +2,17 @@
|
||||
resolver = "3"
|
||||
members = [
|
||||
"crates/kebab-core",
|
||||
"crates/kebab-parse-types",
|
||||
"crates/kebab-config",
|
||||
"crates/kebab-source-fs",
|
||||
"crates/kebab-parse-md",
|
||||
"crates/kebab-normalize",
|
||||
"crates/kebab-chunk",
|
||||
"crates/kebab-store-sqlite",
|
||||
"crates/kebab-store-vector",
|
||||
"crates/kebab-search",
|
||||
"crates/kebab-embed",
|
||||
"crates/kebab-embed-local",
|
||||
"crates/kebab-embed-candle",
|
||||
"crates/kebab-embed-ollama",
|
||||
"crates/kebab-llm",
|
||||
"crates/kebab-llm-local",
|
||||
"crates/kebab-rag",
|
||||
@@ -23,6 +23,8 @@ members = [
|
||||
"crates/kebab-parse-pdf",
|
||||
"crates/kebab-tui",
|
||||
"crates/kebab-mcp",
|
||||
"crates/kebab-parse-code",
|
||||
"crates/kebab-nli",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -30,7 +32,95 @@ edition = "2024"
|
||||
rust-version = "1.85"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/altair823/kebab"
|
||||
version = "0.3.2"
|
||||
version = "0.28.0" # v0.28.0 — config 스키마 v2→v3 재편: 미디어 형식 설정을 `[ingest.*]` 우산으로 통합(`[indexing]`→`[ingest]` 스칼라, `[chunking]`/`[image.ocr]`/`[image.caption]`/`[pdf.ocr]`→`[ingest.*]`). 기존 v2 파일은 load 시 메모리 자동 변환(디스크 미변경), 파일 갱신은 `kebab config migrate`(값·주석 보존). env 이름(LHS) 100% 보존 + RHS 만 새 경로, 신규 `KEBAB_PDF_OCR_{DET_MODEL,REC_MODEL,DICT,SCORE_THRESH,UNCLIP_RATIO,MAX_BOXES}`. `ingest_config_signature` 바이트 불변(재색인 0). PdfOcrCfg paddle 대칭 키. 신규 인터페이스(config 레이아웃 rename + env 추가) → minor. — CLAUDE.md §Release
|
||||
|
||||
# pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with
|
||||
# intentional allow-list. The allowed lints are either cosmetic (doc style),
|
||||
# informational (function size), or carry intentional truncation we accept
|
||||
# (numeric casts in tokenizer/ONNX inputs, hash modular reduction, etc).
|
||||
[workspace.lints.clippy]
|
||||
pedantic = { level = "warn", priority = -1 }
|
||||
# Intentional u32 ↔ i64 casts in kebab-nli (ONNX i64 inputs from tokenizer u32 ids).
|
||||
# u64 ↔ usize across kebab-store-sqlite row counts. Wide truncation is auditable
|
||||
# at use site, not lint-wide.
|
||||
cast_possible_truncation = "allow"
|
||||
cast_possible_wrap = "allow"
|
||||
cast_sign_loss = "allow"
|
||||
cast_precision_loss = "allow"
|
||||
# Doc markdown style is cosmetic; we run rustdoc on demand.
|
||||
doc_markdown = "allow"
|
||||
missing_errors_doc = "allow"
|
||||
missing_panics_doc = "allow"
|
||||
# Informational only — splitting a long pipeline function isn't always cleaner.
|
||||
too_many_lines = "allow"
|
||||
# `Foo::default()` is concise and idiomatic here; `<Foo as Default>::default()`
|
||||
# adds noise without surfacing intent.
|
||||
default_trait_access = "allow"
|
||||
# Module name prefix on public items keeps the wire/log surface readable
|
||||
# (`refusal_reason::no_chunks` etc).
|
||||
module_name_repetitions = "allow"
|
||||
# We use `#[must_use]` deliberately on public results, not blanket.
|
||||
must_use_candidate = "allow"
|
||||
# `String` arg sometimes signals "I'll consume this" — let signature decide.
|
||||
needless_pass_by_value = "allow"
|
||||
# Idiomatic single-line bindings stay; let-else expansion isn't always clearer.
|
||||
manual_let_else = "allow"
|
||||
# `use` after `let` is a common kebab pattern (scoped imports next to use site).
|
||||
items_after_statements = "allow"
|
||||
# Naming pairs like `chunk_id` / `chunks_id` are intentional domain terms.
|
||||
similar_names = "allow"
|
||||
# `iter.map(format!).collect::<String>()` is idiomatic when the per-element
|
||||
# string is genuinely independent — `fold` only wins on accumulation patterns.
|
||||
format_collect = "allow"
|
||||
# Exhaustive `match` with explicit variant arms (vs `_`) catches future
|
||||
# variant additions at compile time (kebab core's `RefusalReason` pattern).
|
||||
match_wildcard_for_single_variants = "allow"
|
||||
# Copy types under `&self` keep call-site discipline; auto-deref noise > tiny perf gain.
|
||||
trivially_copy_pass_by_ref = "allow"
|
||||
# `unnecessary_wraps` flags helpers that could drop `Result`, but keeping the
|
||||
# Result allows future error variants without churning callers.
|
||||
unnecessary_wraps = "allow"
|
||||
# NLI score / RRF fusion / similarity threshold comparisons are intentional —
|
||||
# floats live in the `[0, 1]` band and are compared with explicit thresholds.
|
||||
float_cmp = "allow"
|
||||
# File-extension dispatch is keyed on ASCII conventions; case sensitivity
|
||||
# is part of the spec for `.md`, `.pdf`, etc.
|
||||
case_sensitive_file_extension_comparisons = "allow"
|
||||
# Config / opts structs intentionally bundle boolean flags (ingest options,
|
||||
# search modes, etc) — splitting them into enums would obscure the wire shape.
|
||||
struct_excessive_bools = "allow"
|
||||
# `bytecount` crate would be a new dep just for one-off ASCII counts.
|
||||
naive_bytecount = "allow"
|
||||
# `#[ignore]` annotations on tests document via the test name + nearby comment.
|
||||
ignore_without_reason = "allow"
|
||||
# `format!` push patterns are a hot path for kebab-tui's progressive rendering;
|
||||
# `write!` rewrite needs a verified-equal benchmark before swapping.
|
||||
format_push_string = "allow"
|
||||
# Builder-style `with_*` methods return `Self`; the existing `#[must_use]`
|
||||
# discipline lives on aggregate constructors, not every chainable setter.
|
||||
return_self_not_must_use = "allow"
|
||||
# Match arms grouped by side-effect over body equality (e.g. snake_case wire
|
||||
# label tables) — fanning them out keeps adding a new variant trivial.
|
||||
match_same_arms = "allow"
|
||||
# Remaining style-only warnings: trailing `continue` is sometimes clearer than
|
||||
# rewriting, `_x` underscored bindings document intent at the use site, and
|
||||
# `!(a == b)` reads better than `a != b` when paired with a complementary check.
|
||||
needless_continue = "allow"
|
||||
used_underscore_binding = "allow"
|
||||
nonminimal_bool = "allow"
|
||||
# Other one-off cosmetic items: large literal formatting, doc link quoting,
|
||||
# `Clone::clone_from` swap, `str::replace` chaining, `Iterator::any` ergonomics.
|
||||
unreadable_literal = "allow"
|
||||
many_single_char_names = "allow"
|
||||
doc_link_with_quotes = "allow"
|
||||
assigning_clones = "allow"
|
||||
collapsible_str_replace = "allow"
|
||||
trivial_regex = "allow"
|
||||
elidable_lifetime_names = "allow"
|
||||
range_plus_one = "allow"
|
||||
explicit_iter_loop = "allow"
|
||||
implicit_hasher = "allow"
|
||||
ref_option = "allow"
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1"
|
||||
@@ -53,6 +143,7 @@ proptest = "1"
|
||||
# p9-fb-19: LRU cache for `App::search` results. Bounded capacity
|
||||
# from `config.search.cache_capacity` (default 256, ~1.3 MB cap).
|
||||
lru = "0.12"
|
||||
lopdf = "0.32"
|
||||
# fastembed-rs ships ONNX runtime via the `ort-download-binaries` feature
|
||||
# in its default set (which also pulls `hf-hub` for first-run model
|
||||
# downloads). Pinned to the 4.x line per task p3-2 (current 5.x release
|
||||
@@ -80,6 +171,44 @@ rmcp = { version = "1.6", default-features = false, features = ["server"
|
||||
# a tokio runtime to host its mock server (the runtime adapter crate stays
|
||||
# sync via reqwest::blocking — wiremock is dev-only there).
|
||||
wiremock = "0.6"
|
||||
base64 = "0.22"
|
||||
# Pure-Rust git library for repo metadata detection (kebab-parse-code).
|
||||
# No `git` binary required. Default features include thread-safety + most
|
||||
# object-reading capabilities needed for HEAD name + commit SHA queries.
|
||||
gix = { version = "0.70", default-features = false, features = ["revision"] }
|
||||
# Rust source parsing for code ingest (kebab-parse-code, p10-1A-2). The
|
||||
# chunker stays tree-sitter-free — AST work is parser-side per design §6.3.
|
||||
tree-sitter = "0.26"
|
||||
tree-sitter-rust = "0.24"
|
||||
# Python / TS / JS grammars for code ingest (kebab-parse-code, p10-1B).
|
||||
tree-sitter-python = "0.25.0"
|
||||
tree-sitter-typescript = "0.23.2"
|
||||
tree-sitter-javascript = "0.25.0"
|
||||
# Go grammar for code ingest (kebab-parse-code, p10-1C-Go).
|
||||
tree-sitter-go = "0.25.0"
|
||||
# JVM family grammars for code ingest (kebab-parse-code, p10-1C-JK).
|
||||
tree-sitter-java = "0.23.5"
|
||||
tree-sitter-kotlin-ng = "1.1.0" # bare tree-sitter-kotlin requires ts <0.23; -ng uses tree-sitter-language 0.1 (ts 0.26 compat)
|
||||
# C/C++ family grammars for code ingest (kebab-parse-code, p10-1D).
|
||||
tree-sitter-c = "0.24.2"
|
||||
tree-sitter-cpp = "0.23.4"
|
||||
# fb-41 PR-9 (kebab-nli): mDeBERTa-v3 XNLI verifier deps. Versions match
|
||||
# the fastembed 4.9 transitive set so the ONNX Runtime + tokenizer stack
|
||||
# stays single-versioned across the workspace. ort `default-features=false`
|
||||
# drops the bundled binary downloader (fastembed already provides one);
|
||||
# tokenizers `default-features=false, onig` swaps the default `esaxx` regex
|
||||
# backend for `onig` so the build doesn't need libstdc++ headers (verified
|
||||
# via PR-9a pre-flight: SentencePiece tokenizer.json loads + KR/EN encode).
|
||||
# hf-hub uses `ureq + rustls-tls` to stay aligned with kebab-embed-local's
|
||||
# pure-Rust TLS stack.
|
||||
ort = { version = "=2.0.0-rc.9", default-features = false, features = ["ndarray"] }
|
||||
tokenizers = { version = "0.21", default-features = false, features = ["onig"] }
|
||||
hf-hub = { version = "0.4", default-features = false, features = ["ureq", "rustls-tls"] }
|
||||
ndarray = "0.16"
|
||||
# Korean morphological tokenizer (FTS v0.20.x, §6.1). lindera-ko-dic bundles
|
||||
# the KO-DIC dictionary as an embedded blob via the embed-ko-dic feature.
|
||||
lindera = "3"
|
||||
lindera-ko-dic = "3"
|
||||
|
||||
# Disk-footprint trim for dev / test builds. Codegen, opt-level, and
|
||||
# behavior are unchanged — only DWARF debug info is reduced (line
|
||||
|
||||
119
HANDOFF.md
119
HANDOFF.md
@@ -4,7 +4,7 @@
|
||||
|
||||
## 한 줄 요약
|
||||
|
||||
P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. `kebab ingest` 가 markdown / image / PDF 모두 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공 — 사용자가 `?` 로 ask, `/` 로 search, Library Enter / Search `i` 로 inspect, Search `g` 로 editor jump. 다음 후보 = P9-5 (desktop tauri) 또는 보류 중인 P8 (audio) 의 시스템 dep brainstorm.
|
||||
P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) + P10 전체 머지 완료 (현재 **v0.18.0**). `kebab ingest` 가 markdown / image / PDF / 소스코드 (Rust / Python / TS / JS / Go / Java / Kotlin / C / C++) / Tier 2 리소스 파일 (yaml/k8s / dockerfile / toml / json / xml / groovy / go-mod) + Tier 3 paragraph fallback (shell / 비-k8s YAML / AST 실패 케이스) 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page / code citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공. **v0.17.0 cut (2026-05-24)**: 한국어 trigram FTS5 tokenizer (PR #159) + C typedef alias unit (PR #160) + `code_lang_chunk_breakdown` additive (PR #161). **v0.17.1 cut (2026-05-25)**: 확장 도그푸딩 후 `[models.llm] request_timeout_secs` config 노브 (PR #162) + sudo 없이 ollama 설치 + `kebab ask --stream` UX 권장 docs (PR #163). **v0.17.2 cut (2026-05-25)**: v0.17.1 post-dogfood polish — `[image.ocr] request_timeout_secs` 별 노브 (PR #164, v0.17.1 미진행 closure) + `heading_path` FTS5 column filter 로 text-only 매칭 + raw-mode escape hatch (PR #165, 2026-05-24 v0.17.0 trigram entry 의 JSON 노이즈 closure). **v0.18.0 cut (2026-05-26)**: fb-41 multi-hop RAG + NLI verification ship (PR #176-180) — `kebab ask --multi-hop` 의 decompose → decide → synthesize loop + mDeBERTa-v3 XNLI ONNX post-synthesize entailment 검사. dogfood S7 caffeine hallucination 의 silent LLM-self-judge ceiling 해결 (nli_score 0.0035 graceful refuse). 추가 `chore: workspace-wide cleanup + post-PR9 refactor` (PR #181) — clippy::pedantic baseline + H1 config wiring + 9 new tests. 자세한 영향은 [v0.17.0 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.17.0) + [v0.17.1 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.17.1) + [v0.17.2 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.17.2) + [v0.18.0 release notes](https://gitea.altair823.xyz/altair823-org/kebab/releases/tag/v0.18.0). 구조적으로 남은 component 는 P9-5 (desktop tauri) 하나뿐, P8 (audio) 는 사용자 보류.
|
||||
|
||||
## Phase 로드맵
|
||||
|
||||
@@ -17,20 +17,46 @@ P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료.
|
||||
| **P4** | Local LLM + RAG + grounded answer | `kebab-llm`, `kebab-llm-local`, `kebab-rag` | P3 | ✅ 완료 |
|
||||
| **P5** | Golden query / regression eval | `kebab-eval` | P4 | ✅ 완료 |
|
||||
| **P6** | 이미지 ingestion (OCR + caption) | `kebab-parse-image` | P5 | ✅ 완료 (4/4 component, OCR/caption Ollama-vision) |
|
||||
| **P7** | PDF text + page citation | `kebab-parse-pdf` | P5 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring) |
|
||||
| **P7** | PDF text + page citation + scanned OCR (v0.20.0 sub-item 1) | `kebab-parse-pdf` + `kebab-app::pdf_ocr_apply` | P5 + P6 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring + post-extract OCR enrichment via qwen2.5vl:3b vision LLM) |
|
||||
| **P8** | 음성 transcription + timestamp citation | `kebab-parse-audio` | P5 | ⏸ 보류 (whisper-rs 시스템 dep brainstorm 필요) |
|
||||
| **P9** | TUI + desktop app | `kebab-tui`, `kebab-desktop` | P5 | 🟡 진행 (4/5 component — P9-1/2/3/4 완료 [Library / Search / Ask / Inspect], P9-5 desktop 예정 · 도그푸딩 피드백 **20/20 ✅**) |
|
||||
| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)**, **2 ✅ (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` — v0.14.0)**, **3 ✅ (Tier 3 paragraph fallback: code-text-paragraph-v1 — v0.15.0)**, **1D ✅ (C + C++ AST chunkers, code-c-ast-v1 + code-cpp-ast-v1 — v0.16.0)** |
|
||||
|
||||
P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
|
||||
|
||||
## Component 카운트
|
||||
|
||||
총 33 component task — spec 시점 31 개 + 후속 wiring task 3 (P3-5 / P6-4 / P7-3) 가 머지 시점에 추가됨. per-component 진행 + status 는 [tasks/INDEX.md](tasks/INDEX.md).
|
||||
총 33 component task — spec 시점 31 개 + 후속 wiring task 3 (P3-5 / P6-4 / P7-3) 가 머지 시점에 추가됨. v0.18.0 cut 시점에 fb-41 multi-hop RAG + NLI verification (PR-9 5 sub-PRs) 가 P9 추가 component 로 ship — `kebab-nli` 신규 crate (mDeBERTa-v3 XNLI ONNX verifier) + `kebab-rag::ask_multi_hop` (decompose/decide/synthesize loop + step 8.5 NLI hook). per-component 진행 + status 는 [tasks/INDEX.md](tasks/INDEX.md).
|
||||
|
||||
## 머지 후 발견된 버그 / 결정 (요약)
|
||||
|
||||
- **candle 임베딩 백엔드 다변화** (2026-06-01, Track 1, v0.22.0): `provider = "candle"` opt-in 추가 — 같은 `multilingual-e5-large` 모델을 순수 Rust(candle)로 돌려 듀얼소켓 NUMA 서버의 onnxruntime 48-스레드 double-free 를 회피. `[models.embedding].num_threads`(+env `KEBAB_EMBED_THREADS`)로 CPU 스레드 캡. fastembed default 동작·벡터 불변, `embedding_version` 유지(재색인 0). Phase 0 스파이크 패리티 cosine 1.000000. 상세 HOTFIXES 동일 일자.
|
||||
- **config 마이그레이션** (2026-05-31, PR #198): `kebab config migrate` 추가 — 기존 config.toml 에 빠진 섹션을 주석과 함께 채우고 deprecated 정리(멱등·`.bak`·dry-run, 값/주석 보존). `schema_version` 1→2, `init` 도 섹션 주석 포함, doctor 에 `config_migration` 체크. 상세 HOTFIXES 동일 일자.
|
||||
|
||||
머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만:
|
||||
|
||||
- **2026-06-04 PP-OCRv5 ONNX Rust 네이티브 OCR** — v0.27.0. `[image.ocr] engine = "paddle-onnx"` 로 PP-OCRv5(검출+인식) ONNX 를 in-process(`ort` =2.0.0-rc.9) 실행 — Python 런타임/원격 호출 없이 큰 페이지 CPU <4초(Ollama vision ~50초 대비). default 는 여전히 `"ollama-vision"`. 후처리(min-area rect/unclip)는 pure-Rust. **함정**: unclip 은 corner 를 centroid 에서 방사 확장하면 안 되고 edge 별 polygon offset 이어야 함(방사 확장 시 wide/short 텍스트 박스 높이가 안 커져 글자 윗부분 잘림 → ㄷ→ㄴ, e2e CER 0.26). 수정 후 CER 0.005. 모델 ONNX 는 `crates/kebab-parse-image/assets/paddleocr-onnx/`(LFS). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-04 PP-OCRv5 ONNX), spec/plan `docs/superpowers/{specs,plans}/2026-06-04-rust-native-ocr-*.md`.
|
||||
- **2026-06-03 ingest 설정 변경 자동 재색인** — v0.26.2. ingest 산출에 영향 주는 설정(청킹/이미지 OCR·caption/pdf.ocr/`[ingest.code]`)을 변경하면 `--force-reingest` 없이 영향 자산만 자동 재색인. 그 설정들의 결정적 서명(`ingest_config_signature`)을 effective parser_version(skip 비교 + 저장 doc 필드 양쪽)에 폴딩 → 다음 ingest 비교가 mismatch. 비산출 설정(search/rag/ui/log + max_pixels/languages/timeout)은 제외(과도 무효화 회피), doc_id 는 base 로 안정 유지. **업그레이드 후 첫 ingest 는 전 자산 1회 재색인**(저장된 상수 parser_version ≠ 새 composite; embedding 은 V012 캐시 히트). 결과 포맷·CLI·wire 불변(내부 skip 판정 정정). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 설정 변경 자동 재색인), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-*invalidation*.md`.
|
||||
- **2026-06-03 ingest 진행 로그 개선** — v0.26.1. 이미지/PDF + OCR/caption on 볼트 ingest 가 "멈춘 듯" 보이던 문제 해소: TTY 진행바에 현재 파일명 + 느린 phase(ocr/caption/embed)+모델명 + 경과초 `(Ns)` heartbeat, 종료 시 최장 소요 파일 top-5 요약. 신규 wire `asset_phase{idx,total,phase,model}` + `asset_timings.ocr_ms`/`caption_ms`(additive, `ingest_progress.v1` 유지, serde default 0). 이미지·PDF 경로도 `asset_timings` emit(이전 markdown 만). 기본 동작 불변. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 ingest 진행 로그), spec/plan `docs/superpowers/{specs,plans}/2026-06-03-ingest-log-improve-*.md`.
|
||||
- **2026-06-03 arctic-embed-l-v2.0 임베더 통합** — v0.26.0. 별칭 제거 후 설명형 query recall 보강(측정 recall@10 130/132, e5 +7). `kebab-embed-candle` 모델 레지스트리화(e5 mean + `snowflake-arctic-embed-l-v2.0` CLS, 모델별 pooling/prefix) + 신규 `kebab-embed-ollama`(`provider="ollama"`, `/api/embed`). config `endpoint: Option<String>` 추가. 기본 e5 유지(opt-in), arctic 전환은 embedding_version cascade → 재색인. candle↔Ollama cosine>0.99 게이트로 pooling/prefix 정확성 고정(`#[ignore]`). 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03 arctic), spec `docs/superpowers/specs/2026-06-03-arctic-embedder-spec.md`.
|
||||
- **2026-06-03 doc-side expansion(별칭) 기능 완전 제거** — v0.25.0. 아래 2026-05-31 항목의 색인-시 청크당 LLM 별칭 생성 + 별칭 검색 채널을 **전부 제거**(ROI 음수: cross-lingual 은 e5-large 단독으로 충분, 기여는 설명형 +2 그룹뿐인데 대가가 청크당 색인-시 LLM). `Chunk.aliases`/`expansion.rs`/`IngestExpansionCfg`/alias lexical arm/`expansion_progress` wire kind 제거, 신규 마이그레이션 **V013** 이 `chunk_aliases_fts`+`chunks.aliases` DROP. 별칭 default-off 였어 사용자 체감 0, 기존 KB 도 재색인 불요(잔존 별칭 벡터는 `strip_alias_suffix` graceful 매핑/`reset` 정리). `AssetTimings.expansion_ms` 는 wire 호환 위해 값 0 으로 유지. 자세한 내용: `tasks/HOTFIXES.md` (2026-06-03), spec `docs/superpowers/specs/2026-06-03-remove-doc-expansion-spec.md`.
|
||||
- **2026-05-31 Phase 2 doc-side expansion 별칭(개별 dense 벡터) + 파생물 캐시(V012)** — v0.21.0 cut. 색인 시 LLM 이 청크별 별칭("같은 의미 다른 표현")을 생성, 줄별 **개별 dense 벡터**(sentinel `{chunk}#alias#N`)로 색인 (묶음 1벡터는 평균화 희석으로 회귀 → 폐기) + boilerplate 청크 skip. `[ingest.expansion]` default off. 측정(나무위키 ~1000 문서 CS corpus): 변형 일관성 14/18 → **16/18**, spread 0.222→0.111, 대조군 false-positive 별칭 무죄. 비용 병목(별칭 18문서 2.5h)은 **파생물 캐시(V012, 청크 내용 해시 키)**로 해소 — 정답 3개 cold 1879s → warm 13s **≈ 145배**, embedding+별칭 LLM 캐싱, version_key cascade 정합. search/ask 가 `kebab.sqlite`+`lancedb` 만으로 동작 → 외부 서버 색인 후 DB 만 복사하는 이식 워크플로 가능. **결정/known limitation**: grounded/refusal 판정이 부분 인용을 grounded 로 오분류(정직한 거부가 false-positive 로 집계) — 별도 개선 후보. stack·svm 설명형 2개 잔존. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-31), 측정: `docs/superpowers/handoffs/2026-05-31-namu-wiki-alias-cache-study.md`.
|
||||
- **2026-05-29 v0.20.2 dogfood findings + 검색 품질 baseline** — 8-finding 라운드 완료. (1) Ask 응답언어: rag-v3 default (질문 언어 = 답변 언어). (2) eval `--config` facade 패치 로 dogfood KB 직접 eval 가능. (3) 검색 품질 baseline — hybrid hit@3=1.0 / MRR=0.833, lexical hit@3=1.0 / MRR=0.7 (golden 10 query). **O-2 known limitation**: 소형 모델(gemma4:e4b) refusal 메시지의 query 언어 불일치 가능 — 판정은 정상, 표시 문구만 해당. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-29).
|
||||
- **v0.20 sub-item 1 (scanned PDF OCR via qwen2.5vl:3b)**: post-extract enrichment pattern (`kebab-app::pdf_ocr_apply`, H-1 resolution), DCTDecode-only v1 scope (FlateDecode/CCITTFax page 는 warning + skip), parser_version `"pdf-text-v1"` 보존 + force-reingest UX 명문 (H-4).
|
||||
- **2026-05-26 kebab-normalize + kebab-parse-types 흡수 (24 → 22 crates, design §3.7b 재작성)** — v0.19.0 cut. 4 parser 중 markdown 한 갈래만 lift 를 경유하는 reality 가 design §3.7b 의 fan-in ≥ 2 가정과 diverge → thin layer (`kebab-parse-types`) + `kebab-normalize` 두 crate 가 `kebab-parse-md` 로 흡수. 5 사용 type + 3 forward-declared struct 모두 `kebab-parse-md::{types,normalize}` module 의 `pub` re-export 로 보존. wire / surface impact = 0 (CLI / TUI / MCP / `--json` / config / XDG / parser_version 모두 unchanged). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-26 design deviation entry).
|
||||
- **2026-05-26 v0.18.0 fb-41 multi-hop RAG + NLI verification ship (PR #176-180) + post-PR9 cleanup (PR #181)** — pre-v0.18.0 dogfood (`/build/cache/dogfood-v018/`, 33 assets / 205 chunks, gemma3:4b CPU only / 16 GB RAM) 에서 발견된 S7 caffeine hallucination 의 root cause = LLM-self-judge ceiling (synthesize 가 chunks 와 무관한 Adam optimizer gradient 식을 silent emit, self-judge 가 reject 못함). 학계 표준 (Self-RAG, CRAG, Auto-GDA, MedTrust-RAG) 결론 = deterministic post-synthesis verification. mDeBERTa-v3 XNLI ONNX (280 MB, Xenova HF) 가 `(packed_chunks, answer)` entailment 검사 — `[rag] nli_threshold > 0` (default 0.0 = disabled, production 권장 0.5) 일 때 활성. dogfood retest 측정 — S7 PR-8 baseline `grounded=true + Adam hallucination` → PR-9 `nli_verification_failed, nli_score 0.0035`. wire additive minor — `answer.v1.verification` field + `refusal_reason` 의 `nli_verification_failed` / `nli_model_unavailable` 추가, pre-v0.18 reader 무영향. 5 sub-PR 시퀀스 + cleanup PR (clippy::pedantic baseline + 의도적 30+ allow + H1 `[models.nli].model` config wiring + 9 new tests). post-refactor retest = PR-9d byte-identical (deterministic 확인). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 fb-41 PR-9 closure entry + S3 follow-up).
|
||||
- **2026-05-25 v0.17.2 post-v0.17.1 polish (PR #164 + #165)** — v0.17.1 의 두 follow-up closure. (1) `[image.ocr] request_timeout_secs` 별 노브 — `crates/kebab-parse-image/src/ocr.rs::REQUEST_TIMEOUT` hard 300s 제거, LLM 쪽 패턴 (PR #162) 을 OCR 어댑터에 동일 적용. 사용자 결정으로 별 노브 분리 (OCR vs LLM 의 cold start 패턴이 달라 독립 조절). v0.17.1 미진행 항목 closure. (2) `chunks_fts` 의 `heading_path` 컬럼이 JSON 표기 + path 세그먼트 까지 trigram 색인 → query false positive 가능 문제 closure. `lexical.rs::build_match_string` 가 non-raw 분기 결과를 `text : (<expr>)` 로 wrap — heading 색인 V007 verbatim 유지, 매칭만 text 한정. 사용자가 명시 heading 검색 하려면 raw mode `'heading_path : <token>'` escape hatch (SKILL.md 갱신). 둘 다 additive (옛 config 호환) / re-ingest 불필요. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25 v0.17.2 두 entry).
|
||||
- **2026-05-25 v0.17.1 post-dogfood (PR #162 + #163)** — 확장 도그푸딩 (16 GB CPU only, gemma4:e4b 시도) 에서 발견된 두 follow-up 한 묶음. (1) `crates/kebab-llm-local/src/ollama.rs::REQUEST_TIMEOUT` hard 300s → `[models.llm] request_timeout_secs` config + env override (additive, default 300, `=0` 은 disable 아닌 "즉시 timeout" 이라 doc 명시). (2) README + SMOKE 에 sudo / systemd 없이 ollama 설치 + ≤4B Q4 권장 모델 + `kebab ask --stream` UX 권장 docs. additive only — 옛 config / wire 호환. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-25).
|
||||
- **2026-05-24 v0.17.0 PR-C `code_lang_chunk_breakdown` additive (closure of 2026-05-22 LOW)** — `schema.v1.stats` 에 chunk 수 집계 신규 키. 기존 `code_lang_breakdown` (doc count) 와 sister. 또 기존 두 필드 JSON schema description 의 "chunk count" 오기재 → "doc count" 로 정정. wire additive — schema_version bump 불필요. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-24 PR-C).
|
||||
- **2026-05-24 v0.17.0 PR-B C typedef alias unit (closure of 2026-05-21)** — `kebab-parse-code::c::extract_blocks` 의 `type_definition` 분기로 inner anonymous struct/enum/union → declarator 의 typedef alias 이름으로 synthetic unit 방출. `PARSER_VERSION code-c-v1` → `code-c-v2` bump + 같은-asset/다른-doc_id 케이스용 `purge_workspace_path_for_parser_bump` cascade (`stale_chunk_ids_for_workspace_path_except_doc_id` + `purge_document_at_workspace_path_except_doc_id` helper 신규). 사용자 작업 불필요 (다음 ingest 가 자동 재처리). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-24 PR-B).
|
||||
- **2026-05-24 v0.17.0 PR-A 한국어 trigram tokenizer 채택 (closure of 2026-05-22 한국어 lexical)** — `chunks_fts` 가 FTS5 `unicode61` → `trigram` 으로 V007 migration (자동 backfill, re-ingest 불필요). `lexical.rs::build_match_string` trigram-aware 재설계 — multi-token 한국어 query (`해시 충돌`) 가 whole-phrase 후보로 hit, 한영 혼합 (`Rust 충돌은`) 도 OR-combined. 2자 이하 query 는 0-hit + CLI/TUI/wire `hint` 안내. 영어 lexical 도 substring 매칭으로 바뀜 (recall ↑ / 단어 경계 ↓). `kebab.sqlite` 크기 ~2-5배 증가 (trigram index). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-24).
|
||||
- **2026-05-22 P10 종합 도그푸딩 round 2 (한국어 lexical 검색 한계)** — `kebab search --mode lexical` 의 한국어 query 가 FTS5 `unicode61` 토크나이저에서 거의 0 hit (어절 단위 토큰화 → 부분 매칭 불가). 기본 hybrid 모드는 `multilingual-e5-small` vector 가 carry 해 한국어 검색 정상. **closure**: 위 2026-05-24 v0.17.0 entry.
|
||||
- **2026-05-20 P10-1B (Rust 1A symbol path 비일관 + expression-level 함수 미방출)** — (a) Rust `code-rust-ast-v1` 은 file-scope nesting 만 (workspace path prefix 없음), 1B 의 Python/TypeScript/JavaScript 는 workspace 경로 → module path prefix 사용 (비일관 수용, retrofit = chunker_version bump + reindex 필요, 사용자 명시 요청까지 보류); (b) TS/JS 의 `const foo = () => {...}` 같은 expression-level 함수는 `<top-level>` glue 로 처리됨 (declaration-level 단위만 1B 1차 범위). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-20) 두 항목.
|
||||
- **2026-05-19 P10-1A-2 (code_rust_ast_v1.rs + SourceType)** — `AST_CHUNK_MAX_LINES` 상수가 `IngestCodeCfg.ast_chunk_max_lines` 를 읽지 않고 모듈 상수 200 고정 (Chunker trait 이 per-medium config 미노출); `SourceType::Code` variant 부재로 code 파일이 `SourceType::Note` 로 분류됨 — 두 항목 모두 `tasks/HOTFIXES.md` (2026-05-19) 에 기록.
|
||||
- **2026-05-07 fb-26 (progress.rs)** — `Aborted` unconditional writeln (TTY duplicate) + `Completed` TTY no summary fixed; `KEBAB_PROGRESS=plain` env + quiet suppression added
|
||||
- **2026-05-07 fb-28 (main.rs)** — `--readonly` (KEBAB_READONLY) blocks Ingest/IngestFile/IngestStdin/Reset; `--quiet` suppresses progress stderr; error.v1 code: "readonly_mode"
|
||||
|
||||
- **2026-05-07 macOS XDG path collision (config 사라지는 버그)** — `dirs` crate 가 macOS 에서 `config_dir()` 과 `data_dir()` 둘 다 `~/Library/Application Support/` 반환 → `reset --data-only` 가 config 파일까지 삭제. Fix: `~/.config`, `~/.local/share`, `~/.cache` 직접 사용. 새 경로: config `~/.config/kebab/`, data `~/.local/share/kebab/`, cache `~/.cache/kebab/`. `Config::load(None)` 이 macOS legacy path 에서 자동 마이그레이션. 자세한 내용: `tasks/HOTFIXES.md`.
|
||||
- **2026-05-07 P9 post-도그푸딩 (p9-fb-31)** — `kebab ingest-file <path>` + `kebab ingest-stdin --title <T>` 두 신규 subcommand + MCP tool `ingest_file` / `ingest_stdin` (4 → 6 tool). agent 가 fetch 한 web markdown / 외부 file 을 KB 에 즉시 저장. workspace 외부 file 은 `<workspace.root>/_external/<blake3-12>.<ext>` 로 copy (deterministic 명명 → idempotent). `_external/` 디렉토리 첫 생성 시 `.kebabignore` 자동 append (walk 무한 루프 방지). stdin 은 markdown 전용 + flag (`--title`, `--source-uri`) → frontmatter 자동 prepend. .kebabignore 매치 시 stderr warn 후 진행 (explicit ingest = bypass intent). fb-30 의 v1 read-only MCP 정책 변경 — 첫 mutation tool 도입. spec: `tasks/p9/p9-fb-31-single-file-stdin-ingest.md`. design: `docs/superpowers/specs/2026-05-07-p9-fb-31-single-file-stdin-ingest-design.md`.
|
||||
- **2026-05-07 P9 post-도그푸딩 (p9-fb-30)** — `kebab mcp` 신규 subcommand + new crate `kebab-mcp` (lib only) — stdio JSON-RPC server. 4 read-only tool (`search` / `ask` / `schema` / `doctor`) 가 `kebab-app` facade 위에 build. rmcp 1.6 SDK 채택, manual `tools/list` + `tools/call` dispatch (rmcp 의 `#[tool_router]` 매크로 대신). `error_classify` 모듈을 `kebab-cli` → `kebab-app::error_wire` 로 promotion (UI crate 끼리 import 회피, facade 룰 준수). `ErrorV1` 에 `schema_version: String` 필드 추가 — kebab-mcp 의 직접 serialize 경로에서도 wire 정합. `KebabAppState` 가 `(Config, Option<PathBuf>)` carry — doctor tool 의 path-aware behavior 위해. ask + search arm 의 `tokio::task::spawn_blocking` wrap — `OllamaLanguageModel` 의 reqwest blocking client 가 async 안에서 panic 회피. capability flag `mcp_server` `false` → `true`. agent integration MVP 완성 — Claude Code / Cursor / OpenAI Agents 등 host-agnostic 사용 가능. spec: `tasks/p9/p9-fb-30-mcp-server.md`. design: `docs/superpowers/specs/2026-05-07-p9-fb-30-mcp-server-design.md`.
|
||||
- **P3-5 / P4-3 `--config` 누락** — `kebab-cli` 가 `--config <path>` 를 honor 하려면 `kebab_app::*_with_config` companion 을 호출해야 함. 두 번 같은 모양으로 회귀했음.
|
||||
@@ -74,25 +100,88 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능.
|
||||
|
||||
## 다음 task 후보
|
||||
|
||||
- **P9-2 TUI search** — `App.search` slot 채움. Library 의 `/` 가 enable 됨.
|
||||
- **P9-3 TUI ask** — `App.ask` slot 채움. `?` enable.
|
||||
- **P9-4 TUI inspect** — `App.inspect` slot 채움. `Enter` enable.
|
||||
- **P9-5 desktop tauri** — 별도 분기. PDF citation rendering UI 가치 큼.
|
||||
- **P8 audio brainstorm** — whisper-rs 시스템 dep 받을지 / 외부 transcription endpoint 사용할지 사용자 결정 필요. 사용자 패턴 (책+PDF 위주, audio 의향 없음) 상 후순위.
|
||||
구조적으로 미완인 component 는 P9-5 하나뿐. 나머지는 도그푸딩 follow-up (아래 "P10 dogfooding 백로그") 또는 사용자 결정 대기.
|
||||
|
||||
P9-2/3/4 는 P9-1 의 parallel-safety contract (sub-state slot 패턴) 덕에 병렬 진행 가능 — 같은 `App` 손대지 않음.
|
||||
- **P9-5 desktop tauri** — 마지막 남은 P9 component. `kebab-desktop` crate + Tauri 앱, 별도 분기. PDF citation rendering UI 가치 큼. 사용자 우선순위 (P9 우선 · 책/PDF 위주) 와 부합.
|
||||
- **P10 도그푸딩 round 2 follow-up** — ✅ v0.17.0 cut (2026-05-24) 으로 세 항목 모두 closure (한국어 trigram PR-A + C typedef alias PR-B + code_lang_chunk_breakdown additive PR-C). 상세 cross-link: 아래 "P10 dogfooding 백로그" 절 + `tasks/HOTFIXES.md` (2026-05-24 PR-A/B/C).
|
||||
- **P8 audio brainstorm** — whisper-rs 시스템 dep 받을지 / 외부 transcription endpoint 사용할지 사용자 결정 필요. 사용자 패턴 (책+PDF 위주, audio 의향 없음) 상 보류.
|
||||
- **fb-41 multi-hop reasoning** — ⏳ 미구현, XL, eval 인프라 선행 + brainstorm 필요.
|
||||
- **Rust symbol path retrofit** — Rust `code-rust-ast-v1` symbol 이 file-scope-only (1B+ 는 module prefix). `code-rust-ast-v2` bump + Rust corpus re-ingest 비용 → 사용자 명시 요청까지 보류. HOTFIXES `2026-05-20`.
|
||||
|
||||
### P9 dogfooding 백로그 (fb-26 ~ fb-42) — 4 minor release 분할
|
||||
### v0.20.0 sub-item 1 (PDF scanned OCR) 머지 후 priorities (2026-05-28, 사용자 결정)
|
||||
|
||||
2026-05-06 도그푸딩 누적 피드백 + "AI agent 가 kebab 을 쓰게 한다" 궁극 목표용 surface 확장. 17 항목 모두 **status: open + brainstorm 선행 필요**. 각 spec 상단 banner 명시. cascade 영향 / 분량 고려해 한 minor 에 묶지 않고 4 분할. 2026-05-06 renumber — **번호 = release 순서**:
|
||||
PR #189 (2026-05-28 머지, commit `09333d0`) 으로 PDF scanned OCR (qwen2.5vl:3b vision LLM) + 4 round bugfix (#2/#3/#4/#6/#7/#9/#10/#11/#13/#14) + ingest log feature 가 main 으로 진입. 다음 작업 순서 = **C → B → A → G**.
|
||||
|
||||
- **0.3.0+ — agent foundation**: fb-26 (log), fb-27 (introspection/error wire) ✅ 머지 + v0.3.0 cut (2026-05-07), fb-28 (readonly/quiet), ~~fb-29 (daemon)~~ → 🚫 **deferred (2026-05-07 brainstorm)** — fb-30 stdio MCP 가 동일 가치 (agent integration + session 동안 hot cache) 를 daemon 복잡도 (PID file / port lock / loopback security / lifecycle UX) 없이 제공, single-user local-first 환경에 비대. fb-30 (MCP, stdio-only — fb-29 의존 제거 → depends_on `[p9-fb-27]` 만), fb-31 (single-file ingest). 후속 fb 들은 0.3.x patch / 0.4.0 minor 로 누적.
|
||||
- **0.4.0 — agent surface refinement (additive)**: fb-32 (stale), fb-33 (streaming), fb-34 (budget), fb-35 (verbatim fetch), fb-36 (filters), fb-37 (trace/stats).
|
||||
- **0.5.0 — RAG quality (cascade 동반)**: fb-38 (score semantics), fb-39 (precision tuning, embedding_version cascade + V00X), fb-40 (fact-grounded, prompt_template_version cascade).
|
||||
- **0.6.0 또는 P+**: fb-41 (multi-hop, XL), fb-42 (bulk/rerank, Nice).
|
||||
- **C — 한국어 morphological tokenizer (Bug #8 follow-up)** ✅ **v0.20.1 머지 완료**.
|
||||
- V007 trigram 의 ≥3 char query 제약 (HOTFIXES `2026-05-22`) — '한국' 같은 2-char 한국어 query 0 hit → V009 migration + lindera-ko-dic tokenizer + tokenized_korean_text column + first-boot eager backfill 으로 해소. branch `feat/korean-morphological-tokenizer` (8 commit + 5 follow-up).
|
||||
- scope: search index 재빌드 cascade (corpus_revision bump) + V007 trigram 보존 (backward-compat).
|
||||
- 사용자 surface: `kebab search` 의 한국어 2자 query ('한국', '서울') 매칭. README + SKILL + release notes 반영.
|
||||
|
||||
- **B — OCR dense page coverage** ⏳ C 다음.
|
||||
- metro-korea.pdf page 8/13 timeout (180s, dense newspaper article). vision LLM 의 output token 과대 → 정상 timeout.
|
||||
- 가능한 path: (a) per-page `max_pixels` 동적 조정 (high-resolution page 만 축소), (b) column-level sub-region OCR (newspaper layout 분할 후 OCR call 분리), (c) model upgrade (qwen2.5vl:7b — Ollama 모델 변경 + max_pixels trade-off), (d) OCR timeout 점진 축소 (180s → 120s → 90s) — round 마다 p90 측정 후.
|
||||
- mojibake.pdf `pdf_ocr_pages: 0` (round 1 부터 동일) — text-detect path fallback 강화 검토.
|
||||
- 별 sub-item.
|
||||
|
||||
- **A — v0.20 의 deferred sub-items (frozen design contract)** ⏳ B 다음.
|
||||
- **sub-item 2** — Multi-region image dispatch (`OcrText.regions` bbox 분리) — image OCR + PDF column-aware OCR.
|
||||
- **sub-item 3** — PDF normalize integration (`ParsedPdfPage` production caller + `build_canonical_document_from_pdf_pages` + cross-page reference graph).
|
||||
- **TODO #4** — Per-page image / table extraction (PDF figure / table extract).
|
||||
- **TODO #5** — Enricher trait 도입 — OCR + caption 의 `Extractor` trait 통합 (post-extract enrichment 의 generalization).
|
||||
- 각 sub-item 별 spec/plan/executor cycle.
|
||||
|
||||
- **G — v0.20.1 patch release + release notes** ⏳ A 머지 후 (또는 C/B 시점에 따라 조기 cut).
|
||||
- CLAUDE.md release 룰 — sub-item 1 base + bugfix1-4 + log feature + logging r2 누적 → minor surface 변경 다수 + wire schema additive minor + config 신규 → **v0.20.1 patch bump + release notes**.
|
||||
- 핵심 surface (사용자 도그푸딩 가이드 형식):
|
||||
- **한국어 2자 query 지원** (`kebab search` 에서 '한국', '서울' 같은 2자 단어 매칭 — V009 morphological tokenizer).
|
||||
- OCR timeout default 180s (HOTFIXES 2026-05-28).
|
||||
- `[logging]` config section (default enabled) + `{state_dir}/logs/ingest-{run_id}.ndjson` 자동 생성.
|
||||
- `[logging] keep_recent_runs` (100) + `retention_days` (30) — OR-on-stale cleanup.
|
||||
- `ingest_progress.v1.pdf_ocr_finished` 의 4 추가 field (image_byte_size, image_width, image_height, failure_reason) — image_w/h 가 round 2 (PR #190) 에서 실제 capture.
|
||||
- `schema.v1.models` 의 `active_parsers` + `active_chunkers` (additive minor).
|
||||
- V008 migration — `pdf_ocr_events` table (per-OCR-call historical record).
|
||||
- 새 wire schemas — `ocr_stats.v1` + `ocr_failures.v1` (CLI inspect 의 emit).
|
||||
- CLI `kebab inspect ocr-stats` + `kebab inspect ocr-failures` — sweet-spot 점진 분석.
|
||||
- CLI `--media code` first-class, empty query → `invalid_input`, `--config` missing → `config_not_found` + exit 2.
|
||||
- capabilities.streaming_ask + single_file_ingest 가 true (이전 false 거짓 정정).
|
||||
- bump 작업: workspace `Cargo.toml` version → 0.20.1, tag, gitea-release.
|
||||
|
||||
### v0.20 후속 bug catalog (non-blocking known)
|
||||
|
||||
본 PR #189 dogfood 에서 **falsified** 또는 **design constraint** 로 분류 — fix 안 함:
|
||||
- Bug #8 (V007 trigram 2-char query 한계) → 위 C 항목.
|
||||
- Bug #12 (Code block wire `.code` field, `.text` 가 아닌 jq fallback artifact) — falsified.
|
||||
- ask 한국어 query phrasing-sensitive refusal — RAG corner case / NLI gate behavior. 별도 brainstorm.
|
||||
|
||||
### Logging feature enhancements — ✅ closed (PR #190, 2026-05-28 merged commit `7bbdc89a`)
|
||||
|
||||
logging round 2 (PR #190) 으로 4 enhancement 모두 closed:
|
||||
- ✅ `image_width` + `image_height` capture (raster JPEG decode).
|
||||
- ✅ SQLite mirror (V008 `pdf_ocr_events` table + dual-write).
|
||||
- ✅ CLI query (`kebab inspect ocr-stats` + `ocr-failures` — `ocr_stats.v1` + `ocr_failures.v1` wire schemas).
|
||||
- ✅ log retention (`keep_recent_runs` + `retention_days` — file + SQLite cleanup).
|
||||
|
||||
### P9 dogfooding 백로그 (fb-26 ~ fb-42) — release 분할
|
||||
|
||||
2026-05-06 도그푸딩 누적 피드백 + "AI agent 가 kebab 을 쓰게 한다" 궁극 목표용 surface 확장. cascade 영향 / 분량 고려해 한 minor 에 묶지 않고 분할.
|
||||
|
||||
- **0.3.0 — agent foundation** ✅ cut 2026-05-07: fb-26 (log), fb-27 (introspection/error wire), fb-28 (readonly/quiet). ~~fb-29 (daemon)~~ → 🚫 **deferred** — fb-30 stdio MCP 가 동일 가치를 daemon 복잡도 없이 제공.
|
||||
- **0.4.0 — agent integration (MCP)** ✅ cut: fb-30 (MCP stdio), fb-31 (single-file/stdin ingest).
|
||||
- **0.5.0 — agent surface refinement (additive)** ✅ cut 2026-05-10: fb-32 (stale doc indicator), fb-33 (streaming ask), fb-34 (output budget controls), fb-35 (verbatim fetch), fb-36 (search filter args), fb-37 (trace + stats). 모두 wire schema additive minor.
|
||||
- **0.6.0 — RAG quality** ✅ 대부분 머지 (2026-05-10): fb-38 (score semantics) ✅, fb-39 (eval foundation — `precision_at_k_chunk` metric) ✅, fb-39b (embedding upgrade — multilingual-e5-large default) ✅, fb-40 (fact-grounded answer / rag-v2 prompt) ✅. 잔여 = fb-39 의 retrieval precision lever 실제 적용 (eval golden set 확장 선행 필요).
|
||||
- **0.7.0 또는 P+**: fb-41 (multi-hop reasoning, XL) — ⏳ 미구현 · brainstorm 필요; fb-42 (bulk multi-query) ✅ 머지 (2026-05-10, bulk only — rerank hint 은 deferred).
|
||||
|
||||
각 fb spec frontmatter 의 `target_version` 필드가 source of truth. INDEX.md 의 release subheader 도 동일 grouping.
|
||||
|
||||
### P10 dogfooding 백로그 (2026-05-22 round 2)
|
||||
|
||||
P10 종합 도그푸딩 round 2 (`/build/cache/dogfood-p10b/`, OSS 8 repo + 한국어 위키 문서 10편) 에서 발견된 follow-up 후보. 자세한 내용 + 우선순위 근거는 `tasks/HOTFIXES.md` (2026-05-22).
|
||||
|
||||
- **한국어 lexical tokenizer** — ✅ v0.17.0 (2026-05-24) PR-A 머지 (#159). V007 trigram migration 자동 backfill + `build_match_string` 재설계 + CLI/TUI/wire hint. HOTFIXES `2026-05-24 PR-A` 참조.
|
||||
- **code_lang_chunk_breakdown chunk 단위 집계 (LOW)** — ✅ v0.17.0 (2026-05-24) PR-C 머지 (#161). `schema.v1.stats` additive 필드. HOTFIXES `2026-05-24 PR-C` 참조.
|
||||
- **C typedef-wrapped struct (LOW)** — ✅ v0.17.0 (2026-05-24) PR-B 머지 (#160). `type_definition` 분기 + `PARSER_VERSION code-c-v2` bump + orphan purge cascade. HOTFIXES `2026-05-24 PR-B` 참조.
|
||||
- **ranking glue chunk 편향 (deferred)** — 자동 heuristic 은 user intent misalignment 위험. 사용자 명시 요청 전까지 surface 변경 0 유지. 1주+ 실사용 후 재 brainstorm.
|
||||
|
||||
## 검증된 운영 동작 (release binary, fastembed enabled)
|
||||
|
||||
P7-3 머지 직후 25 시나리오 smoke 통과 — markdown + image + PDF 5 자산 워크스페이스에서 doctor / ingest / list / inspect / search (lex/vec/hybrid) / re-ingest / byte-edit re-ingest / corrupt PDF / RAG ask + page citation 모두. 자세한 시나리오 표는 conversation 기록 참조; 워크스페이스에 직접 돌려보는 절차는 [docs/SMOKE.md](docs/SMOKE.md).
|
||||
|
||||
301
README.md
301
README.md
@@ -1,92 +1,198 @@
|
||||
# kebab — Local-first Knowledge Base
|
||||
# kebab — Local-first Knowledge Base + RAG
|
||||
|
||||
`kebab` 는 개인용 로컬 knowledge base + RAG 도구다. Markdown / PDF / 이미지를 한 곳에 색인하고, 의미 검색 + page-단위 citation 포함 LLM 답변을 단일 binary 로 제공한다. 모든 추론은 로컬 (Ollama / fastembed) 에서 돌아간다. 대상 하드웨어: M4 48GB MacBook 1대, 사용자 1명.
|
||||
|
||||
## 사전 요구
|
||||
|
||||
- **Rust toolchain** ≥ 1.85 (workspace 가 edition 2024 + resolver 3 사용). [rustup](https://rustup.rs) 권장.
|
||||
- **Ollama** — `kebab ask` 와 이미지 OCR/caption 가 사용. `https://ollama.com/download` 에서 설치 후 `ollama serve` 실행. 기본 LLM 은 gemma4 계열 (`ollama pull gemma4:e4b`) — OCR / caption 도 같은 family 라 모델 하나만 pull 하면 됨. 더 큰 variant 원하면 `gemma4:26b` 등으로 config override. config 의 `[models.llm].endpoint` 에 host:port 명시.
|
||||
- **빌드 디스크** — 첫 빌드 시 `target/` 가 6–10 GB (Lance + DataFusion + fastembed). 여유 확인.
|
||||
- **fastembed 모델** — 첫 `kebab ingest` 시 `multilingual-e5-small` (~470 MB) 자동 다운로드.
|
||||
|
||||
## 설치
|
||||
|
||||
표준 경로는 `cargo install` — `~/.cargo/bin/kebab` 가 PATH 에 있는지만 확인하면 끝.
|
||||
|
||||
```bash
|
||||
# 1) repo clone
|
||||
git clone https://gitea.altair823.xyz/altair823-org/kebab.git
|
||||
cd kebab
|
||||
|
||||
# 2) binary 빌드 + 설치 (~/.cargo/bin/kebab)
|
||||
cargo install --path crates/kebab-cli --locked
|
||||
|
||||
# 3) PATH 확인 (아직 추가 안 했으면 ~/.bashrc / ~/.zshrc 에 추가)
|
||||
which kebab # → /Users/<you>/.cargo/bin/kebab 같은 경로
|
||||
kebab --version # → kebab 0.1.0
|
||||
```
|
||||
|
||||
git URL 직접 install 도 가능 (clone 없이):
|
||||
|
||||
```bash
|
||||
cargo install --git https://gitea.altair823.xyz/altair823-org/kebab.git --bin kebab --locked
|
||||
```
|
||||
|
||||
업데이트는 `git pull && cargo install --path crates/kebab-cli --locked --force` 또는 git URL 형식의 경우 `cargo install --git ... --force`.
|
||||
|
||||
제거는 `cargo uninstall kebab-cli`. 이 명령은 binary 만 지우고 워크스페이스 데이터는 그대로 남는다. 데이터까지 정리하려면 `kebab reset --all --yes` (config + data + cache + state 4 개 XDG 경로 모두 wipe — **irreversible**, 재시작 시 `kebab init` 다시 실행). 부분 wipe 는 `kebab reset --data-only` (config 보존), `kebab reset --vector-only` (Lance + `embedding_records` 만, 다음 ingest 가 re-embed) 등.
|
||||
`kebab` 는 개인용 로컬 knowledge base + RAG 도구다. Markdown · PDF · 이미지 · 소스코드를 한 곳에 색인하고, 하이브리드 의미 검색과 근거 인용을 포함한 LLM 답변을 **단일 binary** 로 제공한다. 모든 추론은 로컬 (Ollama + fastembed) 에서 돌아간다.
|
||||
|
||||
## Quick start
|
||||
|
||||
사전 요구는 두 가지뿐이다.
|
||||
|
||||
- **Rust toolchain** ≥ 1.85 (workspace 가 edition 2024 사용). [rustup](https://rustup.rs).
|
||||
- **Ollama** — `kebab ask` 와 이미지/PDF OCR 가 사용. [공식 설치 안내](https://ollama.com/download) 참고 후 `ollama serve` 실행. 기본 LLM family 는 gemma4 (`ollama pull gemma4:e4b`) — OCR/caption 도 같은 family 라 모델 하나면 된다. CPU-only 환경이면 소형 모델 (예: `gemma3:4b`) 을 권장.
|
||||
|
||||
```bash
|
||||
# 첫 실행 — XDG 경로에 데이터 디렉토리 + config.toml 생성
|
||||
# 1) 빌드 + 설치 (~/.cargo/bin/kebab)
|
||||
git clone https://gitea.altair823.xyz/altair823-org/kebab.git
|
||||
cd kebab
|
||||
cargo install --path crates/kebab-cli --locked
|
||||
|
||||
# 2) 데이터 디렉토리 + config.toml 생성 (XDG 경로)
|
||||
kebab init
|
||||
|
||||
# config 손보고 — workspace.root, 모델 endpoint 등 설정 (지원 형식은 md / png / jpg / pdf 로 고정)
|
||||
# 3) config 최소 손보기 — workspace.root (색인할 폴더) 와 LLM endpoint
|
||||
${EDITOR:-vi} ~/.config/kebab/config.toml
|
||||
|
||||
# 색인 (Markdown / 이미지 / PDF 모두 한 번에)
|
||||
# 4) 색인 (Markdown · PDF · 이미지 · 소스코드 한 번에)
|
||||
kebab ingest
|
||||
|
||||
# 검색 (citation 의 source_span 이 매체별로 line / region / page)
|
||||
kebab search "Markdown chunking 규칙" --mode hybrid
|
||||
# 5) 검색 (hybrid = lexical + vector RRF, citation 포함)
|
||||
kebab search "Markdown chunking 규칙"
|
||||
|
||||
# 질문 (Ollama 필요, PDF 인용 시 page 번호 surface)
|
||||
# 6) 질문 (RAG 답변 + 근거 인용, Ollama 필요)
|
||||
kebab ask "내 KB 설계에서 저장소 전략은?"
|
||||
|
||||
# Ratatui 셸 (Library + Search + Ask + Inspect 패널, desktop 진행 중)
|
||||
kebab tui
|
||||
|
||||
# 헬스 체크 (config 경로 / 데이터 디렉토리 쓰기 가능 여부)
|
||||
kebab doctor
|
||||
```
|
||||
|
||||
격리된 임시 워크스페이스로 돌려보는 절차는 [docs/SMOKE.md](docs/SMOKE.md) — `--config <path>` 로 분리. 이미지 / PDF fixture 가 필요하면 두 example 바이너리 (`cargo run --release --example gen_smoke_pdf -p kebab-parse-pdf` / `gen_smoke_png -p kebab-parse-image`) 로 시스템 dep 없이 in-tree 생성 가능.
|
||||
clone 없이 git URL 로 바로 설치할 수도 있다: `cargo install --git https://gitea.altair823.xyz/altair823-org/kebab.git --bin kebab --locked`. 업데이트는 동일 명령에 `--force`. 제거는 `cargo uninstall kebab-cli` (데이터는 보존 — 데이터까지 지우려면 `kebab reset --all --yes`).
|
||||
|
||||
설치 없이 dev 흐름으로 돌려볼 때는 `cargo run --release -p kebab-cli -- <subcommand>` 또는 `cargo build --release && ./target/release/kebab <subcommand>`.
|
||||
설치 없이 dev 흐름으로 돌려볼 때는 `cargo run --release -p kebab-cli -- <subcommand>`. 격리된 임시 워크스페이스로 검증하는 절차는 [docs/SMOKE.md](docs/SMOKE.md) (`--config <path>` 로 분리).
|
||||
|
||||
## 핵심 기능
|
||||
|
||||
### 하이브리드 검색 + citation
|
||||
|
||||
lexical (FTS5 BM25) 과 vector (cosine) 두 채널을 **RRF fusion** 으로 합쳐 검색한다. 모든 hit 은 출처 위치를 매체별로 정확히 담는다 — Markdown/코드는 line, 이미지는 region, PDF 는 page. `--tag` · `--media` · `--lang` · `--path-glob` 등 다양한 필터와 `--max-tokens` · `--cursor` 같은 agent budget flag 를 지원한다.
|
||||
|
||||
### 파생물 캐시 (자동)
|
||||
|
||||
embedding 벡터를 청크 **내용 해시** 로 캐싱한다 (`derivation_cache`). 재색인·갱신 시 내용이 같은 청크는 재계산을 건너뛴다. 캐시 키에 모델·차원 버전이 포함돼 버전 변경 시 자동 무효화된다 (cascade 안전). 별도 설정 없이 투명하게 동작한다. (현재 TTL/LRU 자동 정리는 미구현 — 누적된 캐시는 `kebab reset` 으로만 정리.)
|
||||
|
||||
### 외부 계산 + 로컬 검색 워크플로
|
||||
|
||||
search/ask 는 원본 파일 없이 KB 산출물만으로 동작한다 (청크 본문이 SQLite 에 저장되고 문서 경로는 상대경로로 기록됨). 비싼 색인(임베딩·OCR)을 성능 좋은 머신에서 수행한 뒤(예: Apple Silicon 맥에서 candle Metal GPU), **두 산출물만** 다른 머신(예: NUMA 서버)으로 복사하면 그대로 검색·질문할 수 있다.
|
||||
|
||||
**무엇을 복사하나 — `[storage]` 에서 정의된 두 경로:**
|
||||
|
||||
| 복사 대상 | config 키 (`[storage]`) | 기본 경로 | 내용 |
|
||||
|-----------|------------------------|-----------|------|
|
||||
| `kebab.sqlite` | `sqlite = "{data_dir}/kebab.sqlite"` | `{data_dir}/kebab.sqlite` | 문서·청크·본문·FTS5·메타 |
|
||||
| `lancedb/` | `vector_dir = "{data_dir}/lancedb"` | `{data_dir}/lancedb/` | 임베딩 벡터 |
|
||||
|
||||
`{data_dir}` 는 `[storage].data_dir` (예: `~/.local/share/kebab`). `models/`(`model_dir`)·`assets/`(`asset_dir`)는 **복사 불필요** — 모델은 각 머신이 자기 캐시를 받고, asset 원본 바이트는 검색·질문에 쓰이지 않는다 (단일파일/`stdin` 색인의 원본 재읽기·재색인까지 보존하려면 `assets/` 도 함께 복사).
|
||||
|
||||
```bash
|
||||
# ingest 가 끝난(쓰기 없는) 상태에서 복사
|
||||
rsync -a <src-data_dir>/kebab.sqlite user@server:<dst-data_dir>/
|
||||
rsync -a <src-data_dir>/lancedb/ user@server:<dst-data_dir>/lancedb/
|
||||
```
|
||||
|
||||
조건: **양쪽 동일 `kebab` 버전 + 동일 임베딩 모델/차원** (`[models.embedding].model`·`dimensions`). provider 는 달라도 됨 (예: 맥 `candle`/Metal ↔ 서버 `candle`/CPU 또는 `fastembed` — 같은 모델이면 벡터 호환). 복사는 반드시 ingest 가 돌지 않을 때.
|
||||
|
||||
### 멀티미디어 색인
|
||||
|
||||
Markdown · PDF · 이미지(OCR + caption) · 소스코드(Rust/Python/TS/JS/Go/Java/Kotlin/C/C++ AST) · 리소스(YAML/Dockerfile/TOML/JSON/XML 등)를 확장자에 따라 자동으로 적절한 chunker 에 라우팅한다. embedded text 가 없는 scanned PDF 는 `[ingest.pdf.ocr]` 로 page-단위 OCR (opt-in). 전체 확장자→chunker 매핑은 [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).
|
||||
|
||||
### RAG (근거 인용 + 거절)
|
||||
|
||||
검색 결과를 근거로 LLM 답변을 생성하고 [#번호] 인용을 단다. 근거가 부족하면 답을 지어내지 않고 거절한다. compound 질문은 `--multi-hop` 으로 분해→synthesize. 답변의 groundedness 는 mDeBERTa XNLI 로 검증할 수 있다 (`[rag] nli_threshold`, default off).
|
||||
|
||||
### TUI
|
||||
|
||||
`kebab tui` 는 Ratatui 셸 — Library / Search / Ask / Inspect 패널을 vim-style 모드로 다룬다. 키 매핑은 앱 내 `F1` cheatsheet 가 권위 소스다.
|
||||
|
||||
## 명령
|
||||
|
||||
| 명령 | 동작 |
|
||||
|------|------|
|
||||
| `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 |
|
||||
| `kebab ingest [<path>]` | Markdown / 이미지 / PDF 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. |
|
||||
| `kebab search --mode {lexical,vector,hybrid} "<query>" [--no-cache]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale |
|
||||
| `kebab ingest [<path>]` | 워크스페이스 스캔 후 새/변경 문서 색인 (idempotent · incremental, `--force-reingest` 로 강제 재처리). 미지원 확장자는 자동 skip. 진행바는 현재 **파일명** · 느린 **phase(ocr/caption/embed)+모델명** · **경과초**`(Ns)` · 문서별 청크 수 · phase별 소요시간(parse/chunk/ocr/caption/embed/store)을 표시하고, 종료 시 **최장 소요 파일 top-5** 를 요약한다 (`--json` 은 `asset_phase`/`asset_chunked`/`asset_timings` 이벤트로, 사람용 요약은 미출력) |
|
||||
| `kebab ingest-file <path>` | 단일 파일 ingest (workspace 외부 가능 — `_external/` 로 deterministic copy) |
|
||||
| `kebab ingest-stdin --title <T>` | stdin 의 markdown 본문 ingest |
|
||||
| `kebab search --mode {lexical,vector,hybrid} "<query>" [flags]` | 검색 (default hybrid = RRF fusion, citation 포함). 필터/budget flag 는 `--help` |
|
||||
| `kebab ask "<query>" [flags]` | RAG 답변 + 근거 인용 (Ollama 필요). `--session` (multi-turn) · `--stream` · `--multi-hop` |
|
||||
| `kebab list docs` | 색인된 문서 목록 |
|
||||
| `kebab inspect doc <id>` / `kebab inspect chunk <id>` | raw record 보기 |
|
||||
| `kebab ask "<query>" [--show-citations / --hide-citations] [--session <id>]` | RAG 답변 + 근거 인용. 답변 후 `근거:` block 으로 full path / line range / score 한 줄씩 (default ON — `--hide-citations` 로 끄기, pipe 시 유용). 근거 부족 시 거절. Ollama 필요. `--session <id>` 로 multi-turn — 첫 호출에서 SQLite `chat_sessions` 에 자동 생성, 이후 호출은 prior turns 를 history 로 받아 follow-up. session id 는 사용자 지정 (e.g. `kb-rust-async-2026-05`) — `kebab reset --data-only` 로 모든 session wipe |
|
||||
| `kebab doctor` | 설정/모델/DB 헬스 체크 |
|
||||
| `kebab tui` | Ratatui 셸 (Library + Search + Ask + Inspect 패널, desktop 진행 중). Library 에서 `r` 키로 background ingest 시작 — 화면 하단 status bar 가 진행 표시, 완료/abort 시 final 라인 잠시 유지 후 자동 hide. ingest 진행 중 `Esc` / `Ctrl-C` 가 cancel signal (그 외에는 quit). vim-style mode (header 우측 `-- NORMAL --` / `-- INSERT --`) — Library/Inspect 는 자동 NORMAL, Search/Ask 는 자동 INSERT. `i` 로 Normal→Insert (모든 pane — p9-fb-21), `Esc` 로 Insert→Normal 어디서나. mode-authoritative dispatch — Search 의 `j/k/o/g`, Ask 의 `e/j/k` 는 NORMAL 모드에서만 명령으로 동작, INSERT 에서는 입력 문자로 typing. (Search 의 chunk inspect 키는 `i`→`o` 로 rebind — `i` 가 universal Insert toggle.) **`F1` 로 cheatsheet popup** (현재 pane 의 키 매핑 + global 토글 표) — `Esc` / `F1` 로 닫기. Search 패널은 200ms debounce 후 background worker 가 검색 — 키 입력으로 UI freeze 안 됨, 사용자가 계속 타이핑하면 stale 결과 자동 폐기 (generation counter). Ask 패널은 multi-turn — 같은 conversation 안에서 Q1/A1, Q2/A2 transcript 누적, 다음 질문이 이전 턴을 history 로 받아 답변. 답변 본문은 markdown 렌더 (bold/italic/inline code/heading/list/code fence/table/blockquote, raw `**bold**` 가 실제 굵게 표시). `Ctrl-L` 로 새 conversation 시작. Search 의 `g` 키가 `$EDITOR` (기본 `vi`) 로 hit 의 citation 위치 열기 — 종료 후 TUI 화면이 자동으로 깨끗이 redraw. CLI `kebab ask` 는 raw markdown 그대로 (terminal 호환성 위해). Library 의 doc-list 가 한글 / 일본어 / 중국어 (CJK) 제목을 wide-char 정확한 column width 로 truncate — 한글 제목이 한 줄을 넘기지 않음 (CJK 1 자 = 2 col). Search/Ask/Filter 입력의 cursor 가 wide char 위에서 column 단위로 정렬 — 한글 입력 시 caret 이 글자 옆에 정확히 놓임. `← / →` 로 입력 문자열 중간 cursor 이동 (한글 한 글자 = 2 column 이라도 한 번에 이동), `Home / End` 로 양 끝 점프, `Delete` 로 cursor 위치 char 삭제 — 모든 input pane (Ask / Search / Library filter overlay) 동일 (p9-fb-22). Ask 트랜스크립트는 새 답변이 viewport 아래로 누적될 때 자동으로 tail 을 따라감 (auto-scroll); `j` / `k` 로 위로 스크롤하면 freeze, `Shift-G` 로 다시 bottom + auto-tail 재개. 화면 하단 hint line 은 한국어 동사구로 (`"위로"` / `"아래로"` / `"필터"` / `"타이핑 검색어"` / `"Esc 로 NORMAL 모드"` / `"i 입력모드"` 등) + 현재 (pane, mode) 조합에 맞춰 자동 분기, **첫 fragment 가 항상 `F1 도움말`** (cheatsheet 발견성 보장). 모든 모드에서 항상 떠 있는 상태바 — `kebab v<version> │ <pane> │ <docs> docs │ <state>` (state: streaming/searching/indexing/idle, ingest 진행 중에는 progress 가 같은 자리에 흡수됨). Ask 진입 시 conversation id 8 자 prefix 도 함께 표시. Ask 트랜스크립트와 Inspect 양쪽에서 `PgUp / PgDn` 으로 10 줄씩 페이지 스크롤. Library 의 doc list 위에는 `TITLE / TAGS / UPDATED / CHUNKS` 컬럼 헤더 행 표시 (display-width 정렬, Hangul / CJK 안전). |
|
||||
| `kebab reset [--all / --data-only / --vector-only / --config-only] [--yes]` | XDG 데이터 wipe. **Irreversible.** TTY 면 confirm prompt, 아니면 `--yes` 필수. `--vector-only` 는 SQLite `embedding_records` 도 함께 truncate (orphan 방지) |
|
||||
| `kebab eval run / compare` | golden query 회귀 측정 |
|
||||
| `kebab schema [--json]` | introspection — wire schemas / capabilities / models / stats 한 번에. `--json` 은 `schema.v1` wire; 사람 모드는 서식 출력. |
|
||||
| `kebab ingest-file <path>` | 단일 파일 ingest (workspace 외부 가능). 바이트는 `<workspace.root>/_external/<hash12>.<ext>` 로 copy. `.kebabignore` 매치 시 stderr warn 후 진행 (explicit ingest 가 bypass intent). |
|
||||
| `kebab ingest-stdin --title <T> [--source-uri <URI>]` | stdin 의 markdown 본문 ingest. frontmatter (title + source_uri) 자동 prepend. v1 markdown only. |
|
||||
| `kebab mcp` | MCP (Model Context Protocol) stdio server. agent host (Claude Code / Cursor / OpenAI Agents) 가 spawn 하여 tool 호출 (`search` / `ask` / `schema` / `doctor` / `ingest_file` / `ingest_stdin`). `--config` honor. |
|
||||
| `kebab inspect doc <id>` / `inspect chunk <id>` | raw record 보기 |
|
||||
| `kebab fetch chunk\|doc\|span <id> [flags]` | indexed corpus 에서 verbatim text fetch |
|
||||
| `kebab eval run \| aggregate \| compare \| variants` | golden query 회귀 측정 + 변형 일관성 진단 |
|
||||
| `kebab schema [--json]` | introspection — wire schemas / capabilities / models / stats |
|
||||
| `kebab doctor` | 설정 / 모델 / DB 헬스 체크 |
|
||||
| `kebab tui` | Ratatui 셸 (Library / Search / Ask / Inspect) |
|
||||
| `kebab mcp` | MCP stdio server (`search` / `bulk_search` / `ask` / `fetch` / `schema` / `doctor` / `ingest_file` / `ingest_stdin`) |
|
||||
| `kebab reset [--all \| --data-only \| --vector-only \| --config-only \| --orphans-only] [--yes]` | XDG 데이터 wipe (**irreversible**) |
|
||||
|
||||
모든 명령에 `--json` 플래그. 출력은 frozen wire schema v1 (`schema_version` 항상 포함, 예: `ingest_report.v1`, `ingest_progress.v1`, `search_hit.v1`, `answer.v1`, `doctor.v1`, `reset_report.v1`, `schema.v1`). `--json` 모드에서 fatal error 는 stderr 에 `error.v1` ndjson 으로 emit (exit code 0/1/2/3 unchanged).
|
||||
모든 명령에 `--json` 플래그가 있고, 출력은 frozen **wire schema v1** 을 따른다 (`schema_version` 항상 포함). `--json` 모드에서 fatal error 는 stderr 에 `error.v1` ndjson 으로 emit (exit code 0/1/2/3 불변). 글로벌 flag: `--readonly` (write-path 비활성화), `--quiet` (human stderr 억제), env `KEBAB_PROGRESS=plain`. 전체 flag·wire 의미는 `kebab <cmd> --help` 와 [docs/wire-schema/v1/](docs/wire-schema/v1/). 외부 agent 통합(Claude Code skill / MCP)은 [docs/mcp-usage.md](docs/mcp-usage.md) 와 [integrations/](integrations/).
|
||||
|
||||
## 논리 아키텍처
|
||||
## Configuration
|
||||
|
||||
`~/.config/kebab/config.toml` 은 `kebab init` 가 XDG 경로에 생성한다. 핵심 노브만 정리한다 (전체 절은 생성된 파일 주석 참고, 예시는 [docs/SMOKE.md](docs/SMOKE.md)).
|
||||
|
||||
```toml
|
||||
[workspace]
|
||||
root = "~/KnowledgeBase" # 색인할 폴더. 절대 / tilde / env / 상대 경로 가능.
|
||||
# 상대 경로의 base 는 config.toml 위치 (cwd 무관).
|
||||
|
||||
[models.embedding]
|
||||
provider = "fastembed" # "fastembed"(기본, onnxruntime) / "candle"(순수 Rust)
|
||||
# / "ollama"(원격 HTTP) / "none"(lexical-only).
|
||||
# candle 는 같은 모델·같은 벡터를 순수 Rust 로 돌려
|
||||
# NUMA 서버의 onnxruntime 48-스레드 double-free 를 피하는
|
||||
# opt-in 백엔드 (e5 는 재색인 불필요).
|
||||
model = "multilingual-e5-large" # 다국어 sentence embedding (1024-dim).
|
||||
# 첫 ingest 시 ONNX (~1.3GB) 자동 다운로드.
|
||||
# candle provider 는 safetensors (~2GB) 다운로드.
|
||||
# candle/ollama 는 "snowflake-arctic-embed-l-v2.0"
|
||||
# (설명형 query 의 recall 보강) 도 지원 — 아래 참고.
|
||||
dimensions = 1024 # config 와 LanceDB stored dim 불일치 시 검색 0건.
|
||||
num_threads = 0 # candle 전용 CPU 스레드 캡 (0=auto=#cores).
|
||||
# env KEBAB_EMBED_THREADS 가 우선. NUMA 노드 바인딩은
|
||||
# numactl 과 조합. fastembed provider 는 무시.
|
||||
# endpoint = "http://127.0.0.1:11434" # provider="ollama" 전용 HTTP endpoint.
|
||||
# 생략 시 [models.llm].endpoint 로 폴백.
|
||||
# fastembed/candle provider 는 무시.
|
||||
```
|
||||
|
||||
**arctic-embed-l-v2.0 (설명형 query recall 보강)**: 기본 e5-large 대신
|
||||
Snowflake `arctic-embed-l-v2.0` 임베더를 쓸 수 있다 (1024-dim, opt-in). 측정에서
|
||||
설명형/약어/영문 용어 query 의 recall@10 이 e5 대비 향상됐다. 두 경로:
|
||||
|
||||
```toml
|
||||
# (A) candle 백엔드 — 순수 Rust, in-process (NUMA 안전, Metal GPU 가능):
|
||||
[models.embedding]
|
||||
provider = "candle"
|
||||
model = "snowflake-arctic-embed-l-v2.0" # CLS pooling, query 에 "query: " 접두어
|
||||
# (문서는 무접두어). safetensors ~2GB 다운로드.
|
||||
|
||||
# (B) ollama 백엔드 — 원격/로컬 Ollama 데몬에 위임 (POST /api/embed):
|
||||
[models.embedding]
|
||||
provider = "ollama"
|
||||
model = "snowflake-arctic-embed2" # Ollama 모델 태그 (ollama pull 필요)
|
||||
endpoint = "http://127.0.0.1:11434" # 생략 시 [models.llm].endpoint
|
||||
```
|
||||
|
||||
> ⚠️ e5 → arctic 전환은 `embedding_version` cascade 를 트리거한다 (모델이 다르면
|
||||
> 벡터도 다름). 기존 e5 KB 와 혼용 불가 — 전환 시 **재색인** 필요 (`kebab reset`
|
||||
> 후 재 ingest). 기본값은 e5 라 기존 사용자는 영향 없음.
|
||||
|
||||
**Apple Silicon GPU 가속 (candle / macOS)**: M-시리즈 맥에서 candle 임베딩을
|
||||
GPU(Metal)로 돌리면 CPU 대비 대용량 ingest 가 크게 빨라진다. 빌드 또는 설치 시
|
||||
`embed_metal` feature 를 켠다:
|
||||
|
||||
```bash
|
||||
# 빌드만:
|
||||
cargo build --release --features embed_metal
|
||||
# 전역 설치 (~/.cargo/bin/kebab):
|
||||
cargo install --path crates/kebab-cli --features embed_metal --locked
|
||||
```
|
||||
|
||||
벡터는 CPU candle 과 동일 모델이라 호환되므로, 맥에서 GPU 로 색인한
|
||||
`kebab.sqlite` + `lancedb/` 를 그대로 Linux 서버(CPU candle)로 복사해 질의할 수
|
||||
있다. 색인 로그에 `candle device = Metal (GPU)` 가 보이면 GPU 사용 중. metal
|
||||
feature 는 macOS 전용 (Linux/서버는 기본 CPU 빌드).
|
||||
|
||||
```toml
|
||||
|
||||
[models.llm]
|
||||
endpoint = "http://localhost:11434" # Ollama host:port
|
||||
model = "gemma4:e4b"
|
||||
# request_timeout_secs = 300 # 큰 모델은 늘림. 0 은 disable 이 아니라 "즉시 timeout".
|
||||
|
||||
[search]
|
||||
stale_threshold_days = 30 # search hit / citation 의 stale 플래그 기준 (0 = off).
|
||||
|
||||
[rag]
|
||||
prompt_template_version = "rag-v3" # 답변 언어 = 질문 언어. rag-v1/v2 는 legacy.
|
||||
nli_threshold = 0.0 # >0 (예: 0.5) 면 mDeBERTa XNLI groundedness 검증.
|
||||
```
|
||||
|
||||
- **`[ingest]`** (v0.28.0) — 모든 형식 ingest 설정의 우산. 병렬도(`max_parallel_extractors`/`max_parallel_embeddings`/`watch_filesystem`, ← 옛 `[indexing]`)와 형식별 하위 절(`[ingest.chunking]` ← 옛 `[chunking]`, `[ingest.code]`, `[ingest.image.ocr]` ← 옛 `[image.ocr]`, `[ingest.pdf.ocr]` ← 옛 `[pdf.ocr]`)이 전부 이 아래로 모인다. 기존 v2 `config.toml` 은 그대로 둬도 로드 시 메모리에서 자동 변환되며, 파일을 새 레이아웃으로 갱신하려면 `kebab config migrate` (값·주석 보존).
|
||||
- **파생물 캐시** — embedding 결과를 내용 해시로 자동 캐싱한다 (위 「핵심 기능」 참고). 설정 항목 없음.
|
||||
- **`[ingest.code]`** — code ingest 의 skip 정책 (`skip_generated_header`, `max_file_bytes`, `extra_skip_globs`). `.gitignore` 자동 honor, `.kebabignore` 는 추가 layer.
|
||||
- **`[ingest.image.ocr]`** — 이미지 OCR (default off / opt-in). `engine` 으로 백엔드 선택: `"ollama-vision"` (default, 원격 vision LM) 또는 `"paddle-onnx"` (PP-OCRv5 ONNX 를 in-process 로 실행, Python 런타임 불필요, 큰 페이지 CPU <4초, 오프라인). `paddle-onnx` 는 워크스페이스에 번들된 모델을 쓰며 `det_model`/`rec_model`/`dict` 로 경로 override, `score_thresh`(0.3)/`unclip_ratio`(1.5)/`max_boxes`(1000) 로 검출 튜닝 가능 (`KEBAB_IMAGE_OCR_*` env 동일 지원 — env 이름은 v3 에서도 불변). engine 또는 모델을 바꾸면 영향 이미지가 자동 재색인된다.
|
||||
- **`[ingest.pdf.ocr]`** — scanned PDF 의 page-단위 OCR (default off / opt-in, page 당 ~수십 초 cost). `engine` 은 `[ingest.image.ocr]` 과 동일하게 `"ollama-vision"`/`"paddle-onnx"` 선택. v3 에서 paddle 모델 경로 키(`det_model`/`rec_model`/`dict`/`score_thresh`/`unclip_ratio`/`max_boxes`)를 PDF 자체적으로 가질 수 있다(`KEBAB_PDF_OCR_*` env 동일). 활성화 후 옛 색인분은 `kebab ingest --force-reingest` 로 재처리.
|
||||
- **`--config <path>`** — 임시 워크스페이스 / 격리 테스트용 (CLI · TUI 모두 honor).
|
||||
- **`kebab config migrate`** — 새 버전에서 추가된 config 섹션을 기존 `config.toml` 에 설명 주석과 함께 채워 넣는다 (사용자가 손본 값·주석·순서는 보존, 멱등, 변경 시 자동 `.bak` 백업). `--dry-run` 으로 변경 미리보기. `kebab doctor` 가 갱신 필요 시 안내한다. `kebab init` 으로 새로 생성되는 config.toml 도 섹션별 주석을 포함한다.
|
||||
- **`KEBAB_*` env** — 일부 키 override (`KEBAB_RAG_SCORE_GATE`, `KEBAB_EVAL_GOLDEN` 등).
|
||||
- **XDG layout**: `~/.config/kebab/`, `~/.local/share/kebab/`, `~/.cache/kebab/`, `~/.local/state/kebab/`.
|
||||
|
||||
## 아키텍처
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
@@ -102,9 +208,9 @@ flowchart TB
|
||||
end
|
||||
|
||||
subgraph Pipeline["도메인 + 파이프라인"]
|
||||
parse["parse-md / parse-pdf / parse-image"]
|
||||
chunker["chunker (md-heading-v1, pdf-page-v1)"]
|
||||
embedder["embedder (fastembed multilingual-e5-small)"]
|
||||
parse["parse-md / parse-pdf / parse-image / parse-code"]
|
||||
chunker["chunker (md / pdf / code-AST / manifest)"]
|
||||
embedder["embedder (fastembed multilingual-e5-large)"]
|
||||
retriever["retriever (lexical / vector / hybrid RRF)"]
|
||||
rag["RAG pipeline"]
|
||||
end
|
||||
@@ -145,59 +251,22 @@ flowchart TB
|
||||
rag --> ollama
|
||||
```
|
||||
|
||||
`kebab-app` 가 facade — UI binary 가 store / parse / search / llm / rag 를 직접 참조하지 않는다 (frozen 설계 §8). 자세한 crate-level 의존성 + 디렉토리 + 핵심 기술 결정은 [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).
|
||||
v0.21.0 기준 핵심 설계:
|
||||
|
||||
## Configuration
|
||||
- **crate facade** — `kebab-app` 가 유일한 facade다. UI binary (`kebab-cli` / `kebab-tui`) 는 store / parse / search / llm / rag 를 직접 참조하지 않는다 (frozen 설계 §8). 각 user-facing 엔트리는 `*_with_config(cfg, …)` 동반 함수로 explicit config 를 thread 한다.
|
||||
- **chunk_id 는 위치 기반** — chunk 의 정체성은 문서 내 위치(ordinal + span)다. 반면 파생물 캐시 키는 **내용 해시**라, 내용이 같으면 위치·문서가 달라도 동일 캐시를 재사용한다.
|
||||
- **wire schema v1** — 모든 `--json` 출력은 `schema_version` 을 담는 frozen contract다. 깨는 변경은 `*.v2` major bump을 요구한다.
|
||||
- **versioning cascade** — `parser_version` / `chunker_version` / `embedding_version` / `prompt_template_version` / `index_version` 변경은 downstream record(청크·임베딩·캐시·eval)를 무효화한다.
|
||||
|
||||
- `~/.config/kebab/config.toml` — `kebab init` 가 XDG 경로에 생성. `[workspace]` (root, exclude — include 필드는 제거됨, 지원 형식은 자동 결정), `[storage]`, `[chunking]`, `[models.embedding]`, `[models.llm]`, `[image.ocr]`, `[image.caption]`, `[search]`, `[rag]`, `[ui]` 절. `[ui] theme = "dark" | "light"` 로 TUI 팔레트 선택 (default `"dark"`, 알 수 없는 값은 dark fallback). 옛 config 의 `workspace.include = [...]` 은 silently 무시 + 단발 deprecation warning (p9-fb-25).
|
||||
- `--config <path>` flag — 임시 워크스페이스 / 격리 테스트 시 사용. CLI / TUI 모두 honor.
|
||||
- `KEBAB_*` env — 일부 키 override (`KEBAB_RAG_SCORE_GATE`, `KEBAB_EVAL_GOLDEN`, `KEBAB_COMMIT_HASH` 등).
|
||||
- XDG layout: `~/.config/kebab/`, `~/.local/share/kebab/`, `~/.cache/kebab/`, `~/.local/state/kebab/`.
|
||||
- `workspace.root` 경로 형식: 절대 (`/foo/bar`) / tilde (`~/KnowledgeBase`, default) / env (`${XDG_DATA_HOME}/kebab`) / 상대 (`./notes`, `notes`, `../shared/x`) 모두 가능. **상대 경로의 base 는 config.toml 자체가 위치한 디렉토리** — 사용자의 `cwd` 와 무관 (`--config /tmp/cfg.toml` + `root = "kb"` → `/tmp/kb`). p9-fb-05 정책.
|
||||
|
||||
config 예시는 [docs/SMOKE.md](docs/SMOKE.md) 의 `/tmp/kebab-smoke/config.toml` 블록 참조.
|
||||
|
||||
## 외부 AI 통합
|
||||
|
||||
`--json` 출력 + frozen wire schema v1 가 stable contract. 통합 옵션:
|
||||
|
||||
- **Claude Code skill** — repo 의 [`integrations/claude-code/`](integrations/claude-code/) 가 ship-ready skill. `cp -r integrations/claude-code/kebab ~/.claude/skills/` 한 번이면 새 Claude Code 세션부터 자동 trigger (내부 시스템 / 위키 lookup / 사내 runbook 질문). multi-turn 은 `kebab ask --session <id> --json` 으로 영속 — skill 이 conversation id 관리하면 외부 agent 도 `--repl` 없이 stateful 대화 가능 (p9-fb-18).
|
||||
- **Codex / 기타 agent host** — `--json` + frozen wire schema v1 가 stable contract. 동일 패턴으로 ~50줄 wrapper 작성 가능. `integrations/<host>/` 에 추가 PR 환영.
|
||||
- **MCP server** — stdio JSON-RPC 로 `kebab-app` facade 1:1 노출. `kebab mcp` 참조.
|
||||
- **HTTP wrapper** — `kebab serve --bind 127.0.0.1:7711` (P+, local-only 가치 신중).
|
||||
|
||||
## MCP 사용
|
||||
|
||||
`kebab mcp` 가 stdio MCP server. 6 tool: `search` / `ask` / `schema` / `doctor` / `ingest_file` / `ingest_stdin`.
|
||||
|
||||
Claude Code 빠른 등록 (`~/.claude/mcp.json` 또는 host 동등 위치):
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"kebab": {
|
||||
"command": "kebab",
|
||||
"args": ["mcp"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
자세한 사용법 (Cursor / OpenAI Agents / Copilot CLI config, per-tool 입출력 예시, troubleshooting, multi-turn ask + session 관리, performance / security) — **[docs/mcp-usage.md](docs/mcp-usage.md)** 참조.
|
||||
crate-level 의존성 그래프 · 디렉토리 트리 · 확장자→chunker 전체 매핑 · 핵심 기술 결정은 [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md), 진척도는 [HANDOFF.md](HANDOFF.md).
|
||||
|
||||
## 비-목표
|
||||
|
||||
다중 사용자 SaaS / K8s / 원격 vector DB / enterprise RBAC / 실시간 협업 / 모든 파일 포맷의 완벽한 parsing / agent 임의 파일 수정 / multi-workspace / LLM-as-judge eval / CLIP 시각 embedding / `kebab://` protocol handler — frozen 설계 §11 / §0 참조.
|
||||
다중 사용자 SaaS / K8s / 원격 vector DB / enterprise RBAC / 실시간 협업 / agent 임의 파일 수정 / multi-workspace / LLM-as-judge eval / CLIP 시각 embedding — frozen 설계 §0 / §11 참조.
|
||||
|
||||
## 라이선스
|
||||
## 버전 / 라이선스 / 참고
|
||||
|
||||
`MIT OR Apache-2.0` (workspace `Cargo.toml` 의 `license` 필드).
|
||||
|
||||
## 참고
|
||||
|
||||
- 진척도: [HANDOFF.md](HANDOFF.md)
|
||||
- 아키텍처: [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)
|
||||
- Frozen 설계: [docs/superpowers/specs/2026-04-27-kebab-final-form-design.md](docs/superpowers/specs/2026-04-27-kebab-final-form-design.md)
|
||||
- Task 인덱스: [tasks/INDEX.md](tasks/INDEX.md)
|
||||
- 머지 후 hotfix 로그: [tasks/HOTFIXES.md](tasks/HOTFIXES.md)
|
||||
- Smoke 절차: [docs/SMOKE.md](docs/SMOKE.md)
|
||||
- **버전**: v0.21.0 (`kebab --version` 으로 확인).
|
||||
- **라이선스**: `MIT OR Apache-2.0`.
|
||||
- 진척도: [HANDOFF.md](HANDOFF.md) · 아키텍처: [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) · Frozen 설계: [docs/superpowers/specs/2026-04-27-kebab-final-form-design.md](docs/superpowers/specs/2026-04-27-kebab-final-form-design.md)
|
||||
- Task 인덱스: [tasks/INDEX.md](tasks/INDEX.md) · Hotfix 로그: [tasks/HOTFIXES.md](tasks/HOTFIXES.md) · Smoke 절차: [docs/SMOKE.md](docs/SMOKE.md) · MCP 사용: [docs/mcp-usage.md](docs/mcp-usage.md)
|
||||
|
||||
@@ -12,17 +12,22 @@ kebab-core = { path = "../kebab-core" }
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
kebab-source-fs = { path = "../kebab-source-fs" }
|
||||
kebab-parse-md = { path = "../kebab-parse-md" }
|
||||
kebab-parse-types = { path = "../kebab-parse-types" }
|
||||
kebab-normalize = { path = "../kebab-normalize" }
|
||||
kebab-chunk = { path = "../kebab-chunk" }
|
||||
kebab-store-sqlite = { path = "../kebab-store-sqlite" }
|
||||
kebab-store-vector = { path = "../kebab-store-vector" }
|
||||
kebab-search = { path = "../kebab-search" }
|
||||
kebab-embed = { path = "../kebab-embed" }
|
||||
kebab-embed-local = { path = "../kebab-embed-local" }
|
||||
kebab-embed-candle = { path = "../kebab-embed-candle" }
|
||||
kebab-embed-ollama = { path = "../kebab-embed-ollama" }
|
||||
kebab-llm = { path = "../kebab-llm" }
|
||||
kebab-llm-local = { path = "../kebab-llm-local" }
|
||||
kebab-rag = { path = "../kebab-rag" }
|
||||
# p9-fb-41 PR-9c-2: facade construction of OnnxNliVerifier when
|
||||
# `[rag] nli_threshold > 0`. Trait-only consumption via kebab-rag's
|
||||
# `with_verifier`; no kebab-nli internals leak into kebab-app code
|
||||
# beyond the construction site in `open_with_config`.
|
||||
kebab-nli = { path = "../kebab-nli" }
|
||||
# P6-4: image extractor + OCR + caption adapters live here. App
|
||||
# threads them into the per-asset dispatch (see `ingest_one_asset`
|
||||
# image branch). Trait-only consumption — no `kebab-parse-image`
|
||||
@@ -32,11 +37,21 @@ kebab-parse-image = { path = "../kebab-parse-image" }
|
||||
# per-asset dispatch (see `ingest_one_asset` PDF branch) and runs the
|
||||
# resulting `CanonicalDocument` through `kebab-chunk::PdfPageV1Chunker`.
|
||||
kebab-parse-pdf = { path = "../kebab-parse-pdf" }
|
||||
lopdf = { workspace = true }
|
||||
# Enhancement 1 (v0.20.x r2): JPEG dimension decode in pdf_ocr_apply.rs.
|
||||
# jpeg feature added explicitly (F3 closure-r1) rather than relying on
|
||||
# feature unification via kebab-parse-image.
|
||||
image = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
|
||||
# p10-1A-2: Rust AST extractor lives here. App threads it into the
|
||||
# per-asset dispatch (see `ingest_one_asset` Code branch) and runs the
|
||||
# resulting `CanonicalDocument` through `kebab-chunk::CodeRustAstV1Chunker`.
|
||||
kebab-parse-code = { path = "../kebab-parse-code" }
|
||||
anyhow = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] }
|
||||
tracing-appender = "0.2"
|
||||
@@ -52,21 +67,42 @@ unicode-normalization = "0.1"
|
||||
# p9-fb-31: GitignoreBuilder for .kebabignore matching in ingest_file_with_config.
|
||||
# Same version as kebab-source-fs (0.4) to avoid duplicate dep versions.
|
||||
ignore = "0.4"
|
||||
# p9-fb-34: opaque pagination cursor encodes payload as base64.
|
||||
base64 = { workspace = true }
|
||||
# Enhancement 3 (v0.20.x r2): direct SQL queries for inspect_ocr_stats/failures.
|
||||
rusqlite = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
kebab-config = { path = "../kebab-config" }
|
||||
# doc-side expansion (Phase 2) Task 4: ExpansionGenerator unit tests build
|
||||
# MockLanguageModel (gated behind kebab-llm's `mock` feature, default OFF in
|
||||
# [dependencies]). Enabling it here turns it on for the test build only.
|
||||
kebab-llm = { path = "../kebab-llm", features = ["mock"] }
|
||||
rusqlite = { workspace = true }
|
||||
filetime = "0.2"
|
||||
tempfile = { workspace = true }
|
||||
# Image-pipeline integration tests use wiremock to stub Ollama for OCR
|
||||
# / caption HTTP calls. Async runtime to host the mock server only;
|
||||
# the kb-app code under test stays sync.
|
||||
wiremock = { workspace = true }
|
||||
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||
image = { version = "0.25", default-features = false, features = ["png"] }
|
||||
image = { version = "0.25", default-features = false, features = ["png", "jpeg"] }
|
||||
# P7-3 PDF integration tests build in-memory PDF fixtures via the same
|
||||
# lopdf builder pattern `kebab-parse-pdf::tests::common` uses; pinned
|
||||
# to the same major (0.32) so byte output is identical between the two
|
||||
# fixture surfaces.
|
||||
lopdf = "0.32"
|
||||
lopdf = { workspace = true }
|
||||
# error_wire::tests::llm_unreachable_classifies_to_model_unreachable needs a real
|
||||
# reqwest::Error (private constructor) — built from a connect-refused call.
|
||||
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
|
||||
|
||||
[features]
|
||||
# Marker feature — spec §6.3 Option A (단순): lindera 는 kebab-chunk 가 default dep 으로 소유.
|
||||
# disable path 없음; 이 feature 는 spec §6.3 명시를 honor 하는 role 만.
|
||||
default = ["fts_korean_morphological"]
|
||||
fts_korean_morphological = []
|
||||
# opt-in (macOS): candle embedder runs on the Apple Silicon GPU. See kebab-embed-candle.
|
||||
embed_metal = ["kebab-embed-candle/metal"]
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
321
crates/kebab-app/src/bulk.rs
Normal file
321
crates/kebab-app/src/bulk.rs
Normal file
@@ -0,0 +1,321 @@
|
||||
//! p9-fb-42: bulk multi-query facade. Sequential for-loop reusing
|
||||
//! one App instance so embedder cold-start + LRU cache amortize
|
||||
//! across the N queries.
|
||||
|
||||
use anyhow::Context;
|
||||
use kebab_core::{
|
||||
BulkSearchItem, BulkSearchSummary, DocumentId, Lang, SearchFilters, SearchHit, SearchMode,
|
||||
SearchOpts, SearchQuery, TrustLevel,
|
||||
};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::{App, SearchResponse};
|
||||
|
||||
/// Hard cap on items per bulk call. Documented in spec — agents that
|
||||
/// hit this should batch-split.
|
||||
pub const BULK_QUERIES_MAX: usize = 100;
|
||||
|
||||
/// p9-fb-42: bulk search facade. Returns `(items, summary)` always
|
||||
/// — per-query failures embed `error.v1` JSON in the item rather
|
||||
/// than aborting the bulk call. Returns `Err` only for input
|
||||
/// validation failures (e.g. >100 queries).
|
||||
#[doc(hidden)]
|
||||
pub fn bulk_search_with_config(
|
||||
config: kebab_config::Config,
|
||||
raw_items: Vec<Value>,
|
||||
) -> anyhow::Result<(Vec<BulkSearchItem>, BulkSearchSummary)> {
|
||||
if raw_items.len() > BULK_QUERIES_MAX {
|
||||
anyhow::bail!(
|
||||
"queries: max {} items, got {}",
|
||||
BULK_QUERIES_MAX,
|
||||
raw_items.len()
|
||||
);
|
||||
}
|
||||
|
||||
let app = App::open_with_config(config).context("kebab-app: open for bulk_search")?;
|
||||
|
||||
let mut results: Vec<BulkSearchItem> = Vec::with_capacity(raw_items.len());
|
||||
let mut succeeded: u32 = 0;
|
||||
let mut failed: u32 = 0;
|
||||
|
||||
for raw in raw_items {
|
||||
let item = run_one(&app, raw);
|
||||
if item.error.is_some() {
|
||||
failed += 1;
|
||||
} else {
|
||||
succeeded += 1;
|
||||
}
|
||||
results.push(item);
|
||||
}
|
||||
|
||||
let summary = BulkSearchSummary {
|
||||
total: succeeded + failed,
|
||||
succeeded,
|
||||
failed,
|
||||
};
|
||||
Ok((results, summary))
|
||||
}
|
||||
|
||||
fn run_one(app: &App, raw: Value) -> BulkSearchItem {
|
||||
let echo = raw.clone();
|
||||
match parse_one(&raw) {
|
||||
Ok((query, opts)) => match app.search_with_opts(query, opts) {
|
||||
Ok(resp) => BulkSearchItem {
|
||||
query: echo,
|
||||
response: Some(serialize_search_response(&resp)),
|
||||
error: None,
|
||||
},
|
||||
Err(e) => BulkSearchItem {
|
||||
query: echo,
|
||||
response: None,
|
||||
error: Some(error_v1_json("retrieval_error", &format!("{e:#}"), None)),
|
||||
},
|
||||
},
|
||||
Err(msg) => BulkSearchItem {
|
||||
query: echo,
|
||||
response: None,
|
||||
error: Some(error_v1_json("invalid_input", &msg, None)),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Mirror of `kebab-cli::wire::wire_search_response` — `SearchResponse`
|
||||
/// itself is not `Serialize`, so we build the `search_response.v1`-shaped
|
||||
/// JSON manually. Each hit also gets `score` promoted from
|
||||
/// `retrieval.fusion_score` per §2.2, matching the CLI wire layer.
|
||||
fn serialize_search_response(r: &SearchResponse) -> Value {
|
||||
let mut v = serde_json::json!({
|
||||
"schema_version": "search_response.v1",
|
||||
"hits": r.hits.iter().map(serialize_search_hit).collect::<Vec<_>>(),
|
||||
"next_cursor": r.next_cursor,
|
||||
"truncated": r.truncated,
|
||||
});
|
||||
if let Value::Object(ref mut map) = v {
|
||||
let trace_v = match &r.trace {
|
||||
Some(t) => serde_json::to_value(t).unwrap_or(Value::Null),
|
||||
None => Value::Null,
|
||||
};
|
||||
map.insert("trace".to_string(), trace_v);
|
||||
// v0.17.0 A5 Step 4b: only emit `hint` when set — matches
|
||||
// the CLI wire wrapper's additive emit pattern.
|
||||
if let Some(hint) = &r.hint {
|
||||
map.insert("hint".to_string(), Value::String(hint.clone()));
|
||||
}
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
fn serialize_search_hit(h: &SearchHit) -> Value {
|
||||
let mut v = serde_json::to_value(h).unwrap_or(Value::Null);
|
||||
if let Value::Object(ref mut map) = v {
|
||||
if let Some(Value::Object(retrieval)) = map.get("retrieval") {
|
||||
if let Some(score) = retrieval.get("fusion_score").cloned() {
|
||||
map.insert("score".to_string(), score);
|
||||
}
|
||||
}
|
||||
map.insert(
|
||||
"schema_version".to_string(),
|
||||
Value::String("search_hit.v1".to_string()),
|
||||
);
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
fn parse_one(raw: &Value) -> Result<(SearchQuery, SearchOpts), String> {
|
||||
let obj = raw.as_object().ok_or("expected JSON object")?;
|
||||
let text = obj
|
||||
.get("query")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or(
|
||||
"missing required field: query \
|
||||
(expected {\"query\":\"<text>\",\"mode\":\"lexical|vector|hybrid\",\"k\":3,...})",
|
||||
)?
|
||||
.to_string();
|
||||
|
||||
let mode = match obj.get("mode").and_then(|v| v.as_str()) {
|
||||
None => SearchMode::Hybrid,
|
||||
Some("hybrid") => SearchMode::Hybrid,
|
||||
Some("lexical") => SearchMode::Lexical,
|
||||
Some("vector") => SearchMode::Vector,
|
||||
Some(other) => return Err(format!("invalid mode: {other:?}")),
|
||||
};
|
||||
|
||||
let k = obj
|
||||
.get("k")
|
||||
.and_then(serde_json::Value::as_u64)
|
||||
.map_or(0, |n| n as usize); // 0 → use config default in app
|
||||
|
||||
let trust_min = match obj.get("trust_min").and_then(|v| v.as_str()) {
|
||||
None => None,
|
||||
Some("primary") => Some(TrustLevel::Primary),
|
||||
Some("secondary") => Some(TrustLevel::Secondary),
|
||||
Some("generated") => Some(TrustLevel::Generated),
|
||||
Some(other) => return Err(format!("invalid trust_min: {other:?}")),
|
||||
};
|
||||
|
||||
let ingested_after = match obj.get("ingested_after").and_then(|v| v.as_str()) {
|
||||
None => None,
|
||||
Some(s) => Some(
|
||||
time::OffsetDateTime::parse(s, &time::format_description::well_known::Rfc3339)
|
||||
.map_err(|e| format!("invalid ingested_after RFC3339 {s:?}: {e}"))?,
|
||||
),
|
||||
};
|
||||
|
||||
let media: Vec<String> = obj
|
||||
.get("media")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|arr| {
|
||||
arr.iter()
|
||||
.filter_map(|x| x.as_str().map(normalize_media_alias))
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
let tags_any: Vec<String> = obj
|
||||
.get("tag")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|arr| {
|
||||
arr.iter()
|
||||
.filter_map(|x| x.as_str().map(String::from))
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
let lang = obj
|
||||
.get("lang")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| Lang(s.to_string()));
|
||||
|
||||
let path_glob = obj
|
||||
.get("path_glob")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(String::from);
|
||||
|
||||
let doc_id = obj
|
||||
.get("doc_id")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| DocumentId(s.to_string()));
|
||||
|
||||
let filters = SearchFilters {
|
||||
tags_any,
|
||||
lang,
|
||||
path_glob,
|
||||
trust_min,
|
||||
media,
|
||||
ingested_after,
|
||||
doc_id,
|
||||
repo: vec![],
|
||||
code_lang: vec![],
|
||||
};
|
||||
|
||||
let opts = SearchOpts {
|
||||
max_tokens: obj
|
||||
.get("max_tokens")
|
||||
.and_then(serde_json::Value::as_u64)
|
||||
.map(|n| n as usize),
|
||||
snippet_chars: obj
|
||||
.get("snippet_chars")
|
||||
.and_then(serde_json::Value::as_u64)
|
||||
.map(|n| n as usize),
|
||||
cursor: obj.get("cursor").and_then(|v| v.as_str()).map(String::from),
|
||||
trace: obj
|
||||
.get("trace")
|
||||
.and_then(serde_json::Value::as_bool)
|
||||
.unwrap_or(false),
|
||||
};
|
||||
|
||||
Ok((
|
||||
SearchQuery {
|
||||
text,
|
||||
mode,
|
||||
k,
|
||||
filters,
|
||||
},
|
||||
opts,
|
||||
))
|
||||
}
|
||||
|
||||
fn normalize_media_alias(s: &str) -> String {
|
||||
match s.to_ascii_lowercase().as_str() {
|
||||
"md" => "markdown".to_string(),
|
||||
other => other.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn error_v1_json(code: &str, message: &str, hint: Option<&str>) -> Value {
|
||||
serde_json::json!({
|
||||
"schema_version": "error.v1",
|
||||
"code": code,
|
||||
"message": message,
|
||||
"hint": hint,
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn open_temp() -> kebab_config::Config {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let mut cfg = kebab_config::Config::defaults();
|
||||
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
|
||||
// Bring up migrations so SqliteStore::open_existing succeeds inside App::open.
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
drop(store);
|
||||
// Leak the tempdir into a static — tests are short-lived; not worth threading.
|
||||
std::mem::forget(dir);
|
||||
cfg
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_input_returns_empty_summary() {
|
||||
let cfg = open_temp();
|
||||
let (items, summary) = bulk_search_with_config(cfg, vec![]).unwrap();
|
||||
assert!(items.is_empty());
|
||||
assert_eq!(summary.total, 0);
|
||||
assert_eq!(summary.succeeded, 0);
|
||||
assert_eq!(summary.failed, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn over_cap_returns_err() {
|
||||
let cfg = open_temp();
|
||||
let raw: Vec<Value> = (0..101)
|
||||
.map(|_| serde_json::json!({"query": "x"}))
|
||||
.collect();
|
||||
let err = bulk_search_with_config(cfg, raw).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("max 100"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_item_emits_error_keeps_total_count() {
|
||||
let cfg = open_temp();
|
||||
let raw = vec![
|
||||
serde_json::json!({"query": "ok", "mode": "lexical"}),
|
||||
serde_json::json!({"mode": "lexical"}), // missing required `query`
|
||||
];
|
||||
let (items, summary) = bulk_search_with_config(cfg, raw).unwrap();
|
||||
assert_eq!(items.len(), 2);
|
||||
assert_eq!(summary.total, 2);
|
||||
// First item: lexical mode against empty corpus succeeds with empty hits.
|
||||
assert!(items[0].error.is_none());
|
||||
// Second item: missing required field.
|
||||
assert!(items[1].error.is_some());
|
||||
assert_eq!(items[1].error.as_ref().unwrap()["code"], "invalid_input");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn missing_query_error_message_includes_shape_hint() {
|
||||
let cfg = open_temp();
|
||||
let raw = vec![serde_json::json!({"mode": "lexical"})];
|
||||
let (items, _summary) = bulk_search_with_config(cfg, raw).unwrap();
|
||||
let err = items[0].error.as_ref().unwrap();
|
||||
let msg = err["message"].as_str().unwrap();
|
||||
assert!(
|
||||
msg.contains("query") && msg.contains("mode"),
|
||||
"missing shape hint in error message: {msg}"
|
||||
);
|
||||
}
|
||||
}
|
||||
75
crates/kebab-app/src/cursor.rs
Normal file
75
crates/kebab-app/src/cursor.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
//! p9-fb-34 opaque pagination cursor.
|
||||
//!
|
||||
//! Format: base64(JSON({offset: usize, corpus_revision: string})).
|
||||
//! Opaque to callers — they MUST NOT decode the contents themselves;
|
||||
//! the schema is internal and may change without notice.
|
||||
|
||||
use base64::Engine;
|
||||
use base64::engine::general_purpose::URL_SAFE_NO_PAD;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::error_wire::ErrorV1;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct Payload {
|
||||
offset: usize,
|
||||
corpus_revision: String,
|
||||
}
|
||||
|
||||
/// Encode `(offset, corpus_revision)` as an opaque base64 string.
|
||||
pub fn encode(offset: usize, corpus_revision: &str) -> String {
|
||||
let payload = Payload {
|
||||
offset,
|
||||
corpus_revision: corpus_revision.to_string(),
|
||||
};
|
||||
let json = serde_json::to_vec(&payload).expect("Payload serializes");
|
||||
URL_SAFE_NO_PAD.encode(&json)
|
||||
}
|
||||
|
||||
/// Decode an opaque cursor against the expected `corpus_revision`.
|
||||
/// Mismatch or malformed input returns an `ErrorV1` with
|
||||
/// `code = "stale_cursor"`.
|
||||
//
|
||||
// p9-fb-34: ErrorV1 is the workspace-wide wire error struct (~200B
|
||||
// after monomorphization with Value + String fields). Boxing here
|
||||
// would force every call site to deref through a Box for no win —
|
||||
// the err-path is rare. Single allow at the function level.
|
||||
//
|
||||
// p9-fb-34 round-1 review: differentiate the three failure modes
|
||||
// (base64 / JSON / revision mismatch) with distinct messages — all
|
||||
// keep `code = "stale_cursor"` so the agent's branching logic stays
|
||||
// the same, but humans reading the message get a precise hint.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn decode(s: &str, expected_revision: &str) -> Result<usize, ErrorV1> {
|
||||
let bytes = URL_SAFE_NO_PAD.decode(s.as_bytes()).map_err(|_| ErrorV1 {
|
||||
schema_version: "error.v1".to_string(),
|
||||
code: "stale_cursor".to_string(),
|
||||
message: "cursor is not valid base64. Re-issue search to obtain a fresh cursor."
|
||||
.to_string(),
|
||||
details: Value::Null,
|
||||
hint: None,
|
||||
})?;
|
||||
let payload: Payload = serde_json::from_slice(&bytes).map_err(|_| ErrorV1 {
|
||||
schema_version: "error.v1".to_string(),
|
||||
code: "stale_cursor".to_string(),
|
||||
message: "cursor payload is malformed. Re-issue search to obtain a fresh cursor."
|
||||
.to_string(),
|
||||
details: Value::Null,
|
||||
hint: None,
|
||||
})?;
|
||||
if payload.corpus_revision != expected_revision {
|
||||
return Err(ErrorV1 {
|
||||
schema_version: "error.v1".to_string(),
|
||||
code: "stale_cursor".to_string(),
|
||||
message: format!(
|
||||
"cursor was issued against corpus_revision '{}'; current revision is \
|
||||
'{}'. Re-issue search to obtain a fresh cursor.",
|
||||
payload.corpus_revision, expected_revision
|
||||
),
|
||||
details: Value::Null,
|
||||
hint: None,
|
||||
});
|
||||
}
|
||||
Ok(payload.offset)
|
||||
}
|
||||
61
crates/kebab-app/src/derivation_payload.rs
Normal file
61
crates/kebab-app/src/derivation_payload.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
//! Derivation-cache payload encoding helpers (design 2026-05-31 §3.3).
|
||||
//!
|
||||
//! - embedding: `dimensions × f32` little-endian bytes (1024×4 = 4096 B/chunk).
|
||||
//! - alias / korean_tokens: UTF-8 as-is (handled inline by the caller — no
|
||||
//! helper needed, `String::as_bytes` / `String::from_utf8`).
|
||||
|
||||
/// Encode an embedding vector as a little-endian `f32` byte string (§3.3).
|
||||
pub fn encode_embedding(vector: &[f32]) -> Vec<u8> {
|
||||
let mut out = Vec::with_capacity(vector.len() * 4);
|
||||
for &v in vector {
|
||||
out.extend_from_slice(&v.to_le_bytes());
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Decode a little-endian `f32` byte string back into a vector (§3.3).
|
||||
///
|
||||
/// Returns `None` if the payload length is not a multiple of 4 (corrupt
|
||||
/// entry) — the caller treats this as a cache miss and recomputes, so a bad
|
||||
/// payload never produces a wrong vector.
|
||||
pub fn decode_embedding(payload: &[u8]) -> Option<Vec<f32>> {
|
||||
if payload.len() % 4 != 0 {
|
||||
return None;
|
||||
}
|
||||
Some(
|
||||
payload
|
||||
.chunks_exact(4)
|
||||
.map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn roundtrips_vector() {
|
||||
let v = vec![0.0_f32, 1.5, -2.25, 3.125e10, f32::MIN, f32::MAX];
|
||||
let bytes = encode_embedding(&v);
|
||||
assert_eq!(bytes.len(), v.len() * 4);
|
||||
assert_eq!(decode_embedding(&bytes), Some(v));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_vector_roundtrips() {
|
||||
assert_eq!(encode_embedding(&[]), Vec::<u8>::new());
|
||||
assert_eq!(decode_embedding(&[]), Some(vec![]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn misaligned_payload_is_none() {
|
||||
assert_eq!(decode_embedding(&[1, 2, 3]), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn little_endian_layout_is_fixed() {
|
||||
// 1.0_f32 == 0x3F800000, little-endian bytes [0x00,0x00,0x80,0x3F].
|
||||
assert_eq!(encode_embedding(&[1.0]), vec![0x00, 0x00, 0x80, 0x3F]);
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,6 @@
|
||||
|
||||
pub use crate::doctor_signal::{DoctorUnhealthy, NoHitSignal, RefusalSignal};
|
||||
|
||||
pub use kebab_config::{ConfigInvalid, ConfigNotFound};
|
||||
pub use kebab_llm_local::LlmError;
|
||||
pub use kebab_config::ConfigInvalid;
|
||||
pub use kebab_store_sqlite::NotIndexed;
|
||||
|
||||
@@ -9,7 +9,13 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Value, json};
|
||||
|
||||
use crate::error_signal::{ConfigInvalid, LlmError, NotIndexed};
|
||||
use crate::error_signal::{ConfigInvalid, ConfigNotFound, LlmError, NotIndexed};
|
||||
|
||||
// p9-fb-34: `stale_cursor` is constructed directly by `cursor::decode`
|
||||
// and surfaced through `StructuredError` (an anyhow-friendly wrapper
|
||||
// that carries the typed `ErrorV1` payload without lossy string
|
||||
// formatting). `classify` short-circuits on it at the top of the
|
||||
// function so the typed `code = "stale_cursor"` reaches the wire.
|
||||
|
||||
/// Wire schema id for [`ErrorV1`]. Single source of truth — kebab-cli
|
||||
/// + kebab-mcp use this via `kebab_app::ERROR_V1_ID`.
|
||||
@@ -24,7 +30,29 @@ pub struct ErrorV1 {
|
||||
pub hint: Option<String>,
|
||||
}
|
||||
|
||||
/// p9-fb-34: typed wrapper around an [`ErrorV1`] so callers that
|
||||
/// surface `anyhow::Error` can downcast back to the structured wire
|
||||
/// payload instead of losing it to string formatting. Constructed by
|
||||
/// the cursor code path (`cursor::decode` → `App::search_with_opts`)
|
||||
/// and short-circuited inside [`classify`].
|
||||
#[derive(Debug)]
|
||||
pub struct StructuredError(pub ErrorV1);
|
||||
|
||||
impl std::fmt::Display for StructuredError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[{}] {}", self.0.code, self.0.message)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for StructuredError {}
|
||||
|
||||
pub fn classify(err: &anyhow::Error, verbose: bool) -> ErrorV1 {
|
||||
// p9-fb-34: structured wrapper short-circuits — preserves the
|
||||
// typed payload that callers (cursor::decode) constructed
|
||||
// instead of falling through to `code = "generic"`.
|
||||
if let Some(s) = err.downcast_ref::<StructuredError>() {
|
||||
return s.0.clone();
|
||||
}
|
||||
if let Some(s) = err.downcast_ref::<ConfigInvalid>() {
|
||||
return ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
@@ -37,6 +65,20 @@ pub fn classify(err: &anyhow::Error, verbose: bool) -> ErrorV1 {
|
||||
hint: Some("check `--config <path>` and TOML syntax".to_string()),
|
||||
};
|
||||
}
|
||||
if let Some(s) = err.downcast_ref::<ConfigNotFound>() {
|
||||
return ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "config_not_found".to_string(),
|
||||
message: s.to_string(),
|
||||
details: json!({
|
||||
"path": s.path.to_string_lossy(),
|
||||
}),
|
||||
hint: Some(
|
||||
"verify --config <path>; pass an existing toml file or omit --config to use XDG default"
|
||||
.to_string(),
|
||||
),
|
||||
};
|
||||
}
|
||||
if let Some(s) = err.downcast_ref::<NotIndexed>() {
|
||||
return ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
@@ -63,7 +105,7 @@ pub fn classify(err: &anyhow::Error, verbose: bool) -> ErrorV1 {
|
||||
}
|
||||
let mut details = json!({});
|
||||
if verbose {
|
||||
let chain: Vec<String> = err.chain().map(|c| c.to_string()).collect();
|
||||
let chain: Vec<String> = err.chain().map(std::string::ToString::to_string).collect();
|
||||
details = json!({"chain": chain});
|
||||
}
|
||||
ErrorV1 {
|
||||
@@ -130,7 +172,10 @@ mod tests {
|
||||
});
|
||||
let v1 = classify(&err, false);
|
||||
assert_eq!(v1.code, "config_invalid");
|
||||
assert_eq!(v1.details.get("path").and_then(|p| p.as_str()), Some("/tmp/x.toml"));
|
||||
assert_eq!(
|
||||
v1.details.get("path").and_then(|p| p.as_str()),
|
||||
Some("/tmp/x.toml")
|
||||
);
|
||||
assert!(v1.hint.is_some());
|
||||
}
|
||||
|
||||
@@ -154,7 +199,8 @@ mod tests {
|
||||
// the resulting LlmError::Unreachable maps to "model_unreachable".
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.timeout(std::time::Duration::from_millis(500))
|
||||
.build().unwrap();
|
||||
.build()
|
||||
.unwrap();
|
||||
let err = client.get("http://127.0.0.1:1").send().unwrap_err();
|
||||
let llm = LlmError::Unreachable {
|
||||
endpoint: "http://127.0.0.1:1".to_string(),
|
||||
@@ -170,7 +216,10 @@ mod tests {
|
||||
let llm = LlmError::ModelNotPulled("gemma4:e4b".to_string());
|
||||
let v1 = classify(&anyhow::Error::new(llm), false);
|
||||
assert_eq!(v1.code, "model_not_pulled");
|
||||
assert_eq!(v1.details.get("model").and_then(|p| p.as_str()), Some("gemma4:e4b"));
|
||||
assert_eq!(
|
||||
v1.details.get("model").and_then(|p| p.as_str()),
|
||||
Some("gemma4:e4b")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -197,4 +246,39 @@ mod tests {
|
||||
let v1 = classify(&err, false);
|
||||
assert_eq!(v1.code, "io_error");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stale_cursor_is_not_routed_through_classify() {
|
||||
use anyhow::anyhow;
|
||||
let err: anyhow::Error = anyhow!("stale_cursor: rev mismatch");
|
||||
let v1 = classify(&err, false);
|
||||
// p9-fb-34: stale_cursor is constructed directly by cursor::decode
|
||||
// (single source of truth). classify must not pattern-match on
|
||||
// anyhow string contents — that would create two sources of
|
||||
// truth. The bare anyhow string falls through to "generic".
|
||||
assert_ne!(
|
||||
v1.code, "stale_cursor",
|
||||
"classify must not produce stale_cursor from bare anyhow string"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stale_cursor_propagates_through_structured_wrapper() {
|
||||
// p9-fb-34: positive-side contract for the structured-wrapper
|
||||
// path. cursor::decode constructs a typed ErrorV1, the call site
|
||||
// wraps it in `StructuredError`, anyhow carries it, and classify
|
||||
// short-circuits via downcast — preserving the typed code +
|
||||
// message instead of falling through to "generic".
|
||||
let original = ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "stale_cursor".to_string(),
|
||||
message: "test stale cursor".to_string(),
|
||||
details: Value::Null,
|
||||
hint: None,
|
||||
};
|
||||
let err: anyhow::Error = anyhow::Error::new(StructuredError(original));
|
||||
let v1 = classify(&err, false);
|
||||
assert_eq!(v1.code, "stale_cursor");
|
||||
assert_eq!(v1.message, "test stale cursor");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,9 +36,7 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let already = existing
|
||||
.lines()
|
||||
.any(|line| line.trim() == KEBABIGNORE_LINE);
|
||||
let already = existing.lines().any(|line| line.trim() == KEBABIGNORE_LINE);
|
||||
if already {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -50,18 +48,14 @@ pub fn ensure_kebabignore_entry(workspace_root: &Path) -> Result<()> {
|
||||
if !existing.is_empty() && !existing.ends_with('\n') {
|
||||
file.write_all(b"\n")?;
|
||||
}
|
||||
writeln!(file, "{}", KEBABIGNORE_LINE)?;
|
||||
writeln!(file, "{KEBABIGNORE_LINE}")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Copy bytes to `<external_dir>/<blake3-12>.<ext>`. Idempotent — if the
|
||||
/// destination file already exists with the expected hash, the existing
|
||||
/// file is reused (no second write). Returns the destination path.
|
||||
pub fn copy_to_external(
|
||||
external_dir: &Path,
|
||||
bytes: &[u8],
|
||||
ext: &str,
|
||||
) -> Result<PathBuf> {
|
||||
pub fn copy_to_external(external_dir: &Path, bytes: &[u8], ext: &str) -> Result<PathBuf> {
|
||||
let hash = blake3::hash(bytes);
|
||||
let hex = hash.to_hex();
|
||||
let prefix = &hex.as_str()[..12];
|
||||
@@ -82,11 +76,7 @@ pub fn copy_to_external(
|
||||
/// Internal `yaml_quote` always uses double-quoted YAML form with backslash
|
||||
/// escapes for `"` / `\` / control chars — agent-supplied titles with
|
||||
/// special characters are safe.
|
||||
pub fn inject_frontmatter(
|
||||
body: &str,
|
||||
title: &str,
|
||||
source_uri: Option<&str>,
|
||||
) -> Result<String> {
|
||||
pub fn inject_frontmatter(body: &str, title: &str, source_uri: Option<&str>) -> Result<String> {
|
||||
let head = body.trim_start();
|
||||
if head.starts_with("---\n") || head.starts_with("---\r\n") || head.starts_with("---\r") {
|
||||
anyhow::bail!(
|
||||
|
||||
449
crates/kebab-app/src/fetch.rs
Normal file
449
crates/kebab-app/src/fetch.rs
Normal file
@@ -0,0 +1,449 @@
|
||||
//! p9-fb-35 verbatim fetch implementation.
|
||||
//!
|
||||
//! [`App::fetch`] is the facade entry point. It dispatches on
|
||||
//! [`FetchQuery`] variants:
|
||||
//!
|
||||
//! - `Chunk(id)` — return the chunk row from `chunks.text`, optionally
|
||||
//! with ±N surrounding chunks (`FetchOpts::context`).
|
||||
//! - `Doc(id)` — return the entire document re-serialized to markdown.
|
||||
//! (Implemented in Task 4.)
|
||||
//! - `Span { doc_id, line_start, line_end }` — return a contiguous line
|
||||
//! slice. (Implemented in Task 5.)
|
||||
//!
|
||||
//! Errors are surfaced as [`StructuredError`] (anyhow-friendly wrapper
|
||||
//! around `ErrorV1`) so the CLI / MCP wire layer's `classify` keeps the
|
||||
//! typed `code` (`chunk_not_found` / `doc_not_found` /
|
||||
//! `span_not_supported`) instead of falling through to `code =
|
||||
//! "generic"`.
|
||||
|
||||
use anyhow::Result;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, Chunk, ChunkId, DocumentId, DocumentStore, FetchKind, FetchOpts,
|
||||
FetchQuery, FetchResult,
|
||||
};
|
||||
|
||||
use crate::App;
|
||||
use crate::error_wire::{ERROR_V1_ID, ErrorV1, StructuredError};
|
||||
use crate::staleness::compute_stale;
|
||||
|
||||
impl App {
|
||||
/// p9-fb-35: verbatim fetch facade. Returns text from
|
||||
/// `chunks.text` / `CanonicalDocument` based on the requested
|
||||
/// mode. Errors surface as `StructuredError(ErrorV1)` with one
|
||||
/// of `chunk_not_found` / `doc_not_found` / `span_not_supported`
|
||||
/// so the wire-layer classifier preserves the typed code.
|
||||
pub fn fetch(&self, query: FetchQuery, opts: FetchOpts) -> Result<FetchResult> {
|
||||
match query {
|
||||
FetchQuery::Chunk(id) => fetch_chunk(self, id, opts),
|
||||
FetchQuery::Doc(id) => fetch_doc(self, id, opts),
|
||||
FetchQuery::Span {
|
||||
doc_id,
|
||||
line_start,
|
||||
line_end,
|
||||
} => fetch_span(self, doc_id, line_start, line_end, opts),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_chunk(app: &App, id: ChunkId, opts: FetchOpts) -> Result<FetchResult> {
|
||||
let target = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_chunk(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "chunk_not_found".to_string(),
|
||||
message: format!("chunk_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
let doc_id = target.doc_id.clone();
|
||||
let doc =
|
||||
<kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &doc_id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!(
|
||||
"doc_id '{}' (parent of chunk '{}') not found",
|
||||
doc_id.0, id.0
|
||||
),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
let (context_before, context_after) = match opts.context {
|
||||
Some(n) if n > 0 => surrounding_chunks(app, &doc_id, &id, n)?,
|
||||
_ => (Vec::new(), Vec::new()),
|
||||
};
|
||||
|
||||
let now = OffsetDateTime::now_utc();
|
||||
let stale = compute_stale(
|
||||
doc_metadata_updated_at(&doc),
|
||||
now,
|
||||
app.config.search.stale_threshold_days,
|
||||
);
|
||||
|
||||
Ok(FetchResult {
|
||||
kind: FetchKind::Chunk,
|
||||
doc_id: doc.doc_id.clone(),
|
||||
doc_path: doc.workspace_path.clone(),
|
||||
indexed_at: doc_metadata_updated_at(&doc),
|
||||
stale,
|
||||
chunk: Some(target),
|
||||
context_before,
|
||||
context_after,
|
||||
text: None,
|
||||
line_start: None,
|
||||
line_end: None,
|
||||
effective_end: None,
|
||||
truncated: false,
|
||||
})
|
||||
}
|
||||
|
||||
fn fetch_doc(app: &App, id: DocumentId, opts: FetchOpts) -> Result<FetchResult> {
|
||||
let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
let mut text = fmt_canonical_to_markdown(&doc);
|
||||
let mut truncated = false;
|
||||
if let Some(max_tokens) = opts.max_tokens {
|
||||
let max_chars = max_tokens.saturating_mul(4);
|
||||
if text.chars().count() > max_chars {
|
||||
text = trim_to_chars(&text, max_chars);
|
||||
truncated = true;
|
||||
}
|
||||
}
|
||||
|
||||
let now = OffsetDateTime::now_utc();
|
||||
let stale = compute_stale(
|
||||
doc_metadata_updated_at(&doc),
|
||||
now,
|
||||
app.config.search.stale_threshold_days,
|
||||
);
|
||||
|
||||
Ok(FetchResult {
|
||||
kind: FetchKind::Doc,
|
||||
doc_id: doc.doc_id.clone(),
|
||||
doc_path: doc.workspace_path.clone(),
|
||||
indexed_at: doc_metadata_updated_at(&doc),
|
||||
stale,
|
||||
chunk: None,
|
||||
context_before: Vec::new(),
|
||||
context_after: Vec::new(),
|
||||
text: Some(text),
|
||||
line_start: None,
|
||||
line_end: None,
|
||||
effective_end: None,
|
||||
truncated,
|
||||
})
|
||||
}
|
||||
|
||||
/// p9-fb-35: trim string to N chars (Unicode-safe). Mirrors fb-34's
|
||||
/// helper at `crates/kebab-app/src/app.rs` — kept local to avoid
|
||||
/// re-exporting an internal helper.
|
||||
fn trim_to_chars(s: &str, n: usize) -> String {
|
||||
if s.chars().count() <= n {
|
||||
return s.to_string();
|
||||
}
|
||||
let mut out = String::with_capacity(n * 4);
|
||||
for (i, c) in s.chars().enumerate() {
|
||||
if i >= n {
|
||||
break;
|
||||
}
|
||||
out.push(c);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn fetch_span(
|
||||
app: &App,
|
||||
id: DocumentId,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
opts: FetchOpts,
|
||||
) -> Result<FetchResult> {
|
||||
let doc = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_document(&app.sqlite, &id)?
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "doc_not_found".to_string(),
|
||||
message: format!("doc_id '{}' not found", id.0),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
}))
|
||||
})?;
|
||||
|
||||
// Reject line-incompatible media types (PDF / audio). `SourceType`
|
||||
// (markdown / note / paper / reference / inbox) is the *user-facing*
|
||||
// category, not the rendering format — the actual byte-level format
|
||||
// lives on the source `RawAsset.media_type`. Look it up via
|
||||
// doc.source_asset_id (PRIMARY KEY) so twin files (identical content
|
||||
// at different paths) always read *this* document's own asset row,
|
||||
// not whichever twin last wrote `assets.workspace_path`.
|
||||
if let Some(asset) = <kebab_store_sqlite::SqliteStore as DocumentStore>::get_asset(
|
||||
&app.sqlite,
|
||||
&doc.source_asset_id,
|
||||
)? {
|
||||
if matches!(
|
||||
asset.media_type,
|
||||
kebab_core::MediaType::Pdf | kebab_core::MediaType::Audio(_)
|
||||
) {
|
||||
return Err(anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "span_not_supported".to_string(),
|
||||
message: format!(
|
||||
"doc '{}' has media_type {:?}; line-based span fetch unsupported. \
|
||||
Use `fetch chunk` or `fetch doc` instead.",
|
||||
id.0, asset.media_type
|
||||
),
|
||||
details: serde_json::Value::Null,
|
||||
hint: Some("kind = chunk or kind = doc instead".to_string()),
|
||||
})));
|
||||
}
|
||||
}
|
||||
|
||||
if line_start == 0 || line_end == 0 || line_end < line_start {
|
||||
return Err(anyhow::Error::new(StructuredError(ErrorV1 {
|
||||
schema_version: ERROR_V1_ID.to_string(),
|
||||
code: "invalid_input".to_string(),
|
||||
message: format!(
|
||||
"line_start ({line_start}) and line_end ({line_end}) must be 1-based with start <= end"
|
||||
),
|
||||
details: serde_json::Value::Null,
|
||||
hint: None,
|
||||
})));
|
||||
}
|
||||
|
||||
let full = fmt_canonical_to_markdown(&doc);
|
||||
let lines: Vec<&str> = full.lines().collect();
|
||||
let total = lines.len() as u32;
|
||||
|
||||
// p9-fb-35 round-1 review fix: empty / out-of-range request must
|
||||
// not slice. Returning empty text + `effective_end = line_start - 1`
|
||||
// lets the caller detect "no lines fetched" via
|
||||
// `text.is_empty() && effective_end < line_start`. `truncated`
|
||||
// stays false because line-range clamp is NOT a budget event —
|
||||
// budget-driven truncation is the only thing `truncated` signals.
|
||||
if total == 0 || line_start > total {
|
||||
let now = OffsetDateTime::now_utc();
|
||||
let stale = compute_stale(
|
||||
doc_metadata_updated_at(&doc),
|
||||
now,
|
||||
app.config.search.stale_threshold_days,
|
||||
);
|
||||
return Ok(FetchResult {
|
||||
kind: FetchKind::Span,
|
||||
doc_id: doc.doc_id.clone(),
|
||||
doc_path: doc.workspace_path.clone(),
|
||||
indexed_at: doc_metadata_updated_at(&doc),
|
||||
stale,
|
||||
chunk: None,
|
||||
context_before: Vec::new(),
|
||||
context_after: Vec::new(),
|
||||
text: Some(String::new()),
|
||||
line_start: Some(line_start),
|
||||
line_end: Some(line_end),
|
||||
// saturating_sub: when line_start = 1 we end at 0, signaling
|
||||
// "no lines fetched" without underflowing u32.
|
||||
effective_end: Some(line_start.saturating_sub(1)),
|
||||
truncated: false,
|
||||
});
|
||||
}
|
||||
|
||||
let effective_end_raw = line_end.min(total);
|
||||
let lo = (line_start - 1) as usize;
|
||||
let hi = effective_end_raw as usize;
|
||||
let mut text = lines[lo..hi].join("\n");
|
||||
|
||||
// p9-fb-35 round-1 review fix: `truncated` is reserved for
|
||||
// budget-driven truncation only. Line-range clamp (line_end >
|
||||
// total) is signaled via `effective_end < line_end`, not via
|
||||
// `truncated`.
|
||||
let mut truncated = false;
|
||||
let mut effective_end = effective_end_raw;
|
||||
if let Some(max_tokens) = opts.max_tokens {
|
||||
let max_chars = max_tokens.saturating_mul(4);
|
||||
if text.chars().count() > max_chars {
|
||||
text = trim_to_chars(&text, max_chars);
|
||||
truncated = true;
|
||||
let kept = text.lines().count() as u32;
|
||||
effective_end = (line_start - 1) + kept;
|
||||
}
|
||||
}
|
||||
|
||||
let now = OffsetDateTime::now_utc();
|
||||
let stale = compute_stale(
|
||||
doc_metadata_updated_at(&doc),
|
||||
now,
|
||||
app.config.search.stale_threshold_days,
|
||||
);
|
||||
|
||||
Ok(FetchResult {
|
||||
kind: FetchKind::Span,
|
||||
doc_id: doc.doc_id.clone(),
|
||||
doc_path: doc.workspace_path.clone(),
|
||||
indexed_at: doc_metadata_updated_at(&doc),
|
||||
stale,
|
||||
chunk: None,
|
||||
context_before: Vec::new(),
|
||||
context_after: Vec::new(),
|
||||
text: Some(text),
|
||||
line_start: Some(line_start),
|
||||
line_end: Some(line_end),
|
||||
effective_end: Some(effective_end),
|
||||
truncated,
|
||||
})
|
||||
}
|
||||
|
||||
/// p9-fb-35: list chunks for a document in ordinal order, return
|
||||
/// `(before, after)` slices around the target chunk_id. `n` caps each
|
||||
/// side independently — the worst case is `2n` total neighbors when
|
||||
/// the target sits in the middle of the doc.
|
||||
fn surrounding_chunks(
|
||||
app: &App,
|
||||
doc_id: &DocumentId,
|
||||
target: &ChunkId,
|
||||
n: u32,
|
||||
) -> Result<(Vec<Chunk>, Vec<Chunk>)> {
|
||||
let chunks = list_chunks_in_order(app, doc_id)?;
|
||||
let target_idx = chunks
|
||||
.iter()
|
||||
.position(|c| c.chunk_id == *target)
|
||||
.ok_or_else(|| anyhow::anyhow!("chunk not found in doc chunk list"))?;
|
||||
let n = n as usize;
|
||||
let lo = target_idx.saturating_sub(n);
|
||||
let hi = target_idx
|
||||
.saturating_add(n)
|
||||
.saturating_add(1)
|
||||
.min(chunks.len());
|
||||
let before: Vec<Chunk> = chunks[lo..target_idx].to_vec();
|
||||
let after: Vec<Chunk> = chunks[target_idx + 1..hi].to_vec();
|
||||
Ok((before, after))
|
||||
}
|
||||
|
||||
/// p9-fb-35: chunks have no explicit ordinal column, so the underlying
|
||||
/// helper sorts by `(created_at, chunk_id)` which matches insertion
|
||||
/// order produced by the chunker (deterministic). The actual SQL lives
|
||||
/// inside `kebab-store-sqlite` (`SqliteStore::list_chunk_ids_for_doc`)
|
||||
/// to keep the facade crate free of direct rusqlite usage.
|
||||
fn list_chunks_in_order(app: &App, doc_id: &DocumentId) -> Result<Vec<Chunk>> {
|
||||
let chunk_ids = app.sqlite.list_chunk_ids_for_doc(doc_id)?;
|
||||
let mut out: Vec<Chunk> = Vec::with_capacity(chunk_ids.len());
|
||||
for cid in chunk_ids {
|
||||
if let Some(chunk) =
|
||||
<kebab_store_sqlite::SqliteStore as DocumentStore>::get_chunk(&app.sqlite, &cid)?
|
||||
{
|
||||
out.push(chunk);
|
||||
}
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
fn doc_metadata_updated_at(doc: &CanonicalDocument) -> OffsetDateTime {
|
||||
doc.metadata.updated_at
|
||||
}
|
||||
|
||||
/// p9-fb-35: serialize a `CanonicalDocument` back to markdown. Best-
|
||||
/// effort round-trip — inline-styled spans (Strong/Emph children)
|
||||
/// flatten to plain text via the already-flattened `TextBlock.text`
|
||||
/// field. Good enough for an agent reading verbatim context. Used by
|
||||
/// Task 4 (doc mode) and Task 5 (span mode).
|
||||
pub(crate) fn fmt_canonical_to_markdown(doc: &CanonicalDocument) -> String {
|
||||
let mut out = String::with_capacity(1024);
|
||||
for (i, block) in doc.blocks.iter().enumerate() {
|
||||
if i > 0 {
|
||||
out.push_str("\n\n");
|
||||
}
|
||||
match block {
|
||||
Block::Heading(h) => {
|
||||
let level = h.level.clamp(1, 6) as usize;
|
||||
for _ in 0..level {
|
||||
out.push('#');
|
||||
}
|
||||
out.push(' ');
|
||||
out.push_str(&h.text);
|
||||
}
|
||||
Block::Paragraph(t) => out.push_str(&t.text),
|
||||
Block::Quote(t) => {
|
||||
// Prefix every line with `> ` so block-quote round-trips.
|
||||
for (li, line) in t.text.split('\n').enumerate() {
|
||||
if li > 0 {
|
||||
out.push('\n');
|
||||
}
|
||||
out.push_str("> ");
|
||||
out.push_str(line);
|
||||
}
|
||||
}
|
||||
Block::List(l) => {
|
||||
for (idx, item) in l.items.iter().enumerate() {
|
||||
if idx > 0 {
|
||||
out.push('\n');
|
||||
}
|
||||
if l.ordered {
|
||||
out.push_str(&format!("{}. {}", idx + 1, item.text));
|
||||
} else {
|
||||
out.push_str(&format!("- {}", item.text));
|
||||
}
|
||||
}
|
||||
}
|
||||
Block::Code(c) => {
|
||||
out.push_str("```");
|
||||
if let Some(lang) = &c.lang {
|
||||
out.push_str(lang);
|
||||
}
|
||||
out.push('\n');
|
||||
out.push_str(&c.code);
|
||||
if !c.code.ends_with('\n') {
|
||||
out.push('\n');
|
||||
}
|
||||
out.push_str("```");
|
||||
}
|
||||
Block::Table(t) => {
|
||||
out.push_str(&t.headers.join(" | "));
|
||||
out.push('\n');
|
||||
// Markdown table separator — N copies of `---|` is
|
||||
// acceptable for a verbatim re-serialization (renderer
|
||||
// tolerates trailing pipe).
|
||||
out.push_str(&"---|".repeat(t.headers.len()));
|
||||
for row in &t.rows {
|
||||
out.push('\n');
|
||||
out.push_str(&row.join(" | "));
|
||||
}
|
||||
}
|
||||
Block::ImageRef(img) => {
|
||||
out.push_str(&format!("", img.alt, img.src));
|
||||
}
|
||||
Block::AudioRef(_a) => {
|
||||
// Canonical doc carries the transcript on AudioRefBlock,
|
||||
// but markdown has no native audio embed. Emit a stub
|
||||
// marker so the agent sees something ran here.
|
||||
out.push_str("(audio reference)");
|
||||
}
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// p9-fb-35: free-function entry for CLI / MCP. Mirrors the
|
||||
/// `*_with_config` pattern documented in the kebab-app crate root —
|
||||
/// `kebab-cli` calls this so a `--config <path>` flag is honored.
|
||||
#[doc(hidden)]
|
||||
pub fn fetch_with_config(
|
||||
config: kebab_config::Config,
|
||||
query: FetchQuery,
|
||||
opts: FetchOpts,
|
||||
) -> Result<FetchResult> {
|
||||
App::open_with_config(config)?.fetch(query, opts)
|
||||
}
|
||||
446
crates/kebab-app/src/ingest_log.rs
Normal file
446
crates/kebab-app/src/ingest_log.rs
Normal file
@@ -0,0 +1,446 @@
|
||||
//! Per-ingest-run structured ndjson log writer (v0.20.x ingest log feature).
|
||||
//!
|
||||
//! Each `kebab ingest` run produces one `ingest-{run_id}.ndjson` file in
|
||||
//! `config.logging.ingest_log_dir`. Records are appended line by line; the
|
||||
//! last record is always `kind="summary"`. `IngestLogWriter::open` returns
|
||||
//! `Ok(None)` when `ingest_log_enabled = false` so callers need not branch.
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
|
||||
pub struct IngestLogWriter {
|
||||
file: BufWriter<File>,
|
||||
path: PathBuf,
|
||||
run_id: String,
|
||||
started_at: SystemTime,
|
||||
}
|
||||
|
||||
impl IngestLogWriter {
|
||||
/// Open a new log file. Returns `Ok(None)` when `cfg.ingest_log_enabled == false` (AC-6).
|
||||
pub fn open(cfg: &kebab_config::LoggingCfg) -> anyhow::Result<Option<Self>> {
|
||||
if !cfg.ingest_log_enabled {
|
||||
return Ok(None);
|
||||
}
|
||||
let run_id = generate_run_id();
|
||||
let log_dir = expand_log_dir(&cfg.ingest_log_dir);
|
||||
std::fs::create_dir_all(&log_dir)?;
|
||||
// Cleanup before creating the new file (non-critical: warn on error).
|
||||
if let Err(e) = cleanup_old_logs(&log_dir, cfg.keep_recent_runs, cfg.retention_days) {
|
||||
tracing::warn!(target: "kebab-app", "ingest log cleanup failed: {e}");
|
||||
}
|
||||
let path = log_dir.join(format!("ingest-{run_id}.ndjson"));
|
||||
let file = BufWriter::new(File::create(&path)?);
|
||||
Ok(Some(Self {
|
||||
file,
|
||||
path,
|
||||
run_id,
|
||||
started_at: SystemTime::now(),
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn write_event(&mut self, event: &LogEvent<'_>) -> anyhow::Result<()> {
|
||||
serde_json::to_writer(&mut self.file, event)?;
|
||||
writeln!(self.file)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_summary(&mut self, summary: &IngestSummary) -> anyhow::Result<()> {
|
||||
serde_json::to_writer(&mut self.file, summary)?;
|
||||
writeln!(self.file)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> anyhow::Result<()> {
|
||||
self.file.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn run_id(&self) -> &str {
|
||||
&self.run_id
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.path
|
||||
}
|
||||
|
||||
pub fn started_at(&self) -> SystemTime {
|
||||
self.started_at
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for IngestLogWriter {
|
||||
fn drop(&mut self) {
|
||||
let _ = self.file.flush();
|
||||
}
|
||||
}
|
||||
|
||||
/// ISO 8601 compact timestamp + uuid v7 suffix: `20260528T013000Z-abc123de`.
|
||||
/// uuid v7 is the workspace dep (Cargo.toml); `rand` is not added (spec §6 R-5).
|
||||
fn generate_run_id() -> String {
|
||||
use time::macros::format_description;
|
||||
let now = time::OffsetDateTime::now_utc();
|
||||
let ts = now
|
||||
.format(format_description!(
|
||||
"[year][month][day]T[hour][minute][second]Z"
|
||||
))
|
||||
.unwrap_or_else(|_| "19700101T000000Z".to_string());
|
||||
let uid = uuid::Uuid::now_v7().simple().to_string();
|
||||
let suffix = &uid[uid.len() - 8..];
|
||||
format!("{ts}-{suffix}")
|
||||
}
|
||||
|
||||
/// Expand `{state_dir}` placeholder → XDG state dir (spec §6 R-3).
|
||||
/// Other tilde/env expansion is delegated to `kebab_config::expand_path`.
|
||||
fn expand_log_dir(path: &Path) -> PathBuf {
|
||||
let path_str = path.to_string_lossy();
|
||||
if path_str.contains("{state_dir}") {
|
||||
let state_dir = kebab_config::Config::xdg_state_dir();
|
||||
PathBuf::from(path_str.replace("{state_dir}", &state_dir.to_string_lossy()))
|
||||
} else {
|
||||
path.to_path_buf()
|
||||
}
|
||||
}
|
||||
|
||||
/// RFC 3339 UTC timestamp for log records.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn now_ts() -> String {
|
||||
time::OffsetDateTime::now_utc()
|
||||
.format(&Rfc3339)
|
||||
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string())
|
||||
}
|
||||
|
||||
/// Ingest event record (ndjson line). `kind` is the discriminator.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||
pub enum LogEvent<'a> {
|
||||
Ocr {
|
||||
ts: String,
|
||||
/// v0.20.x r2: additive field — doc_id for dual-write SQLite correlation.
|
||||
/// Round 1 ndjson logs deserialize with doc_id=None (Serde Option default).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
doc_id: Option<&'a str>,
|
||||
doc_path: &'a str,
|
||||
page: u32,
|
||||
image_byte_size: Option<u64>,
|
||||
image_width: Option<u32>,
|
||||
image_height: Option<u32>,
|
||||
ms: u64,
|
||||
chars: u32,
|
||||
success: bool,
|
||||
reason: Option<&'a str>,
|
||||
ocr_engine: &'a str,
|
||||
},
|
||||
ParseError {
|
||||
ts: String,
|
||||
doc_path: &'a str,
|
||||
reason: &'a str,
|
||||
message: &'a str,
|
||||
},
|
||||
Skip {
|
||||
ts: String,
|
||||
doc_path: &'a str,
|
||||
reason: &'a str,
|
||||
detail: Option<&'a str>,
|
||||
},
|
||||
Error {
|
||||
ts: String,
|
||||
code: &'a str,
|
||||
message: &'a str,
|
||||
},
|
||||
}
|
||||
|
||||
/// Final summary record — always the last line of the log file.
|
||||
/// Explicit `kind` field serializes to `"kind": "summary"`.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct IngestSummary {
|
||||
pub kind: String,
|
||||
pub ts: String,
|
||||
pub run_id: String,
|
||||
pub scanned: u32,
|
||||
pub new: u32,
|
||||
pub errors: u32,
|
||||
pub ocr_pages: u32,
|
||||
pub ocr_failures: u32,
|
||||
pub ocr_p50_ms: Option<u64>,
|
||||
pub ocr_p90_ms: Option<u64>,
|
||||
pub ocr_max_ms: Option<u64>,
|
||||
pub duration_ms: u64,
|
||||
}
|
||||
|
||||
impl IngestSummary {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
ts: String,
|
||||
run_id: String,
|
||||
scanned: u32,
|
||||
new: u32,
|
||||
errors: u32,
|
||||
ocr_pages: u32,
|
||||
ocr_failures: u32,
|
||||
ocr_ms_samples: &[u64],
|
||||
duration_ms: u64,
|
||||
) -> Self {
|
||||
let (p50, p90, _p99, max) = percentiles(ocr_ms_samples);
|
||||
Self {
|
||||
kind: "summary".to_string(),
|
||||
ts,
|
||||
run_id,
|
||||
scanned,
|
||||
new,
|
||||
errors,
|
||||
ocr_pages,
|
||||
ocr_failures,
|
||||
ocr_p50_ms: p50,
|
||||
ocr_p90_ms: p90,
|
||||
ocr_max_ms: max,
|
||||
duration_ms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple percentile extraction on a sorted copy of `samples`.
|
||||
/// Returns `(p50, p90, p99, max)`. All `None` when samples is empty.
|
||||
/// p99 surfaces via `inspect ocr-stats`; `IngestSummary` uses p50/p90/max only.
|
||||
pub(crate) fn percentiles(samples: &[u64]) -> (Option<u64>, Option<u64>, Option<u64>, Option<u64>) {
|
||||
if samples.is_empty() {
|
||||
return (None, None, None, None);
|
||||
}
|
||||
let mut sorted = samples.to_vec();
|
||||
sorted.sort_unstable();
|
||||
let n = sorted.len();
|
||||
let p50 = sorted[(n.saturating_sub(1) * 50) / 100];
|
||||
let p90 = sorted[(n.saturating_sub(1) * 90) / 100];
|
||||
let p99 = sorted[(n.saturating_sub(1) * 99) / 100];
|
||||
let max = *sorted.last().unwrap();
|
||||
(Some(p50), Some(p90), Some(p99), Some(max))
|
||||
}
|
||||
|
||||
/// Delete old ingest log files from `log_dir`.
|
||||
///
|
||||
/// **Retention rule (§3.4 OR-on-stale semantics):**
|
||||
/// Keep a file iff BOTH conditions hold: (idx < keep_recent) AND (modified > cutoff).
|
||||
/// Delete iff (idx >= keep_recent) OR (modified <= cutoff) — either stale condition
|
||||
/// triggers deletion. Files are indexed newest-first so `idx=0` is the most recent.
|
||||
pub(crate) fn cleanup_old_logs(
|
||||
log_dir: &Path,
|
||||
keep_recent: u32,
|
||||
retention_days: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut entries: Vec<_> = std::fs::read_dir(log_dir)?
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
e.path()
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.is_some_and(|s| s.starts_with("ingest-") && s.ends_with(".ndjson"))
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort newest-first by mtime (files without mtime go to the end).
|
||||
entries.sort_by_key(|e| std::cmp::Reverse(e.metadata().ok().and_then(|m| m.modified().ok())));
|
||||
|
||||
let cutoff = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(
|
||||
u64::from(retention_days) * 86400,
|
||||
))
|
||||
.unwrap_or(SystemTime::UNIX_EPOCH);
|
||||
|
||||
for (idx, entry) in entries.into_iter().enumerate() {
|
||||
let modified = entry
|
||||
.metadata()
|
||||
.ok()
|
||||
.and_then(|m| m.modified().ok())
|
||||
.unwrap_or(SystemTime::UNIX_EPOCH);
|
||||
// Keep iff (idx < keep_recent) AND (modified > cutoff).
|
||||
if (idx as u32) < keep_recent && modified > cutoff {
|
||||
continue;
|
||||
}
|
||||
if let Err(e) = std::fs::remove_file(entry.path()) {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
"failed to remove old log {}: {e}",
|
||||
entry.path().display()
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_config::LoggingCfg;
|
||||
use std::time::SystemTime;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn generate_run_id_has_iso_prefix_and_8_hex_suffix() {
|
||||
let id = generate_run_id();
|
||||
// Format: YYYYMMDDTHHmmssZ-xxxxxxxx (total len = 16+1+8 = 25)
|
||||
assert_eq!(id.len(), 25, "run_id len should be 25: {id}");
|
||||
let (prefix, suffix) = id.split_once('-').expect("run_id should contain '-'");
|
||||
assert_eq!(prefix.len(), 16, "prefix should be 16 chars: {prefix}");
|
||||
assert!(prefix.contains('T'), "prefix should contain T: {prefix}");
|
||||
assert!(prefix.ends_with('Z'), "prefix should end with Z: {prefix}");
|
||||
assert_eq!(suffix.len(), 8, "suffix should be 8 chars: {suffix}");
|
||||
assert!(
|
||||
suffix.chars().all(|c| c.is_ascii_hexdigit()),
|
||||
"suffix should be hex: {suffix}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn expand_log_dir_substitutes_state_dir_placeholder() {
|
||||
let input = PathBuf::from("{state_dir}/logs");
|
||||
let expanded = expand_log_dir(&input);
|
||||
let expected = kebab_config::Config::xdg_state_dir().join("logs");
|
||||
assert_eq!(expanded, expected);
|
||||
assert!(!expanded.to_string_lossy().contains("{state_dir}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn writer_disabled_returns_none() {
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: false,
|
||||
ingest_log_dir: PathBuf::from("/tmp/should-not-exist"),
|
||||
..Default::default()
|
||||
};
|
||||
let result = IngestLogWriter::open(&cfg).expect("open should not error");
|
||||
assert!(result.is_none(), "disabled writer should return None");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn writer_writes_one_event_per_line_with_kind_discriminator() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: tmp.path().to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
|
||||
let path = writer.path().to_path_buf();
|
||||
|
||||
writer
|
||||
.write_event(&LogEvent::Skip {
|
||||
ts: now_ts(),
|
||||
doc_path: "a.zip",
|
||||
reason: "builtin_blacklist",
|
||||
detail: Some(".zip extension"),
|
||||
})
|
||||
.unwrap();
|
||||
writer
|
||||
.write_event(&LogEvent::Error {
|
||||
ts: now_ts(),
|
||||
code: "ingest_fatal",
|
||||
message: "something bad",
|
||||
})
|
||||
.unwrap();
|
||||
writer
|
||||
.write_event(&LogEvent::ParseError {
|
||||
ts: now_ts(),
|
||||
doc_path: "weird.pdf",
|
||||
reason: "lopdf_error",
|
||||
message: "unexpected EOF",
|
||||
})
|
||||
.unwrap();
|
||||
writer.flush().unwrap();
|
||||
|
||||
let contents = std::fs::read_to_string(&path).unwrap();
|
||||
let lines: Vec<&str> = contents.lines().collect();
|
||||
assert_eq!(lines.len(), 3, "expected 3 lines, got: {}", lines.len());
|
||||
for line in &lines {
|
||||
assert!(
|
||||
line.starts_with('{'),
|
||||
"each line should be JSON object: {line}"
|
||||
);
|
||||
assert!(
|
||||
line.contains("\"kind\""),
|
||||
"each line should have 'kind': {line}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn drop_flushes_pending_buffer() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let cfg = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: tmp.path().to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
let mut writer = IngestLogWriter::open(&cfg).unwrap().unwrap();
|
||||
let path = writer.path().to_path_buf();
|
||||
writer
|
||||
.write_event(&LogEvent::Error {
|
||||
ts: now_ts(),
|
||||
code: "test",
|
||||
message: "drop flush test",
|
||||
})
|
||||
.unwrap();
|
||||
// Drop without explicit flush — Drop impl should flush BufWriter.
|
||||
drop(writer);
|
||||
let contents = std::fs::read_to_string(&path).unwrap();
|
||||
assert!(
|
||||
contents.lines().count() >= 1,
|
||||
"file should have at least 1 line after drop"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC-7: keep_recent=3 with 5 files, oldest 2 should be deleted.
|
||||
#[test]
|
||||
fn cleanup_keeps_recent_n_drops_old() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let dir = tmp.path();
|
||||
// Create 5 files with mtime spread across 60 days
|
||||
for i in 0..5u64 {
|
||||
let path = dir.join(format!("ingest-file{i}.ndjson"));
|
||||
std::fs::write(&path, b"x").unwrap();
|
||||
// Set mtime: file 0 = newest, file 4 = 60 days old
|
||||
let age_days = i * 15; // 0, 15, 30, 45, 60 days old
|
||||
let mtime = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(age_days * 86400))
|
||||
.unwrap();
|
||||
filetime::set_file_mtime(&path, filetime::FileTime::from_system_time(mtime)).unwrap();
|
||||
}
|
||||
// keep_recent=3, retention_days=90 (no time-based deletion)
|
||||
cleanup_old_logs(dir, 3, 90).unwrap();
|
||||
let remaining: Vec<_> = std::fs::read_dir(dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.collect();
|
||||
assert_eq!(remaining.len(), 3, "expected 3 files after cleanup");
|
||||
}
|
||||
|
||||
/// F5 OR-on-stale: files within keep_recent count but older than retention_days
|
||||
/// must still be deleted.
|
||||
#[test]
|
||||
fn cleanup_drops_stale_even_within_count() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let dir = tmp.path();
|
||||
// 2 files, both 90 days old — well past retention_days=30
|
||||
for i in 0..2u64 {
|
||||
let path = dir.join(format!("ingest-old{i}.ndjson"));
|
||||
std::fs::write(&path, b"x").unwrap();
|
||||
let mtime = SystemTime::now()
|
||||
.checked_sub(std::time::Duration::from_secs(90 * 86400))
|
||||
.unwrap();
|
||||
filetime::set_file_mtime(&path, filetime::FileTime::from_system_time(mtime)).unwrap();
|
||||
}
|
||||
// keep_recent=10 (both within count) but retention_days=30 → both stale
|
||||
cleanup_old_logs(dir, 10, 30).unwrap();
|
||||
let remaining: Vec<_> = std::fs::read_dir(dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.collect();
|
||||
assert_eq!(
|
||||
remaining.len(),
|
||||
0,
|
||||
"stale files must be deleted even within keep_recent"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -46,10 +46,21 @@ pub struct AggregateCounts {
|
||||
/// Ordering invariant per design §2.4a:
|
||||
///
|
||||
/// ```text
|
||||
/// ScanStarted < ScanCompleted < (AssetStarted < AssetFinished)*
|
||||
/// < (Completed | Aborted)
|
||||
/// ScanStarted < ScanCompleted
|
||||
/// < ( AssetStarted
|
||||
/// [< (PdfOcrStarted < PdfOcrFinished)*]
|
||||
/// [< AssetChunked]
|
||||
/// [< AssetTimings]
|
||||
/// < AssetFinished )*
|
||||
/// < (Completed | Aborted)
|
||||
/// ```
|
||||
///
|
||||
/// `[]` = optional. `PdfOcr*` is per-PDF asset only (v0.20.0 sub-item 1).
|
||||
/// `AssetChunked` / `AssetTimings` are the v0.24.0 asset-internal phase
|
||||
/// events: `AssetChunked` fires once right after chunking (markdown /
|
||||
/// image / PDF); `AssetTimings` reports per-phase wall-clock once
|
||||
/// (markdown only).
|
||||
///
|
||||
/// Embed-batch events (`embed_batch_started` / `embed_batch_finished`
|
||||
/// in §2.4a) are reserved for a future iteration and are not emitted
|
||||
/// by this task; the spec calls them out as "임의 위치" (optional).
|
||||
@@ -79,12 +90,82 @@ pub enum IngestEvent {
|
||||
result: IngestItemKind,
|
||||
chunks: u32,
|
||||
},
|
||||
/// v0.24.0 (additive): emitted right after an asset is chunked, before
|
||||
/// expansion / embed / store. Surfaces "this document is N chunks"
|
||||
/// immediately so a single large document no longer looks frozen at
|
||||
/// `idx/total` while its per-chunk phases churn. `chunks` is the chunk
|
||||
/// count for asset `idx`.
|
||||
AssetChunked { idx: u32, total: u32, chunks: u32 },
|
||||
/// v0.26.1 (additive): emitted when an asset enters a *slow* internal
|
||||
/// phase, so the interactive progress bar can show **which** phase
|
||||
/// (and which model) is currently running instead of looking frozen.
|
||||
/// `phase` ∈ {`"ocr"`, `"caption"`, `"embed"`}; short phases
|
||||
/// (parse / chunk / store) are intentionally *not* emitted to avoid
|
||||
/// noise. `model` is the model performing the phase — the vision LLM
|
||||
/// id for `ocr` / `caption`, the embedder `model_id` for `embed`
|
||||
/// (`None` when the phase runs without a configured model, e.g. embed
|
||||
/// with no embedder wired). Emitted once per (asset, phase); no
|
||||
/// throttle needed (low frequency). Wire v1 consumers that predate
|
||||
/// this variant simply ignore the unknown `asset_phase` kind.
|
||||
AssetPhase {
|
||||
idx: u32,
|
||||
total: u32,
|
||||
phase: String,
|
||||
model: Option<String>,
|
||||
},
|
||||
/// v0.24.0 (additive): per-phase wall-clock (milliseconds) for asset
|
||||
/// `idx`, emitted once the asset's pipeline finishes. Lets a user see
|
||||
/// *where* the time went (parse / chunk / ocr / caption / embed /
|
||||
/// store) without parsing logs. The markdown path leaves `ocr_ms` /
|
||||
/// `caption_ms` at 0 (no image analysis); the image / PDF paths fill
|
||||
/// them so the slowest-asset summary attributes vision-model time
|
||||
/// correctly. `expansion_ms` is retained for wire compatibility but is
|
||||
/// always 0 since doc-side expansion was removed (HOTFIXES 2026-06-03).
|
||||
/// `ocr_ms` / `caption_ms` (v0.26.1) are additive with serde default 0
|
||||
/// so pre-v0.26.1 consumers deserialize cleanly.
|
||||
AssetTimings {
|
||||
idx: u32,
|
||||
total: u32,
|
||||
parse_ms: u64,
|
||||
chunk_ms: u64,
|
||||
expansion_ms: u64,
|
||||
embed_ms: u64,
|
||||
store_ms: u64,
|
||||
#[serde(default)]
|
||||
ocr_ms: u64,
|
||||
#[serde(default)]
|
||||
caption_ms: u64,
|
||||
},
|
||||
/// Run finished normally. `counts` is the final aggregate.
|
||||
Completed { counts: AggregateCounts },
|
||||
/// Run finished by user cancellation. `counts` is the partial
|
||||
/// aggregate at the cancel boundary. Emitted by `p9-fb-04`; this
|
||||
/// task never produces `Aborted`.
|
||||
Aborted { counts: AggregateCounts },
|
||||
/// PDF page 별 OCR 시작 시 emit. v0.20.0 sub-item 1.
|
||||
PdfOcrStarted { page: u32 },
|
||||
/// PDF page 별 OCR 종료 시 emit. v0.20.0 sub-item 1.
|
||||
/// `skipped` = `true` 일 시 OCR 미수행 (DCTDecode 부재 또는 engine 실패).
|
||||
/// `chars = 0` 만으로는 "skip" 과 "0-char OCR result" 구분 불가, `skipped` field 가 명시적.
|
||||
PdfOcrFinished {
|
||||
page: u32,
|
||||
ms: u64,
|
||||
chars: u32,
|
||||
ocr_engine: String,
|
||||
skipped: bool,
|
||||
/// v0.20.x ingest log: raster image byte size (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_byte_size: Option<u64>,
|
||||
/// v0.20.x ingest log: raster image width in pixels (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_width: Option<u32>,
|
||||
/// v0.20.x ingest log: raster image height in pixels (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
image_height: Option<u32>,
|
||||
/// v0.20.x ingest log: OCR failure reason (additive minor, optional).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
failure_reason: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
/// Map a `MediaType` to the short label used by `IngestEvent::AssetStarted`.
|
||||
@@ -96,6 +177,7 @@ pub fn media_label(media: &kebab_core::MediaType) -> &'static str {
|
||||
kebab_core::MediaType::Pdf => "pdf",
|
||||
kebab_core::MediaType::Image(_) => "image",
|
||||
kebab_core::MediaType::Audio(_) => "audio",
|
||||
kebab_core::MediaType::Code(_) => "code",
|
||||
kebab_core::MediaType::Other(_) => "other",
|
||||
}
|
||||
}
|
||||
@@ -117,10 +199,7 @@ pub fn render_skipped_breakdown(map: &std::collections::BTreeMap<String, u32>) -
|
||||
/// Best-effort send into an optional `mpsc::Sender`. A dropped receiver
|
||||
/// is silently absorbed — the ingest hot path must not stall on a slow
|
||||
/// consumer. Logged at `trace` for diagnostics.
|
||||
pub(crate) fn emit(
|
||||
progress: Option<&std::sync::mpsc::Sender<IngestEvent>>,
|
||||
event: IngestEvent,
|
||||
) {
|
||||
pub(crate) fn emit(progress: Option<&std::sync::mpsc::Sender<IngestEvent>>, event: IngestEvent) {
|
||||
if let Some(tx) = progress {
|
||||
if tx.send(event).is_err() {
|
||||
tracing::trace!(
|
||||
@@ -148,6 +227,7 @@ mod tests {
|
||||
media_label(&MediaType::Audio(kebab_core::AudioType::Wav)),
|
||||
"audio"
|
||||
);
|
||||
assert_eq!(media_label(&MediaType::Code("rust".into())), "code");
|
||||
assert_eq!(media_label(&MediaType::Other("x".into())), "other");
|
||||
}
|
||||
|
||||
@@ -163,13 +243,131 @@ mod tests {
|
||||
media: "markdown".into(),
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("asset_started"));
|
||||
assert_eq!(v.get("idx").and_then(|n| n.as_u64()), Some(1));
|
||||
assert_eq!(v.get("total").and_then(|n| n.as_u64()), Some(10));
|
||||
assert_eq!(
|
||||
v.get("kind").and_then(|s| s.as_str()),
|
||||
Some("asset_started")
|
||||
);
|
||||
assert_eq!(v.get("idx").and_then(serde_json::Value::as_u64), Some(1));
|
||||
assert_eq!(v.get("total").and_then(serde_json::Value::as_u64), Some(10));
|
||||
assert_eq!(v.get("path").and_then(|s| s.as_str()), Some("notes/foo.md"));
|
||||
assert_eq!(v.get("media").and_then(|s| s.as_str()), Some("markdown"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn asset_chunked_serializes_with_discriminator() {
|
||||
// v0.24.0 additive variant — `kind` must be snake_case
|
||||
// `asset_chunked` so wire v1 consumers branch on it cleanly.
|
||||
let ev = IngestEvent::AssetChunked {
|
||||
idx: 3,
|
||||
total: 10,
|
||||
chunks: 142,
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(
|
||||
v.get("kind").and_then(|s| s.as_str()),
|
||||
Some("asset_chunked")
|
||||
);
|
||||
assert_eq!(v.get("idx").and_then(serde_json::Value::as_u64), Some(3));
|
||||
assert_eq!(
|
||||
v.get("chunks").and_then(serde_json::Value::as_u64),
|
||||
Some(142)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn asset_timings_serializes_all_phase_fields() {
|
||||
let ev = IngestEvent::AssetTimings {
|
||||
idx: 2,
|
||||
total: 7,
|
||||
parse_ms: 12,
|
||||
chunk_ms: 3,
|
||||
expansion_ms: 45_000,
|
||||
embed_ms: 800,
|
||||
store_ms: 20,
|
||||
ocr_ms: 1_200,
|
||||
caption_ms: 3_400,
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(
|
||||
v.get("kind").and_then(|s| s.as_str()),
|
||||
Some("asset_timings")
|
||||
);
|
||||
// All phase fields are present (plain u64, always serialized).
|
||||
for (field, want) in [
|
||||
("parse_ms", 12u64),
|
||||
("chunk_ms", 3),
|
||||
("expansion_ms", 45_000),
|
||||
("embed_ms", 800),
|
||||
("store_ms", 20),
|
||||
("ocr_ms", 1_200),
|
||||
("caption_ms", 3_400),
|
||||
] {
|
||||
assert_eq!(
|
||||
v.get(field).and_then(serde_json::Value::as_u64),
|
||||
Some(want),
|
||||
"field {field}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn asset_timings_ocr_caption_default_to_zero_for_legacy_wire() {
|
||||
// v0.26.1 additive: a pre-v0.26.1 wire payload omits ocr_ms /
|
||||
// caption_ms; serde `default` must fill 0 so old producers stay
|
||||
// compatible.
|
||||
let legacy = serde_json::json!({
|
||||
"kind": "asset_timings",
|
||||
"idx": 1, "total": 1,
|
||||
"parse_ms": 5, "chunk_ms": 2, "expansion_ms": 0,
|
||||
"embed_ms": 10, "store_ms": 3
|
||||
});
|
||||
let ev: IngestEvent = serde_json::from_value(legacy).unwrap();
|
||||
match ev {
|
||||
IngestEvent::AssetTimings {
|
||||
ocr_ms,
|
||||
caption_ms,
|
||||
embed_ms,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(ocr_ms, 0);
|
||||
assert_eq!(caption_ms, 0);
|
||||
assert_eq!(embed_ms, 10);
|
||||
}
|
||||
other => panic!("unexpected event: {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn asset_phase_serializes_with_discriminator() {
|
||||
// v0.26.1 additive variant — `kind` must be snake_case
|
||||
// `asset_phase`, `phase` is the slow-phase label, `model` the
|
||||
// model id (nullable).
|
||||
let ev = IngestEvent::AssetPhase {
|
||||
idx: 4,
|
||||
total: 12,
|
||||
phase: "ocr".into(),
|
||||
model: Some("gemma4:e4b".into()),
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("asset_phase"));
|
||||
assert_eq!(v.get("idx").and_then(serde_json::Value::as_u64), Some(4));
|
||||
assert_eq!(v.get("phase").and_then(|s| s.as_str()), Some("ocr"));
|
||||
assert_eq!(v.get("model").and_then(|s| s.as_str()), Some("gemma4:e4b"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn asset_phase_model_none_serializes_as_null() {
|
||||
let ev = IngestEvent::AssetPhase {
|
||||
idx: 1,
|
||||
total: 1,
|
||||
phase: "embed".into(),
|
||||
model: None,
|
||||
};
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("phase").and_then(|s| s.as_str()), Some("embed"));
|
||||
assert!(v.get("model").is_some_and(serde_json::Value::is_null));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_event_completed_has_counts() {
|
||||
let ev = IngestEvent::Completed {
|
||||
@@ -182,8 +380,14 @@ mod tests {
|
||||
let v = serde_json::to_value(&ev).unwrap();
|
||||
assert_eq!(v.get("kind").and_then(|s| s.as_str()), Some("completed"));
|
||||
let counts = v.get("counts").unwrap();
|
||||
assert_eq!(counts.get("scanned").and_then(|n| n.as_u64()), Some(5));
|
||||
assert_eq!(counts.get("new").and_then(|n| n.as_u64()), Some(2));
|
||||
assert_eq!(
|
||||
counts.get("scanned").and_then(serde_json::Value::as_u64),
|
||||
Some(5)
|
||||
);
|
||||
assert_eq!(
|
||||
counts.get("new").and_then(serde_json::Value::as_u64),
|
||||
Some(2)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -26,7 +26,9 @@ pub fn init(level: LogLevel) -> Result<WorkerGuard> {
|
||||
let (nb, guard) = tracing_appender::non_blocking(file_appender);
|
||||
|
||||
let env_filter = match level {
|
||||
LogLevel::Default => EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")),
|
||||
LogLevel::Default => {
|
||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn"))
|
||||
}
|
||||
LogLevel::Verbose => EnvFilter::new("info"),
|
||||
LogLevel::Debug => EnvFilter::new("debug"),
|
||||
};
|
||||
|
||||
362
crates/kebab-app/src/pdf_ocr_apply.rs
Normal file
362
crates/kebab-app/src/pdf_ocr_apply.rs
Normal file
@@ -0,0 +1,362 @@
|
||||
// crates/kebab-app/src/pdf_ocr_apply.rs
|
||||
//
|
||||
// PDF post-extract OCR enrichment. parser isolation 보존 — kebab-parse-pdf 가
|
||||
// kebab-parse-image::OcrEngine 을 import 하지 않도록, helper 는 kebab-app 에 둠.
|
||||
// image path 의 apply_ocr (kebab-parse-image::ocr::apply_ocr) 의
|
||||
// PDF page 변형 — image 는 ImageRefBlock.ocr 를 mutate, PDF 는
|
||||
// Block::Paragraph.text / inlines 를 in-place mutate (단일 OCR fallback) 또는
|
||||
// 새 Block::Paragraph 를 push (always_on dual-block).
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use kebab_core::{
|
||||
Block, CanonicalDocument, CommonBlock, Inline, Lang, ProvenanceEvent, ProvenanceKind,
|
||||
SourceSpan, TextBlock, id_for_block,
|
||||
};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
use kebab_parse_pdf::{compute_valid_char_ratio, extract_dctdecode_page_image};
|
||||
use lopdf::Document as LopdfDocument;
|
||||
use time::OffsetDateTime;
|
||||
use tracing::warn;
|
||||
|
||||
/// Extract width/height from a JPEG (or any image format) byte slice.
|
||||
/// Returns `None` on corrupt / unsupported data — callers fall back to
|
||||
/// `(None, None)` so OCR results remain valid (R-4 mitigation).
|
||||
fn extract_image_dimensions(bytes: &[u8]) -> Option<(u32, u32)> {
|
||||
use image::ImageReader;
|
||||
ImageReader::new(std::io::Cursor::new(bytes))
|
||||
.with_guessed_format()
|
||||
.ok()?
|
||||
.into_dimensions()
|
||||
.ok()
|
||||
}
|
||||
|
||||
/// Per-page OCR knobs threaded through [`apply_ocr_to_pdf_pages`].
|
||||
/// Mirrors the `[pdf.ocr]` config block (spec §4.5); the facade
|
||||
/// (`kebab_app::ingest_one_pdf_asset`) fills these from
|
||||
/// `kebab_config::Config::pdf::ocr` plus runtime flags (CLI / SIGINT).
|
||||
pub struct PdfOcrOpts {
|
||||
/// Master switch. `false` short-circuits to
|
||||
/// `PdfOcrSummary { pages_ocrd: 0, ms_total: 0 }` without lopdf reparse.
|
||||
pub enabled: bool,
|
||||
/// `true` → 모든 page OCR (dual-block path, new `Block::Paragraph` push).
|
||||
/// `false` → text-detect block 의 `min_char_count` 또는
|
||||
/// `valid_ratio_threshold` 미달인 page 만 OCR (in-place mutate).
|
||||
pub always_on: bool,
|
||||
/// 0.0..=1.0. text-detect block 의 `compute_valid_char_ratio` 가
|
||||
/// 본 임계 미만이면 OCR fallback. Default `0.5`.
|
||||
pub valid_ratio_threshold: f32,
|
||||
/// text-detect block 의 char count 가 본 임계 미만이면 OCR fallback.
|
||||
/// empty page (cover, blank separator) 자동 skip. Default `20`.
|
||||
pub min_char_count: u32,
|
||||
/// OCR engine 에 전달할 언어 힌트 (예: `Lang("kor".into())`).
|
||||
/// `None` → no hint passed to engine.
|
||||
pub lang_hint: Option<Lang>,
|
||||
/// Optional per-page cancellation handle. checked at start of each page
|
||||
/// loop iteration; set→true 시 `cancelled mid-PDF` error 반환. plan §6 E4
|
||||
/// + verifier LOW L-1 resolution + spec §4.8 line 1159 명시.
|
||||
pub cancel: Option<Arc<AtomicBool>>,
|
||||
}
|
||||
|
||||
/// OCR run summary returned by [`apply_ocr_to_pdf_pages`] for the caller's
|
||||
/// `IngestItem.pdf_ocr_pages` + `pdf_ocr_ms_total` wire fields (§4.6.2).
|
||||
#[derive(Debug)]
|
||||
pub struct PdfOcrSummary {
|
||||
/// Number of pages 가 OCR pipeline 을 실제 통과 (skipped page 제외).
|
||||
pub pages_ocrd: u32,
|
||||
/// Cumulative wall-clock duration of successful OCR engine calls (ms).
|
||||
/// `saturating_add` 사용 — 24-day cumulative 까지 overflow-safe.
|
||||
pub ms_total: u64,
|
||||
}
|
||||
|
||||
/// Post-extract OCR enrichment for PDF. Walks `canonical.blocks` page-by-page,
|
||||
/// classifies each page via `text_quality::compute_valid_char_ratio` +
|
||||
/// `min_char_count`, and either:
|
||||
/// - skips (vector PDF + sufficient text + `always_on=false`),
|
||||
/// - mutates the text-detect `Block::Paragraph` in-place with OCR output
|
||||
/// (scanned/mojibake page), or
|
||||
/// - pushes a new `Block::Paragraph` with dual ordinal (`always_on=true` +
|
||||
/// vector page).
|
||||
///
|
||||
/// Errors:
|
||||
/// - cancel handle (`opts.cancel = Some(true)`) → `Err("PDF OCR cancelled mid-PDF at page N")`.
|
||||
/// - lopdf re-parse failure → `Err(...)`.
|
||||
/// - per-page OCR engine failure 또는 DCTDecode 부재 → `ProvenanceKind::Warning`
|
||||
/// event push + `emit_progress(Finished { skipped: true })` + continue
|
||||
/// (no `Err` propagation).
|
||||
///
|
||||
/// See spec §4.1 + §4.4 for the full pipeline.
|
||||
pub fn apply_ocr_to_pdf_pages<F>(
|
||||
canonical: &mut CanonicalDocument,
|
||||
engine: &dyn OcrEngine,
|
||||
pdf_bytes: &[u8],
|
||||
opts: &PdfOcrOpts,
|
||||
mut emit_progress: F,
|
||||
) -> Result<PdfOcrSummary>
|
||||
where
|
||||
F: FnMut(PdfOcrProgress),
|
||||
{
|
||||
if !opts.enabled {
|
||||
return Ok(PdfOcrSummary {
|
||||
pages_ocrd: 0,
|
||||
ms_total: 0,
|
||||
});
|
||||
}
|
||||
let pdf_doc = LopdfDocument::load_mem(pdf_bytes)
|
||||
.context("kb-app::pdf_ocr_apply: re-parse PDF for image extract")?;
|
||||
let page_count = pdf_doc.get_pages().len() as u32;
|
||||
|
||||
let mut new_events: Vec<ProvenanceEvent> = Vec::new();
|
||||
let mut ocr_blocks: Vec<Block> = Vec::new();
|
||||
let mut pages_ocrd: u32 = 0;
|
||||
let mut ms_total: u64 = 0;
|
||||
|
||||
// canonical.blocks 의 page → block index map (text-detect block 의 in-place
|
||||
// mutate 또는 dual-block push 결정용).
|
||||
// PdfTextExtractor 가 page 마다 1 Block::Paragraph + SourceSpan::Page 를
|
||||
// 생성 (§1.4) — 그 invariant 사용.
|
||||
for page_num in 1..=page_count {
|
||||
if let Some(cancel) = &opts.cancel {
|
||||
if cancel.load(std::sync::atomic::Ordering::Relaxed) {
|
||||
anyhow::bail!("PDF OCR cancelled mid-PDF at page {page_num}");
|
||||
}
|
||||
}
|
||||
|
||||
let text_block_idx = find_paragraph_block_idx(&canonical.blocks, page_num);
|
||||
let text = match &canonical.blocks[text_block_idx] {
|
||||
Block::Paragraph(tb) => tb.text.clone(),
|
||||
_ => String::new(),
|
||||
};
|
||||
let chars = text.chars().count() as u32;
|
||||
let valid_ratio = compute_valid_char_ratio(&text);
|
||||
let needs_ocr = chars < opts.min_char_count || valid_ratio < opts.valid_ratio_threshold;
|
||||
|
||||
// 결정 matrix:
|
||||
// always_on=true → 모든 page OCR (dual-block).
|
||||
// always_on=false + needs_ocr → in-place OCR (text-detect block mutate).
|
||||
// needs_ocr=false → skip.
|
||||
let do_ocr = opts.always_on || needs_ocr;
|
||||
if !do_ocr {
|
||||
continue;
|
||||
}
|
||||
|
||||
emit_progress(PdfOcrProgress::Started { page: page_num });
|
||||
|
||||
let page_image_bytes = if let Some(b) = extract_dctdecode_page_image(&pdf_doc, page_num)? {
|
||||
b
|
||||
} else {
|
||||
let note = format!(
|
||||
"page={page_num} skipped: no DCTDecode image XObject (vector PDF page or unsupported /Filter — v1 supports DCTDecode passthrough only; see release notes for normalization guidance)"
|
||||
);
|
||||
warn!(target: "kebab-app", "{}", note);
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(note),
|
||||
});
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: 0,
|
||||
chars: 0,
|
||||
skipped: true,
|
||||
image_byte_size: None,
|
||||
image_width: None,
|
||||
image_height: None,
|
||||
failure_reason: None,
|
||||
});
|
||||
continue;
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
let ocr = match engine.recognize(&page_image_bytes, opts.lang_hint.as_ref()) {
|
||||
Ok(t) => t,
|
||||
Err(e) => {
|
||||
// OCR failure: warning event + skip (text-detect block 그대로).
|
||||
let note = format!(
|
||||
"page={} OCR failed engine={} version={} err={}",
|
||||
page_num,
|
||||
engine.engine_name(),
|
||||
engine.engine_version(),
|
||||
e
|
||||
);
|
||||
warn!(target: "kebab-app", "{}", note);
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(note),
|
||||
});
|
||||
let (image_width, image_height) = extract_image_dimensions(&page_image_bytes)
|
||||
.map_or((None, None), |(w, h)| (Some(w), Some(h)));
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: start.elapsed().as_millis() as u64,
|
||||
chars: 0,
|
||||
skipped: true,
|
||||
image_byte_size: Some(page_image_bytes.len() as u64),
|
||||
image_width,
|
||||
image_height,
|
||||
failure_reason: Some("ocr_error".to_string()),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let elapsed_ms = start.elapsed().as_millis() as u64;
|
||||
let chars_ocr = ocr.joined.chars().count() as u32;
|
||||
|
||||
pages_ocrd = pages_ocrd.saturating_add(1);
|
||||
ms_total = ms_total.saturating_add(elapsed_ms);
|
||||
|
||||
if opts.always_on && !needs_ocr {
|
||||
// dual-block path: 새 Block::Paragraph push, ordinal = page-1 + page_count.
|
||||
let ocr_ordinal = (page_num - 1) + page_count;
|
||||
let span_ocr = SourceSpan::Page {
|
||||
page: page_num,
|
||||
char_start: Some(0),
|
||||
char_end: Some(chars_ocr),
|
||||
};
|
||||
let block_id =
|
||||
id_for_block(&canonical.doc_id, "paragraph", &[], ocr_ordinal, &span_ocr);
|
||||
let common = CommonBlock {
|
||||
block_id,
|
||||
heading_path: Vec::new(),
|
||||
source_span: span_ocr,
|
||||
};
|
||||
ocr_blocks.push(Block::Paragraph(TextBlock {
|
||||
common,
|
||||
text: ocr.joined.clone(),
|
||||
inlines: if ocr.joined.is_empty() {
|
||||
Vec::new()
|
||||
} else {
|
||||
vec![Inline::Text {
|
||||
text: ocr.joined.clone(),
|
||||
}]
|
||||
},
|
||||
}));
|
||||
} else {
|
||||
// in-place mutate: text-detect block (빈 또는 low-valid) 의 text/inlines 교체.
|
||||
// block_id / ordinal 보존 — span 의 char_end 만 갱신.
|
||||
if let Block::Paragraph(tb) = &mut canonical.blocks[text_block_idx] {
|
||||
tb.text = ocr.joined.clone();
|
||||
tb.inlines = if ocr.joined.is_empty() {
|
||||
Vec::new()
|
||||
} else {
|
||||
vec![Inline::Text {
|
||||
text: ocr.joined.clone(),
|
||||
}]
|
||||
};
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(chars_ocr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
new_events.push(ProvenanceEvent {
|
||||
at: OffsetDateTime::now_utc(),
|
||||
agent: "kb-parse-pdf".to_string(),
|
||||
kind: ProvenanceKind::OcrApplied,
|
||||
note: Some(format!(
|
||||
"page={} engine={} version={} regions={} ms={} chars={}",
|
||||
page_num,
|
||||
engine.engine_name(),
|
||||
engine.engine_version(),
|
||||
ocr.regions.len(),
|
||||
elapsed_ms,
|
||||
chars_ocr
|
||||
)),
|
||||
});
|
||||
|
||||
let (image_width, image_height) = extract_image_dimensions(&page_image_bytes)
|
||||
.map_or((None, None), |(w, h)| (Some(w), Some(h)));
|
||||
emit_progress(PdfOcrProgress::Finished {
|
||||
page: page_num,
|
||||
ms: elapsed_ms,
|
||||
chars: chars_ocr,
|
||||
skipped: false,
|
||||
image_byte_size: Some(page_image_bytes.len() as u64),
|
||||
image_width,
|
||||
image_height,
|
||||
failure_reason: None,
|
||||
});
|
||||
}
|
||||
|
||||
canonical.blocks.extend(ocr_blocks);
|
||||
canonical.provenance.events.extend(new_events);
|
||||
Ok(PdfOcrSummary {
|
||||
pages_ocrd,
|
||||
ms_total,
|
||||
})
|
||||
}
|
||||
|
||||
fn find_paragraph_block_idx(blocks: &[Block], page_num: u32) -> usize {
|
||||
blocks
|
||||
.iter()
|
||||
.position(|b| match b {
|
||||
Block::Paragraph(tb) => matches!(
|
||||
tb.common.source_span,
|
||||
SourceSpan::Page { page, .. } if page == page_num
|
||||
),
|
||||
_ => false,
|
||||
})
|
||||
.expect("PdfTextExtractor emits 1 Block::Paragraph per page (invariant)")
|
||||
}
|
||||
|
||||
/// Per-page OCR progress event 가 caller 의 `emit_progress` closure 호출 시 emit.
|
||||
/// Step 6 의 ingest_one_pdf_asset 가 IngestEvent::PdfOcrStarted / PdfOcrFinished
|
||||
/// 로 carry (spec §4.6.1 wire schema).
|
||||
pub enum PdfOcrProgress {
|
||||
/// page 별 OCR 시작 시 emit. `engine.recognize` 호출 직전.
|
||||
Started {
|
||||
/// 1-based PDF page number.
|
||||
page: u32,
|
||||
},
|
||||
/// page 별 OCR 종료 시 emit (성공 / skip / failure 모두).
|
||||
Finished {
|
||||
/// 1-based PDF page number.
|
||||
page: u32,
|
||||
/// `engine.recognize` wall-clock duration. skip path 의 의미는 mixed
|
||||
/// (DCTDecode 부재 시 `0`, OCR engine 실패 시 actual latency before bail).
|
||||
ms: u64,
|
||||
/// OCR result text 의 char count. skip 시 `0`.
|
||||
chars: u32,
|
||||
/// `true` = DCTDecode 부재 또는 OCR engine 실패 로 skip.
|
||||
/// `false` = 정상 OCR 완료.
|
||||
skipped: bool,
|
||||
/// v0.20.x ingest log: raster image byte size (additive, optional).
|
||||
image_byte_size: Option<u64>,
|
||||
/// v0.20.x ingest log: raster image width in pixels (additive, optional).
|
||||
image_width: Option<u32>,
|
||||
/// v0.20.x ingest log: raster image height in pixels (additive, optional).
|
||||
image_height: Option<u32>,
|
||||
/// v0.20.x ingest log: failure reason string when OCR failed (additive, optional).
|
||||
/// Values: "timeout" | "ocr_error" | "network_error" | None (success).
|
||||
failure_reason: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn extract_image_dimensions_valid_jpeg() {
|
||||
let img = image::RgbImage::new(16, 12);
|
||||
let mut bytes = Vec::new();
|
||||
image::DynamicImage::from(img)
|
||||
.write_to(
|
||||
&mut std::io::Cursor::new(&mut bytes),
|
||||
image::ImageFormat::Jpeg,
|
||||
)
|
||||
.expect("encode jpeg");
|
||||
assert_eq!(extract_image_dimensions(&bytes), Some((16, 12)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_image_dimensions_corrupt_returns_none() {
|
||||
assert_eq!(extract_image_dimensions(b"not a jpeg"), None);
|
||||
}
|
||||
}
|
||||
@@ -9,13 +9,19 @@
|
||||
//!
|
||||
//! `--vector-only` additionally truncates `embedding_records` in SQLite
|
||||
//! so the next `kebab ingest` re-embeds cleanly without orphan rows.
|
||||
//!
|
||||
//! `--orphans-only` purges stored docs that are outside the current walker
|
||||
//! scope (config narrowing / removed sub-directory). No filesystem paths are
|
||||
//! removed — this is purely a store-level reconciliation.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use kebab_config::{Config, expand_path};
|
||||
use kebab_core::WorkspacePath;
|
||||
|
||||
/// What the user asked to remove. Mutually exclusive — picked by the CLI
|
||||
/// from a clap `ArgGroup`.
|
||||
@@ -32,6 +38,13 @@ pub enum ResetScope {
|
||||
VectorOnly,
|
||||
/// Wipe only the config dir.
|
||||
ConfigOnly,
|
||||
/// Purge stored docs that are outside the current walker scope (no
|
||||
/// filesystem paths are removed). Filesystem existence is NOT checked —
|
||||
/// anything the current walker would not visit is considered an orphan.
|
||||
/// The explicit complement to the conservative `sweep_deleted_files`
|
||||
/// that runs during ingest (which leaves on-disk-but-out-of-scope docs
|
||||
/// alone for data safety).
|
||||
OrphansOnly,
|
||||
}
|
||||
|
||||
/// Result of a successful wipe — emitted as `reset_report.v1` by the
|
||||
@@ -41,6 +54,16 @@ pub struct ResetReport {
|
||||
pub scope: ResetScope,
|
||||
pub removed_paths: Vec<PathBuf>,
|
||||
pub embedding_rows_truncated: u64,
|
||||
/// Number of stored docs purged because they are outside the current
|
||||
/// walker scope. Non-zero only when `scope == OrphansOnly`.
|
||||
/// `#[serde(default)]` preserves back-compat with older callers that
|
||||
/// do not include this field.
|
||||
#[serde(default)]
|
||||
pub orphans_purged: u32,
|
||||
/// Paths of the orphaned docs that were purged. Sorted for deterministic
|
||||
/// output. Non-empty only when `scope == OrphansOnly`.
|
||||
#[serde(default)]
|
||||
pub purged_paths: Vec<WorkspacePath>,
|
||||
}
|
||||
|
||||
/// Compute the absolute on-disk paths a given scope will wipe, given a
|
||||
@@ -62,11 +85,14 @@ pub fn enumerate_paths(scope: ResetScope, cfg: &Config) -> Vec<PathBuf> {
|
||||
ResetScope::All => vec![cfg_dir, data_dir, cache_dir, state_dir],
|
||||
ResetScope::DataOnly => vec![data_dir, cache_dir, state_dir],
|
||||
ResetScope::VectorOnly => {
|
||||
let vector_dir =
|
||||
expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
|
||||
let vector_dir = expand_path(&cfg.storage.vector_dir, &data_dir.to_string_lossy());
|
||||
vec![vector_dir]
|
||||
}
|
||||
ResetScope::ConfigOnly => vec![cfg_dir],
|
||||
// OrphansOnly operates purely at the store level — no filesystem paths
|
||||
// are removed. Return empty so `estimate_size_bytes` stays zero and
|
||||
// the existing confirm UI path for directory wipes is skipped.
|
||||
ResetScope::OrphansOnly => vec![],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,16 +122,79 @@ pub fn estimate_size_bytes(paths: &[PathBuf]) -> u64 {
|
||||
paths.iter().map(|p| walk(p)).sum()
|
||||
}
|
||||
|
||||
/// Compute the workspace paths stored in SQLite that are NOT visited by
|
||||
/// the current walker scope (i.e. they are "orphans" — on disk but
|
||||
/// outside the configured include/exclude rules, or from a sub-directory
|
||||
/// that has since been removed from the workspace).
|
||||
///
|
||||
/// Does NOT check filesystem existence — `OrphansOnly` is the explicit
|
||||
/// "I know what I'm doing" variant; callers that want the conservative
|
||||
/// fs-aware sweep should use `sweep_deleted_files` inside ingest.
|
||||
///
|
||||
/// Returns the list sorted for deterministic output. Called twice by the
|
||||
/// CLI path (once for the confirm UI preview, once inside `execute`);
|
||||
/// the double scan is acceptable for a rare destructive operation.
|
||||
pub fn enumerate_orphans(cfg: &Config) -> Result<Vec<WorkspacePath>> {
|
||||
use kebab_core::DocumentStore as _;
|
||||
use kebab_core::SourceScope;
|
||||
use kebab_source_fs::FsSourceConnector;
|
||||
|
||||
let store = kebab_store_sqlite::SqliteStore::open(cfg)
|
||||
.context("enumerate_orphans: open SqliteStore")?;
|
||||
|
||||
let stored = store
|
||||
.all_workspace_paths()
|
||||
.context("enumerate_orphans: all_workspace_paths")?;
|
||||
|
||||
if stored.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
// Build the same SourceScope the CLI's ingest path uses: root from
|
||||
// config, exclude list from config, no include override (full scope).
|
||||
let root = cfg.resolve_workspace_root();
|
||||
let scope = SourceScope {
|
||||
root: root.clone(),
|
||||
exclude: cfg.workspace.exclude.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let connector =
|
||||
FsSourceConnector::new(cfg).context("enumerate_orphans: build FsSourceConnector")?;
|
||||
let (assets, _skips) = connector
|
||||
.scan_with_skips(&scope)
|
||||
.context("enumerate_orphans: scan workspace")?;
|
||||
|
||||
let scanned: HashSet<WorkspacePath> = assets.into_iter().map(|a| a.workspace_path).collect();
|
||||
|
||||
let mut orphans: Vec<WorkspacePath> = stored
|
||||
.into_iter()
|
||||
.filter(|p| !scanned.contains(p))
|
||||
.collect();
|
||||
orphans.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
Ok(orphans)
|
||||
}
|
||||
|
||||
/// Wipe every path from `enumerate_paths(scope, cfg)`. For
|
||||
/// `ResetScope::VectorOnly`, also truncates the SQLite
|
||||
/// `embedding_records` table so the store doesn't point at the Lance
|
||||
/// rows we just removed off-disk.
|
||||
///
|
||||
/// For `ResetScope::OrphansOnly`, no filesystem directories are removed.
|
||||
/// Instead the store is reconciled: stored docs outside the current walker
|
||||
/// scope are purged from SQLite (+ vector store when configured). The
|
||||
/// caller is expected to have already shown the confirm UI using
|
||||
/// `enumerate_orphans`.
|
||||
///
|
||||
/// Idempotent: a missing path is treated as already-removed (success).
|
||||
/// Returns a `ResetReport` listing exactly what was removed (paths that
|
||||
/// existed before the call) so `--json` callers see the truth, not the
|
||||
/// request.
|
||||
pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
||||
if matches!(scope, ResetScope::OrphansOnly) {
|
||||
return execute_orphans_only(cfg);
|
||||
}
|
||||
|
||||
let paths = enumerate_paths(scope, cfg);
|
||||
let mut removed = Vec::new();
|
||||
|
||||
@@ -113,8 +202,7 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
||||
if !p.exists() {
|
||||
continue;
|
||||
}
|
||||
std::fs::remove_dir_all(p)
|
||||
.with_context(|| format!("remove {}", p.display()))?;
|
||||
std::fs::remove_dir_all(p).with_context(|| format!("remove {}", p.display()))?;
|
||||
removed.push(p.clone());
|
||||
}
|
||||
|
||||
@@ -128,9 +216,99 @@ pub fn execute(scope: ResetScope, cfg: &Config) -> Result<ResetReport> {
|
||||
scope,
|
||||
removed_paths: removed,
|
||||
embedding_rows_truncated,
|
||||
orphans_purged: 0,
|
||||
purged_paths: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Execute the `OrphansOnly` variant: reconcile stored docs against the
|
||||
/// current walker scope without touching any filesystem directory.
|
||||
fn execute_orphans_only(cfg: &Config) -> Result<ResetReport> {
|
||||
let orphans = enumerate_orphans(cfg).context("execute_orphans_only: enumerate orphans")?;
|
||||
|
||||
if orphans.is_empty() {
|
||||
return Ok(ResetReport {
|
||||
scope: ResetScope::OrphansOnly,
|
||||
removed_paths: Vec::new(),
|
||||
embedding_rows_truncated: 0,
|
||||
orphans_purged: 0,
|
||||
purged_paths: Vec::new(),
|
||||
});
|
||||
}
|
||||
|
||||
let store = std::sync::Arc::new(
|
||||
kebab_store_sqlite::SqliteStore::open(cfg)
|
||||
.context("execute_orphans_only: open SqliteStore")?,
|
||||
);
|
||||
|
||||
// Open vector store if configured. Mirror the same guard the ingest
|
||||
// path uses: only construct when the provider is not "none" / dims > 0.
|
||||
let vector_store: Option<kebab_store_vector::LanceVectorStore> =
|
||||
open_vector_store_if_configured(cfg, store.clone())?;
|
||||
|
||||
let mut purged_paths: Vec<WorkspacePath> = Vec::new();
|
||||
|
||||
for path in &orphans {
|
||||
let chunk_ids = kebab_store_sqlite::purge_deleted_workspace_path(&store, path)
|
||||
.with_context(|| format!("execute_orphans_only: purge {}", path.0))?;
|
||||
|
||||
if let Some(ref vs) = vector_store {
|
||||
if !chunk_ids.is_empty() {
|
||||
use kebab_core::VectorStore as _;
|
||||
if let Err(e) = vs.delete_by_chunk_ids(&chunk_ids) {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
path = %path.0,
|
||||
count = chunk_ids.len(),
|
||||
error = %e,
|
||||
"reset --orphans-only: vector delete failed; SQLite side already cleaned"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
target: "kebab-app",
|
||||
path = %path.0,
|
||||
"reset --orphans-only: purged orphan document"
|
||||
);
|
||||
purged_paths.push(path.clone());
|
||||
}
|
||||
|
||||
let orphans_purged = u32::try_from(purged_paths.len()).unwrap_or(u32::MAX);
|
||||
|
||||
Ok(ResetReport {
|
||||
scope: ResetScope::OrphansOnly,
|
||||
removed_paths: Vec::new(),
|
||||
embedding_rows_truncated: 0,
|
||||
orphans_purged,
|
||||
purged_paths,
|
||||
})
|
||||
}
|
||||
|
||||
/// Open the Lance vector store if the configured embedding provider is
|
||||
/// active (non-"none", dimensions > 0). Returns `None` for lexical-only
|
||||
/// configs. Mirrors the guard in `App::vector`.
|
||||
fn open_vector_store_if_configured(
|
||||
cfg: &Config,
|
||||
store: std::sync::Arc<kebab_store_sqlite::SqliteStore>,
|
||||
) -> Result<Option<kebab_store_vector::LanceVectorStore>> {
|
||||
if cfg.models.embedding.provider == "none" || cfg.models.embedding.dimensions == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
match kebab_store_vector::LanceVectorStore::new(cfg, store) {
|
||||
Ok(vs) => Ok(Some(vs)),
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
target: "kebab-app",
|
||||
error = %e,
|
||||
"reset --orphans-only: could not open vector store; skipping vector delete"
|
||||
);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Open the SQLite store at the configured path and run
|
||||
/// `truncate_embedding_records`. Returns the count of truncated rows
|
||||
/// (the helper itself reports `DELETE` rowcount). If the SQLite file
|
||||
@@ -200,4 +378,14 @@ mod tests {
|
||||
let bytes = estimate_size_bytes(&[dir.path().to_path_buf()]);
|
||||
assert_eq!(bytes, 5 + 6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn enumerate_orphans_only_returns_empty_paths() {
|
||||
let cfg = Config::defaults();
|
||||
let paths = enumerate_paths(ResetScope::OrphansOnly, &cfg);
|
||||
assert!(
|
||||
paths.is_empty(),
|
||||
"OrphansOnly must return empty vec from enumerate_paths"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,24 +32,65 @@ pub struct Capabilities {
|
||||
pub http_daemon: bool,
|
||||
pub mcp_server: bool,
|
||||
pub single_file_ingest: bool,
|
||||
pub bulk_search: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Models {
|
||||
pub parser_version: String,
|
||||
pub chunker_version: String,
|
||||
/// v0.20.1+ (Bug #13). Corpus 안 활성 parser version 전체.
|
||||
/// 빈 corpus → empty Vec. backward compat: `parser_version` field 보존.
|
||||
#[serde(default)]
|
||||
pub active_parsers: Vec<String>,
|
||||
/// v0.20.1+ (Bug #13). Corpus 안 활성 chunker version 전체.
|
||||
/// 빈 corpus → empty Vec.
|
||||
#[serde(default)]
|
||||
pub active_chunkers: Vec<String>,
|
||||
pub embedding_version: String,
|
||||
pub prompt_template_version: String,
|
||||
pub index_version: String,
|
||||
pub corpus_revision: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct Stats {
|
||||
pub doc_count: u64,
|
||||
pub chunk_count: u64,
|
||||
pub asset_count: u64,
|
||||
pub last_ingest_at: Option<String>,
|
||||
/// p9-fb-37: per-media-kind doc count (5 keys, zero-padded).
|
||||
#[serde(default)]
|
||||
pub media_breakdown: std::collections::BTreeMap<String, u64>,
|
||||
/// p9-fb-37: per-language doc count, NULL keyed as `"null"`.
|
||||
#[serde(default)]
|
||||
pub lang_breakdown: std::collections::BTreeMap<String, u64>,
|
||||
/// p9-fb-37: on-disk byte sums.
|
||||
#[serde(default)]
|
||||
pub index_bytes: kebab_core::IndexBytes,
|
||||
/// p9-fb-37: docs whose `updated_at` exceeds the staleness threshold.
|
||||
#[serde(default)]
|
||||
pub stale_doc_count: u64,
|
||||
/// p10-1A-1: code language breakdown (**doc** counts by canonical
|
||||
/// lowercase language identifier). Empty until 1A-2 produces code
|
||||
/// docs. v0.17.0 PR-C: doc-count semantics corrected here (the
|
||||
/// previous "chunk counts" wording was a longstanding mis-label —
|
||||
/// implementation has always been `COUNT(*) FROM documents
|
||||
/// GROUP BY code_lang`). Use `code_lang_chunk_breakdown` for the
|
||||
/// chunk-level companion.
|
||||
#[serde(default)]
|
||||
pub code_lang_breakdown: std::collections::BTreeMap<String, u32>,
|
||||
/// p10-1A-1: repo breakdown (**doc** counts by `metadata.repo`
|
||||
/// value). Empty until 1A-2 produces code docs. v0.17.0 PR-C:
|
||||
/// doc-count wording corrected (mirror of code_lang_breakdown).
|
||||
#[serde(default)]
|
||||
pub repo_breakdown: std::collections::BTreeMap<String, u32>,
|
||||
/// v0.17.0 PR-C: sister of [`Self::code_lang_breakdown`] returning
|
||||
/// chunk counts instead of doc counts. Indexing-pressure metric —
|
||||
/// one PDF spec → 200 chunks vs one Rust file → 5 chunks shows up
|
||||
/// here in a way `code_lang_breakdown` (doc count) hides.
|
||||
#[serde(default)]
|
||||
pub code_lang_chunk_breakdown: std::collections::BTreeMap<String, u32>,
|
||||
}
|
||||
|
||||
const KEBAB_VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
@@ -63,15 +104,22 @@ pub const SCHEMA_V1_ID: &str = "schema.v1";
|
||||
const WIRE_SCHEMAS: &[&str] = &[
|
||||
"answer.v1",
|
||||
"search_hit.v1",
|
||||
"search_response.v1",
|
||||
"doc_summary.v1",
|
||||
"chunk_inspection.v1",
|
||||
"doctor.v1",
|
||||
"config_migration.v1",
|
||||
"ingest_report.v1",
|
||||
"ingest_progress.v1",
|
||||
"reset_report.v1",
|
||||
"citation.v1",
|
||||
"schema.v1",
|
||||
"error.v1",
|
||||
"bulk_search_item.v1",
|
||||
"bulk_search_response.v1",
|
||||
// v0.20.x r2 Enhancement 3: OCR statistics + failures introspection.
|
||||
"ocr_stats.v1",
|
||||
"ocr_failures.v1",
|
||||
];
|
||||
|
||||
/// Build a [`SchemaV1`] introspection report for the given config.
|
||||
@@ -84,7 +132,7 @@ const WIRE_SCHEMAS: &[&str] = &[
|
||||
#[doc(hidden)]
|
||||
pub fn schema_with_config(cfg: &Config) -> anyhow::Result<SchemaV1> {
|
||||
let store = open_store_for_stats(cfg)?;
|
||||
let stats = collect_stats(&store)?;
|
||||
let stats = collect_stats(cfg, &store)?;
|
||||
let models = collect_models(cfg, &store);
|
||||
Ok(SchemaV1 {
|
||||
schema_version: SCHEMA_V1_ID.to_string(),
|
||||
@@ -106,10 +154,11 @@ fn capabilities_snapshot() -> Capabilities {
|
||||
rag_multi_turn: true,
|
||||
search_cache: true,
|
||||
incremental_ingest: true,
|
||||
streaming_ask: false,
|
||||
streaming_ask: true,
|
||||
http_daemon: false,
|
||||
mcp_server: true,
|
||||
single_file_ingest: false,
|
||||
single_file_ingest: true,
|
||||
bulk_search: true,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,23 +172,42 @@ fn open_store_for_stats(cfg: &Config) -> anyhow::Result<kebab_store_sqlite::Sqli
|
||||
kebab_store_sqlite::SqliteStore::open_existing(&db_path)
|
||||
}
|
||||
|
||||
fn collect_stats(store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
|
||||
let counts = store.count_summary()?;
|
||||
fn collect_stats(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> anyhow::Result<Stats> {
|
||||
let counts = store.count_summary_with_threshold(u64::from(cfg.search.stale_threshold_days))?;
|
||||
let data_dir = kebab_config::expand_path(&cfg.storage.data_dir, "");
|
||||
let index_bytes = kebab_store_sqlite::stats_ext::index_bytes(&data_dir)
|
||||
.map_err(|e| anyhow::anyhow!("index_bytes: {e}"))?;
|
||||
Ok(Stats {
|
||||
doc_count: counts.doc_count,
|
||||
chunk_count: counts.chunk_count,
|
||||
asset_count: counts.asset_count,
|
||||
last_ingest_at: counts.last_ingest_at,
|
||||
media_breakdown: counts.media_breakdown,
|
||||
lang_breakdown: counts.lang_breakdown,
|
||||
index_bytes,
|
||||
stale_doc_count: counts.stale_doc_count,
|
||||
// p10-1A-2: populated by the store query added in this task.
|
||||
code_lang_breakdown: store.code_lang_breakdown()?,
|
||||
// p10-1A-2 follow-up: dogfooding (2026-05-20) revealed this was a
|
||||
// placeholder — mirror of code_lang_breakdown for the repo field.
|
||||
repo_breakdown: store.repo_breakdown()?,
|
||||
// v0.17.0 PR-C: chunk-level companion (closes HOTFIXES
|
||||
// 2026-05-22 "code_lang_breakdown chunk granularity" LOW).
|
||||
code_lang_chunk_breakdown: store.code_lang_chunk_breakdown()?,
|
||||
})
|
||||
}
|
||||
|
||||
fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Models {
|
||||
let active_parsers = store.fetch_distinct_parser_versions().unwrap_or_default();
|
||||
let active_chunkers = store.fetch_distinct_chunker_versions().unwrap_or_default();
|
||||
Models {
|
||||
// markdown parser only — pdf-page-v1 (P7) / image extractors (P6)
|
||||
// maintain their own versions; surface those when SchemaV1.models
|
||||
// becomes a multi-medium map (P+).
|
||||
parser_version: kebab_parse_md::PARSER_VERSION.to_string(),
|
||||
chunker_version: cfg.chunking.chunker_version.clone(),
|
||||
chunker_version: cfg.ingest.chunking.chunker_version.clone(),
|
||||
active_parsers,
|
||||
active_chunkers,
|
||||
// EmbeddingModelCfg uses `.model` (not `.id`) — adapt from plan.
|
||||
embedding_version: cfg.models.embedding.model.clone(),
|
||||
prompt_template_version: cfg.rag.prompt_template_version.clone(),
|
||||
@@ -149,3 +217,90 @@ fn collect_models(cfg: &Config, store: &kebab_store_sqlite::SqliteStore) -> Mode
|
||||
corpus_revision: store.corpus_revision(),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_stats_ext {
|
||||
use super::*;
|
||||
|
||||
/// p10-1A-1: Stats must serialize `code_lang_breakdown` and
|
||||
/// `repo_breakdown` so downstream consumers (MCP skill, Claude Code)
|
||||
/// can branch on their presence.
|
||||
#[test]
|
||||
fn stats_includes_code_lang_and_repo_breakdown_fields() {
|
||||
let stats = Stats::default();
|
||||
let v = serde_json::to_value(&stats).unwrap();
|
||||
assert!(
|
||||
v.get("code_lang_breakdown").is_some(),
|
||||
"Stats JSON must include code_lang_breakdown: {v}"
|
||||
);
|
||||
assert!(
|
||||
v.get("repo_breakdown").is_some(),
|
||||
"Stats JSON must include repo_breakdown: {v}"
|
||||
);
|
||||
// v0.17.0 PR-C: chunk-level companion field.
|
||||
assert!(
|
||||
v.get("code_lang_chunk_breakdown").is_some(),
|
||||
"Stats JSON must include code_lang_chunk_breakdown (v0.17.0 PR-C): {v}"
|
||||
);
|
||||
// Empty BTreeMap serializes as `{}` — confirm it's an object, not null.
|
||||
assert!(
|
||||
v["code_lang_breakdown"].is_object(),
|
||||
"code_lang_breakdown must be an object: {v}"
|
||||
);
|
||||
assert!(
|
||||
v["repo_breakdown"].is_object(),
|
||||
"repo_breakdown must be an object: {v}"
|
||||
);
|
||||
assert!(
|
||||
v["code_lang_chunk_breakdown"].is_object(),
|
||||
"code_lang_chunk_breakdown must be an object: {v}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stats_includes_breakdowns_and_bytes_on_fresh_corpus() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let mut cfg = kebab_config::Config::defaults();
|
||||
cfg.storage.data_dir = dir.path().to_string_lossy().into_owned();
|
||||
// Bring up migrations so the sqlite file is created.
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
drop(store);
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
// 5 keys padded.
|
||||
assert_eq!(s.stats.media_breakdown.len(), 5);
|
||||
assert_eq!(s.stats.media_breakdown.get("markdown"), Some(&0));
|
||||
assert_eq!(s.stats.media_breakdown.get("pdf"), Some(&0));
|
||||
// lang map empty on empty corpus.
|
||||
assert!(s.stats.lang_breakdown.is_empty());
|
||||
// sqlite bytes positive after migrations, lancedb 0.
|
||||
assert!(s.stats.index_bytes.sqlite > 0);
|
||||
assert_eq!(s.stats.index_bytes.lancedb, 0);
|
||||
assert_eq!(s.stats.stale_doc_count, 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_capabilities {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn capabilities_streaming_ask_matches_cli_surface() {
|
||||
// Bug #9: kebab ask --stream 가 answer_event.v1 ndjson 191 event 정상 emit →
|
||||
// capabilities.streaming_ask 가 true 여야 함.
|
||||
let caps = capabilities_snapshot();
|
||||
assert!(caps.streaming_ask, "streaming_ask must be true (Bug #9)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn capabilities_single_file_ingest_matches_cli_surface() {
|
||||
// Bug #9: kebab ingest-file <path> + kebab ingest-stdin --title <T> 양쪽 모두
|
||||
// ingest_report.v1 정상 emit → capabilities.single_file_ingest 가 true 여야 함.
|
||||
let caps = capabilities_snapshot();
|
||||
assert!(
|
||||
caps.single_file_ingest,
|
||||
"single_file_ingest must be true (Bug #9)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
69
crates/kebab-app/src/staleness.rs
Normal file
69
crates/kebab-app/src/staleness.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
//! p9-fb-32 staleness helpers.
|
||||
|
||||
use time::{Duration, OffsetDateTime};
|
||||
|
||||
use kebab_core::SearchHit;
|
||||
|
||||
/// Returns `true` iff `now - indexed_at > threshold_days * 24h`.
|
||||
/// `threshold_days = 0` always returns `false` (feature disabled).
|
||||
/// Strict `>` so that exactly `threshold_days` old returns `false`.
|
||||
///
|
||||
/// p9-fb-32: mirrored in `kebab_rag::pipeline::compute_stale` (dep-boundary
|
||||
/// rule prevents `kebab-rag → kebab-app`). Update both together.
|
||||
pub fn compute_stale(indexed_at: OffsetDateTime, now: OffsetDateTime, threshold_days: u32) -> bool {
|
||||
if threshold_days == 0 {
|
||||
return false;
|
||||
}
|
||||
let threshold = Duration::days(i64::from(threshold_days));
|
||||
(now - indexed_at) > threshold
|
||||
}
|
||||
|
||||
/// Sets `stale` on each hit in place using `compute_stale`.
|
||||
pub fn mark_stale_in_place(hits: &mut [SearchHit], now: OffsetDateTime, threshold_days: u32) {
|
||||
for h in hits {
|
||||
h.stale = compute_stale(h.indexed_at, now, threshold_days);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use time::macros::datetime;
|
||||
|
||||
fn now() -> OffsetDateTime {
|
||||
datetime!(2026-05-09 12:00:00 UTC)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn threshold_zero_always_fresh() {
|
||||
let very_old = datetime!(2020-01-01 00:00:00 UTC);
|
||||
assert!(!compute_stale(very_old, now(), 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn just_under_threshold_is_fresh() {
|
||||
// 29 days, 23h, 59m old — under 30d.
|
||||
let indexed = now() - Duration::days(29) - Duration::hours(23) - Duration::minutes(59);
|
||||
assert!(!compute_stale(indexed, now(), 30));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn exactly_threshold_is_fresh() {
|
||||
// strict `>` boundary: exactly 30d old is still fresh.
|
||||
let indexed = now() - Duration::days(30);
|
||||
assert!(!compute_stale(indexed, now(), 30));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_minute_past_threshold_is_stale() {
|
||||
let indexed = now() - Duration::days(30) - Duration::minutes(1);
|
||||
assert!(compute_stale(indexed, now(), 30));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn future_indexed_at_is_fresh() {
|
||||
// clock skew safety: future timestamps must not be stale.
|
||||
let future = now() + Duration::hours(1);
|
||||
assert!(!compute_stale(future, now(), 30));
|
||||
}
|
||||
}
|
||||
@@ -33,6 +33,7 @@ fn ask_lexical_smoke() {
|
||||
history: Vec::new(),
|
||||
conversation_id: None,
|
||||
turn_index: None,
|
||||
multi_hop: false,
|
||||
};
|
||||
// The fixture workspace contains "ownership" content; the model's
|
||||
// citation behavior depends on its training, so we don't assert on
|
||||
|
||||
1421
crates/kebab-app/tests/code_ingest_smoke.rs
Normal file
1421
crates/kebab-app/tests/code_ingest_smoke.rs
Normal file
File diff suppressed because it is too large
Load Diff
65
crates/kebab-app/tests/common/mock_ocr.rs
Normal file
65
crates/kebab-app/tests/common/mock_ocr.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
use std::sync::Mutex;
|
||||
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Lang, OcrText};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
|
||||
pub struct MockOcrEngine {
|
||||
expected_texts: Vec<String>,
|
||||
call_index: Mutex<usize>,
|
||||
fail: bool,
|
||||
}
|
||||
|
||||
impl MockOcrEngine {
|
||||
/// Single text (backward-compat ctor for pdf_ocr_apply.rs 10 sites).
|
||||
pub fn single(text: impl Into<String>, fail: bool) -> Self {
|
||||
Self {
|
||||
expected_texts: vec![text.into()],
|
||||
call_index: Mutex::new(0),
|
||||
fail,
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-page texts (cursor advances per recognize call).
|
||||
pub fn per_page(texts: Vec<String>, fail: bool) -> Self {
|
||||
Self {
|
||||
expected_texts: texts,
|
||||
call_index: Mutex::new(0),
|
||||
fail,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OcrEngine for MockOcrEngine {
|
||||
fn engine_name(&self) -> &'static str {
|
||||
"mock-ocr"
|
||||
}
|
||||
|
||||
fn engine_version(&self) -> String {
|
||||
"mock-v1".to_string()
|
||||
}
|
||||
|
||||
#[allow(clippy::unnecessary_literal_bound)]
|
||||
fn model(&self) -> &str {
|
||||
"mock-model"
|
||||
}
|
||||
|
||||
fn recognize(&self, _img: &[u8], _hint: Option<&Lang>) -> Result<OcrText> {
|
||||
if self.fail {
|
||||
anyhow::bail!("mock failure");
|
||||
}
|
||||
let mut idx = self.call_index.lock().unwrap();
|
||||
let text = self
|
||||
.expected_texts
|
||||
.get(*idx)
|
||||
.cloned()
|
||||
.unwrap_or_else(|| self.expected_texts.last().cloned().unwrap_or_default());
|
||||
*idx += 1;
|
||||
Ok(OcrText {
|
||||
joined: text,
|
||||
regions: vec![],
|
||||
engine: "mock-ocr".to_string(),
|
||||
engine_version: "mock-v1".to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -62,8 +62,8 @@ impl TestEnv {
|
||||
// Drop in a small chunk policy so the fixture's small files
|
||||
// emit at least a couple of chunks even with overlap_tokens
|
||||
// honored.
|
||||
config.chunking.target_tokens = 80;
|
||||
config.chunking.overlap_tokens = 20;
|
||||
config.ingest.chunking.target_tokens = 80;
|
||||
config.ingest.chunking.overlap_tokens = 20;
|
||||
|
||||
Self {
|
||||
temp,
|
||||
@@ -79,6 +79,36 @@ impl TestEnv {
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// p9-fb-34 alias — tests added in fb-34 invoke `TestEnv::new()`
|
||||
/// per the plan; route to the existing lexical-only constructor
|
||||
/// so the lane stays AVX-free without churning all the existing
|
||||
/// callers.
|
||||
pub fn new() -> Self {
|
||||
Self::lexical_only()
|
||||
}
|
||||
|
||||
/// p9-fb-34: open a fresh `App` against this env's config. Used
|
||||
/// by integration tests that need to call `App::search_with_opts`
|
||||
/// directly. Caller can invoke this multiple times to simulate
|
||||
/// re-opening the binary after a corpus revision bump.
|
||||
pub fn app(&self) -> kebab_app::App {
|
||||
kebab_app::App::open_with_config(self.config.clone()).expect("App::open_with_config")
|
||||
}
|
||||
}
|
||||
|
||||
/// p9-fb-34: write `content` into the env's workspace at
|
||||
/// `relative_path`, then run a full ingest so the document is
|
||||
/// searchable. Mirrors the convenience helpers used by other
|
||||
/// `TestEnv`-driven crates.
|
||||
pub fn ingest_md(env: &TestEnv, relative_path: &str, content: &str) {
|
||||
let path = env.workspace_root.join(relative_path);
|
||||
if let Some(parent) = path.parent() {
|
||||
std::fs::create_dir_all(parent).expect("create parent dirs");
|
||||
}
|
||||
std::fs::write(&path, content).expect("write workspace file");
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest_with_config");
|
||||
}
|
||||
|
||||
/// Test helper: build a `SearchQuery` for lexical mode at k=10. Used
|
||||
@@ -94,6 +124,29 @@ pub fn lexical_query(text: &str) -> kebab_core::SearchQuery {
|
||||
}
|
||||
}
|
||||
|
||||
/// p9-fb-32: rewrite `documents.updated_at` for one workspace path
|
||||
/// to `now - days_ago` (RFC3339 UTC). Used by staleness integration
|
||||
/// tests to simulate aged-out docs without faking system time. Caller
|
||||
/// is responsible for ingesting the doc *before* calling this — the
|
||||
/// row must already exist.
|
||||
pub fn backdate_document_updated_at(env: &TestEnv, workspace_path: &str, days_ago: i64) {
|
||||
let backdated = (time::OffsetDateTime::now_utc() - time::Duration::days(days_ago))
|
||||
.format(&time::format_description::well_known::Rfc3339)
|
||||
.expect("format backdated updated_at");
|
||||
let db_path = PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
|
||||
let conn = rusqlite::Connection::open(&db_path).expect("open kebab.sqlite");
|
||||
let updated = conn
|
||||
.execute(
|
||||
"UPDATE documents SET updated_at = ?1 WHERE workspace_path = ?2",
|
||||
rusqlite::params![backdated, workspace_path],
|
||||
)
|
||||
.expect("UPDATE documents.updated_at");
|
||||
assert_eq!(
|
||||
updated, 1,
|
||||
"backdate_document_updated_at: expected to update exactly 1 row for {workspace_path}, got {updated}"
|
||||
);
|
||||
}
|
||||
|
||||
fn copy_fixture_workspace(dest: &Path) {
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
@@ -115,3 +168,5 @@ fn copy_dir_recursive(src: &Path, dest: &Path) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub mod mock_ocr;
|
||||
|
||||
169
crates/kebab-app/tests/config_invalidation.rs
Normal file
169
crates/kebab-app/tests/config_invalidation.rs
Normal file
@@ -0,0 +1,169 @@
|
||||
//! v0.26.2: ingest-config invalidation — changing a setting that affects
|
||||
//! ingest output auto-re-indexes the affected assets on the next ingest
|
||||
//! (no `--force-reingest`), while changing an unrelated setting does not.
|
||||
//!
|
||||
//! These end-to-end tests exercise the model-free signals (chunking +
|
||||
//! `[ingest.code]` options vs `search` settings). The exhaustive per-setting
|
||||
//! mapping (image OCR / caption, pdf.ocr, code options, search/rag/ui
|
||||
//! invariance) is unit-tested in
|
||||
//! `kebab-app/src/lib.rs::ingest_config_signature_tests` — those toggles
|
||||
//! (OCR/caption) require a live vision endpoint to ingest, so the wiring is
|
||||
//! verified here via the signature-driven chunking path that shares the same
|
||||
//! `effective_parser_version` plumbing.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
use kebab_app::{IngestOpts, ingest_with_config, ingest_with_config_opts};
|
||||
use kebab_core::IngestItemKind;
|
||||
|
||||
/// Seed a workspace with a markdown + a rust file so both the markdown and
|
||||
/// the code ingest paths are exercised. Returns the first-ingest report.
|
||||
fn seed_and_first_ingest(env: &TestEnv) -> kebab_core::IngestReport {
|
||||
std::fs::write(
|
||||
env.workspace_root.join("demo.rs"),
|
||||
"/// adds two integers\npub fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n",
|
||||
)
|
||||
.unwrap();
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false).expect("first ingest");
|
||||
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||
assert!(first.new >= 1, "first ingest creates docs: {first:?}");
|
||||
assert_eq!(first.unchanged, 0, "first ingest has no unchanged: {first:?}");
|
||||
first
|
||||
}
|
||||
|
||||
fn reingest(env: &TestEnv) -> kebab_core::IngestReport {
|
||||
ingest_with_config_opts(env.config.clone(), env.scope(), false, IngestOpts::default())
|
||||
.expect("re-ingest")
|
||||
}
|
||||
|
||||
/// Re-running with the identical config skips every asset (no spurious
|
||||
/// re-index). Regression guard for over-invalidation.
|
||||
#[test]
|
||||
fn identical_config_skips_all_assets() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||
assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}");
|
||||
assert_eq!(second.unchanged, scanned, "every doc Unchanged: {second:?}");
|
||||
assert_eq!(second.errors, 0);
|
||||
}
|
||||
|
||||
/// Changing a common chunking parameter re-indexes EVERY media type
|
||||
/// (markdown + code here) without `--force-reingest`.
|
||||
#[test]
|
||||
fn chunking_change_reindexes_all_types() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
// Bump target_tokens — folds into every type's signature.
|
||||
env.config.ingest.chunking.target_tokens += 100;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||
assert_eq!(
|
||||
second.unchanged, 0,
|
||||
"chunking change must re-index all: {second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.updated, scanned,
|
||||
"every doc re-indexed as Updated: {second:?}"
|
||||
);
|
||||
assert_eq!(second.errors, 0);
|
||||
}
|
||||
|
||||
/// Changing an `[ingest.code]` option re-indexes only the code asset; the
|
||||
/// markdown assets stay Unchanged.
|
||||
#[test]
|
||||
fn code_option_change_reindexes_code_only() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
// Raise max_file_lines (keeps the tiny demo.rs in-scope; only the code
|
||||
// signature changes).
|
||||
env.config.ingest.code.max_file_lines += 1000;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(second.new, 0, "no new docs: {second:?}");
|
||||
assert_eq!(second.errors, 0);
|
||||
assert_eq!(
|
||||
second.updated, 1,
|
||||
"exactly the code asset re-indexed: {second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.unchanged,
|
||||
scanned - 1,
|
||||
"all markdown assets stay Unchanged: {second:?}"
|
||||
);
|
||||
|
||||
let items = second.items.as_ref().expect("items present");
|
||||
let code = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("demo.rs"))
|
||||
.expect("demo.rs item");
|
||||
assert_eq!(
|
||||
code.kind,
|
||||
IngestItemKind::Updated,
|
||||
"demo.rs must be re-indexed: {code:?}"
|
||||
);
|
||||
for i in items.iter().filter(|i| i.doc_path.0.ends_with(".md")) {
|
||||
assert_eq!(
|
||||
i.kind,
|
||||
IngestItemKind::Unchanged,
|
||||
"markdown must be Unchanged: {i:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Regression guard: changing a non-ingest setting (`search.default_k`) does
|
||||
/// NOT re-index anything.
|
||||
#[test]
|
||||
fn search_setting_change_reindexes_nothing() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
let first = seed_and_first_ingest(&env);
|
||||
let scanned = first.scanned;
|
||||
|
||||
env.config.search.default_k += 5;
|
||||
env.config.search.snippet_chars += 50;
|
||||
env.config.rag.score_gate = 0.5;
|
||||
|
||||
let second = reingest(&env);
|
||||
assert_eq!(second.scanned, scanned);
|
||||
assert_eq!(
|
||||
second.unchanged, scanned,
|
||||
"search/rag changes must not re-index: {second:?}"
|
||||
);
|
||||
assert_eq!(second.updated, 0, "nothing re-indexed: {second:?}");
|
||||
assert_eq!(second.new, 0);
|
||||
assert_eq!(second.errors, 0);
|
||||
}
|
||||
|
||||
/// v3 불변식 #1: `ingest_config_signature` 출력 문자열은 값 기반이라 struct
|
||||
/// 경로 재편(미디어 ingest 통합) 후에도 v2 와 **바이트 동일**해야 한다. 깨지면
|
||||
/// 업그레이드 시 전체 재색인 발생. paddle-onnx image 분기 형식 골든.
|
||||
#[test]
|
||||
fn ingest_signature_image_paddle_byte_stable() {
|
||||
let mut cfg = kebab_config::Config::defaults();
|
||||
cfg.ingest.image.ocr.enabled = true;
|
||||
cfg.ingest.image.ocr.engine = "paddle-onnx".into();
|
||||
let sig = kebab_app::test_ingest_config_signature(
|
||||
&cfg,
|
||||
&kebab_core::MediaType::Image(kebab_core::ImageType::Png),
|
||||
);
|
||||
// 골든: chunk:... |ocr:1:paddle-onnx:<engine_version> |cap:0
|
||||
assert!(
|
||||
sig.starts_with("chunk:500:80:true:md-heading-v1"),
|
||||
"chunk prefix drift: {sig}"
|
||||
);
|
||||
assert!(sig.contains("|ocr:1:paddle-onnx:"), "ocr token drift: {sig}");
|
||||
assert!(sig.ends_with("|cap:0"), "cap token drift: {sig}");
|
||||
}
|
||||
85
crates/kebab-app/tests/config_migrate.rs
Normal file
85
crates/kebab-app/tests/config_migrate.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use std::fs;
|
||||
|
||||
#[test]
|
||||
fn migrate_writes_backup_and_atomic_with_dry_run_noop() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = dir.path().join("config.toml");
|
||||
fs::write(
|
||||
&cfg,
|
||||
"schema_version = 1\n\n[workspace]\nroot = \"/n\"\ninclude = [\"*.md\"]\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// dry-run: 파일·백업 미변경.
|
||||
let report = kebab_app::config_migrate_with_config_path(Some(&cfg), true).unwrap();
|
||||
assert!(report.changed);
|
||||
assert!(report.dry_run);
|
||||
assert!(report.backup_path.is_none());
|
||||
assert!(!dir.path().join("config.toml.bak").exists());
|
||||
assert!(
|
||||
fs::read_to_string(&cfg).unwrap().contains("include"),
|
||||
"dry-run modified file"
|
||||
);
|
||||
|
||||
// 실제 적용: 백업 생성 + 파일 갱신.
|
||||
let report = kebab_app::config_migrate_with_config_path(Some(&cfg), false).unwrap();
|
||||
assert!(report.changed);
|
||||
assert!(!report.dry_run);
|
||||
assert!(report.backup_path.is_some());
|
||||
assert!(dir.path().join("config.toml.bak").exists());
|
||||
let new = fs::read_to_string(&cfg).unwrap();
|
||||
assert!(!new.contains("include"));
|
||||
assert!(new.contains("[ingest.code]"));
|
||||
|
||||
// 멱등: 재실행 changed=false.
|
||||
let report = kebab_app::config_migrate_with_config_path(Some(&cfg), false).unwrap();
|
||||
assert!(!report.changed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn migrate_missing_file_errors() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = dir.path().join("nope.toml");
|
||||
assert!(kebab_app::config_migrate_with_config_path(Some(&cfg), false).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn annotated_default_serialization_contains_section_comments() {
|
||||
let doc = kebab_config::migrate::annotated_default_document();
|
||||
let text = doc.to_string();
|
||||
assert!(
|
||||
text.contains("code ingest skip 정책"),
|
||||
"section comment missing:\n{text}"
|
||||
);
|
||||
assert!(text.contains("[ingest.code]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doctor_flags_outdated_config() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = dir.path().join("config.toml");
|
||||
fs::write(
|
||||
&cfg,
|
||||
"schema_version = 1\n\n[workspace]\nroot = \"/n\"\ninclude=[\"*.md\"]\n",
|
||||
)
|
||||
.unwrap();
|
||||
let report = kebab_app::doctor_with_config_path(Some(&cfg)).unwrap();
|
||||
let check = report
|
||||
.checks
|
||||
.iter()
|
||||
.find(|c| c.name == "config_migration")
|
||||
.unwrap();
|
||||
assert!(!check.ok, "outdated config should fail check");
|
||||
assert!(check.hint.as_deref().unwrap().contains("config migrate"));
|
||||
assert!(!report.ok, "overall doctor should be false");
|
||||
|
||||
// migrate 후엔 통과.
|
||||
kebab_app::config_migrate_with_config_path(Some(&cfg), false).unwrap();
|
||||
let report = kebab_app::doctor_with_config_path(Some(&cfg)).unwrap();
|
||||
let check = report
|
||||
.checks
|
||||
.iter()
|
||||
.find(|c| c.name == "config_migration")
|
||||
.unwrap();
|
||||
assert!(check.ok, "after migrate should pass");
|
||||
}
|
||||
24
crates/kebab-app/tests/cursor.rs
Normal file
24
crates/kebab-app/tests/cursor.rs
Normal file
@@ -0,0 +1,24 @@
|
||||
//! p9-fb-34: cursor encode/decode round-trip + corpus_revision mismatch.
|
||||
|
||||
use kebab_app::cursor;
|
||||
|
||||
#[test]
|
||||
fn cursor_roundtrip_preserves_offset() {
|
||||
let encoded = cursor::encode(5, "rev-abc");
|
||||
let offset = cursor::decode(&encoded, "rev-abc").unwrap();
|
||||
assert_eq!(offset, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cursor_decode_rejects_mismatched_revision() {
|
||||
let encoded = cursor::encode(7, "rev-old");
|
||||
let err = cursor::decode(&encoded, "rev-new").unwrap_err();
|
||||
assert_eq!(err.code, "stale_cursor");
|
||||
assert!(err.message.contains("rev-old") || err.message.contains("rev-new"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cursor_decode_rejects_garbage_input() {
|
||||
let err = cursor::decode("not-base64!!!", "any").unwrap_err();
|
||||
assert_eq!(err.code, "stale_cursor");
|
||||
}
|
||||
344
crates/kebab-app/tests/fetch_integration.rs
Normal file
344
crates/kebab-app/tests/fetch_integration.rs
Normal file
@@ -0,0 +1,344 @@
|
||||
//! p9-fb-35 App::fetch integration tests.
|
||||
|
||||
mod common;
|
||||
|
||||
use kebab_app::App;
|
||||
use kebab_core::{FetchKind, FetchOpts, FetchQuery};
|
||||
|
||||
fn open(env: &common::TestEnv) -> App {
|
||||
env.app()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fetch_chunk_returns_target_only_when_no_context() {
|
||||
let env = common::TestEnv::new();
|
||||
common::ingest_md(
|
||||
&env,
|
||||
"a.md",
|
||||
"# Title\n\nFirst paragraph.\n\n## Section\n\nSecond.\n",
|
||||
);
|
||||
let app = open(&env);
|
||||
|
||||
// Find a chunk via search to obtain its id.
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "First".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = app.search(q).unwrap();
|
||||
let chunk_id = hits[0].chunk_id.clone();
|
||||
|
||||
let result = app
|
||||
.fetch(FetchQuery::Chunk(chunk_id), FetchOpts::default())
|
||||
.unwrap();
|
||||
assert_eq!(result.kind, FetchKind::Chunk);
|
||||
assert!(result.chunk.is_some(), "target chunk populated");
|
||||
assert!(result.context_before.is_empty());
|
||||
assert!(result.context_after.is_empty());
|
||||
assert!(!result.truncated);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fetch_chunk_with_context_returns_neighbors() {
|
||||
let env = common::TestEnv::new();
|
||||
// v0.17.0 trigram tokenizer: terms must be ≥3 Unicode chars to
|
||||
// match. The earlier fixture used 2-char tokens like `A1`/`A3` for
|
||||
// section bodies — those zero-hit under trigram. Use 5-char unique
|
||||
// words per section so the query can pin one chunk deterministically.
|
||||
let body =
|
||||
"# H1\n\napples\n\n# H2\n\nbanana\n\n# H3\n\ncherry\n\n# H4\n\ndurian\n\n# H5\n\nelder\n";
|
||||
common::ingest_md(&env, "multi.md", body);
|
||||
let app = env.app();
|
||||
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "cherry".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = app.search(q).unwrap();
|
||||
let chunk_id = hits[0].chunk_id.clone();
|
||||
|
||||
let result = app
|
||||
.fetch(
|
||||
FetchQuery::Chunk(chunk_id),
|
||||
FetchOpts {
|
||||
context: Some(2),
|
||||
max_tokens: None,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(result.kind, FetchKind::Chunk);
|
||||
assert!(result.chunk.is_some());
|
||||
let total = result.context_before.len() + result.context_after.len();
|
||||
assert!(total >= 1, "at least one neighbor expected");
|
||||
assert!(total <= 4, "context capped at +-2 ⇒ max 4 neighbors");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fetch_chunk_unknown_id_returns_chunk_not_found() {
|
||||
let env = common::TestEnv::new();
|
||||
let app = env.app();
|
||||
let err = app
|
||||
.fetch(
|
||||
FetchQuery::Chunk(kebab_core::ChunkId("nonexistent-id".to_string())),
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.unwrap_err();
|
||||
let msg = err.to_string();
|
||||
assert!(
|
||||
msg.contains("chunk_not_found") || msg.contains("nonexistent-id"),
|
||||
"expected chunk_not_found error, got: {msg}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fetch_doc_returns_serialized_markdown() {
|
||||
let env = common::TestEnv::new();
|
||||
let body = "# Heading One\n\nFirst paragraph.\n\n## Sub\n\nSecond.\n";
|
||||
common::ingest_md(&env, "doc.md", body);
|
||||
let app = env.app();
|
||||
|
||||
// Discover doc_id via search hit (avoids depending on list_docs API shape).
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "First".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = app.search(q).unwrap();
|
||||
let doc_id = hits[0].doc_id.clone();
|
||||
|
||||
let result = app
|
||||
.fetch(FetchQuery::Doc(doc_id), FetchOpts::default())
|
||||
.unwrap();
|
||||
assert_eq!(result.kind, FetchKind::Doc);
|
||||
let text = result.text.expect("doc text");
|
||||
assert!(
|
||||
text.contains("Heading One"),
|
||||
"doc text contains heading: {text:?}"
|
||||
);
|
||||
assert!(text.contains("First paragraph"), "doc text contains body");
|
||||
assert!(!result.truncated);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fetch_doc_unknown_id_returns_doc_not_found() {
|
||||
let env = common::TestEnv::new();
|
||||
let app = env.app();
|
||||
let err = app
|
||||
.fetch(
|
||||
FetchQuery::Doc(kebab_core::DocumentId("nonexistent-doc".to_string())),
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.unwrap_err();
|
||||
assert!(err.to_string().contains("doc_not_found"), "got: {err}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fetch_doc_with_max_tokens_truncates() {
|
||||
let env = common::TestEnv::new();
|
||||
let p = "Lorem ipsum dolor sit amet consectetur adipiscing elit. ".repeat(20);
|
||||
let body = format!("# Big\n\n{p}\n");
|
||||
common::ingest_md(&env, "big.md", &body);
|
||||
let app = env.app();
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "Lorem".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = app.search(q).unwrap();
|
||||
let doc_id = hits[0].doc_id.clone();
|
||||
|
||||
let result = app
|
||||
.fetch(
|
||||
FetchQuery::Doc(doc_id),
|
||||
FetchOpts {
|
||||
context: None,
|
||||
max_tokens: Some(20), // ~80 chars
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
assert!(result.truncated);
|
||||
let text = result.text.expect("doc text");
|
||||
assert!(
|
||||
text.chars().count() <= 100,
|
||||
"trimmed text len {}",
|
||||
text.chars().count()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fetch_span_returns_line_range() {
|
||||
let env = common::TestEnv::new();
|
||||
// Use a list so the canonical-to-markdown roundtrip emits 5
|
||||
// single-line entries joined by `\n` (paragraphs would be joined by
|
||||
// `\n\n`, and CommonMark soft breaks inside one paragraph collapse to
|
||||
// spaces — see crates/kebab-parse-md/src/blocks.rs `Event::SoftBreak`).
|
||||
let body = "- Line one.\n- Line two.\n- Line three.\n- Line four.\n- Line five.\n";
|
||||
common::ingest_md(&env, "lines.md", body);
|
||||
let app = env.app();
|
||||
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "Line".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = app.search(q).unwrap();
|
||||
let doc_id = hits[0].doc_id.clone();
|
||||
|
||||
let result = app
|
||||
.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id,
|
||||
line_start: 2,
|
||||
line_end: 4,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(result.kind, FetchKind::Span);
|
||||
let text = result.text.expect("span text");
|
||||
let line_count = text.lines().count();
|
||||
assert_eq!(line_count, 3, "span should be 3 lines: {text:?}");
|
||||
assert_eq!(result.line_start, Some(2));
|
||||
assert_eq!(result.line_end, Some(4));
|
||||
assert_eq!(result.effective_end, Some(4));
|
||||
assert!(!result.truncated);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fetch_span_clamps_line_end_when_out_of_range() {
|
||||
let env = common::TestEnv::new();
|
||||
common::ingest_md(&env, "short.md", "Line one.\nLine two.\n");
|
||||
let app = env.app();
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "Line".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = app.search(q).unwrap();
|
||||
let doc_id = hits[0].doc_id.clone();
|
||||
|
||||
let result = app
|
||||
.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id,
|
||||
line_start: 1,
|
||||
line_end: 999,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let text = result.text.expect("span text");
|
||||
let actual_lines = text.lines().count();
|
||||
assert_eq!(result.effective_end, Some(actual_lines as u32));
|
||||
assert!(actual_lines < 999);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fetch_span_invalid_input_when_zero_lines() {
|
||||
let env = common::TestEnv::new();
|
||||
common::ingest_md(&env, "a.md", "Line one.\n");
|
||||
let app = env.app();
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "Line".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = app.search(q).unwrap();
|
||||
let doc_id = hits[0].doc_id.clone();
|
||||
|
||||
let err = app
|
||||
.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id,
|
||||
line_start: 0,
|
||||
line_end: 0,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.unwrap_err();
|
||||
assert!(err.to_string().contains("invalid_input"), "got: {err}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fetch_span_line_start_beyond_total_returns_empty_text() {
|
||||
let env = common::TestEnv::new();
|
||||
let body = "- Line one.\n- Line two.\n";
|
||||
common::ingest_md(&env, "two_lines.md", body);
|
||||
let app = env.app();
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "Line".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = app.search(q).unwrap();
|
||||
let doc_id = hits[0].doc_id.clone();
|
||||
|
||||
let result = app
|
||||
.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id,
|
||||
line_start: 100,
|
||||
line_end: 200,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let text = result.text.expect("text field");
|
||||
assert!(text.is_empty(), "out-of-range request returns empty text");
|
||||
assert!(
|
||||
!result.truncated,
|
||||
"out-of-range is NOT truncated (budget-only flag)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fetch_chunk_context_at_first_chunk_clamps_lower_bound() {
|
||||
let env = common::TestEnv::new();
|
||||
// Multi-chunk markdown so context ±N has neighbors.
|
||||
let body = "# H1\n\nFirst chunk text body.\n\n# H2\n\nSecond chunk.\n\n# H3\n\nThird chunk.\n";
|
||||
common::ingest_md(&env, "boundary.md", body);
|
||||
let app = env.app();
|
||||
let q = kebab_core::SearchQuery {
|
||||
text: "First".to_string(),
|
||||
mode: kebab_core::SearchMode::Lexical,
|
||||
k: 1,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = app.search(q).unwrap();
|
||||
let chunk_id = hits[0].chunk_id.clone();
|
||||
|
||||
let result = app
|
||||
.fetch(
|
||||
FetchQuery::Chunk(chunk_id),
|
||||
FetchOpts {
|
||||
context: Some(2),
|
||||
max_tokens: None,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
// p9-fb-35 R2: doc has 3 chunks; ±2 should clamp the total
|
||||
// neighbor count to ≤ 2 + 1 (= excludes target).
|
||||
//
|
||||
// ⚠ Strict "first-chunk → context_before is empty" cannot be
|
||||
// asserted here yet because chunks.ordinal column does not exist
|
||||
// — `list_chunk_ids_for_doc` orders by `(created_at, chunk_id)`
|
||||
// and chunk_id is a blake3 hash, so the "First chunk" content
|
||||
// may land at any hash-order position within the doc. The clamp
|
||||
// logic itself is correct (target_idx ± n → [0..len]); we just
|
||||
// can't pin which chunk is hash-order-first. Tracked as
|
||||
// follow-up: V007 chunks.ordinal migration.
|
||||
let total = result.context_before.len() + result.context_after.len();
|
||||
assert!(
|
||||
total <= 2,
|
||||
"doc with 3 chunks ±2 → at most 2 neighbors (excludes target), got {total}"
|
||||
);
|
||||
}
|
||||
171
crates/kebab-app/tests/file_deletion_auto_purge.rs
Normal file
171
crates/kebab-app/tests/file_deletion_auto_purge.rs
Normal file
@@ -0,0 +1,171 @@
|
||||
//! Dogfood: auto-purge stored docs for filesystem-deleted files.
|
||||
//!
|
||||
//! Two tests:
|
||||
//!
|
||||
//! 1. `file_deletion_auto_purge` — ingest 2 files, delete one, re-ingest.
|
||||
//! The re-ingest must report `purged_deleted_files = 1`, the deleted
|
||||
//! file must no longer appear in `list_docs`, and lexical search for
|
||||
//! its unique content must return no hits.
|
||||
//!
|
||||
//! 2. `include_scope_narrowing_does_not_purge` — ingest 2 files under a
|
||||
//! wide glob, narrow the walker scope to only one file, re-ingest.
|
||||
//! The narrowed ingest must NOT purge the out-of-scope file because
|
||||
//! the file is still on disk (just excluded from this run). Protects
|
||||
//! users against accidental data loss via config edits.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::IngestOpts;
|
||||
use kebab_app::ingest_with_config_opts;
|
||||
use kebab_core::{DocFilter, DocumentStore, SearchMode, SearchQuery, SourceScope};
|
||||
|
||||
/// Helper: open the store via `TestEnv` and run `list_documents`.
|
||||
fn list_doc_paths(env: &TestEnv) -> Vec<String> {
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
let store = SqliteStore::open(&env.config).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
store
|
||||
.list_documents(&DocFilter::default())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|d| d.doc_path.0)
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn file_deletion_auto_purge() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write two .rs files into the workspace.
|
||||
let a_path = env.workspace_root.join("a.rs");
|
||||
let b_path = env.workspace_root.join("b.rs");
|
||||
std::fs::write(&a_path, "// file a\nfn alpha() {}\n").unwrap();
|
||||
std::fs::write(&b_path, "// file b\nfn bravo() {}\n").unwrap();
|
||||
|
||||
// First ingest — both must be New.
|
||||
let first = ingest_with_config_opts(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.expect("first ingest must succeed");
|
||||
// Only count the .rs files we added (there may be fixture files too).
|
||||
let first_new = first.new;
|
||||
assert!(first_new >= 2, "expected at least 2 new docs: {first:?}");
|
||||
assert_eq!(
|
||||
first.purged_deleted_files, 0,
|
||||
"no purges on first ingest: {first:?}"
|
||||
);
|
||||
assert_eq!(first.errors, 0, "no errors on first ingest: {first:?}");
|
||||
|
||||
// Delete one file from the filesystem.
|
||||
std::fs::remove_file(&b_path).expect("remove b.rs");
|
||||
|
||||
// Second ingest — scanned count drops by 1; b.rs should be purged.
|
||||
let second = ingest_with_config_opts(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.expect("second ingest must succeed");
|
||||
|
||||
assert_eq!(
|
||||
second.purged_deleted_files, 1,
|
||||
"exactly 1 file should be purged: {second:?}"
|
||||
);
|
||||
assert_eq!(second.new, 0, "no new docs after deletion: {second:?}");
|
||||
assert_eq!(second.updated, 0, "no updated docs: {second:?}");
|
||||
assert_eq!(second.errors, 0, "no errors: {second:?}");
|
||||
|
||||
// b.rs must no longer appear in list_docs.
|
||||
let doc_paths = list_doc_paths(&env);
|
||||
let b_ws_path = "b.rs";
|
||||
assert!(
|
||||
!doc_paths.iter().any(|p| p == b_ws_path),
|
||||
"b.rs must be gone from list_docs; got: {doc_paths:?}"
|
||||
);
|
||||
// a.rs must still be present.
|
||||
let a_ws_path = "a.rs";
|
||||
assert!(
|
||||
doc_paths.iter().any(|p| p == a_ws_path),
|
||||
"a.rs must still be in list_docs; got: {doc_paths:?}"
|
||||
);
|
||||
|
||||
// Lexical search for b.rs's unique content returns no hits.
|
||||
let app = env.app();
|
||||
let query = SearchQuery {
|
||||
text: "bravo".to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k: 10,
|
||||
filters: kebab_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = app.search(query).expect("search must not error");
|
||||
assert!(
|
||||
hits.is_empty(),
|
||||
"search for deleted file's content must return no hits; got: {hits:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn include_scope_narrowing_does_not_purge() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write two .rs files.
|
||||
let a_path = env.workspace_root.join("a_narrow.rs");
|
||||
let b_path = env.workspace_root.join("b_narrow.rs");
|
||||
std::fs::write(&a_path, "// narrow a\nfn alpha_narrow() {}\n").unwrap();
|
||||
std::fs::write(&b_path, "// narrow b\nfn bravo_narrow() {}\n").unwrap();
|
||||
|
||||
// Wide scope: first ingest — both must be New.
|
||||
let wide_scope = SourceScope {
|
||||
root: env.workspace_root.clone(),
|
||||
include: vec!["**/*.rs".to_string()],
|
||||
exclude: env.config.workspace.exclude.clone(),
|
||||
};
|
||||
let first =
|
||||
ingest_with_config_opts(env.config.clone(), wide_scope, false, IngestOpts::default())
|
||||
.expect("first ingest (wide) must succeed");
|
||||
assert!(first.new >= 2, "expected at least 2 new docs: {first:?}");
|
||||
assert_eq!(
|
||||
first.purged_deleted_files, 0,
|
||||
"no purges on first ingest: {first:?}"
|
||||
);
|
||||
|
||||
// Narrow scope: only a_narrow.rs in include — b_narrow.rs is still
|
||||
// on disk but excluded from the walker scope.
|
||||
let narrow_scope = SourceScope {
|
||||
root: env.workspace_root.clone(),
|
||||
include: vec!["a_narrow.rs".to_string()],
|
||||
exclude: env.config.workspace.exclude.clone(),
|
||||
};
|
||||
let second = ingest_with_config_opts(
|
||||
env.config.clone(),
|
||||
narrow_scope,
|
||||
false,
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.expect("second ingest (narrow) must succeed");
|
||||
|
||||
// CRITICAL: b_narrow.rs is still on disk — must NOT be purged.
|
||||
assert_eq!(
|
||||
second.purged_deleted_files, 0,
|
||||
"scope-narrowing must NOT purge on-disk files; got: {second:?}"
|
||||
);
|
||||
assert_eq!(second.errors, 0, "no errors: {second:?}");
|
||||
|
||||
// b_narrow.rs must still exist in the store.
|
||||
let doc_paths = list_doc_paths(&env);
|
||||
let b_ws_path = "b_narrow.rs";
|
||||
assert!(
|
||||
doc_paths.iter().any(|p| p == b_ws_path),
|
||||
"b_narrow.rs must still be in list_docs after scope narrowing; got: {doc_paths:?}"
|
||||
);
|
||||
// And the file must still be on disk.
|
||||
assert!(
|
||||
b_path.exists(),
|
||||
"b_narrow.rs must still be on disk (we didn't delete it)"
|
||||
);
|
||||
}
|
||||
@@ -24,8 +24,7 @@ use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
/// inspectable in stored DB rows.
|
||||
fn write_red_png(root: &Path, name: &str) -> std::path::PathBuf {
|
||||
use image::{ImageBuffer, Rgb};
|
||||
let img: ImageBuffer<Rgb<u8>, _> =
|
||||
ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
|
||||
let img: ImageBuffer<Rgb<u8>, _> = ImageBuffer::from_fn(100, 50, |_, _| Rgb([255, 0, 0]));
|
||||
let path = root.join(name);
|
||||
img.save(&path).expect("write PNG fixture");
|
||||
path
|
||||
@@ -35,11 +34,11 @@ fn cfg_with_image_pipeline(env: &TestEnv, mock_endpoint: &str) -> Config {
|
||||
let mut cfg = env.config.clone();
|
||||
// p9-fb-25: workspace.include removed; extension routing is now
|
||||
// handled by extractor matching alone (no config knob).
|
||||
cfg.image.ocr.enabled = true;
|
||||
cfg.image.ocr.endpoint = Some(mock_endpoint.to_string());
|
||||
cfg.image.ocr.model = "vision-mock:1b".to_string();
|
||||
cfg.image.ocr.max_pixels = 512;
|
||||
cfg.image.caption.enabled = false; // tested separately below
|
||||
cfg.ingest.image.ocr.enabled = true;
|
||||
cfg.ingest.image.ocr.endpoint = Some(mock_endpoint.to_string());
|
||||
cfg.ingest.image.ocr.model = "vision-mock:1b".to_string();
|
||||
cfg.ingest.image.ocr.max_pixels = 512;
|
||||
cfg.ingest.image.caption.enabled = false; // tested separately below
|
||||
cfg.models.llm.endpoint = mock_endpoint.to_string();
|
||||
cfg.models.llm.model = "vision-mock:1b".to_string();
|
||||
cfg
|
||||
@@ -80,7 +79,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
|
||||
// Counters: scanned should include the PNG; new ≥ 1 (markdown
|
||||
// fixtures from the workspace tree may also count).
|
||||
assert!(report.scanned >= 1, "scanned={}, items={:?}", report.scanned, report.items);
|
||||
assert!(
|
||||
report.scanned >= 1,
|
||||
"scanned={}, items={:?}",
|
||||
report.scanned,
|
||||
report.items
|
||||
);
|
||||
assert_eq!(report.errors, 0, "no errors on lenient OCR path");
|
||||
|
||||
// Locate the image doc in the report items.
|
||||
@@ -94,7 +98,11 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
kebab_core::IngestItemKind::New,
|
||||
"image asset must be classified New on first ingest"
|
||||
);
|
||||
assert_eq!(img_item.chunk_count, Some(1), "image emits exactly one chunk");
|
||||
assert_eq!(
|
||||
img_item.chunk_count,
|
||||
Some(1),
|
||||
"image emits exactly one chunk"
|
||||
);
|
||||
|
||||
// Inspect the stored chunk text via kb-app's inspect_chunk facade.
|
||||
let doc_id = img_item.doc_id.clone().expect("image doc id");
|
||||
@@ -117,10 +125,12 @@ async fn ingest_image_with_ocr_produces_chunk_containing_ocr_text() {
|
||||
|
||||
// Sanity: the doc was actually persisted into SQLite (kb-app's
|
||||
// list_docs facade reads the same store the chunker writes to).
|
||||
let summaries = kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
|
||||
.expect("list_docs");
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).expect("list_docs");
|
||||
assert!(
|
||||
summaries.iter().any(|s| s.doc_path.0.ends_with("diagram.png")),
|
||||
summaries
|
||||
.iter()
|
||||
.any(|s| s.doc_path.0.ends_with("diagram.png")),
|
||||
"image doc must appear in list_docs"
|
||||
);
|
||||
|
||||
@@ -151,8 +161,8 @@ async fn ingest_image_with_ocr_and_caption_populates_both_fields() {
|
||||
let env = TestEnv::lexical_only();
|
||||
write_red_png(&env.workspace_root, "diagram.png");
|
||||
let mut cfg = cfg_with_image_pipeline(&env, &server.uri());
|
||||
cfg.image.caption.enabled = true;
|
||||
cfg.image.caption.max_pixels = 384;
|
||||
cfg.ingest.image.caption.enabled = true;
|
||||
cfg.ingest.image.caption.max_pixels = 384;
|
||||
|
||||
let cfg_clone = cfg.clone();
|
||||
let scope = env.scope();
|
||||
@@ -171,8 +181,7 @@ async fn ingest_image_with_ocr_and_caption_populates_both_fields() {
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("diagram.png"))
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let block = match &doc.blocks[0] {
|
||||
kebab_core::Block::ImageRef(b) => b,
|
||||
_ => unreachable!(),
|
||||
@@ -261,14 +270,13 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
|
||||
let mut cfg = env.config.clone();
|
||||
// p9-fb-25: workspace.include removed; extension routing is now
|
||||
// handled by extractor matching alone (no config knob).
|
||||
cfg.image.ocr.enabled = false;
|
||||
cfg.image.caption.enabled = false;
|
||||
cfg.ingest.image.ocr.enabled = false;
|
||||
cfg.ingest.image.caption.enabled = false;
|
||||
|
||||
let cfg_clone = cfg.clone();
|
||||
let scope = env.scope();
|
||||
let report = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg_clone, scope, false)
|
||||
.expect("ingest with no OCR/caption")
|
||||
kebab_app::ingest_with_config(cfg_clone, scope, false).expect("ingest with no OCR/caption")
|
||||
})
|
||||
.await
|
||||
.expect("task");
|
||||
@@ -282,8 +290,7 @@ async fn image_indexed_with_filename_when_ocr_and_caption_disabled() {
|
||||
.find(|i| i.doc_path.0.ends_with("raw.png"))
|
||||
.unwrap();
|
||||
assert_eq!(img_item.chunk_count, Some(1), "image emits one chunk");
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap())
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, img_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let block = match &doc.blocks[0] {
|
||||
kebab_core::Block::ImageRef(b) => b,
|
||||
_ => unreachable!(),
|
||||
@@ -327,8 +334,8 @@ async fn garbage_png_increments_errors_counter_exactly_once() {
|
||||
let mut cfg = env.config.clone();
|
||||
// p9-fb-25: workspace.include removed; extension routing is now
|
||||
// handled by extractor matching alone (no config knob).
|
||||
cfg.image.ocr.enabled = false;
|
||||
cfg.image.caption.enabled = false;
|
||||
cfg.ingest.image.ocr.enabled = false;
|
||||
cfg.ingest.image.caption.enabled = false;
|
||||
|
||||
let cfg_clone = cfg.clone();
|
||||
let scope = env.scope();
|
||||
@@ -392,16 +399,12 @@ async fn re_ingest_image_produces_unchanged_with_same_doc_id() {
|
||||
let scope1 = scope.clone();
|
||||
let scope2 = scope.clone();
|
||||
|
||||
let r1 = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg1, scope1, false).unwrap()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
let r2 = spawn_blocking(move || {
|
||||
kebab_app::ingest_with_config(cfg2, scope2, false).unwrap()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
let r1 = spawn_blocking(move || kebab_app::ingest_with_config(cfg1, scope1, false).unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
let r2 = spawn_blocking(move || kebab_app::ingest_with_config(cfg2, scope2, false).unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let id1 = r1
|
||||
.items
|
||||
|
||||
@@ -21,11 +21,16 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
// First ingest — populates the DB. Use the legacy entry so the
|
||||
// assertions cover the "previously ingested" set without needing
|
||||
// IngestOpts::default() to behave identically.
|
||||
let first =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||
assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
|
||||
assert_eq!(first.unchanged, 0, "first ingest cannot have unchanged: {first:?}");
|
||||
assert!(
|
||||
first.new >= 1,
|
||||
"first ingest must create new docs: {first:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
first.unchanged, 0,
|
||||
"first ingest cannot have unchanged: {first:?}"
|
||||
);
|
||||
|
||||
let scanned = first.scanned;
|
||||
|
||||
@@ -38,9 +43,15 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(second.scanned, scanned, "second scanned matches first: {second:?}");
|
||||
assert_eq!(
|
||||
second.scanned, scanned,
|
||||
"second scanned matches first: {second:?}"
|
||||
);
|
||||
assert_eq!(second.new, 0, "no new docs on re-ingest: {second:?}");
|
||||
assert_eq!(second.updated, 0, "nothing should be marked updated: {second:?}");
|
||||
assert_eq!(
|
||||
second.updated, 0,
|
||||
"nothing should be marked updated: {second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.unchanged, scanned,
|
||||
"every doc must be Unchanged: {second:?}"
|
||||
@@ -52,10 +63,12 @@ fn second_ingest_of_unchanged_corpus_marks_all_unchanged() {
|
||||
fn force_reingest_bypasses_skip() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
let first =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(first.errors, 0, "first ingest must not error: {first:?}");
|
||||
assert!(first.new >= 1, "first ingest must create new docs: {first:?}");
|
||||
assert!(
|
||||
first.new >= 1,
|
||||
"first ingest must create new docs: {first:?}"
|
||||
);
|
||||
let scanned = first.scanned;
|
||||
|
||||
let second = ingest_with_config_opts(
|
||||
|
||||
@@ -107,13 +107,9 @@ fn cancel_none_is_uncancellable_default() {
|
||||
// ingest_with_config_progress (no cancel) runs to completion.
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ fn ingest_file_copies_external_md_and_reports_new() {
|
||||
assert!(ext_dir.is_dir());
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
.filter_map(std::result::Result::ok)
|
||||
.collect();
|
||||
assert_eq!(entries.len(), 1, "exactly one file in _external/");
|
||||
let name = entries[0].file_name().to_string_lossy().into_owned();
|
||||
@@ -107,5 +107,8 @@ fn ingest_file_errors_on_unsupported_extension() {
|
||||
|
||||
let err = kebab_app::ingest_file_with_config(cfg, &docx).unwrap_err();
|
||||
assert!(err.to_string().contains("unsupported extension"), "{err}");
|
||||
assert!(err.to_string().contains(".docx") || err.to_string().contains("docx"), "{err}");
|
||||
assert!(
|
||||
err.to_string().contains(".docx") || err.to_string().contains("docx"),
|
||||
"{err}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -8,8 +8,7 @@ use common::TestEnv;
|
||||
#[test]
|
||||
fn ingest_then_list_inspects_round_trip() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
|
||||
// The fixture has 3 markdown files; first ingest should label them
|
||||
// all as New.
|
||||
@@ -27,17 +26,14 @@ fn ingest_then_list_inspects_round_trip() {
|
||||
}
|
||||
|
||||
// list_docs returns the 3 docs.
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert_eq!(docs.len(), 3, "docs: {docs:?}");
|
||||
|
||||
// inspect_doc round-trips one of them.
|
||||
let any_doc_id = docs[0].doc_id.clone();
|
||||
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id)
|
||||
.unwrap();
|
||||
let canonical = kebab_app::inspect_doc_with_config(env.config.clone(), &any_doc_id).unwrap();
|
||||
assert_eq!(canonical.doc_id, any_doc_id);
|
||||
assert!(!canonical.blocks.is_empty(), "blocks empty");
|
||||
}
|
||||
@@ -46,12 +42,10 @@ fn ingest_then_list_inspects_round_trip() {
|
||||
fn ingest_idempotent_on_second_run() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
let r1 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let r1 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(r1.new, 3);
|
||||
|
||||
let r2 =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let r2 = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
// Same files re-ingested — p9-fb-23 task 7 introduced the early-skip
|
||||
// path: when checksum + parser/chunker/embedding versions all match,
|
||||
// the second run reports `Unchanged` rather than `Updated`. Pre-p9-fb-23
|
||||
@@ -63,19 +57,16 @@ fn ingest_idempotent_on_second_run() {
|
||||
assert_eq!(r2.unchanged, 3, "second run unchanged: {r2:?}");
|
||||
|
||||
// list_docs still has 3 docs (no duplicates).
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert_eq!(docs.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_summary_only_drops_items() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert!(report.items.is_none(), "summary-only should null items");
|
||||
}
|
||||
@@ -87,12 +78,10 @@ fn ingest_records_ingest_runs_row_with_aggregate_counts() {
|
||||
// of every run. `summary_only=true` writes `items_json=NULL`; the
|
||||
// counts MUST still be present.
|
||||
let env = TestEnv::lexical_only();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
|
||||
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir)
|
||||
.join("kebab.sqlite");
|
||||
let db_path = std::path::PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
|
||||
let conn = rusqlite::Connection::open(&db_path).expect("open kebab.sqlite");
|
||||
let (scanned, new_c, updated, skipped, errors, items_json): (
|
||||
i64,
|
||||
@@ -141,25 +130,18 @@ fn ingest_provider_none_skips_lance() {
|
||||
// tree shape (no `<data_dir>/lancedb` directory, or no `*.lance`
|
||||
// tables under it).
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0, "lexical-only run must not error");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir)
|
||||
.join("lancedb");
|
||||
let lance_dir = std::path::PathBuf::from(&env.config.storage.data_dir).join("lancedb");
|
||||
if lance_dir.exists() {
|
||||
// If the dir was created (e.g., by an earlier consumer touching
|
||||
// the path), it MUST contain no `.lance` tables.
|
||||
let mut had_lance_table = false;
|
||||
for entry in std::fs::read_dir(&lance_dir).expect("read lance_dir") {
|
||||
let entry = entry.unwrap();
|
||||
if entry
|
||||
.path()
|
||||
.extension()
|
||||
.and_then(|s| s.to_str())
|
||||
== Some("lance")
|
||||
{
|
||||
if entry.path().extension().and_then(|s| s.to_str()) == Some("lance") {
|
||||
had_lance_table = true;
|
||||
break;
|
||||
}
|
||||
@@ -189,8 +171,7 @@ fn list_docs_filters_by_tags_any() {
|
||||
tags_any: vec!["rust".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let rust_docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
|
||||
let rust_docs = kebab_app::list_docs_with_config(env.config.clone(), rust_filter).unwrap();
|
||||
// intro.md and notes/cargo.md both tag "rust".
|
||||
assert_eq!(rust_docs.len(), 2, "expected 2 rust docs: {rust_docs:?}");
|
||||
}
|
||||
@@ -198,8 +179,9 @@ fn list_docs_filters_by_tags_any() {
|
||||
#[test]
|
||||
fn inspect_doc_not_found_returns_actionable_error() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bogus =
|
||||
kebab_core::DocumentId("0000000000000000000000000000000000000000000000000000000000000000".to_string());
|
||||
let bogus = kebab_core::DocumentId(
|
||||
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
|
||||
);
|
||||
let err = kebab_app::inspect_doc_with_config(env.config.clone(), &bogus).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(
|
||||
@@ -218,8 +200,7 @@ fn inspect_chunk_not_found_returns_actionable_error() {
|
||||
let bogus = kebab_core::ChunkId(
|
||||
"0000000000000000000000000000000000000000000000000000000000000000".to_string(),
|
||||
);
|
||||
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus)
|
||||
.unwrap_err();
|
||||
let err = kebab_app::inspect_chunk_with_config(env.config.clone(), &bogus).unwrap_err();
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("not found"), "got: {msg}");
|
||||
}
|
||||
@@ -251,22 +232,18 @@ fn ingest_with_config_opts_default_matches_legacy_behaviour() {
|
||||
#[test]
|
||||
fn ingest_stamps_chunker_version_on_document() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
assert!(report.new >= 1, "expected at least one new doc: {report:?}");
|
||||
assert_eq!(report.errors, 0, "no errors expected: {report:?}");
|
||||
|
||||
let docs = kebab_app::list_docs_with_config(
|
||||
env.config.clone(),
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let docs =
|
||||
kebab_app::list_docs_with_config(env.config.clone(), kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
assert!(!docs.is_empty(), "no docs after ingest");
|
||||
|
||||
for doc_entry in &docs {
|
||||
let canonical =
|
||||
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id)
|
||||
.unwrap();
|
||||
kebab_app::inspect_doc_with_config(env.config.clone(), &doc_entry.doc_id).unwrap();
|
||||
assert!(
|
||||
canonical.last_chunker_version.is_some(),
|
||||
"last_chunker_version must be stamped for doc {}: got {:?}",
|
||||
|
||||
171
crates/kebab-app/tests/ingest_log_smoke.rs
Normal file
171
crates/kebab-app/tests/ingest_log_smoke.rs
Normal file
@@ -0,0 +1,171 @@
|
||||
// crates/kebab-app/tests/ingest_log_smoke.rs
|
||||
//
|
||||
// Integration tests for ingest_log feature (v0.20.x). Spec §5 AC-9 + AC-6.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_app::{IngestOpts, ingest_with_config_opts};
|
||||
use kebab_config::{Config, LoggingCfg};
|
||||
use kebab_core::SourceScope;
|
||||
use serde_json::Value;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn minimal_config(workspace: &std::path::Path, log_dir: &std::path::Path) -> Config {
|
||||
let data_dir = workspace.parent().unwrap().join("data");
|
||||
std::fs::create_dir_all(&data_dir).unwrap();
|
||||
let model_dir = workspace.parent().unwrap().join("models");
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.ingest.chunking.target_tokens = 80;
|
||||
cfg.ingest.chunking.overlap_tokens = 20;
|
||||
cfg.logging = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: log_dir.to_path_buf(),
|
||||
..Default::default()
|
||||
};
|
||||
cfg
|
||||
}
|
||||
|
||||
/// AC-9: ingest → log file exists + each line valid JSON + last line kind=summary + scanned>0.
|
||||
#[test]
|
||||
fn ingest_log_smoke() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let workspace = tmp.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let log_dir = tmp.path().join("logs");
|
||||
|
||||
// 1. Minimal corpus: 1 markdown + 1 scanned PDF (OCR disabled — no Ollama needed).
|
||||
std::fs::write(
|
||||
workspace.join("hello.md"),
|
||||
"# Hello\n\nThis is a smoke test.\n",
|
||||
)
|
||||
.unwrap();
|
||||
let pdf_src = PathBuf::from("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
if pdf_src.exists() {
|
||||
std::fs::copy(&pdf_src, workspace.join("scanned.pdf")).unwrap();
|
||||
}
|
||||
|
||||
// 2. Config with logging enabled.
|
||||
let cfg = minimal_config(&workspace, &log_dir);
|
||||
let scope = SourceScope {
|
||||
root: workspace.clone(),
|
||||
exclude: vec![],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// 3. Run ingest.
|
||||
ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
|
||||
.expect("ingest should succeed");
|
||||
|
||||
// 4. Assert log file exists in log_dir.
|
||||
let log_files: Vec<_> = std::fs::read_dir(&log_dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
e.file_name().to_string_lossy().starts_with("ingest-")
|
||||
&& e.file_name().to_string_lossy().ends_with(".ndjson")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(
|
||||
log_files.len(),
|
||||
1,
|
||||
"expected exactly 1 ingest-*.ndjson file, found: {log_files:?}"
|
||||
);
|
||||
|
||||
// 5. Parse each line as JSON — assert kind field present and valid.
|
||||
let body = std::fs::read_to_string(log_files[0].path()).unwrap();
|
||||
let lines: Vec<&str> = body.lines().collect();
|
||||
assert!(!lines.is_empty(), "log file should not be empty");
|
||||
|
||||
let valid_kinds = ["ocr", "parse_error", "skip", "error", "summary"];
|
||||
for line in &lines {
|
||||
let v: Value = serde_json::from_str(line)
|
||||
.unwrap_or_else(|e| panic!("line is not valid JSON: {e}\nline: {line}"));
|
||||
let kind = v
|
||||
.get("kind")
|
||||
.and_then(|k| k.as_str())
|
||||
.unwrap_or_else(|| panic!("line missing 'kind' field: {line}"));
|
||||
assert!(
|
||||
valid_kinds.contains(&kind),
|
||||
"unexpected kind '{kind}' in line: {line}"
|
||||
);
|
||||
}
|
||||
|
||||
// 6. Last line must be kind=summary with scanned > 0.
|
||||
let last = lines.last().unwrap();
|
||||
let last_v: Value = serde_json::from_str(last).unwrap();
|
||||
assert_eq!(
|
||||
last_v.get("kind").and_then(|k| k.as_str()),
|
||||
Some("summary"),
|
||||
"last line must be kind=summary, got: {last}"
|
||||
);
|
||||
let scanned = last_v.get("scanned").and_then(Value::as_u64).unwrap_or(0);
|
||||
assert!(scanned > 0, "summary.scanned should be > 0, got: {last}");
|
||||
}
|
||||
|
||||
/// AC-6: ingest_log_enabled=false → no log file created.
|
||||
#[test]
|
||||
fn ingest_log_disabled_emits_no_file() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let workspace = tmp.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let log_dir = tmp.path().join("logs");
|
||||
|
||||
std::fs::write(
|
||||
workspace.join("hello.md"),
|
||||
"# Hello\n\nDisabled log test.\n",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let data_dir = tmp.path().join("data");
|
||||
std::fs::create_dir_all(&data_dir).unwrap();
|
||||
let model_dir = tmp.path().join("models");
|
||||
std::fs::create_dir_all(&model_dir).unwrap();
|
||||
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = model_dir.to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.logging = LoggingCfg {
|
||||
ingest_log_enabled: false,
|
||||
ingest_log_dir: log_dir.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scope = SourceScope {
|
||||
root: workspace.clone(),
|
||||
exclude: vec![],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
ingest_with_config_opts(cfg, scope, false, IngestOpts::default())
|
||||
.expect("ingest should succeed");
|
||||
|
||||
// log_dir should either not exist or contain 0 ingest-*.ndjson files.
|
||||
let log_file_count = if log_dir.exists() {
|
||||
std::fs::read_dir(&log_dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
e.file_name().to_string_lossy().starts_with("ingest-")
|
||||
&& e.file_name().to_string_lossy().ends_with(".ndjson")
|
||||
})
|
||||
.count()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
assert_eq!(
|
||||
log_file_count, 0,
|
||||
"no ingest-*.ndjson file should be created when disabled"
|
||||
);
|
||||
}
|
||||
117
crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs
Normal file
117
crates/kebab-app/tests/ingest_pdf_ocr_smoke.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
//! Integration smoke tests for the PDF OCR pipeline (§ Acceptance §9 #1 + #2).
|
||||
//!
|
||||
//! Tests 1 and 2 require a live Ollama endpoint — `#[ignore]` by default.
|
||||
//! Manual invoke:
|
||||
//! KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
|
||||
//! cargo test -p kebab-app --test ingest_pdf_ocr_smoke --ignored -j 4
|
||||
//!
|
||||
//! Test 3 (cancel) uses a dummy endpoint + pre-set cancel — runs by default
|
||||
//! to verify the cancel wiring doesn't panic/deadlock.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
fn ollama_endpoint() -> String {
|
||||
std::env::var("KEBAB_PDF_OCR_ENDPOINT").unwrap_or_else(|_| "http://localhost:11434".to_string())
|
||||
}
|
||||
|
||||
fn make_ocr_env_real() -> TestEnv {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.ingest.pdf.ocr.enabled = true;
|
||||
env.config.ingest.pdf.ocr.endpoint = Some(ollama_endpoint());
|
||||
env.config.models.embedding.provider = "none".to_string();
|
||||
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let dest = env.workspace_root.join("scanned_page1.pdf");
|
||||
std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
|
||||
|
||||
env
|
||||
}
|
||||
|
||||
/// § Acceptance §9 #1 — real Ollama OCR + IngestItem.pdf_ocr_pages = Some(1).
|
||||
#[test]
|
||||
#[ignore = "real Ollama qwen2.5vl:3b dependency"]
|
||||
fn ingest_with_mock_ocr_yields_pdf_ocr_summary() {
|
||||
let env = make_ocr_env_real();
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
|
||||
|
||||
assert!(report.new >= 1, "at least one PDF ingested: {report:?}");
|
||||
|
||||
let items = report.items.unwrap_or_default();
|
||||
let pdf_item = items.iter().find(|i| i.doc_path.0.ends_with(".pdf"));
|
||||
assert!(
|
||||
pdf_item.is_some(),
|
||||
"PDF item must appear in ingest report items: {items:?}"
|
||||
);
|
||||
let pdf_item = pdf_item.unwrap();
|
||||
assert!(
|
||||
pdf_item.pdf_ocr_pages.is_some(),
|
||||
"pdf_ocr_pages must be set for scanned PDF: {pdf_item:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.pdf_ocr_pages.unwrap(),
|
||||
1,
|
||||
"scanned_page1.pdf has exactly 1 page"
|
||||
);
|
||||
}
|
||||
|
||||
/// § Acceptance §9 #2 — OCR text indexed and retrievable via lexical search.
|
||||
#[test]
|
||||
#[ignore = "real Ollama qwen2.5vl:3b dependency"]
|
||||
fn ocr_text_indexed_and_searchable() {
|
||||
let env = make_ocr_env_real();
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
|
||||
|
||||
// Search for a Korean morpheme expected to appear in qwen2.5vl:3b OCR
|
||||
// output of the PoC ground-truth page. "다음" is a high-frequency token
|
||||
// in page1.txt truth file.
|
||||
let query = common::lexical_query("다음");
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), query).expect("search");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"OCR-indexed text must surface in lexical search results"
|
||||
);
|
||||
}
|
||||
|
||||
/// Production cancel wiring smoke — pre-set cancel exits before any OCR call.
|
||||
/// Dummy endpoint (port 1 = connection-refused) means OCR HTTP calls would
|
||||
/// fail, but cancel=true prevents the loop from reaching OCR at all.
|
||||
/// Verifies no panic/deadlock regardless of Ok/Err outcome.
|
||||
#[test]
|
||||
fn ingest_with_cancel_aborts_mid_pdf() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.ingest.pdf.ocr.enabled = true;
|
||||
env.config.ingest.pdf.ocr.endpoint = Some("http://127.0.0.1:1".to_string());
|
||||
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let dest = env.workspace_root.join("scanned_page1.pdf");
|
||||
std::fs::copy(&src, &dest).expect("copy scanned_page1.pdf to workspace");
|
||||
|
||||
let cancel = Arc::new(AtomicBool::new(true)); // pre-set — abort immediately
|
||||
|
||||
let result = kebab_app::ingest_with_config_cancellable(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
None,
|
||||
Some(cancel),
|
||||
);
|
||||
// Both Ok (pre-cancel exit) and Err (eager OCR engine fail) are acceptable —
|
||||
// key assertion is no panic/deadlock.
|
||||
let _ = result;
|
||||
}
|
||||
@@ -13,13 +13,9 @@ use kebab_core::IngestItemKind;
|
||||
fn run_with_progress() -> Vec<IngestEvent> {
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), false, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
@@ -73,40 +69,74 @@ fn progress_event_sequence_matches_design_section_2_4a() {
|
||||
other => panic!("expected Completed last, got {other:?}"),
|
||||
}
|
||||
|
||||
// Middle: 3 AssetStarted/AssetFinished pairs in monotonic idx order.
|
||||
let asset_events: Vec<&IngestEvent> = events[2..events.len() - 1].iter().collect();
|
||||
assert_eq!(
|
||||
asset_events.len(),
|
||||
6,
|
||||
"expected 3 (Started + Finished) pairs, got {asset_events:?}"
|
||||
);
|
||||
for (chunk_idx, pair) in asset_events.chunks(2).enumerate() {
|
||||
let expected_idx = chunk_idx as u32 + 1;
|
||||
match (pair[0], pair[1]) {
|
||||
(
|
||||
IngestEvent::AssetStarted {
|
||||
idx: si,
|
||||
total: st,
|
||||
media,
|
||||
..
|
||||
},
|
||||
IngestEvent::AssetFinished {
|
||||
idx: fi,
|
||||
total: ft,
|
||||
result,
|
||||
chunks,
|
||||
},
|
||||
) => {
|
||||
assert_eq!(*si, expected_idx, "Started idx mismatch: {pair:?}");
|
||||
assert_eq!(*fi, expected_idx, "Finished idx mismatch: {pair:?}");
|
||||
assert_eq!(*st, 3, "Started total mismatch");
|
||||
assert_eq!(*ft, 3, "Finished total mismatch");
|
||||
assert_eq!(media, "markdown", "fixture is markdown only");
|
||||
assert_eq!(*result, IngestItemKind::New, "first ingest → New");
|
||||
assert!(*chunks >= 1, "chunks: {pair:?}");
|
||||
// Middle (v0.24.0 ordering invariant §2.4a): per asset the stream is
|
||||
// AssetStarted < AssetChunked < [ExpansionProgress*] < AssetTimings
|
||||
// < AssetFinished
|
||||
// Expansion is disabled in the lexical fixture, so no ExpansionProgress
|
||||
// frames appear here — but AssetChunked + AssetTimings are emitted for
|
||||
// every markdown asset.
|
||||
let middle = &events[2..events.len() - 1];
|
||||
|
||||
// 3 AssetStarted events, monotonic idx 1..=3, all markdown, total = 3.
|
||||
let started: Vec<u32> = middle
|
||||
.iter()
|
||||
.filter_map(|e| match e {
|
||||
IngestEvent::AssetStarted {
|
||||
idx, total, media, ..
|
||||
} => {
|
||||
assert_eq!(*total, 3, "Started total mismatch: {e:?}");
|
||||
assert_eq!(media, "markdown", "fixture is markdown only: {e:?}");
|
||||
Some(*idx)
|
||||
}
|
||||
other => panic!("expected Started+Finished pair, got {other:?}"),
|
||||
}
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(started, vec![1, 2, 3], "AssetStarted idx order: {middle:?}");
|
||||
|
||||
// 3 AssetFinished events, monotonic idx 1..=3, each New with ≥1 chunk.
|
||||
let finished: Vec<u32> = middle
|
||||
.iter()
|
||||
.filter_map(|e| match e {
|
||||
IngestEvent::AssetFinished {
|
||||
idx,
|
||||
total,
|
||||
result,
|
||||
chunks,
|
||||
} => {
|
||||
assert_eq!(*total, 3, "Finished total mismatch: {e:?}");
|
||||
assert_eq!(*result, IngestItemKind::New, "first ingest → New: {e:?}");
|
||||
assert!(*chunks >= 1, "chunks: {e:?}");
|
||||
Some(*idx)
|
||||
}
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(finished, vec![1, 2, 3], "AssetFinished idx order: {middle:?}");
|
||||
|
||||
// v0.24.0 additive events: exactly one AssetChunked + one AssetTimings
|
||||
// per asset, each strictly bracketed by that asset's Started / Finished.
|
||||
for target in 1u32..=3 {
|
||||
let started_at = middle
|
||||
.iter()
|
||||
.position(|e| matches!(e, IngestEvent::AssetStarted { idx, .. } if *idx == target))
|
||||
.unwrap_or_else(|| panic!("missing AssetStarted for idx {target}: {middle:?}"));
|
||||
let finished_at = middle
|
||||
.iter()
|
||||
.position(|e| matches!(e, IngestEvent::AssetFinished { idx, .. } if *idx == target))
|
||||
.unwrap_or_else(|| panic!("missing AssetFinished for idx {target}: {middle:?}"));
|
||||
let chunked_at = middle
|
||||
.iter()
|
||||
.position(|e| matches!(e, IngestEvent::AssetChunked { idx, chunks, .. } if *idx == target && *chunks >= 1))
|
||||
.unwrap_or_else(|| panic!("missing AssetChunked for idx {target}: {middle:?}"));
|
||||
let timings_at = middle
|
||||
.iter()
|
||||
.position(|e| matches!(e, IngestEvent::AssetTimings { idx, .. } if *idx == target))
|
||||
.unwrap_or_else(|| panic!("missing AssetTimings for idx {target}: {middle:?}"));
|
||||
assert!(
|
||||
started_at < chunked_at && chunked_at < timings_at && timings_at < finished_at,
|
||||
"idx {target} ordering: started={started_at} chunked={chunked_at} \
|
||||
timings={timings_at} finished={finished_at}: {middle:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,13 +146,9 @@ fn ingest_with_config_progress_none_matches_ingest_with_config() {
|
||||
// `ingest_with_config_progress(..., None)` must produce identical
|
||||
// reports modulo wall-clock duration.
|
||||
let env = TestEnv::lexical_only();
|
||||
let r_none = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
let r_none =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, None)
|
||||
.unwrap();
|
||||
assert_eq!(r_none.scanned, 3);
|
||||
assert_eq!(r_none.new, 3);
|
||||
}
|
||||
@@ -134,12 +160,77 @@ fn dropped_receiver_does_not_panic_or_fail_ingest() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
drop(rx);
|
||||
let report = kebab_app::ingest_with_config_progress(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
true,
|
||||
Some(tx),
|
||||
)
|
||||
.unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_with_config_progress(env.config.clone(), env.scope(), true, Some(tx))
|
||||
.unwrap();
|
||||
assert_eq!(report.scanned, 3);
|
||||
}
|
||||
|
||||
/// v0.20.0 sub-item 1: pdf_ocr_started + pdf_ocr_finished events 가 PDF asset 의
|
||||
/// OCR-enabled ingest 시 emit 됨을 검증. real Ollama 의존 — `#[ignore]` default.
|
||||
///
|
||||
/// Manual invoke:
|
||||
/// ```
|
||||
/// KEBAB_PDF_OCR_ENABLED=true \
|
||||
/// KEBAB_PDF_OCR_ENDPOINT=http://192.168.0.47:11434 \
|
||||
/// cargo test -p kebab-app --test ingest_progress \
|
||||
/// --ignored pdf_ocr_progress_emits_started_finished_events
|
||||
/// ```
|
||||
#[test]
|
||||
#[ignore = "real Ollama dependency — manual invoke via KEBAB_PDF_OCR_ENABLED=true"]
|
||||
fn pdf_ocr_progress_emits_started_finished_events() {
|
||||
// F1 fixture (DCTDecode JPEG passthrough) 을 tmpdir 의 workspace 로 copy.
|
||||
let tmpdir = tempfile::tempdir().expect("create tmpdir");
|
||||
let workspace = tmpdir.path().join("workspace");
|
||||
std::fs::create_dir_all(&workspace).expect("create workspace dir");
|
||||
let f1_src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf");
|
||||
let f1 = std::fs::read(&f1_src).expect("F1 fixture present");
|
||||
std::fs::write(workspace.join("page1.pdf"), &f1).expect("copy F1");
|
||||
|
||||
let data_dir = tmpdir.path().join("data");
|
||||
std::fs::create_dir_all(&data_dir).expect("create data dir");
|
||||
|
||||
let mut config = kebab_config::Config::defaults();
|
||||
config.workspace.root = workspace.to_string_lossy().into_owned();
|
||||
config.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
config.models.embedding.provider = "none".to_string();
|
||||
config.models.embedding.dimensions = 0;
|
||||
config.ingest.pdf.ocr.enabled = true;
|
||||
if let Ok(endpoint) = std::env::var("KEBAB_PDF_OCR_ENDPOINT") {
|
||||
config.ingest.pdf.ocr.endpoint = Some(endpoint);
|
||||
}
|
||||
|
||||
let scope = kebab_core::SourceScope {
|
||||
root: workspace.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let (tx, rx) = mpsc::channel::<IngestEvent>();
|
||||
let _report = kebab_app::ingest_with_config_progress(config, scope, false, Some(tx))
|
||||
.expect("ingest_with_config_progress");
|
||||
|
||||
let events: Vec<_> = rx.iter().collect();
|
||||
|
||||
let started_count = events
|
||||
.iter()
|
||||
.filter(|e| matches!(e, IngestEvent::PdfOcrStarted { .. }))
|
||||
.count();
|
||||
let finished_count = events
|
||||
.iter()
|
||||
.filter(|e| matches!(e, IngestEvent::PdfOcrFinished { .. }))
|
||||
.count();
|
||||
|
||||
assert!(
|
||||
started_count >= 1,
|
||||
"PdfOcrStarted 가 ≥ 1 emit 됨 (got {started_count})"
|
||||
);
|
||||
assert!(
|
||||
finished_count >= 1,
|
||||
"PdfOcrFinished 가 ≥ 1 emit 됨 (got {finished_count})"
|
||||
);
|
||||
assert_eq!(
|
||||
started_count, finished_count,
|
||||
"Started 와 Finished 의 count 일치"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -29,13 +29,15 @@ fn ingest_stdin_writes_frontmatter_and_reports_new() {
|
||||
"## Body content\n\nMore.",
|
||||
"Article X",
|
||||
Some("https://example.com/x"),
|
||||
).unwrap();
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(report.new, 1, "{report:?}");
|
||||
|
||||
// _external/ contains exactly one .md file with frontmatter.
|
||||
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(std::result::Result::ok)
|
||||
.collect();
|
||||
assert_eq!(entries.len(), 1);
|
||||
let content = fs::read_to_string(entries[0].path()).unwrap();
|
||||
@@ -50,17 +52,14 @@ fn ingest_stdin_without_source_uri() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let cfg = fresh_cfg(dir.path());
|
||||
|
||||
let report = kebab_app::ingest_stdin_with_config(
|
||||
cfg.clone(),
|
||||
"## Body",
|
||||
"Title",
|
||||
None,
|
||||
).unwrap();
|
||||
let report =
|
||||
kebab_app::ingest_stdin_with_config(cfg.clone(), "## Body", "Title", None).unwrap();
|
||||
assert_eq!(report.new, 1);
|
||||
|
||||
let ext_dir = std::path::PathBuf::from(&cfg.workspace.root).join("_external");
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir).unwrap()
|
||||
.filter_map(|e| e.ok())
|
||||
let entries: Vec<_> = fs::read_dir(&ext_dir)
|
||||
.unwrap()
|
||||
.filter_map(std::result::Result::ok)
|
||||
.collect();
|
||||
let content = fs::read_to_string(entries[0].path()).unwrap();
|
||||
assert!(content.contains("title: \"Title\""));
|
||||
|
||||
@@ -17,9 +17,8 @@ fn init_workspace_header_lists_supported_extensions() {
|
||||
}
|
||||
kebab_app::init_workspace(true).expect("init_workspace");
|
||||
let cfg_path = kebab_config::Config::xdg_config_path();
|
||||
let body = std::fs::read_to_string(&cfg_path).unwrap_or_else(|e| {
|
||||
panic!("read config at {}: {e}", cfg_path.display())
|
||||
});
|
||||
let body = std::fs::read_to_string(&cfg_path)
|
||||
.unwrap_or_else(|e| panic!("read config at {}: {e}", cfg_path.display()));
|
||||
assert!(
|
||||
body.contains("처리 가능한 형식"),
|
||||
"header lists supported types section: body=\n{body}"
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
//! Bug #3 regression: multi-scanned PDF ingest must produce globally unique chunk_ids.
|
||||
//! v0.20.0 sub-item 1 bugfix.
|
||||
//!
|
||||
//! Strategy: helper-level chain test (apply_ocr_to_pdf_pages → PdfPageV1Chunker).
|
||||
//! Facade mock injection is unavailable (kebab-app hardcodes OllamaVisionOcr), so
|
||||
//! this test covers the full OCR→chunk pipeline with real PDF fixtures + MockOcrEngine,
|
||||
//! adding value beyond kebab-chunk unit test B5 (which tests PdfPageV1Chunker alone).
|
||||
|
||||
mod common;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use common::mock_ocr::MockOcrEngine;
|
||||
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
||||
use kebab_chunk::PdfPageV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetStorage, Checksum, ChunkPolicy, Chunker, ExtractConfig, ExtractContext, Extractor,
|
||||
MediaType, RawAsset, SourceUri, WorkspacePath, id_for_asset,
|
||||
};
|
||||
use kebab_parse_image::OcrEngine;
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn make_pdf_asset(path: &str, hash_char: char, byte_len: u64) -> RawAsset {
|
||||
let fake_hash: String = hash_char.to_string().repeat(64);
|
||||
let asset_id = id_for_asset(&fake_hash);
|
||||
RawAsset {
|
||||
asset_id,
|
||||
source_uri: SourceUri::File(PathBuf::from(path)),
|
||||
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
|
||||
media_type: MediaType::Pdf,
|
||||
byte_len,
|
||||
checksum: Checksum(fake_hash),
|
||||
discovered_at: OffsetDateTime::UNIX_EPOCH,
|
||||
stored: AssetStorage::Copied {
|
||||
path: PathBuf::from(path),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_and_ocr(
|
||||
bytes: &[u8],
|
||||
path: &str,
|
||||
hash_char: char,
|
||||
engine: &dyn OcrEngine,
|
||||
) -> kebab_core::CanonicalDocument {
|
||||
let asset = make_pdf_asset(path, hash_char, bytes.len() as u64);
|
||||
let workspace_root = Path::new("/");
|
||||
let config = ExtractConfig::default();
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root,
|
||||
config: &config,
|
||||
};
|
||||
let mut canonical = PdfTextExtractor::new().extract(&ctx, bytes).unwrap();
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
apply_ocr_to_pdf_pages(&mut canonical, engine, bytes, &opts, |_| {}).unwrap();
|
||||
canonical
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_scanned_pdf_ingest_no_chunk_id_collision() {
|
||||
let f1_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||
.expect("F1 fixture missing");
|
||||
let f2_bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page2.pdf")
|
||||
.expect("F2 fixture missing");
|
||||
|
||||
// Bug #3 trigger shape: 10-char early segment + ". " + 500-char tail.
|
||||
// byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500 → multi-chunk.
|
||||
// overlap_bytes = min(240, 750) = 240 / chars=80 → second chunk's actual_start
|
||||
// collapses to prev_min=0 without the fix → same #c0 suffix → chunk_id collision.
|
||||
let trigger_text = format!("{}. {}", "가".repeat(10), "나".repeat(500));
|
||||
|
||||
let f1_engine = MockOcrEngine::single("F1 mock OCR page text", false);
|
||||
let f2_engine = MockOcrEngine::single(&trigger_text, false);
|
||||
|
||||
let f1_canonical = extract_and_ocr(&f1_bytes, "page1.pdf", '1', &f1_engine);
|
||||
let f2_canonical = extract_and_ocr(&f2_bytes, "page2.pdf", '2', &f2_engine);
|
||||
|
||||
let chunk_policy = ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: PdfPageV1Chunker.chunker_version(),
|
||||
};
|
||||
|
||||
let f1_chunks = PdfPageV1Chunker
|
||||
.chunk(&f1_canonical, &chunk_policy)
|
||||
.unwrap();
|
||||
let f2_chunks = PdfPageV1Chunker
|
||||
.chunk(&f2_canonical, &chunk_policy)
|
||||
.unwrap();
|
||||
|
||||
assert!(
|
||||
f2_chunks.len() >= 2,
|
||||
"F2 trigger text must produce ≥2 chunks for the collision to be possible; got {}",
|
||||
f2_chunks.len()
|
||||
);
|
||||
|
||||
let all_ids: Vec<&str> = f1_chunks
|
||||
.iter()
|
||||
.chain(f2_chunks.iter())
|
||||
.map(|c| c.chunk_id.0.as_str())
|
||||
.collect();
|
||||
let total = all_ids.len();
|
||||
let unique: HashSet<&str> = all_ids.iter().copied().collect();
|
||||
assert_eq!(
|
||||
unique.len(),
|
||||
total,
|
||||
"all chunk_ids must be globally unique across F1 + F2 ({} unique vs {} total — collision detected)",
|
||||
unique.len(),
|
||||
total,
|
||||
);
|
||||
}
|
||||
156
crates/kebab-app/tests/ocr_inspect_smoke.rs
Normal file
156
crates/kebab-app/tests/ocr_inspect_smoke.rs
Normal file
@@ -0,0 +1,156 @@
|
||||
//! Integration smoke tests for `kebab inspect ocr-stats / ocr-failures`.
|
||||
//! AC-4, AC-5, AC-6, AC-11 (ocr_inspect_smoke binary), AC-13.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::App;
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
|
||||
/// Insert synthetic pdf_ocr_events rows directly so the test runs without
|
||||
/// a live Ollama endpoint.
|
||||
fn seed_ocr_events(env: &TestEnv, store: &SqliteStore) {
|
||||
// Success rows
|
||||
for i in 0..3u32 {
|
||||
store
|
||||
.record_pdf_ocr_event(
|
||||
"run-aaa",
|
||||
&format!("2026-05-28T0{i}:00:00Z"),
|
||||
Some("doc-abc"),
|
||||
"path/scanned.pdf",
|
||||
i + 1,
|
||||
Some(50_000),
|
||||
Some(200),
|
||||
Some(150),
|
||||
100 + u64::from(i) * 20,
|
||||
42,
|
||||
true,
|
||||
None,
|
||||
"qwen2.5vl",
|
||||
)
|
||||
.expect("seed success row");
|
||||
}
|
||||
// Failure row
|
||||
store
|
||||
.record_pdf_ocr_event(
|
||||
"run-bbb",
|
||||
"2026-05-28T10:00:00Z",
|
||||
Some("doc-abc"),
|
||||
"path/scanned.pdf",
|
||||
4,
|
||||
Some(30_000),
|
||||
Some(200),
|
||||
Some(150),
|
||||
9999,
|
||||
0,
|
||||
false,
|
||||
Some("ocr_error"),
|
||||
"qwen2.5vl",
|
||||
)
|
||||
.expect("seed failure row");
|
||||
// Row for different doc
|
||||
store
|
||||
.record_pdf_ocr_event(
|
||||
"run-ccc",
|
||||
"2026-05-28T11:00:00Z",
|
||||
Some("doc-xyz"),
|
||||
"path/other.pdf",
|
||||
1,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
200,
|
||||
10,
|
||||
true,
|
||||
None,
|
||||
"qwen2.5vl",
|
||||
)
|
||||
.expect("seed doc-xyz row");
|
||||
// Trigger migration (no-op if already done via App::open_with_config)
|
||||
let _ = env;
|
||||
}
|
||||
|
||||
fn open_app_with_seeded_events(env: &TestEnv) -> App {
|
||||
let app = env.app();
|
||||
let store = SqliteStore::open(&env.config).expect("open store for seed");
|
||||
store.run_migrations().expect("run migrations for seed");
|
||||
seed_ocr_events(env, &store);
|
||||
app
|
||||
}
|
||||
|
||||
/// AC-4: `inspect_ocr_stats` returns `schema_version = "ocr_stats.v1"`,
|
||||
/// `total_events >= 1`, `0 ≤ success_rate ≤ 1`.
|
||||
#[test]
|
||||
fn ocr_stats_after_seeded_events() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let app = open_app_with_seeded_events(&env);
|
||||
|
||||
let stats = app.inspect_ocr_stats().expect("inspect_ocr_stats");
|
||||
|
||||
assert_eq!(stats.schema_version, "ocr_stats.v1");
|
||||
assert!(stats.total_events >= 1, "total_events should be >= 1");
|
||||
assert!(
|
||||
(0.0..=1.0).contains(&stats.success_rate),
|
||||
"success_rate must be in [0, 1]: {}",
|
||||
stats.success_rate
|
||||
);
|
||||
assert!(stats.total_runs >= 1, "total_runs should be >= 1");
|
||||
// by_engine should have at least one entry
|
||||
assert!(!stats.by_engine.is_empty(), "by_engine must be non-empty");
|
||||
}
|
||||
|
||||
/// AC-6: `inspect_ocr_failures` (no doc_id, corpus-wide) returns failures list.
|
||||
#[test]
|
||||
fn ocr_failures_corpus_wide() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let app = open_app_with_seeded_events(&env);
|
||||
|
||||
let result = app
|
||||
.inspect_ocr_failures(None, 10)
|
||||
.expect("inspect_ocr_failures");
|
||||
|
||||
assert_eq!(result.schema_version, "ocr_failures.v1");
|
||||
assert!(result.failure_count >= 1, "expected at least 1 failure");
|
||||
assert!(
|
||||
!result.failures.is_empty(),
|
||||
"failures list must be non-empty"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC-5: `inspect_ocr_failures` with doc_id filter returns matching rows.
|
||||
#[test]
|
||||
fn ocr_failures_filter_by_doc_id() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let app = open_app_with_seeded_events(&env);
|
||||
|
||||
let result = app
|
||||
.inspect_ocr_failures(Some("doc-abc"), 10)
|
||||
.expect("inspect_ocr_failures by doc_id");
|
||||
|
||||
assert_eq!(result.schema_version, "ocr_failures.v1");
|
||||
assert_eq!(
|
||||
result.doc_id.as_deref(),
|
||||
Some("doc-abc"),
|
||||
"doc_id must be echoed back"
|
||||
);
|
||||
// All rows must belong to doc-abc (no cross-doc leak)
|
||||
for row in &result.failures {
|
||||
// rows are failure rows for doc-abc only (reason = ocr_error)
|
||||
assert_eq!(row.reason, "ocr_error");
|
||||
}
|
||||
}
|
||||
|
||||
/// AC-13: SKILL.md lists both new wire schemas.
|
||||
#[test]
|
||||
fn skill_md_lists_new_schemas() {
|
||||
let skill_md = std::fs::read_to_string("../../integrations/claude-code/kebab/SKILL.md")
|
||||
.expect("read SKILL.md");
|
||||
assert!(
|
||||
skill_md.contains("ocr_stats.v1"),
|
||||
"SKILL.md must mention ocr_stats.v1"
|
||||
);
|
||||
assert!(
|
||||
skill_md.contains("ocr_failures.v1"),
|
||||
"SKILL.md must mention ocr_failures.v1"
|
||||
);
|
||||
}
|
||||
81
crates/kebab-app/tests/open_with_config_nli.rs
Normal file
81
crates/kebab-app/tests/open_with_config_nli.rs
Normal file
@@ -0,0 +1,81 @@
|
||||
//! Tests for `App::open_with_config`'s NLI verifier construction path.
|
||||
//!
|
||||
//! Coverage:
|
||||
//! 1. `open_with_config_nli_fails_when_model_dir_unwritable_and_threshold_positive` —
|
||||
//! when `rag.nli_threshold > 0` and `storage.model_dir` is unwritable,
|
||||
//! `open_with_config` returns `Err` with "OnnxNliVerifier" in the
|
||||
//! error chain.
|
||||
//! 2. `open_with_config_nli_skipped_when_threshold_zero` —
|
||||
//! same bad `model_dir`, but `rag.nli_threshold = 0.0` (gate disabled),
|
||||
//! so `OnnxNliVerifier::new` is never called and `open_with_config`
|
||||
//! succeeds.
|
||||
//!
|
||||
//! `/proc/1/root` is the init process's filesystem root; on Linux it is
|
||||
//! owned by root and not traversable by unprivileged users, making
|
||||
//! `create_dir_all` fail with `EACCES` — a reliable "unwritable path"
|
||||
//! that requires no test setup beyond the path literal.
|
||||
|
||||
use kebab_config::Config;
|
||||
|
||||
/// Return a `Config` whose `data_dir` lives in a fresh `TempDir`
|
||||
/// (so `SqliteStore::open` succeeds) and whose `model_dir` is set to
|
||||
/// `/proc/1/root` (unwritable by non-root processes on Linux).
|
||||
///
|
||||
/// The `TempDir` is returned alongside the config so the caller keeps
|
||||
/// it alive until the test completes — dropping it early would delete
|
||||
/// the data directory before any assertions run.
|
||||
fn config_with_unwritable_model_dir() -> (tempfile::TempDir, Config) {
|
||||
let tmp = tempfile::tempdir().expect("tempdir");
|
||||
let mut cfg = Config::defaults();
|
||||
// Valid data_dir → SqliteStore::open + run_migrations succeed.
|
||||
cfg.storage.data_dir = tmp.path().to_string_lossy().into_owned();
|
||||
// /proc/1/root is only accessible to root; create_dir_all will
|
||||
// return EACCES for any unprivileged user, which is exactly the
|
||||
// failure mode we want to exercise.
|
||||
cfg.storage.model_dir = "/proc/1/root".to_string();
|
||||
(tmp, cfg)
|
||||
}
|
||||
|
||||
// ── 1. Failure path: threshold > 0 + unwritable model_dir ─────────────────
|
||||
|
||||
#[test]
|
||||
fn open_with_config_nli_fails_when_model_dir_unwritable_and_threshold_positive() {
|
||||
let (_tmp, mut cfg) = config_with_unwritable_model_dir();
|
||||
cfg.rag.nli_threshold = 0.5; // gate enabled → OnnxNliVerifier::new runs
|
||||
|
||||
let result = kebab_app::App::open_with_config(cfg);
|
||||
|
||||
let Err(err) = result else {
|
||||
panic!(
|
||||
"App::open_with_config must fail when model_dir is unwritable and nli_threshold > 0"
|
||||
);
|
||||
};
|
||||
// The error chain must identify the OnnxNliVerifier as the source so
|
||||
// an operator reading logs can trace the failure to the NLI config.
|
||||
let err_chain = format!("{err:?}");
|
||||
assert!(
|
||||
err_chain.contains("OnnxNliVerifier"),
|
||||
"error chain must mention OnnxNliVerifier; full chain: {err_chain}"
|
||||
);
|
||||
}
|
||||
|
||||
// ── 2. Success path: threshold = 0.0 → NLI verifier never constructed ──────
|
||||
|
||||
#[test]
|
||||
fn open_with_config_nli_skipped_when_threshold_zero() {
|
||||
let (_tmp, cfg) = config_with_unwritable_model_dir();
|
||||
// Default nli_threshold is 0.0 — gate disabled, verifier skipped.
|
||||
assert!(
|
||||
(cfg.rag.nli_threshold - 0.0).abs() < f32::EPSILON,
|
||||
"precondition: default nli_threshold must be 0.0 (gate disabled)"
|
||||
);
|
||||
|
||||
// A bad model_dir must NOT cause a failure when the NLI gate is off.
|
||||
let result = kebab_app::App::open_with_config(cfg);
|
||||
assert!(
|
||||
result.is_ok(),
|
||||
"App::open_with_config must succeed when nli_threshold = 0.0 \
|
||||
(OnnxNliVerifier is never constructed); err: {:?}",
|
||||
result.err()
|
||||
);
|
||||
}
|
||||
358
crates/kebab-app/tests/pdf_ocr_apply.rs
Normal file
358
crates/kebab-app/tests/pdf_ocr_apply.rs
Normal file
@@ -0,0 +1,358 @@
|
||||
//! Integration tests for pdf_ocr_apply helper. spec §5.5 MockOcrEngine pattern.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
use common::mock_ocr::MockOcrEngine;
|
||||
use kebab_app::pdf_ocr_apply::{PdfOcrOpts, apply_ocr_to_pdf_pages};
|
||||
use kebab_core::{
|
||||
AssetStorage, Block, CanonicalDocument, Checksum, ExtractConfig, ExtractContext, Extractor,
|
||||
Inline, Lang, MediaType, RawAsset, SourceSpan, SourceUri, WorkspacePath, id_for_asset,
|
||||
};
|
||||
use kebab_parse_pdf::PdfTextExtractor;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
// ── Fixture helpers ───────────────────────────────────────────────────────
|
||||
|
||||
fn f1_pdf_bytes() -> Vec<u8> {
|
||||
std::fs::read("../kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||
.expect("F1 fixture missing")
|
||||
}
|
||||
|
||||
fn make_raw_asset(path: &str, media_type: MediaType, byte_len: u64) -> RawAsset {
|
||||
let fake_hash = "0".repeat(64);
|
||||
let asset_id = id_for_asset(&fake_hash);
|
||||
RawAsset {
|
||||
asset_id,
|
||||
source_uri: SourceUri::File(PathBuf::from(path)),
|
||||
workspace_path: WorkspacePath::new(path.to_string()).unwrap(),
|
||||
media_type,
|
||||
byte_len,
|
||||
checksum: Checksum(fake_hash.clone()),
|
||||
discovered_at: OffsetDateTime::UNIX_EPOCH,
|
||||
stored: AssetStorage::Copied {
|
||||
path: PathBuf::from(path),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a CanonicalDocument from raw PDF bytes using PdfTextExtractor.
|
||||
/// F1 (scanned) returns an empty-text Block::Paragraph per page.
|
||||
fn extract_canonical_from_bytes(bytes: &[u8]) -> CanonicalDocument {
|
||||
let asset = make_raw_asset("test.pdf", MediaType::Pdf, bytes.len() as u64);
|
||||
let workspace_root = Path::new("/");
|
||||
let config = ExtractConfig::default();
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root,
|
||||
config: &config,
|
||||
};
|
||||
PdfTextExtractor::new().extract(&ctx, bytes).unwrap()
|
||||
}
|
||||
|
||||
/// F1 bytes → canonical with 1 empty Block::Paragraph for page 1.
|
||||
fn canonical_with_empty_block() -> CanonicalDocument {
|
||||
extract_canonical_from_bytes(&f1_pdf_bytes())
|
||||
}
|
||||
|
||||
/// F1-based canonical with block text replaced by `text` (high valid_ratio, chars≥20).
|
||||
fn canonical_with_filled_block(text: &str) -> CanonicalDocument {
|
||||
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
|
||||
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
|
||||
let char_count = text.chars().count() as u32;
|
||||
tb.text = text.to_string();
|
||||
tb.inlines = vec![Inline::Text {
|
||||
text: text.to_string(),
|
||||
}];
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(char_count);
|
||||
}
|
||||
}
|
||||
canonical
|
||||
}
|
||||
|
||||
/// F1-based canonical with block text replaced by PUA codepoints (low valid_ratio).
|
||||
fn canonical_with_mojibake_block() -> CanonicalDocument {
|
||||
let mut canonical = extract_canonical_from_bytes(&f1_pdf_bytes());
|
||||
if let Some(Block::Paragraph(tb)) = canonical.blocks.first_mut() {
|
||||
let pua = "\u{E000}".repeat(25); // 25 PUA codepoints → valid_ratio ≈ 0
|
||||
let char_count = pua.chars().count() as u32;
|
||||
tb.text = pua.clone();
|
||||
tb.inlines = vec![Inline::Text { text: pua }];
|
||||
if let SourceSpan::Page { char_end, .. } = &mut tb.common.source_span {
|
||||
*char_end = Some(char_count);
|
||||
}
|
||||
}
|
||||
canonical
|
||||
}
|
||||
|
||||
fn default_opts(enabled: bool) -> PdfOcrOpts {
|
||||
PdfOcrOpts {
|
||||
enabled,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// Test 1: F1 + enabled=true → in-place mutate
|
||||
#[test]
|
||||
fn f1_input_with_ocr_enabled_replaces_empty_block() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("MOCK_OCR_TEXT", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: Some(Lang("kor".into())),
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1);
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
assert!(first_para.is_some());
|
||||
assert_eq!(first_para.unwrap().text, "MOCK_OCR_TEXT");
|
||||
}
|
||||
|
||||
// Test 2: F3 vector (mock filled canonical) + enabled=true → OCR skip (needs_ocr=false)
|
||||
#[test]
|
||||
fn f3_input_with_ocr_enabled_keeps_text_detect_blocks() {
|
||||
let bytes = f1_pdf_bytes(); // reuse F1 bytes; decision is based on canonical text
|
||||
let text = "충분한 한국어 텍스트 컨텐츠입니다. This has more than twenty characters.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "vector PDF 의 OCR 호출 0");
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
if let Some(tb) = first_para {
|
||||
assert!(tb.text.starts_with("충분한"), "원본 text 보존");
|
||||
}
|
||||
}
|
||||
|
||||
// Test 3: F1 + enabled=false → no-op
|
||||
#[test]
|
||||
fn f1_input_with_ocr_disabled_keeps_empty_block() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("IGNORED", false);
|
||||
let opts = default_opts(false);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0);
|
||||
assert_eq!(summary.ms_total, 0);
|
||||
}
|
||||
|
||||
// Test 4: mojibake canonical (PUA chars) + enabled=true → in-place mutate
|
||||
#[test]
|
||||
fn f4_input_with_ocr_enabled_replaces_mojibake_block() {
|
||||
let bytes = f1_pdf_bytes(); // F1 bytes carry DCTDecode image
|
||||
let mut canonical = canonical_with_mojibake_block();
|
||||
let engine = MockOcrEngine::single("OCR_MOJIBAKE_REPLACEMENT", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1, "mojibake page 의 OCR 호출");
|
||||
let first_para = canonical.blocks.iter().find_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb),
|
||||
_ => None,
|
||||
});
|
||||
if let Some(tb) = first_para {
|
||||
assert_eq!(tb.text, "OCR_MOJIBAKE_REPLACEMENT");
|
||||
}
|
||||
}
|
||||
|
||||
// Test 5: filled canonical + always_on=true → dual-block (+1 OCR block)
|
||||
#[test]
|
||||
fn f3_input_with_always_on_pushes_dual_blocks() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let text = "vector PDF 충분한 텍스트 컨텐츠입니다. This has enough characters for valid ratio.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let original_block_count = canonical.blocks.len();
|
||||
let engine = MockOcrEngine::single("OCR_DUAL", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: true,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 1);
|
||||
assert_eq!(
|
||||
canonical.blocks.len(),
|
||||
original_block_count + 1,
|
||||
"always_on 시 새 Block::Paragraph push"
|
||||
);
|
||||
let texts: Vec<&str> = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(tb.text.as_str()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
assert!(texts.contains(&"OCR_DUAL"), "OCR block 포함");
|
||||
assert!(
|
||||
texts.iter().any(|t| t.starts_with("vector")),
|
||||
"원본 text-detect block 보존"
|
||||
);
|
||||
}
|
||||
|
||||
// Test 6: F6 FlateDecode → extract_dctdecode_page_image=None → skip + warning
|
||||
#[test]
|
||||
fn f6_flatedecode_skipped_with_warning() {
|
||||
let bytes = std::fs::read("../kebab-parse-pdf/tests/fixtures/flate_raw.pdf")
|
||||
.expect("F6 fixture missing");
|
||||
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
summary.pages_ocrd, 0,
|
||||
"FlateDecode page 는 skip (DCTDecode-only v1 invariant)"
|
||||
);
|
||||
let warning_count = canonical
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
|
||||
.count();
|
||||
assert!(warning_count >= 1, "FlateDecode skip 시 Warning event 발행");
|
||||
}
|
||||
|
||||
// Test 7: F7 CCITTFax → skip + warning (verifier M-4 split)
|
||||
#[test]
|
||||
fn f7_ccittfax_skipped_with_warning() {
|
||||
let bytes =
|
||||
std::fs::read("../kebab-parse-pdf/tests/fixtures/ccitt.pdf").expect("F7 fixture missing");
|
||||
let mut canonical = canonical_with_empty_block(); // page-1 block from F1
|
||||
let engine = MockOcrEngine::single("SHOULD_NOT_BE_CALLED", false);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "CCITTFax page 는 skip");
|
||||
let warning_count = canonical
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == kebab_core::ProvenanceKind::Warning)
|
||||
.count();
|
||||
assert!(warning_count >= 1, "CCITTFax skip 시 Warning event 발행");
|
||||
}
|
||||
|
||||
// Test 8: OCR engine failure → warning event + skip
|
||||
#[test]
|
||||
fn ocr_engine_failure_surfaces_as_warning() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let engine = MockOcrEngine::single("", true);
|
||||
let opts = default_opts(true);
|
||||
|
||||
let summary = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
assert_eq!(summary.pages_ocrd, 0, "OCR failure 시 pages_ocrd=0");
|
||||
let warning_with_failure = canonical.provenance.events.iter().any(|e| {
|
||||
e.kind == kebab_core::ProvenanceKind::Warning
|
||||
&& e.note.as_deref().unwrap_or("").contains("mock failure")
|
||||
});
|
||||
assert!(
|
||||
warning_with_failure,
|
||||
"OCR failure 의 error message 가 warning event 의 note 안"
|
||||
);
|
||||
}
|
||||
|
||||
// Test 9: dual-block ordinals are deterministic and unique
|
||||
#[test]
|
||||
fn dual_block_ordinals_are_deterministic_and_unique() {
|
||||
let bytes = f1_pdf_bytes(); // 1-page PDF → page_count=1
|
||||
let text = "vector 충분한 텍스트. This text has more than twenty characters total.";
|
||||
let mut canonical = canonical_with_filled_block(text);
|
||||
let engine = MockOcrEngine::single("DUAL", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: true,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: None,
|
||||
};
|
||||
|
||||
apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {}).unwrap();
|
||||
|
||||
// page_count=1 → text-detect ordinal=0, ocr ordinal=1 (page_num-1 + page_count = 0+1=1)
|
||||
let para_count = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter(|b| matches!(b, Block::Paragraph(_)))
|
||||
.count();
|
||||
assert_eq!(para_count, 2, "dual-block: text-detect + OCR");
|
||||
|
||||
let all_page_1 = canonical
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Paragraph(tb) => Some(&tb.common.source_span),
|
||||
_ => None,
|
||||
})
|
||||
.all(|s| matches!(s, SourceSpan::Page { page: 1, .. }));
|
||||
assert!(all_page_1, "두 block 모두 page=1");
|
||||
}
|
||||
|
||||
// Test 10: cancel handle aborts mid-PDF
|
||||
#[test]
|
||||
fn cancel_handle_aborts_mid_pdf() {
|
||||
let bytes = f1_pdf_bytes();
|
||||
let mut canonical = canonical_with_empty_block();
|
||||
let cancel = Arc::new(AtomicBool::new(true)); // pre-cancel
|
||||
let engine = MockOcrEngine::single("IGNORED", false);
|
||||
let opts = PdfOcrOpts {
|
||||
enabled: true,
|
||||
always_on: false,
|
||||
valid_ratio_threshold: 0.5,
|
||||
min_char_count: 20,
|
||||
lang_hint: None,
|
||||
cancel: Some(cancel.clone()),
|
||||
};
|
||||
|
||||
let result = apply_ocr_to_pdf_pages(&mut canonical, &engine, &bytes, &opts, |_| {});
|
||||
let err = result.expect_err("cancel=true 시 error 반환");
|
||||
assert!(
|
||||
format!("{err}").contains("cancelled mid-PDF"),
|
||||
"error message 가 'cancelled mid-PDF' 포함: {err}"
|
||||
);
|
||||
}
|
||||
139
crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs
Normal file
139
crates/kebab-app/tests/pdf_ocr_events_insert_smoke.rs
Normal file
@@ -0,0 +1,139 @@
|
||||
//! Integration smoke test: dual-write (ndjson + SQLite) for PDF OCR events.
|
||||
//! AC-3: SQLite row count and doc_id matches ndjson LogEvent::Ocr.
|
||||
//!
|
||||
//! Uses wiremock to stub the Ollama `/api/generate` endpoint so the test
|
||||
//! runs without a live Ollama instance.
|
||||
|
||||
mod common;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_config::LoggingCfg;
|
||||
use serde_json::Value;
|
||||
use tokio::task::spawn_blocking;
|
||||
use wiremock::matchers::{method, path};
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
fn scanned_pdf_src() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.join("kebab-parse-pdf/tests/fixtures/scanned_page1.pdf")
|
||||
}
|
||||
|
||||
/// AC-3: ndjson OCR line count == pdf_ocr_events row count, and doc_id matches.
|
||||
#[tokio::test]
|
||||
async fn ingest_dual_write_doc_id_matches_ndjson() {
|
||||
let src = scanned_pdf_src();
|
||||
if !src.exists() {
|
||||
eprintln!("skipping test: scanned_page1.pdf fixture not found");
|
||||
return;
|
||||
}
|
||||
|
||||
let server = MockServer::start().await;
|
||||
// Stub Ollama /api/generate to return a minimal OCR response.
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/api/generate"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
|
||||
"model": "qwen2.5vl:3b",
|
||||
"response": "test ocr output",
|
||||
"done": true,
|
||||
"done_reason": "stop"
|
||||
})))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let mock_url = server.uri();
|
||||
|
||||
let result = spawn_blocking(move || {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
// Enable PDF OCR + set up mock endpoint
|
||||
env.config.ingest.pdf.ocr.enabled = true;
|
||||
env.config.ingest.pdf.ocr.endpoint = Some(mock_url.clone());
|
||||
env.config.ingest.pdf.ocr.model = "qwen2.5vl:3b".to_string();
|
||||
// Enable ingest log
|
||||
let log_dir = env.temp.path().join("logs");
|
||||
std::fs::create_dir_all(&log_dir).unwrap();
|
||||
env.config.logging = LoggingCfg {
|
||||
ingest_log_enabled: true,
|
||||
ingest_log_dir: log_dir.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Copy scanned PDF into workspace
|
||||
let dest = env.workspace_root.join("scanned.pdf");
|
||||
std::fs::copy(scanned_pdf_src(), &dest).expect("copy scanned PDF");
|
||||
|
||||
// Run ingest
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest");
|
||||
|
||||
// Read ndjson log
|
||||
let log_files: Vec<_> = std::fs::read_dir(&log_dir)
|
||||
.unwrap()
|
||||
.filter_map(Result::ok)
|
||||
.filter(|e| {
|
||||
let name = e.file_name().to_string_lossy().to_string();
|
||||
name.starts_with("ingest-") && name.ends_with(".ndjson")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(log_files.len(), 1, "expected 1 ndjson log file");
|
||||
|
||||
let body = std::fs::read_to_string(log_files[0].path()).unwrap();
|
||||
let ocr_lines: Vec<Value> = body
|
||||
.lines()
|
||||
.filter_map(|l| serde_json::from_str(l).ok())
|
||||
.filter(|v: &Value| v.get("kind").and_then(Value::as_str) == Some("ocr"))
|
||||
.collect();
|
||||
|
||||
// Read pdf_ocr_events from SQLite
|
||||
let db_path = PathBuf::from(&env.config.storage.data_dir).join("kebab.sqlite");
|
||||
let conn = rusqlite::Connection::open(&db_path).expect("open db");
|
||||
let rows: Vec<(Option<String>, String)> = {
|
||||
let mut stmt = conn
|
||||
.prepare("SELECT doc_id, doc_path FROM pdf_ocr_events ORDER BY id")
|
||||
.expect("prepare");
|
||||
stmt.query_map([], |r| Ok((r.get(0)?, r.get(1)?)))
|
||||
.expect("query")
|
||||
.map(|r| r.expect("row"))
|
||||
.collect()
|
||||
};
|
||||
|
||||
(ocr_lines, rows)
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
|
||||
let (ocr_lines, rows) = result;
|
||||
|
||||
// At least one OCR event must be produced
|
||||
assert!(!ocr_lines.is_empty(), "expected ≥1 ndjson ocr line");
|
||||
assert!(!rows.is_empty(), "expected ≥1 pdf_ocr_events row");
|
||||
|
||||
// Row counts must match
|
||||
assert_eq!(
|
||||
ocr_lines.len(),
|
||||
rows.len(),
|
||||
"ndjson ocr lines ({}) must equal pdf_ocr_events rows ({})",
|
||||
ocr_lines.len(),
|
||||
rows.len()
|
||||
);
|
||||
|
||||
// doc_id in both sources must be non-null and consistent
|
||||
for (line, (sql_doc_id, _sql_doc_path)) in ocr_lines.iter().zip(rows.iter()) {
|
||||
let json_doc_id = line.get("doc_id").and_then(Value::as_str);
|
||||
assert!(
|
||||
json_doc_id.is_some(),
|
||||
"ndjson ocr line should have doc_id: {line}"
|
||||
);
|
||||
assert!(
|
||||
sql_doc_id.is_some(),
|
||||
"pdf_ocr_events row should have doc_id"
|
||||
);
|
||||
assert_eq!(
|
||||
json_doc_id,
|
||||
sql_doc_id.as_deref(),
|
||||
"ndjson doc_id must equal SQLite doc_id"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -46,17 +46,13 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
|
||||
operations: vec![
|
||||
Operation::new("BT", vec![]),
|
||||
Operation::new("Tf", vec!["F1".into(), 24.into()]),
|
||||
Operation::new(
|
||||
"Td",
|
||||
vec![Object::Integer(100), Object::Integer(700)],
|
||||
),
|
||||
Operation::new("Td", vec![Object::Integer(100), Object::Integer(700)]),
|
||||
Operation::new("Tj", vec![Object::string_literal(*text)]),
|
||||
Operation::new("ET", vec![]),
|
||||
],
|
||||
};
|
||||
let stream_data = content.encode().expect("content encode");
|
||||
let content_id =
|
||||
doc.add_object(Stream::new(dictionary! {}, stream_data));
|
||||
let content_id = doc.add_object(Stream::new(dictionary! {}, stream_data));
|
||||
page_dict.set("Contents", content_id);
|
||||
}
|
||||
let page_id = doc.add_object(page_dict);
|
||||
@@ -76,8 +72,7 @@ fn build_text_pdf(pages: &[Option<&str>]) -> Vec<u8> {
|
||||
Object::Integer(842),
|
||||
],
|
||||
};
|
||||
doc.objects
|
||||
.insert(pages_id, Object::Dictionary(pages_dict));
|
||||
doc.objects.insert(pages_id, Object::Dictionary(pages_dict));
|
||||
|
||||
let catalog_id = doc.add_object(dictionary! {
|
||||
"Type" => "Catalog",
|
||||
@@ -126,8 +121,8 @@ fn cfg_with_pdf(env: &TestEnv) -> Config {
|
||||
// PDF ingest does not need OCR / caption / LM — leave defaults
|
||||
// (ocr.enabled=false, caption.enabled=false). The image pipeline
|
||||
// construction step skips both adapters.
|
||||
cfg.image.ocr.enabled = false;
|
||||
cfg.image.caption.enabled = false;
|
||||
cfg.ingest.image.ocr.enabled = false;
|
||||
cfg.ingest.image.caption.enabled = false;
|
||||
cfg
|
||||
}
|
||||
|
||||
@@ -146,9 +141,8 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
|
||||
write_pdf(&env.workspace_root, "three.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
|
||||
.expect("PDF ingest must succeed");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false)
|
||||
.expect("PDF ingest must succeed");
|
||||
|
||||
assert_eq!(report.errors, 0);
|
||||
let items = report.items.as_ref().expect("items present");
|
||||
@@ -157,23 +151,30 @@ fn ingest_3_page_pdf_produces_one_doc_and_per_page_chunks() {
|
||||
.find(|i| i.doc_path.0.ends_with("three.pdf"))
|
||||
.expect("PDF item present");
|
||||
assert_eq!(pdf_item.kind, IngestItemKind::New);
|
||||
assert_eq!(pdf_item.block_count, Some(3), "one Block::Paragraph per page");
|
||||
assert_eq!(pdf_item.chunk_count, Some(3), "one chunk per non-empty page");
|
||||
assert_eq!(
|
||||
pdf_item.parser_version.as_ref().map(|p| p.0.as_str()),
|
||||
pdf_item.block_count,
|
||||
Some(3),
|
||||
"one Block::Paragraph per page"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.chunk_count,
|
||||
Some(3),
|
||||
"one chunk per non-empty page"
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.parser_version
|
||||
.as_ref()
|
||||
.map(|p| p.0.split('|').next().unwrap()),
|
||||
Some("pdf-text-v1")
|
||||
);
|
||||
assert_eq!(
|
||||
pdf_item.chunker_version.as_ref().map(|c| c.0.as_str()),
|
||||
Some("pdf-page-v1")
|
||||
Some("pdf-page-v1.1")
|
||||
);
|
||||
|
||||
// Inspect the stored doc to confirm SourceSpan::Page round-trip.
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.expect("inspect_doc returns the PDF document");
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap())
|
||||
.expect("inspect_doc returns the PDF document");
|
||||
assert_eq!(doc.blocks.len(), 3);
|
||||
for (i, block) in doc.blocks.iter().enumerate() {
|
||||
let want_page = (i as u32) + 1;
|
||||
@@ -202,8 +203,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
|
||||
write_pdf(&env.workspace_root, "stable.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report1 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item1 = report1
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -214,8 +214,7 @@ fn re_ingest_identical_pdf_produces_unchanged_with_same_doc_id() {
|
||||
.unwrap();
|
||||
assert_eq!(item1.kind, IngestItemKind::New);
|
||||
|
||||
let report2 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item2 = report2
|
||||
.items
|
||||
.unwrap()
|
||||
@@ -239,8 +238,7 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
|
||||
std::fs::write(&path, &bytes_v1).unwrap();
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report_v1 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report_v1 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let id_v1 = report_v1
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -252,12 +250,10 @@ fn re_ingest_edited_pdf_produces_new_doc_id() {
|
||||
.clone()
|
||||
.unwrap();
|
||||
|
||||
let bytes_v2 =
|
||||
build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
|
||||
let bytes_v2 = build_text_pdf(&[Some("VERSION TWO entirely different body content.")]);
|
||||
std::fs::write(&path, &bytes_v2).unwrap();
|
||||
|
||||
let report_v2 =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report_v2 = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let item_v2 = report_v2
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -282,9 +278,11 @@ fn encrypted_pdf_fails_with_qpdf_hint() {
|
||||
write_pdf(&env.workspace_root, "secret.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 1, "encrypted PDF must increment errors exactly once");
|
||||
let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 1,
|
||||
"encrypted PDF must increment errors exactly once"
|
||||
);
|
||||
let items = report.items.as_ref().unwrap();
|
||||
let pdf_item = items
|
||||
.iter()
|
||||
@@ -310,9 +308,11 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
write_pdf(&env.workspace_root, "corrupt.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 1, "corrupt PDF must increment errors exactly once");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 1,
|
||||
"corrupt PDF must increment errors exactly once"
|
||||
);
|
||||
let items = report.items.as_ref().unwrap();
|
||||
let pdf_item = items
|
||||
.iter()
|
||||
@@ -322,11 +322,8 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
|
||||
// Confirm the doc was NOT stored — list_docs returns nothing for
|
||||
// this path.
|
||||
let summaries = kebab_app::list_docs_with_config(
|
||||
cfg,
|
||||
kebab_core::DocFilter::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
|
||||
assert!(
|
||||
!summaries
|
||||
.iter()
|
||||
@@ -341,14 +338,15 @@ fn corrupt_pdf_fails_without_storing() {
|
||||
#[test]
|
||||
fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bytes =
|
||||
build_text_pdf(&[Some("first page"), None, Some("third page")]);
|
||||
let bytes = build_text_pdf(&[Some("first page"), None, Some("third page")]);
|
||||
write_pdf(&env.workspace_root, "mixed.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0, "scanned candidate is a Warning, not Error");
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(
|
||||
report.errors, 0,
|
||||
"scanned candidate is a Warning, not Error"
|
||||
);
|
||||
let pdf_item = report
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -365,14 +363,10 @@ fn mixed_page_pdf_stores_asset_with_scanned_candidate_warning() {
|
||||
assert_eq!(
|
||||
pdf_item.chunk_count,
|
||||
Some(2),
|
||||
"pdf-page-v1 emits 0 chunks for the empty page; total = 2"
|
||||
"pdf-page-v1.1 emits 0 chunks for the empty page; total = 2"
|
||||
);
|
||||
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
let warnings: Vec<_> = doc
|
||||
.provenance
|
||||
.events
|
||||
@@ -419,8 +413,7 @@ fn ingest_report_arithmetic_invariant_holds_with_corrupt_pdf() {
|
||||
write_pdf(&env.workspace_root, "broken.pdf", &corrupt_pdf());
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg, env.scope(), false).unwrap();
|
||||
let total = report.new + report.updated + report.skipped + report.errors;
|
||||
assert_eq!(
|
||||
report.scanned, total,
|
||||
@@ -441,14 +434,12 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
let pages: Vec<String> = (1..=50)
|
||||
.map(|i| format!("Page {i} body — lorem ipsum dolor sit amet."))
|
||||
.collect();
|
||||
let page_refs: Vec<Option<&str>> =
|
||||
pages.iter().map(|s| Some(s.as_str())).collect();
|
||||
let page_refs: Vec<Option<&str>> = pages.iter().map(|s| Some(s.as_str())).collect();
|
||||
let bytes = build_text_pdf(&page_refs);
|
||||
write_pdf(&env.workspace_root, "long.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
assert_eq!(report.errors, 0);
|
||||
let pdf_item = report
|
||||
.items
|
||||
@@ -466,8 +457,7 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
|
||||
// Round-trip: list_docs sees the long PDF.
|
||||
let summaries =
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default())
|
||||
.unwrap();
|
||||
kebab_app::list_docs_with_config(cfg, kebab_core::DocFilter::default()).unwrap();
|
||||
assert!(summaries.iter().any(|s| s.doc_path.0.ends_with("long.pdf")));
|
||||
}
|
||||
|
||||
@@ -476,13 +466,11 @@ fn long_pdf_round_trips_through_lexical_pipeline() {
|
||||
#[test]
|
||||
fn inspect_doc_surfaces_page_spans() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let bytes =
|
||||
build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
|
||||
let bytes = build_text_pdf(&[Some("alpha body"), Some("beta body"), Some("gamma body")]);
|
||||
write_pdf(&env.workspace_root, "inspect.pdf", &bytes);
|
||||
let cfg = cfg_with_pdf(&env);
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let report = kebab_app::ingest_with_config(cfg.clone(), env.scope(), false).unwrap();
|
||||
let pdf_item = report
|
||||
.items
|
||||
.as_ref()
|
||||
@@ -490,19 +478,15 @@ fn inspect_doc_surfaces_page_spans() {
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("inspect.pdf"))
|
||||
.unwrap();
|
||||
let doc = kebab_app::inspect_doc_with_config(
|
||||
cfg,
|
||||
pdf_item.doc_id.as_ref().unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(doc.parser_version.0, "pdf-text-v1");
|
||||
let doc = kebab_app::inspect_doc_with_config(cfg, pdf_item.doc_id.as_ref().unwrap()).unwrap();
|
||||
// v0.26.2: stored parser_version is now `pdf-text-v1|<ingest-config-sig>`
|
||||
// (the signature folds chunking / pdf.ocr settings for skip detection).
|
||||
// Assert the base identity by taking the prefix before the first '|'.
|
||||
assert_eq!(doc.parser_version.0.split('|').next().unwrap(), "pdf-text-v1");
|
||||
assert_eq!(doc.blocks.len(), 3);
|
||||
for block in &doc.blocks {
|
||||
match block {
|
||||
Block::Paragraph(p) => assert!(matches!(
|
||||
p.common.source_span,
|
||||
SourceSpan::Page { .. }
|
||||
)),
|
||||
Block::Paragraph(p) => assert!(matches!(p.common.source_span, SourceSpan::Page { .. })),
|
||||
other => panic!("expected Paragraph, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
137
crates/kebab-app/tests/reset_orphans.rs
Normal file
137
crates/kebab-app/tests/reset_orphans.rs
Normal file
@@ -0,0 +1,137 @@
|
||||
//! Integration test for `kebab reset --orphans-only`.
|
||||
//!
|
||||
//! Verifies that stored docs outside the current walker scope are purged
|
||||
//! from the store without removing any files from the filesystem.
|
||||
//!
|
||||
//! Test outline:
|
||||
//! 1. Ingest 3 .rs files (a.rs, b.rs, c.rs) — all New.
|
||||
//! 2. Narrow the config `include` to `["a.rs"]` only; b.rs and c.rs are
|
||||
//! still on disk but outside the walker scope.
|
||||
//! 3. Run `execute(ResetScope::OrphansOnly, &cfg)` — report must show
|
||||
//! `orphans_purged == 2` and `purged_paths` contains b.rs + c.rs.
|
||||
//! 4. `list docs` must show only a.rs.
|
||||
//! 5. b.rs and c.rs must still exist on disk (no filesystem removal).
|
||||
//! 6. Second reset → `orphans_purged == 0` (idempotent).
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::IngestOpts;
|
||||
use kebab_app::reset::{ResetScope, execute};
|
||||
use kebab_core::{DocFilter, DocumentStore, SourceScope};
|
||||
|
||||
/// Open the SqliteStore and list all `workspace_path` values.
|
||||
fn list_doc_paths(env: &TestEnv) -> Vec<String> {
|
||||
use kebab_store_sqlite::SqliteStore;
|
||||
let store = SqliteStore::open(&env.config).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
store
|
||||
.list_documents(&DocFilter::default())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|d| d.doc_path.0)
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reset_orphans_only_purges_out_of_scope_docs() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write three .rs files into the workspace.
|
||||
let a_path = env.workspace_root.join("a.rs");
|
||||
let b_path = env.workspace_root.join("b.rs");
|
||||
let c_path = env.workspace_root.join("c.rs");
|
||||
std::fs::write(&a_path, "// file a\nfn alpha() {}\n").unwrap();
|
||||
std::fs::write(&b_path, "// file b\nfn bravo() {}\n").unwrap();
|
||||
std::fs::write(&c_path, "// file c\nfn charlie() {}\n").unwrap();
|
||||
|
||||
// Ingest all three with a wide scope.
|
||||
let wide_scope = SourceScope {
|
||||
root: env.workspace_root.clone(),
|
||||
include: vec!["**/*.rs".to_string()],
|
||||
exclude: env.config.workspace.exclude.clone(),
|
||||
};
|
||||
let first = kebab_app::ingest_with_config_opts(
|
||||
env.config.clone(),
|
||||
wide_scope,
|
||||
false,
|
||||
IngestOpts::default(),
|
||||
)
|
||||
.expect("first ingest must succeed");
|
||||
// The fixture workspace may contain other .rs files — just assert we
|
||||
// got at least 3 new docs (our a.rs, b.rs, c.rs).
|
||||
assert!(first.new >= 3, "expected at least 3 new docs: {first:?}");
|
||||
assert_eq!(first.errors, 0, "no errors on first ingest");
|
||||
|
||||
// Narrow config to include only a.rs; b.rs + c.rs are still on disk.
|
||||
let mut narrow_cfg = env.config.clone();
|
||||
narrow_cfg.workspace.exclude.clear();
|
||||
// Re-point workspace root (already correct) and restrict include via
|
||||
// the SourceScope in the connector. The config's `workspace.root` is
|
||||
// used by `enumerate_orphans` to build its scope — we keep that
|
||||
// pointing at the workspace root. We simulate narrowing by setting a
|
||||
// glob that only matches a.rs.
|
||||
//
|
||||
// NOTE: `kebab_config::WorkspaceCfg` does not have an `include` field
|
||||
// (it was removed in p9-fb-25). We narrow the scope via the walker
|
||||
// exclude list: exclude b.rs and c.rs explicitly.
|
||||
narrow_cfg.workspace.exclude = vec!["b.rs".to_string(), "c.rs".to_string()];
|
||||
|
||||
// Run orphans-only reset.
|
||||
let report =
|
||||
execute(ResetScope::OrphansOnly, &narrow_cfg).expect("orphans-only reset must succeed");
|
||||
|
||||
assert_eq!(
|
||||
report.orphans_purged, 2,
|
||||
"expected 2 orphans purged (b.rs + c.rs): {report:?}"
|
||||
);
|
||||
|
||||
let mut purged: Vec<String> = report.purged_paths.iter().map(|p| p.0.clone()).collect();
|
||||
purged.sort();
|
||||
assert_eq!(
|
||||
purged,
|
||||
vec!["b.rs".to_string(), "c.rs".to_string()],
|
||||
"purged_paths must list b.rs and c.rs in sorted order: {purged:?}"
|
||||
);
|
||||
|
||||
// list docs must show only a.rs (and any pre-existing fixture files
|
||||
// that are not excluded by the narrow config).
|
||||
let doc_paths = list_doc_paths(&env);
|
||||
// The narrow_cfg excludes b.rs + c.rs — they must no longer be in store.
|
||||
assert!(
|
||||
!doc_paths.iter().any(|p| p == "b.rs"),
|
||||
"b.rs must be gone from store after orphans-only reset; got: {doc_paths:?}"
|
||||
);
|
||||
assert!(
|
||||
!doc_paths.iter().any(|p| p == "c.rs"),
|
||||
"c.rs must be gone from store after orphans-only reset; got: {doc_paths:?}"
|
||||
);
|
||||
assert!(
|
||||
doc_paths.iter().any(|p| p == "a.rs"),
|
||||
"a.rs must still be in store; got: {doc_paths:?}"
|
||||
);
|
||||
|
||||
// Both b.rs and c.rs must still exist on the filesystem — no file
|
||||
// removal is performed by orphans-only.
|
||||
assert!(
|
||||
b_path.exists(),
|
||||
"b.rs must still be on disk after orphans-only reset"
|
||||
);
|
||||
assert!(
|
||||
c_path.exists(),
|
||||
"c.rs must still be on disk after orphans-only reset"
|
||||
);
|
||||
|
||||
// Second reset must be idempotent: nothing left to purge.
|
||||
let second = execute(ResetScope::OrphansOnly, &narrow_cfg)
|
||||
.expect("second orphans-only reset must succeed");
|
||||
assert_eq!(
|
||||
second.orphans_purged, 0,
|
||||
"second reset must be idempotent (orphans_purged == 0): {second:?}"
|
||||
);
|
||||
assert!(
|
||||
second.purged_paths.is_empty(),
|
||||
"second reset purged_paths must be empty: {:?}",
|
||||
second.purged_paths
|
||||
);
|
||||
}
|
||||
79
crates/kebab-app/tests/schema_active_versions.rs
Normal file
79
crates/kebab-app/tests/schema_active_versions.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
//! Integration tests for Bug #13: schema.v1.models.active_parsers + active_chunkers.
|
||||
|
||||
use kebab_app::schema_with_config;
|
||||
use kebab_config::Config;
|
||||
use kebab_core::SourceScope;
|
||||
|
||||
fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path) -> Config {
|
||||
let mut cfg = Config::defaults();
|
||||
cfg.workspace.root = workspace_root.to_string_lossy().into_owned();
|
||||
cfg.workspace.exclude.clear();
|
||||
cfg.storage.data_dir = data_dir.to_string_lossy().into_owned();
|
||||
cfg.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
cfg.models.embedding.provider = "none".to_string();
|
||||
cfg.models.embedding.dimensions = 0;
|
||||
cfg.ingest.chunking.target_tokens = 80;
|
||||
cfg.ingest.chunking.overlap_tokens = 20;
|
||||
cfg
|
||||
}
|
||||
|
||||
fn minimal_scope(workspace_root: &std::path::Path) -> SourceScope {
|
||||
SourceScope {
|
||||
root: workspace_root.to_path_buf(),
|
||||
include: vec![],
|
||||
exclude: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn schema_models_active_arrays_empty_on_empty_corpus() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let workspace = dir.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
let cfg = minimal_config(dir.path(), &workspace);
|
||||
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&cfg).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
drop(store);
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
assert!(
|
||||
s.models.active_parsers.is_empty(),
|
||||
"empty corpus → no parsers"
|
||||
);
|
||||
assert!(
|
||||
s.models.active_chunkers.is_empty(),
|
||||
"empty corpus → no chunkers"
|
||||
);
|
||||
// backward compat: 기존 단일 field 는 markdown default 보존.
|
||||
assert_eq!(s.models.parser_version, kebab_parse_md::PARSER_VERSION);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn schema_emits_active_parsers_and_chunkers_array_after_ingest() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let workspace = dir.path().join("kb");
|
||||
std::fs::create_dir_all(&workspace).unwrap();
|
||||
std::fs::write(workspace.join("a.md"), "# A\nhello world\n").unwrap();
|
||||
let cfg = minimal_config(dir.path(), &workspace);
|
||||
let scope = minimal_scope(&workspace);
|
||||
|
||||
kebab_app::ingest_with_config(cfg.clone(), scope, false).unwrap();
|
||||
|
||||
let s = schema_with_config(&cfg).unwrap();
|
||||
assert!(
|
||||
!s.models.active_parsers.is_empty(),
|
||||
"active_parsers populated after ingest"
|
||||
);
|
||||
assert!(
|
||||
!s.models.active_chunkers.is_empty(),
|
||||
"active_chunkers populated after ingest"
|
||||
);
|
||||
// active arrays must be sorted (ORDER BY in SQL).
|
||||
let mut sorted = s.models.active_parsers.clone();
|
||||
sorted.sort();
|
||||
assert_eq!(
|
||||
s.models.active_parsers, sorted,
|
||||
"active_parsers must be sorted"
|
||||
);
|
||||
}
|
||||
@@ -14,8 +14,8 @@ fn minimal_config(data_dir: &std::path::Path, workspace_root: &std::path::Path)
|
||||
config.storage.model_dir = data_dir.join("models").to_string_lossy().into_owned();
|
||||
config.models.embedding.provider = "none".to_string();
|
||||
config.models.embedding.dimensions = 0;
|
||||
config.chunking.target_tokens = 80;
|
||||
config.chunking.overlap_tokens = 20;
|
||||
config.ingest.chunking.target_tokens = 80;
|
||||
config.ingest.chunking.overlap_tokens = 20;
|
||||
config
|
||||
}
|
||||
|
||||
@@ -57,7 +57,7 @@ fn schema_report_reflects_freshly_ingested_kb() {
|
||||
schema.wire.schemas
|
||||
);
|
||||
assert!(schema.capabilities.json_mode);
|
||||
assert!(!schema.capabilities.streaming_ask);
|
||||
assert!(schema.capabilities.streaming_ask); // Bug #9: streaming_ask is now true
|
||||
assert!(
|
||||
schema.capabilities.mcp_server,
|
||||
"mcp_server should be true after fb-30",
|
||||
|
||||
175
crates/kebab-app/tests/search_budget_integration.rs
Normal file
175
crates/kebab-app/tests/search_budget_integration.rs
Normal file
@@ -0,0 +1,175 @@
|
||||
//! p9-fb-34: App::search_with_opts integration tests.
|
||||
|
||||
mod common;
|
||||
|
||||
use kebab_app::SearchResponse;
|
||||
use kebab_core::{SearchFilters, SearchMode, SearchOpts, SearchQuery};
|
||||
|
||||
fn lex(text: &str, k: usize) -> SearchQuery {
|
||||
SearchQuery {
|
||||
text: text.to_string(),
|
||||
mode: SearchMode::Lexical,
|
||||
k,
|
||||
filters: SearchFilters::default(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_with_opts_no_budget_matches_search() {
|
||||
let env = common::TestEnv::new();
|
||||
common::ingest_md(&env, "a.md", "# T\n\napples are red\n");
|
||||
let app = env.app();
|
||||
|
||||
let baseline = app.search(lex("apples", 5)).unwrap();
|
||||
let resp: SearchResponse = app
|
||||
.search_with_opts(lex("apples", 5), SearchOpts::default())
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(resp.hits.len(), baseline.len());
|
||||
assert!(!resp.truncated);
|
||||
assert!(
|
||||
resp.next_cursor.is_none(),
|
||||
"k=5 against 1 doc → no next page"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn budget_truncates_snippets_when_below_threshold() {
|
||||
let env = common::TestEnv::new();
|
||||
let body: String = "rust ownership is a memory model. ".repeat(10);
|
||||
common::ingest_md(&env, "a.md", &format!("# T\n\n{body}\n"));
|
||||
let app = env.app();
|
||||
|
||||
let unrestricted = app.search(lex("rust", 5)).unwrap();
|
||||
let unrestricted_chars: usize = unrestricted.iter().map(|h| h.snippet.chars().count()).sum();
|
||||
|
||||
let resp = app
|
||||
.search_with_opts(
|
||||
lex("rust", 5),
|
||||
SearchOpts {
|
||||
max_tokens: Some(50),
|
||||
snippet_chars: None,
|
||||
cursor: None,
|
||||
trace: false,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
let limited_chars: usize = resp.hits.iter().map(|h| h.snippet.chars().count()).sum();
|
||||
|
||||
assert!(resp.truncated, "small budget must trip truncation");
|
||||
assert!(limited_chars < unrestricted_chars, "snippet should shrink");
|
||||
assert!(!resp.hits.is_empty(), "always retain ≥1 hit");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cursor_paginates_to_next_page() {
|
||||
let env = common::TestEnv::new();
|
||||
for i in 0..6 {
|
||||
common::ingest_md(
|
||||
&env,
|
||||
&format!("d{i}.md"),
|
||||
&format!("# T{i}\n\nrust topic {i}\n"),
|
||||
);
|
||||
}
|
||||
let app = env.app();
|
||||
|
||||
let page1 = app
|
||||
.search_with_opts(lex("rust", 2), SearchOpts::default())
|
||||
.unwrap();
|
||||
assert_eq!(page1.hits.len(), 2);
|
||||
let cursor = page1.next_cursor.expect("more hits available");
|
||||
|
||||
let page2 = app
|
||||
.search_with_opts(
|
||||
lex("rust", 2),
|
||||
SearchOpts {
|
||||
max_tokens: None,
|
||||
snippet_chars: None,
|
||||
cursor: Some(cursor),
|
||||
trace: false,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(page2.hits.len(), 2);
|
||||
let p1_ids: std::collections::HashSet<_> =
|
||||
page1.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
|
||||
let p2_ids: std::collections::HashSet<_> =
|
||||
page2.hits.iter().map(|h| h.chunk_id.0.clone()).collect();
|
||||
assert!(
|
||||
p1_ids.is_disjoint(&p2_ids),
|
||||
"page 2 must not repeat page 1 hits"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cursor_rejected_after_corpus_revision_bump() {
|
||||
let env = common::TestEnv::new();
|
||||
common::ingest_md(&env, "a.md", "# T\n\napples\n");
|
||||
let app = env.app();
|
||||
|
||||
let page1 = app
|
||||
.search_with_opts(lex("apples", 1), SearchOpts::default())
|
||||
.unwrap();
|
||||
// p9-fb-34 round-1 review: replaced silent `if let Some(c) = ...`
|
||||
// with `.expect(...)` so a fixture regression that breaks the
|
||||
// cursor-emission contract fails loudly instead of passing vacuously.
|
||||
let c = page1
|
||||
.next_cursor
|
||||
.expect("k=1 page must emit next_cursor — fixture too small if this fails");
|
||||
|
||||
common::ingest_md(&env, "b.md", "# B\n\nbananas\n");
|
||||
let app2 = env.app();
|
||||
|
||||
let result = app2.search_with_opts(
|
||||
lex("apples", 1),
|
||||
SearchOpts {
|
||||
max_tokens: None,
|
||||
snippet_chars: None,
|
||||
cursor: Some(c),
|
||||
trace: false,
|
||||
},
|
||||
);
|
||||
let err = result.unwrap_err();
|
||||
assert!(
|
||||
err.to_string().contains("stale_cursor"),
|
||||
"must surface stale_cursor: {err}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn max_tokens_zero_returns_one_hit_truncated() {
|
||||
// p9-fb-34 round-1 review: pin the documented "≥1 hit floor"
|
||||
// contract — even with `max_tokens=0` (an absurdly tight budget)
|
||||
// the budget loop must keep one hit and flip `truncated: true`.
|
||||
// Fixture intentionally seeds multiple matches so step 2 of the
|
||||
// budget loop (pop hits to 1) actually fires.
|
||||
let env = common::TestEnv::new();
|
||||
for i in 0..3 {
|
||||
common::ingest_md(
|
||||
&env,
|
||||
&format!("d{i}.md"),
|
||||
&format!("# T{i}\n\napples are red {i}\n"),
|
||||
);
|
||||
}
|
||||
let app = env.app();
|
||||
|
||||
let resp = app
|
||||
.search_with_opts(
|
||||
lex("apples", 5),
|
||||
SearchOpts {
|
||||
max_tokens: Some(0),
|
||||
snippet_chars: None,
|
||||
cursor: None,
|
||||
trace: false,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(resp.hits.len(), 1, "max_tokens=0 collapses to 1-hit floor");
|
||||
assert!(resp.truncated);
|
||||
// p9-fb-34 R2: cursor IS emitted on k-pop case so the popped
|
||||
// hits remain reachable.
|
||||
assert!(
|
||||
resp.next_cursor.is_some(),
|
||||
"k-pop truncation must still emit next_cursor; popped hits at offset+returned"
|
||||
);
|
||||
}
|
||||
@@ -46,3 +46,152 @@ fn korean_lexical_query_returns_korean_document() {
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// A4 Step 1c — multi-token Korean query (`해시 충돌`) must hit when
|
||||
/// the lexical builder routes it through a whole-phrase MATCH candidate.
|
||||
///
|
||||
/// Expected: FAIL until A5 (`build_match_string` redesign) lands — the
|
||||
/// current builder emits `"해시" "충돌"` AND, but FTS5 trigram tokenizer
|
||||
/// has no 2-char terms so each side is 0-hit. A5 introduces a whole-
|
||||
/// phrase candidate (`"해시 충돌"`) OR'd with the token AND, restoring
|
||||
/// hits for the dominant Korean usage pattern.
|
||||
#[test]
|
||||
fn lexical_multi_token_korean_query_hits() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Copy the synthetic Korean fixture (introduced in A4 Step 0) into
|
||||
// the test workspace. The fixture contains the exact phrase
|
||||
// "해시 충돌" multiple times.
|
||||
let dest = env.workspace_root.join("hash-table.md");
|
||||
let src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
.join("..")
|
||||
.join("fixtures")
|
||||
.join("search")
|
||||
.join("korean")
|
||||
.join("hash-table.md");
|
||||
std::fs::copy(&src, &dest).expect("copy korean fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query("해시 충돌"))
|
||||
.expect("search must succeed");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"multi-token Korean query '해시 충돌' must hit the hash-table fixture; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
let any_hash_table = hits.iter().any(|h| h.doc_path.0.contains("hash-table"));
|
||||
assert!(
|
||||
any_hash_table,
|
||||
"expected at least one hit on the hash-table fixture, got: {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// A4 Step 1c — mixed Korean+English multi-token query (`Rust 충돌은`).
|
||||
/// Both tokens are ≥3 chars, so the redesigned builder (A5) emits
|
||||
/// `("Rust 충돌은") OR ("Rust" AND "충돌은")`. With trigram tokenizer
|
||||
/// each side has substring coverage in the document, so the AND branch
|
||||
/// alone is enough. Expected: FAIL pre-A5, PASS post-A5.
|
||||
#[test]
|
||||
fn lexical_mixed_korean_english_multi_token_query_hits() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let doc_path = env.workspace_root.join("rust-hash.md");
|
||||
std::fs::write(
|
||||
&doc_path,
|
||||
"# Rust 해시 테이블\n\nRust 의 std::collections::HashMap 에서 \
|
||||
해시 충돌은 SipHash 로 완화한다.\n",
|
||||
)
|
||||
.expect("write rust-hash fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust 충돌은"))
|
||||
.expect("search must succeed");
|
||||
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"mixed Korean+English multi-token query 'Rust 충돌은' must hit the rust-hash fixture; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
let any_rust_hash = hits.iter().any(|h| h.doc_path.0.contains("rust-hash"));
|
||||
assert!(
|
||||
any_rust_hash,
|
||||
"expected at least one hit on the rust-hash fixture, got: {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
// ── S7 V009 morphological tokenizer end-to-end tests ─────────────────
|
||||
|
||||
/// S7 — V009 morphological tokenizer: 한국어 2자 query 가 end-to-end
|
||||
/// lexical 경로에서 hit. lindera ko-dic 이 '한국어를' → '한국어' 형태소로
|
||||
/// 분해, '서울은' → '서울' 로 분해하여 tokenized_korean_text column 에
|
||||
/// 기록 → FTS5 매칭.
|
||||
#[test]
|
||||
fn korean_morphological_2char_query_lexical_mode() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let doc_path = env.workspace_root.join("korean-wiki.md");
|
||||
std::fs::write(
|
||||
&doc_path,
|
||||
"# 한국어 위키\n\n한국어를 공부합니다.\n서울은 한국의 수도입니다.\n",
|
||||
)
|
||||
.expect("write korean-wiki fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("한국"))
|
||||
.expect("search 한국");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'한국' 2-char Korean query must return at least one hit (V009 morphological); got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("서울"))
|
||||
.expect("search 서울");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'서울' 2-char Korean query must return at least one hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// S7 — V009 morphological tokenizer: 한-영 혼합 query lexical hit.
|
||||
/// 'Rust' (English whole-token) + '최적화' (Korean morpheme) 각각 hit.
|
||||
#[test]
|
||||
fn korean_morphological_mixed_english_korean_query() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let doc_path = env.workspace_root.join("rust-optimization.md");
|
||||
std::fs::write(
|
||||
&doc_path,
|
||||
"# Rust 최적화 노트\n\nRust 최적화는 zero-cost abstraction 을 강조한다.\n",
|
||||
)
|
||||
.expect("write rust-optimization fixture");
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true)
|
||||
.expect("ingest must succeed");
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("Rust"))
|
||||
.expect("search Rust");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'Rust' English whole-token must hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query("최적화"))
|
||||
.expect("search 최적화");
|
||||
assert!(
|
||||
!hits.is_empty(),
|
||||
"'최적화' Korean morpheme must hit; got {:?}",
|
||||
hits.iter().map(|h| &h.doc_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -35,8 +35,8 @@ fn lexical_search_returns_hits_after_ingest() {
|
||||
fn lexical_search_empty_query_returns_empty() {
|
||||
let env = TestEnv::lexical_only();
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let hits = kebab_app::search_with_config(env.config.clone(), common::lexical_query(" "))
|
||||
.unwrap();
|
||||
let hits =
|
||||
kebab_app::search_with_config(env.config.clone(), common::lexical_query(" ")).unwrap();
|
||||
assert!(hits.is_empty(), "blank query must short-circuit empty");
|
||||
}
|
||||
|
||||
@@ -107,20 +107,25 @@ fn search_uncached_returns_same_hits_as_cached() {
|
||||
#[test]
|
||||
fn first_ingest_bumps_corpus_revision() {
|
||||
let env = TestEnv::lexical_only();
|
||||
let store_before =
|
||||
kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
let store_before = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
store_before.run_migrations().unwrap();
|
||||
assert_eq!(store_before.corpus_revision(), 0, "fresh store seeds 0");
|
||||
// V004 seeds 0; V009 + V010 + V011 migrations each bump by 1 to
|
||||
// invalidate stale LRU caches (spec §5.2). Baseline before ingest = 3.
|
||||
// (V012 derivation_cache + V013 drop-chunk-aliases are structural/additive
|
||||
// — neither bumps corpus_revision.)
|
||||
let baseline = store_before.corpus_revision();
|
||||
assert_eq!(baseline, 3, "fresh store post-V011 baseline = 3");
|
||||
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert!(report.new + report.updated > 0, "first ingest must commit ≥1 doc");
|
||||
|
||||
let store_after =
|
||||
kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert!(
|
||||
store_after.corpus_revision() >= 1,
|
||||
"ingest commit must bump corpus_revision (got {})",
|
||||
report.new + report.updated > 0,
|
||||
"first ingest must commit ≥1 doc"
|
||||
);
|
||||
|
||||
let store_after = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
assert!(
|
||||
store_after.corpus_revision() > baseline,
|
||||
"ingest commit must bump corpus_revision past baseline {baseline} (got {})",
|
||||
store_after.corpus_revision(),
|
||||
);
|
||||
}
|
||||
|
||||
91
crates/kebab-app/tests/search_stale_integration.rs
Normal file
91
crates/kebab-app/tests/search_stale_integration.rs
Normal file
@@ -0,0 +1,91 @@
|
||||
//! p9-fb-32: `App::search` end-to-end staleness wiring.
|
||||
//!
|
||||
//! `compute_stale` itself is unit-tested in `kebab_app::staleness`; this
|
||||
//! file proves the post-process actually fires through the full
|
||||
//! retriever stack and that the cache-hit re-stamp respects the
|
||||
//! configured threshold.
|
||||
//!
|
||||
//! All three tests run lexical-only (no AVX, no fastembed download).
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
|
||||
fn lexical_query_owner() -> kebab_core::SearchQuery {
|
||||
common::lexical_query("ownership")
|
||||
}
|
||||
|
||||
/// Fresh ingest at default 30-day threshold → no hit can be stale.
|
||||
/// `documents.updated_at` is stamped at ingest time (now), so the
|
||||
/// distance to `now_utc()` is sub-second.
|
||||
#[test]
|
||||
fn fresh_doc_is_not_stale_with_default_threshold() {
|
||||
let env = TestEnv::lexical_only();
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
|
||||
let app = kebab_app::App::open_with_config(env.config.clone()).unwrap();
|
||||
let hits = app.search(lexical_query_owner()).unwrap();
|
||||
assert!(!hits.is_empty(), "expected ≥1 hit for 'ownership'");
|
||||
assert!(
|
||||
hits.iter().all(|h| !h.stale),
|
||||
"freshly-ingested doc must not be stale at default 30d threshold: {:?}",
|
||||
hits.iter()
|
||||
.map(|h| (h.doc_path.0.clone(), h.stale))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// `stale_threshold_days = 0` disables the feature even for very old
|
||||
/// `documents.updated_at`. Backdate the row to a year ago, expect
|
||||
/// `stale: false` on every hit.
|
||||
#[test]
|
||||
fn threshold_zero_disables_staleness() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.search.stale_threshold_days = 0;
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
common::backdate_document_updated_at(&env, "intro.md", 365);
|
||||
|
||||
let app = kebab_app::App::open_with_config(env.config.clone()).unwrap();
|
||||
let hits = app.search(lexical_query_owner()).unwrap();
|
||||
assert!(!hits.is_empty(), "expected ≥1 hit");
|
||||
assert!(
|
||||
hits.iter().all(|h| !h.stale),
|
||||
"threshold=0 disables staleness even for year-old docs: {:?}",
|
||||
hits.iter()
|
||||
.map(|h| (h.doc_path.0.clone(), h.stale))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// At a 30-day threshold, a 60-day-old `documents.updated_at` must
|
||||
/// surface as stale on the matching hit. (Other hits — fresh fixtures
|
||||
/// not backdated — stay fresh, so we use `any` not `all`.)
|
||||
#[test]
|
||||
fn old_doc_marked_stale() {
|
||||
let mut env = TestEnv::lexical_only();
|
||||
env.config.search.stale_threshold_days = 30;
|
||||
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
common::backdate_document_updated_at(&env, "intro.md", 60);
|
||||
|
||||
let app = kebab_app::App::open_with_config(env.config.clone()).unwrap();
|
||||
let hits = app.search(lexical_query_owner()).unwrap();
|
||||
assert!(!hits.is_empty(), "expected ≥1 hit");
|
||||
let intro_hits: Vec<&kebab_core::SearchHit> = hits
|
||||
.iter()
|
||||
.filter(|h| h.doc_path.0.ends_with("intro.md"))
|
||||
.collect();
|
||||
assert!(
|
||||
!intro_hits.is_empty(),
|
||||
"expected ≥1 hit on intro.md (the backdated doc)"
|
||||
);
|
||||
assert!(
|
||||
intro_hits.iter().all(|h| h.stale),
|
||||
"60-day-old intro.md must be stale at 30d threshold: {:?}",
|
||||
intro_hits
|
||||
.iter()
|
||||
.map(|h| (h.doc_path.0.clone(), h.stale))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
@@ -14,12 +14,11 @@ use common::TestEnv;
|
||||
fn require_avx_or_panic() {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if !std::is_x86_feature_detected!("avx") {
|
||||
panic!(
|
||||
"kb-app vector integration test requires AVX-capable hardware; \
|
||||
host CPU lacks AVX. Run on an AVX-capable machine."
|
||||
);
|
||||
}
|
||||
assert!(
|
||||
std::is_x86_feature_detected!("avx"),
|
||||
"kb-app vector integration test requires AVX-capable hardware; \
|
||||
host CPU lacks AVX. Run on an AVX-capable machine."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,8 +29,7 @@ fn ingest_then_hybrid_search_returns_hits() {
|
||||
require_avx_or_panic();
|
||||
|
||||
let env = TestEnv::with_embeddings();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
@@ -57,8 +55,7 @@ fn ingest_then_vector_search_carries_embedding_model() {
|
||||
require_avx_or_panic();
|
||||
|
||||
let env = TestEnv::with_embeddings();
|
||||
let report =
|
||||
kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), true).unwrap();
|
||||
assert_eq!(report.errors, 0, "no per-file errors: {report:?}");
|
||||
assert_eq!(report.new, 3);
|
||||
|
||||
|
||||
@@ -13,11 +13,7 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
|
||||
std::fs::write(workspace_root.join("legacy.docx"), b"unsupported").unwrap();
|
||||
std::fs::write(workspace_root.join("Makefile"), b"unsupported").unwrap();
|
||||
|
||||
let report = kebab_app::ingest_with_config(
|
||||
env.config.clone(),
|
||||
env.scope(),
|
||||
false,
|
||||
).unwrap();
|
||||
let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false).unwrap();
|
||||
|
||||
let items = report.items.as_ref().expect("items array populated");
|
||||
let docx_item = items
|
||||
@@ -39,5 +35,8 @@ fn unsupported_extension_skip_carries_warning_and_is_aggregated() {
|
||||
vec!["unsupported media type: <no-ext>".to_string()],
|
||||
);
|
||||
assert_eq!(report.skipped_by_extension.get("docx").copied(), Some(1));
|
||||
assert_eq!(report.skipped_by_extension.get("<no-ext>").copied(), Some(1));
|
||||
assert_eq!(
|
||||
report.skipped_by_extension.get("<no-ext>").copied(),
|
||||
Some(1)
|
||||
);
|
||||
}
|
||||
|
||||
178
crates/kebab-app/tests/twin_files_fetch_span.rs
Normal file
178
crates/kebab-app/tests/twin_files_fetch_span.rs
Normal file
@@ -0,0 +1,178 @@
|
||||
//! Regression test for the twin-file fetch_span media-type lookup bug.
|
||||
//!
|
||||
//! Twin files (identical content at different workspace paths) share one
|
||||
//! `assets` row whose PRIMARY KEY is the blake3 content hash. The old
|
||||
//! `fetch_span` implementation called
|
||||
//! `get_asset_by_workspace_path(&doc.workspace_path)` to check whether the
|
||||
//! media type was PDF/audio (and therefore reject span fetch). For a twin
|
||||
//! file that lookup could silently return the *other* twin's asset row if
|
||||
//! `assets.workspace_path` had been overwritten on the most recent ingest of
|
||||
//! the sibling — making the media-type branch decision incorrect.
|
||||
//!
|
||||
//! Fix: `fetch_span` now uses the 2-step lookup
|
||||
//! `get_document_by_workspace_path` → `doc.source_asset_id` → `get_asset`
|
||||
//! so the result is always anchored to the requesting document, not
|
||||
//! whichever twin last updated `assets.workspace_path`.
|
||||
//!
|
||||
//! This test builds a twin-file scenario (two .md files at different paths
|
||||
//! with identical content), ingests both, then calls `fetch_span` on each
|
||||
//! twin's `doc_id` and asserts it succeeds. Before the fix, if the asset
|
||||
//! row's workspace_path happened to point at the wrong twin the span could
|
||||
//! return an incorrect `span_not_supported` for a non-PDF/audio file, or
|
||||
//! conversely allow span on a PDF twin by accident. After the fix, the
|
||||
//! lookup is always doc-specific.
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::ingest_with_config;
|
||||
use kebab_core::{DocumentStore, FetchKind, FetchOpts, FetchQuery, IngestItemKind};
|
||||
|
||||
#[test]
|
||||
fn twin_files_fetch_span_uses_correct_asset() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write two markdown files with identical content at different paths.
|
||||
let dir_a = env.workspace_root.join("src_a");
|
||||
let dir_b = env.workspace_root.join("src_b");
|
||||
std::fs::create_dir_all(&dir_a).unwrap();
|
||||
std::fs::create_dir_all(&dir_b).unwrap();
|
||||
|
||||
// The content must produce at least 1 line so span fetch is non-trivial.
|
||||
let content = "# Twin\n\nLine one.\n\nLine two.\n\nLine three.\n";
|
||||
std::fs::write(dir_a.join("note.md"), content).unwrap();
|
||||
std::fs::write(dir_b.join("note.md"), content).unwrap();
|
||||
|
||||
// Ingest all files (fixture workspace + our two new twins).
|
||||
let report =
|
||||
ingest_with_config(env.config.clone(), env.scope(), false).expect("ingest must succeed");
|
||||
assert_eq!(report.errors, 0, "no ingest errors; report={report:?}");
|
||||
|
||||
// Both twin paths must appear as New in the report.
|
||||
let items = report.items.as_ref().expect("items must be present");
|
||||
let twin_items: Vec<_> = items
|
||||
.iter()
|
||||
.filter(|i| {
|
||||
i.doc_path.0.ends_with("src_a/note.md") || i.doc_path.0.ends_with("src_b/note.md")
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(
|
||||
twin_items.len(),
|
||||
2,
|
||||
"exactly 2 twin items expected; items={items:?}"
|
||||
);
|
||||
for item in &twin_items {
|
||||
assert_eq!(
|
||||
item.kind,
|
||||
IngestItemKind::New,
|
||||
"each twin must be New; item={item:?}"
|
||||
);
|
||||
}
|
||||
|
||||
// Resolve doc_ids for both workspace paths.
|
||||
// The ingest layer normalises workspace_path to the path relative to
|
||||
// workspace_root (e.g. "src_a/note.md"), so we look up by that form.
|
||||
let store = kebab_store_sqlite::SqliteStore::open(&env.config).unwrap();
|
||||
store.run_migrations().unwrap();
|
||||
|
||||
// Find the twin items by matching on suffix so the test is robust to
|
||||
// however the workspace root is represented.
|
||||
let items = report.items.as_ref().expect("items must be present");
|
||||
let path_a_str = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("src_a/note.md"))
|
||||
.map(|i| i.doc_path.0.clone())
|
||||
.expect("src_a/note.md must appear in ingest report");
|
||||
let path_b_str = items
|
||||
.iter()
|
||||
.find(|i| i.doc_path.0.ends_with("src_b/note.md"))
|
||||
.map(|i| i.doc_path.0.clone())
|
||||
.expect("src_b/note.md must appear in ingest report");
|
||||
|
||||
let path_a = kebab_core::WorkspacePath(path_a_str);
|
||||
let path_b = kebab_core::WorkspacePath(path_b_str);
|
||||
|
||||
let doc_a = store
|
||||
.get_document_by_workspace_path(&path_a)
|
||||
.expect("get_document_by_workspace_path path_a")
|
||||
.expect("doc_a must exist after ingest");
|
||||
let doc_b = store
|
||||
.get_document_by_workspace_path(&path_b)
|
||||
.expect("get_document_by_workspace_path path_b")
|
||||
.expect("doc_b must exist after ingest");
|
||||
|
||||
// Both twins share one asset_id (same content hash).
|
||||
assert_eq!(
|
||||
doc_a.source_asset_id, doc_b.source_asset_id,
|
||||
"twin files must share one asset_id"
|
||||
);
|
||||
|
||||
// Open App and issue span fetch on each twin's doc_id.
|
||||
let app = env.app();
|
||||
|
||||
let result_a = app
|
||||
.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_a.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 2,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin A must succeed for a markdown file");
|
||||
assert_eq!(result_a.kind, FetchKind::Span);
|
||||
assert!(
|
||||
result_a.text.as_deref().is_some_and(|t| !t.is_empty()),
|
||||
"span text for twin A must not be empty"
|
||||
);
|
||||
|
||||
let result_b = app
|
||||
.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_b.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 2,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin B must succeed for a markdown file");
|
||||
assert_eq!(result_b.kind, FetchKind::Span);
|
||||
assert!(
|
||||
result_b.text.as_deref().is_some_and(|t| !t.is_empty()),
|
||||
"span text for twin B must not be empty"
|
||||
);
|
||||
|
||||
// Ingest again to force the asset.workspace_path flip-flop, then
|
||||
// re-check. Pre-fix this was the scenario that triggered the bug:
|
||||
// after the second ingest the asset row's workspace_path could point
|
||||
// at either twin, making one twin's span fetch behave incorrectly.
|
||||
let report2 = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest must succeed");
|
||||
assert_eq!(
|
||||
report2.errors, 0,
|
||||
"no ingest errors on second run; report={report2:?}"
|
||||
);
|
||||
|
||||
// Re-open app after second ingest and verify span still works on both.
|
||||
let app2 = env.app();
|
||||
|
||||
app2.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_a.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 3,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin A after flip-flop must still succeed");
|
||||
|
||||
app2.fetch(
|
||||
FetchQuery::Span {
|
||||
doc_id: doc_b.doc_id.clone(),
|
||||
line_start: 1,
|
||||
line_end: 3,
|
||||
},
|
||||
FetchOpts::default(),
|
||||
)
|
||||
.expect("fetch_span on twin B after flip-flop must still succeed");
|
||||
}
|
||||
94
crates/kebab-app/tests/twin_files_idempotent.rs
Normal file
94
crates/kebab-app/tests/twin_files_idempotent.rs
Normal file
@@ -0,0 +1,94 @@
|
||||
//! Regression test for the twin-file idempotency bug.
|
||||
//!
|
||||
//! Identical-content files at different workspace paths share one
|
||||
//! `assets` row (`asset_id` = blake3 content hash, PRIMARY KEY). The
|
||||
//! old UPSERT `ON CONFLICT(asset_id) DO UPDATE SET workspace_path =
|
||||
//! excluded.workspace_path` made each twin overwrite the other's path
|
||||
//! on every ingest, so `get_asset_by_workspace_path(path1)` returned
|
||||
//! None (or the wrong twin) → re-process every time.
|
||||
//!
|
||||
//! Fix: `try_skip_unchanged` now uses `get_document_by_workspace_path`
|
||||
//! instead. `documents.workspace_path` is UNIQUE (V001) so each twin
|
||||
//! has its own stable document row.
|
||||
//!
|
||||
//! Assertion contract:
|
||||
//! 1st ingest → 2 New (one per twin)
|
||||
//! 2nd ingest → 0 New, 0 Updated, 2 Unchanged
|
||||
|
||||
mod common;
|
||||
|
||||
use common::TestEnv;
|
||||
use kebab_app::ingest_with_config;
|
||||
use kebab_core::IngestItemKind;
|
||||
|
||||
#[test]
|
||||
fn twin_files_second_ingest_is_unchanged() {
|
||||
let env = TestEnv::lexical_only();
|
||||
|
||||
// Write two files with identical content at different paths.
|
||||
let pkg_a = env.workspace_root.join("pkg_a");
|
||||
let pkg_b = env.workspace_root.join("pkg_b");
|
||||
std::fs::create_dir_all(&pkg_a).unwrap();
|
||||
std::fs::create_dir_all(&pkg_b).unwrap();
|
||||
|
||||
let content = b"# shared\nThis content is identical in both files.\n";
|
||||
std::fs::write(pkg_a.join("__init__.py"), content).unwrap();
|
||||
std::fs::write(pkg_b.join("__init__.py"), content).unwrap();
|
||||
|
||||
// First ingest — both files must be New.
|
||||
let first = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("first ingest must succeed");
|
||||
assert_eq!(first.errors, 0, "first ingest: no errors; report={first:?}");
|
||||
|
||||
let items = first.items.as_ref().expect("items must be present");
|
||||
let twin_items: Vec<_> = items
|
||||
.iter()
|
||||
.filter(|i| i.doc_path.0.ends_with("__init__.py"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
twin_items.len(),
|
||||
2,
|
||||
"first ingest: expected exactly 2 __init__.py items; items={items:?}"
|
||||
);
|
||||
for item in &twin_items {
|
||||
assert_eq!(
|
||||
item.kind,
|
||||
IngestItemKind::New,
|
||||
"first ingest: each twin must be New; item={item:?}"
|
||||
);
|
||||
}
|
||||
|
||||
// Second ingest — same files, same content → both must be Unchanged.
|
||||
let second = ingest_with_config(env.config.clone(), env.scope(), false)
|
||||
.expect("second ingest must succeed");
|
||||
assert_eq!(
|
||||
second.errors, 0,
|
||||
"second ingest: no errors; report={second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.new, 0,
|
||||
"second ingest: no new docs; report={second:?}"
|
||||
);
|
||||
assert_eq!(
|
||||
second.updated, 0,
|
||||
"second ingest: no updated docs (twin-file bug would set this to 2); report={second:?}"
|
||||
);
|
||||
|
||||
let second_items = second.items.as_ref().expect("items must be present");
|
||||
let twin_items2: Vec<_> = second_items
|
||||
.iter()
|
||||
.filter(|i| i.doc_path.0.ends_with("__init__.py"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
twin_items2.len(),
|
||||
2,
|
||||
"second ingest: expected exactly 2 __init__.py items; items={second_items:?}"
|
||||
);
|
||||
for item in &twin_items2 {
|
||||
assert_eq!(
|
||||
item.kind,
|
||||
IngestItemKind::Unchanged,
|
||||
"second ingest: each twin must be Unchanged; item={item:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -13,14 +13,21 @@ serde_json_canonicalizer = "0.3"
|
||||
blake3 = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
serde_yaml = { workspace = true }
|
||||
lindera = { workspace = true, features = ["embed-ko-dic"] }
|
||||
lindera-ko-dic = { workspace = true, features = ["embed-ko-dic"] }
|
||||
|
||||
[dev-dependencies]
|
||||
# kb-parse-md / kb-normalize are dev-only — used by the snapshot integration
|
||||
# test to build a CanonicalDocument from a fixture Markdown file. Forbidden as
|
||||
# regular deps per design §8 (chunker consumes CanonicalDocument from kb-core
|
||||
# only); `cargo tree -p kb-chunk --depth 1` (default scope, excludes dev-deps)
|
||||
# kb-parse-md / kb-parse-code are dev-only — used by the snapshot integration
|
||||
# tests to build a CanonicalDocument from fixture files. kb-parse-md absorbed
|
||||
# kb-normalize in v0.19.0 (HOTFIXES.md 2026-05-26). Forbidden as regular deps
|
||||
# per design §8 (chunker consumes CanonicalDocument from kb-core only);
|
||||
# `cargo tree -p kb-chunk --depth 1` (default scope, excludes dev-deps)
|
||||
# confirms this.
|
||||
kebab-parse-md = { path = "../kebab-parse-md" }
|
||||
kebab-normalize = { path = "../kebab-normalize" }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
kebab-parse-md = { path = "../kebab-parse-md" }
|
||||
kebab-parse-code = { path = "../kebab-parse-code" }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
375
crates/kebab-chunk/src/code_c_ast_v1.rs
Normal file
375
crates/kebab-chunk/src/code_c_ast_v1.rs
Normal file
@@ -0,0 +1,375 @@
|
||||
//! `code-c-ast-v1` — maps a tree-sitter-derived C AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-c-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeCAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeCAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!("CodeCAstV1Chunker only handles code docs (got non-Code block)"),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeCAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-c-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.c".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-c-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("c".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("c".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("c".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_c_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeCAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-c-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "int parse() {\n\t// x\n}"),
|
||||
("print", 5, 7, "void print() {\n\t//\n\treturn;\n}"),
|
||||
]);
|
||||
let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-c-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} = {i};\n"))
|
||||
.collect::<String>();
|
||||
let code = format!("int big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "int parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeCAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
|
||||
let base: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeCAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
377
crates/kebab-chunk/src/code_cpp_ast_v1.rs
Normal file
377
crates/kebab-chunk/src/code_cpp_ast_v1.rs
Normal file
@@ -0,0 +1,377 @@
|
||||
//! `code-cpp-ast-v1` — maps a tree-sitter-derived C++ AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-cpp-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeCppAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeCppAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => {
|
||||
anyhow::bail!("CodeCppAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeCppAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-cpp-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.cpp".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-cpp-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("cpp".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("cpp".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("cpp".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_cpp_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeCppAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-cpp-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "int parse() {\n\t// x\n}"),
|
||||
("print", 5, 7, "void print() {\n\t//\n\treturn;\n}"),
|
||||
]);
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-cpp-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} = {i};\n"))
|
||||
.collect::<String>();
|
||||
let code = format!("int big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "int parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeCppAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]);
|
||||
let base: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeCppAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
383
crates/kebab-chunk/src/code_go_ast_v1.rs
Normal file
383
crates/kebab-chunk/src/code_go_ast_v1.rs
Normal file
@@ -0,0 +1,383 @@
|
||||
//! `code-go-ast-v1` — maps a tree-sitter-derived Go AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-go-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeGoAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeGoAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => {
|
||||
anyhow::bail!("CodeGoAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeGoAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-go-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.go".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-go-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("go".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("go".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("go".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_go_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeGoAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-go-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "func parse() {\n\t// x\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"func double() int {\n\t//\n\treturn 0\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-go-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tx{i} := {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("func big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "func parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeGoAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeGoAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "func parse() {}\n")]);
|
||||
let base: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeGoAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
378
crates/kebab-chunk/src/code_java_ast_v1.rs
Normal file
378
crates/kebab-chunk/src/code_java_ast_v1.rs
Normal file
@@ -0,0 +1,378 @@
|
||||
//! `code-java-ast-v1` — maps a tree-sitter-derived Java AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-java-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeJavaAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeJavaAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeJavaAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeJavaAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-java-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/Main.java".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-java-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("java".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("java".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("java".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_java_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeJavaAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-java-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "void parse() {\n\t// x\n}"),
|
||||
("Foo.double", 5, 7, "int double() {\n\t//\n\treturn 0;\n}"),
|
||||
]);
|
||||
let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-java-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tint x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("void big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "void parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeJavaAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeJavaAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "void parse() {}\n")]);
|
||||
let base: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeJavaAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
383
crates/kebab-chunk/src/code_js_ast_v1.rs
Normal file
383
crates/kebab-chunk/src/code_js_ast_v1.rs
Normal file
@@ -0,0 +1,383 @@
|
||||
//! `code-js-ast-v1` — maps a tree-sitter-derived JavaScript AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-js-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeJsAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeJsAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => {
|
||||
anyhow::bail!("CodeJsAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeJsAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-js-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.js".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-js-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("javascript".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("javascript".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("javascript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_js_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeJsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-js-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "function parse() {\n // x\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"function double() {\n //\n return 0;\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-js-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" const x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("function big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "function parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeJsAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "function parse() {}\n")]);
|
||||
let base: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeJsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
383
crates/kebab-chunk/src/code_kotlin_ast_v1.rs
Normal file
383
crates/kebab-chunk/src/code_kotlin_ast_v1.rs
Normal file
@@ -0,0 +1,383 @@
|
||||
//! `code-kotlin-ast-v1` — maps a tree-sitter-derived Kotlin AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-kotlin-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeKotlinAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeKotlinAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeKotlinAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeKotlinAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-kotlin-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/Main.kt".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-kotlin-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("kotlin".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("kotlin".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("kotlin".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_kotlin_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeKotlinAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-kotlin-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "fun parse() {\n\t// x\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"fun double(): Int {\n\t//\n\treturn 0\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-kotlin-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!("\tval x{i} = {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("fun big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "fun parse() {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeKotlinAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeKotlinAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "fun parse() {}\n")]);
|
||||
let base: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeKotlinAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
378
crates/kebab-chunk/src/code_python_ast_v1.rs
Normal file
378
crates/kebab-chunk/src/code_python_ast_v1.rs
Normal file
@@ -0,0 +1,378 @@
|
||||
//! `code-python-ast-v1` — maps a tree-sitter-derived Python AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-python-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodePythonAstV1Chunker;
|
||||
|
||||
impl Chunker for CodePythonAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodePythonAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodePythonAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-python-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.py".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-python-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("python".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("python".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("python".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_python_ast_v1() {
|
||||
assert_eq!(
|
||||
CodePythonAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-python-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "def parse():\n pass\n # x"),
|
||||
("Foo.double", 5, 7, "def double():\n #\n pass"),
|
||||
]);
|
||||
let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-python-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" x{i} = {i}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("def big():\n{body}\n");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "def parse(): pass")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodePythonAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "def parse(): pass\n")]);
|
||||
let base: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodePythonAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
378
crates/kebab-chunk/src/code_rust_ast_v1.rs
Normal file
378
crates/kebab-chunk/src/code_rust_ast_v1.rs
Normal file
@@ -0,0 +1,378 @@
|
||||
//! `code-rust-ast-v1` — maps a tree-sitter-derived Rust AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-rust-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeRustAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeRustAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => anyhow::bail!(
|
||||
"CodeRustAstV1Chunker only handles code docs (got non-Code block)"
|
||||
),
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeRustAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-rust-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.rs".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-rust-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("rust".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("rust".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("rust".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_rust_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeRustAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-rust-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "pub fn parse() {}\n// x\n}"),
|
||||
("Foo::double", 5, 7, "fn double() {}\n//\n}"),
|
||||
]);
|
||||
let chunks = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-rust-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" let x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("pub fn big() {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "fn parse(){}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeRustAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeRustAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "fn parse(){}\n}")]);
|
||||
let base: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeRustAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
170
crates/kebab-chunk/src/code_text_paragraph_v1.rs
Normal file
170
crates/kebab-chunk/src/code_text_paragraph_v1.rs
Normal file
@@ -0,0 +1,170 @@
|
||||
//! p10-3: Tier 3 paragraph + line-window fallback chunker.
|
||||
//!
|
||||
//! Splits code/text files on blank-line paragraph boundaries. Paragraphs
|
||||
//! with more than 80 lines are further split into 80-line windows with a
|
||||
//! 20-line overlap (stride 60) — the same oversize pattern used by Tier 1/2
|
||||
//! chunkers but without AST structure, hence no symbol.
|
||||
//!
|
||||
//! Per spec §9.3: all emitted chunks carry `symbol: None`.
|
||||
|
||||
use crate::tier2_shared::{build_chunk_no_symbol, policy_hash};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "code-text-paragraph-v1";
|
||||
|
||||
/// Lines-per-window for the oversize fallback (Tier 3).
|
||||
const FALLBACK_LINES_PER_CHUNK: usize = 80;
|
||||
/// Overlap between consecutive windows.
|
||||
const FALLBACK_LINES_OVERLAP: usize = 20;
|
||||
// stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP = 60.
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeTextParagraphV1Chunker;
|
||||
|
||||
impl Chunker for CodeTextParagraphV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full source text.
|
||||
let (text, lang_str) = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => (cb.code.as_str(), cb.lang.as_deref().unwrap_or("")),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let mut chunks = Vec::new();
|
||||
for para in split_paragraphs(text) {
|
||||
push_paragraph(&mut chunks, doc, policy, ¶, lang_str)?;
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"code-text-paragraph-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
|
||||
/// A contiguous run of non-blank lines from the source text.
|
||||
struct Paragraph {
|
||||
/// Lines joined with `\n` (no trailing newline).
|
||||
text: String,
|
||||
/// 1-indexed line number of the first line in the source file.
|
||||
line_start: u32,
|
||||
/// 1-indexed line number of the last line in the source file.
|
||||
line_end: u32,
|
||||
}
|
||||
|
||||
/// Split `text` into `Paragraph`s separated by blank (all-whitespace) lines.
|
||||
///
|
||||
/// Blank lines are treated as boundaries and are NOT included in any
|
||||
/// paragraph's line range. Paragraphs that would consist entirely of blank
|
||||
/// lines are skipped.
|
||||
fn split_paragraphs(text: &str) -> Vec<Paragraph> {
|
||||
let mut paragraphs = Vec::new();
|
||||
let mut current: Vec<&str> = Vec::new();
|
||||
let mut current_start: Option<u32> = None;
|
||||
|
||||
for (idx, line) in text.lines().enumerate() {
|
||||
let line_no = (idx + 1) as u32;
|
||||
let is_blank = line.trim().is_empty();
|
||||
if is_blank {
|
||||
if let Some(start) = current_start.take() {
|
||||
let end = start + current.len() as u32 - 1;
|
||||
paragraphs.push(Paragraph {
|
||||
text: current.join("\n"),
|
||||
line_start: start,
|
||||
line_end: end,
|
||||
});
|
||||
current.clear();
|
||||
}
|
||||
} else {
|
||||
if current_start.is_none() {
|
||||
current_start = Some(line_no);
|
||||
}
|
||||
current.push(line);
|
||||
}
|
||||
}
|
||||
// Flush any trailing paragraph not terminated by a blank line.
|
||||
if let Some(start) = current_start {
|
||||
let end = start + current.len() as u32 - 1;
|
||||
paragraphs.push(Paragraph {
|
||||
text: current.join("\n"),
|
||||
line_start: start,
|
||||
line_end: end,
|
||||
});
|
||||
}
|
||||
paragraphs
|
||||
}
|
||||
|
||||
/// Emit one or more chunks for a single paragraph.
|
||||
///
|
||||
/// Paragraphs with ≤ `FALLBACK_LINES_PER_CHUNK` lines become a single chunk.
|
||||
/// Larger paragraphs are split into overlapping windows of
|
||||
/// `FALLBACK_LINES_PER_CHUNK` lines with stride `FALLBACK_LINES_PER_CHUNK -
|
||||
/// FALLBACK_LINES_OVERLAP`. The last window may be shorter. Window starts
|
||||
/// are passed as `split_key` so `id_for_chunk` can produce distinct ids
|
||||
/// across windows.
|
||||
fn push_paragraph(
|
||||
out: &mut Vec<Chunk>,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
para: &Paragraph,
|
||||
lang: &str,
|
||||
) -> Result<()> {
|
||||
let n_lines = (para.line_end - para.line_start + 1) as usize;
|
||||
|
||||
if n_lines <= FALLBACK_LINES_PER_CHUNK {
|
||||
// Use line_start as split_key so each paragraph gets a distinct
|
||||
// chunk_id even when block_ids is empty (no symbol, no AST structure).
|
||||
// Without this, all short paragraphs from the same doc share the same
|
||||
// base_policy_hash and therefore the same id_for_chunk result.
|
||||
out.push(build_chunk_no_symbol(
|
||||
doc,
|
||||
policy,
|
||||
¶.text,
|
||||
para.line_start,
|
||||
para.line_end,
|
||||
lang,
|
||||
VERSION_LABEL,
|
||||
Some(para.line_start),
|
||||
));
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Oversize: line-window split with overlap.
|
||||
let stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP;
|
||||
let lines: Vec<&str> = para.text.lines().collect();
|
||||
let mut i = 0usize;
|
||||
loop {
|
||||
let end = (i + FALLBACK_LINES_PER_CHUNK).min(lines.len());
|
||||
let window_text = lines[i..end].join("\n");
|
||||
let window_start = para.line_start + i as u32;
|
||||
let window_end = para.line_start + (end as u32) - 1;
|
||||
// Use window_start as split_key so chunk_ids are unique across windows.
|
||||
out.push(build_chunk_no_symbol(
|
||||
doc,
|
||||
policy,
|
||||
&window_text,
|
||||
window_start,
|
||||
window_end,
|
||||
lang,
|
||||
VERSION_LABEL,
|
||||
Some(window_start),
|
||||
));
|
||||
if end == lines.len() {
|
||||
break;
|
||||
}
|
||||
i += stride;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
383
crates/kebab-chunk/src/code_ts_ast_v1.rs
Normal file
383
crates/kebab-chunk/src/code_ts_ast_v1.rs
Normal file
@@ -0,0 +1,383 @@
|
||||
//! `code-ts-ast-v1` — maps a tree-sitter-derived TypeScript AST
|
||||
//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with
|
||||
//! `SourceSpan::Code`) to chunks 1:1. A unit longer than
|
||||
//! `AST_CHUNK_MAX_LINES` is split into `<symbol> [part i/N]` sub-chunks
|
||||
//! at blank-line paragraph boundaries (design §9.1 oversize fallback).
|
||||
//!
|
||||
//! tree-sitter is intentionally NOT a dependency here: AST work is
|
||||
//! parser-side (`kebab-parse-code`, design §6.3). This chunker only
|
||||
//! consumes the `CanonicalDocument`.
|
||||
//!
|
||||
//! `AST_CHUNK_MAX_LINES` is a constant matching
|
||||
//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium
|
||||
//! config threading needs a chunker registry (P+); same deviation
|
||||
//! pattern as `pdf-page-v1`'s pinned `chunker_version`
|
||||
//! (`tasks/HOTFIXES.md`).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "code-ts-ast-v1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct CodeTsAstV1Chunker;
|
||||
|
||||
impl Chunker for CodeTsAstV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
for b in &doc.blocks {
|
||||
let c = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => {
|
||||
anyhow::bail!("CodeTsAstV1Chunker only handles code docs (got non-Code block)")
|
||||
}
|
||||
};
|
||||
if !matches!(c.common.source_span, SourceSpan::Code { .. }) {
|
||||
anyhow::bail!(
|
||||
"CodeTsAstV1Chunker only handles code docs (got non-Code source_span)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
|
||||
for b in &doc.blocks {
|
||||
let cb = match b {
|
||||
Block::Code(c) => c,
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let (ls, le, symbol, lang) = match &cb.common.source_span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => (*line_start, *line_end, symbol.clone(), lang.clone()),
|
||||
_ => unreachable!("validated above"),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![cb.common.block_id.clone()];
|
||||
let span_lines = le.saturating_sub(ls) + 1;
|
||||
|
||||
if span_lines <= AST_CHUNK_MAX_LINES {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: ls,
|
||||
line_end: le,
|
||||
symbol: symbol.clone(),
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
None,
|
||||
span,
|
||||
cb.code.clone(),
|
||||
));
|
||||
} else {
|
||||
let parts = split_oversize(&cb.code);
|
||||
let n = parts.len();
|
||||
for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() {
|
||||
let part_ls = ls + off_start;
|
||||
let part_le = ls + off_end;
|
||||
let part_sym = symbol.as_ref().map(|s| format!("{s} [part {}/{n}]", i + 1));
|
||||
let span = SourceSpan::Code {
|
||||
line_start: part_ls,
|
||||
line_end: part_le,
|
||||
symbol: part_sym,
|
||||
lang: lang.clone(),
|
||||
};
|
||||
out.push(make_chunk(
|
||||
doc,
|
||||
&chunker_version,
|
||||
&block_ids,
|
||||
&base_policy_hash,
|
||||
Some(part_ls),
|
||||
span,
|
||||
text,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = out.len(),
|
||||
"code-ts-ast-v1 chunked",
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn make_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
base_policy_hash: &str,
|
||||
split_key: Option<u32>,
|
||||
span: SourceSpan,
|
||||
text: String,
|
||||
) -> Chunk {
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash);
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids: block_ids.to_vec(),
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Split an oversize unit at blank-line paragraph boundaries, greedily
|
||||
/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate.
|
||||
/// Returns `(line_offset_start, line_offset_end, text)` where offsets are
|
||||
/// 0-based within the unit (caller adds the unit's absolute `line_start`).
|
||||
fn split_oversize(code: &str) -> Vec<(u32, u32, String)> {
|
||||
let lines: Vec<&str> = code.split('\n').collect();
|
||||
let total = lines.len() as u32;
|
||||
let mut out: Vec<(u32, u32, String)> = Vec::new();
|
||||
let mut start: u32 = 0;
|
||||
while start < total {
|
||||
let mut end = (start + AST_CHUNK_MAX_LINES).min(total);
|
||||
let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5);
|
||||
if end < total {
|
||||
if let Some(b) = (floor.min(end)..end)
|
||||
.rev()
|
||||
.find(|&i| lines[i as usize].trim().is_empty())
|
||||
{
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
let text = lines[start as usize..end as usize].join("\n");
|
||||
out.push((start, end.saturating_sub(1), text));
|
||||
start = end;
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push((0, total.saturating_sub(1), code.to_string()));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/x/src/a.ts".into());
|
||||
let aid = AssetId("a".repeat(64));
|
||||
let pv = ParserVersion("code-ts-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
let blocks = units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("typescript".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("typescript".into()),
|
||||
code: (*code).to_string(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "a".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("typescript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion(VERSION_LABEL.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunker_version_is_code_ts_ast_v1() {
|
||||
assert_eq!(
|
||||
CodeTsAstV1Chunker.chunker_version(),
|
||||
ChunkerVersion("code-ts-ast-v1".into())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_chunk_per_unit_preserves_code_span() {
|
||||
let doc = code_doc(&[
|
||||
("parse", 1, 3, "function parse(): void {\n // x\n}"),
|
||||
(
|
||||
"Foo.double",
|
||||
5,
|
||||
7,
|
||||
"function double(): number {\n //\n return 0;\n}",
|
||||
),
|
||||
]);
|
||||
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert_eq!(chunks.len(), 2);
|
||||
for c in &chunks {
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
assert!(matches!(c.source_spans[0], SourceSpan::Code { .. }));
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.chunker_version.0, "code-ts-ast-v1");
|
||||
}
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
symbol,
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(symbol.as_deref(), Some("parse"));
|
||||
assert_eq!((*line_start, *line_end), (1, 3));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversize_unit_splits_into_parts_with_unique_ids() {
|
||||
let body = (0..500)
|
||||
.map(|i| format!(" const x{i} = {i};"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
let code = format!("function big(): void {{\n{body}\n}}");
|
||||
let doc = code_doc(&[("big", 1, 502, &code)]);
|
||||
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap();
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"oversize unit must split, got {}",
|
||||
chunks.len()
|
||||
);
|
||||
for c in &chunks {
|
||||
match &c.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.as_deref().unwrap().starts_with("big [part "),
|
||||
"part-numbered symbol, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
let n = ids.len();
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), n, "chunk_ids unique across split parts");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_code_doc_errors() {
|
||||
use kebab_core::TextBlock;
|
||||
let mut doc = code_doc(&[("parse", 1, 1, "function parse(): void {}")]);
|
||||
doc.blocks = vec![Block::Paragraph(TextBlock {
|
||||
common: CommonBlock {
|
||||
block_id: kebab_core::BlockId("b".into()),
|
||||
heading_path: vec![],
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
},
|
||||
text: "x".into(),
|
||||
inlines: vec![],
|
||||
})];
|
||||
let err = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap_err();
|
||||
assert!(err.to_string().contains("CodeTsAstV1Chunker"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deterministic_chunk_ids_1000() {
|
||||
let doc = code_doc(&[("parse", 1, 2, "function parse(): void {}\n")]);
|
||||
let base: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..1000 {
|
||||
let again: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, base);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1() {
|
||||
let p = policy();
|
||||
assert_eq!(
|
||||
CodeTsAstV1Chunker.policy_hash(&p),
|
||||
crate::MdHeadingV1Chunker.policy_hash(&p)
|
||||
);
|
||||
}
|
||||
}
|
||||
58
crates/kebab-chunk/src/dockerfile_file_v1.rs
Normal file
58
crates/kebab-chunk/src/dockerfile_file_v1.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
//! p10-2: dockerfile whole-file chunker (Tier 2).
|
||||
//!
|
||||
//! Reads entire Dockerfile content and emits a single Chunk with symbol
|
||||
//! "<dockerfile>", code_lang "dockerfile", line range 1..EOF.
|
||||
//! Oversize >200 lines splits into line-windows sharing the symbol via
|
||||
//! tier2_shared::push_chunks_with_oversize.
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "dockerfile-file-v1";
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct DockerfileFileV1Chunker;
|
||||
|
||||
impl Chunker for DockerfileFileV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full Dockerfile text.
|
||||
let text = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => cb.code.as_str(),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let total_lines = text.lines().count().max(1) as u32;
|
||||
let mut chunks = Vec::new();
|
||||
|
||||
push_chunks_with_oversize(
|
||||
&mut chunks,
|
||||
doc,
|
||||
policy,
|
||||
text,
|
||||
1,
|
||||
total_lines,
|
||||
"<dockerfile>",
|
||||
"dockerfile",
|
||||
VERSION_LABEL,
|
||||
None,
|
||||
)?;
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"dockerfile-file-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
162
crates/kebab-chunk/src/k8s_manifest_resource_v1.rs
Normal file
162
crates/kebab-chunk/src/k8s_manifest_resource_v1.rs
Normal file
@@ -0,0 +1,162 @@
|
||||
//! p10-2: k8s manifest resource-aware chunker.
|
||||
//!
|
||||
//! Splits a multi-document YAML file on `^---\s*$` boundaries, recognises
|
||||
//! documents that have both `apiVersion` and `kind` string fields as k8s
|
||||
//! resources, and emits one `Chunk` per resource (with oversize >200-line
|
||||
//! fallback). Non-k8s documents are skipped; invalid YAML yields 0 chunks
|
||||
//! for the entire file.
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "k8s-manifest-resource-v1";
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct K8sManifestResourceV1Chunker;
|
||||
|
||||
impl Chunker for K8sManifestResourceV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full YAML text.
|
||||
let text = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => cb.code.as_str(),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let slices = split_yaml_documents(text);
|
||||
let mut chunks: Vec<Chunk> = Vec::new();
|
||||
|
||||
for slice in slices {
|
||||
// Invalid YAML in any document → return 0 chunks for the file.
|
||||
let value: serde_yaml::Value = match serde_yaml::from_str(slice.text) {
|
||||
Ok(v) => v,
|
||||
Err(_) => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let Some(mapping) = value.as_mapping() else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let api = mapping
|
||||
.get("apiVersion")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("");
|
||||
let kind = mapping.get("kind").and_then(|v| v.as_str()).unwrap_or("");
|
||||
|
||||
// Skip non-k8s documents.
|
||||
if api.is_empty() || kind.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let metadata = mapping.get("metadata").and_then(|v| v.as_mapping());
|
||||
let name = metadata
|
||||
.and_then(|m| m.get("name"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("<unnamed>");
|
||||
let namespace = metadata
|
||||
.and_then(|m| m.get("namespace"))
|
||||
.and_then(|v| v.as_str());
|
||||
|
||||
let symbol = match namespace {
|
||||
Some(ns) if !ns.is_empty() => format!("{kind}/{ns}/{name}"),
|
||||
_ => format!("{kind}/{name}"),
|
||||
};
|
||||
|
||||
push_chunks_with_oversize(
|
||||
&mut chunks,
|
||||
doc,
|
||||
policy,
|
||||
slice.text,
|
||||
slice.line_start,
|
||||
slice.line_end,
|
||||
&symbol,
|
||||
"yaml",
|
||||
VERSION_LABEL,
|
||||
Some(slice.line_start),
|
||||
)?;
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"k8s-manifest-resource-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
|
||||
struct YamlSlice<'a> {
|
||||
text: &'a str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
}
|
||||
|
||||
/// Split raw YAML text into per-document slices on `---` separator lines.
|
||||
/// Line numbers are 1-indexed.
|
||||
fn split_yaml_documents(text: &str) -> Vec<YamlSlice<'_>> {
|
||||
let lines: Vec<&str> = text.lines().collect();
|
||||
|
||||
// Collect indices of separator lines (0-based), then append a sentinel at
|
||||
// the end so the last slice is always terminated.
|
||||
let mut separators: Vec<usize> = lines
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(i, l)| {
|
||||
let trimmed = l.trim_end();
|
||||
if trimmed == "---" || trimmed.starts_with("--- ") || trimmed.starts_with("---\t") {
|
||||
Some(i)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
separators.push(lines.len());
|
||||
|
||||
let mut slices: Vec<YamlSlice<'_>> = Vec::new();
|
||||
let mut doc_start_line: usize = 0; // 0-based index of current doc start
|
||||
|
||||
for sep_line in separators {
|
||||
if sep_line > doc_start_line {
|
||||
let start_byte = byte_offset_of_line(text, doc_start_line);
|
||||
let end_byte = byte_offset_of_line(text, sep_line);
|
||||
let slice_text = &text[start_byte..end_byte];
|
||||
if !slice_text.trim().is_empty() {
|
||||
slices.push(YamlSlice {
|
||||
text: slice_text,
|
||||
line_start: (doc_start_line + 1) as u32,
|
||||
line_end: sep_line as u32,
|
||||
});
|
||||
}
|
||||
}
|
||||
doc_start_line = sep_line + 1;
|
||||
}
|
||||
|
||||
slices
|
||||
}
|
||||
|
||||
/// Return the byte offset of the start of `line_idx` (0-based line index).
|
||||
fn byte_offset_of_line(text: &str, line_idx: usize) -> usize {
|
||||
if line_idx == 0 {
|
||||
return 0;
|
||||
}
|
||||
let mut count = 0usize;
|
||||
for (i, c) in text.char_indices() {
|
||||
if c == '\n' {
|
||||
count += 1;
|
||||
if count == line_idx {
|
||||
return i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
text.len()
|
||||
}
|
||||
@@ -15,8 +15,107 @@
|
||||
//! embedder, the retriever, the LLM, the RAG layer, or the UI layers.
|
||||
//! It consumes `CanonicalDocument` purely through `kb-core` types.
|
||||
|
||||
mod code_c_ast_v1;
|
||||
mod code_cpp_ast_v1;
|
||||
mod code_go_ast_v1;
|
||||
mod code_java_ast_v1;
|
||||
mod code_js_ast_v1;
|
||||
mod code_kotlin_ast_v1;
|
||||
mod code_python_ast_v1;
|
||||
mod code_rust_ast_v1;
|
||||
pub mod code_text_paragraph_v1;
|
||||
mod code_ts_ast_v1;
|
||||
pub mod dockerfile_file_v1;
|
||||
pub mod k8s_manifest_resource_v1;
|
||||
pub mod manifest_file_v1;
|
||||
mod md_heading_v1;
|
||||
mod pdf_page_v1;
|
||||
mod tier2_shared;
|
||||
|
||||
pub use code_c_ast_v1::CodeCAstV1Chunker;
|
||||
pub use code_cpp_ast_v1::CodeCppAstV1Chunker;
|
||||
pub use code_go_ast_v1::CodeGoAstV1Chunker;
|
||||
pub use code_java_ast_v1::CodeJavaAstV1Chunker;
|
||||
pub use code_js_ast_v1::CodeJsAstV1Chunker;
|
||||
pub use code_kotlin_ast_v1::CodeKotlinAstV1Chunker;
|
||||
pub use code_python_ast_v1::CodePythonAstV1Chunker;
|
||||
pub use code_rust_ast_v1::CodeRustAstV1Chunker;
|
||||
pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker;
|
||||
pub use code_ts_ast_v1::CodeTsAstV1Chunker;
|
||||
pub use dockerfile_file_v1::DockerfileFileV1Chunker;
|
||||
pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker;
|
||||
pub use manifest_file_v1::ManifestFileV1Chunker;
|
||||
pub use md_heading_v1::MdHeadingV1Chunker;
|
||||
pub use pdf_page_v1::PdfPageV1Chunker;
|
||||
|
||||
// ── Korean morphological tokenizer ───────────────────────────────────────────
|
||||
|
||||
use lindera::dictionary::{DictionaryKind, load_embedded_dictionary};
|
||||
use lindera::mode::Mode;
|
||||
use lindera::segmenter::Segmenter;
|
||||
use lindera::tokenizer::Tokenizer;
|
||||
|
||||
static KOREAN_TOKENIZER: std::sync::OnceLock<Option<Tokenizer>> = std::sync::OnceLock::new();
|
||||
|
||||
/// 한 codepoint 가 한글 음절 또는 자모인지 판정 — N-gram supplement 의 emit 대상 필터링.
|
||||
fn is_hangul(c: char) -> bool {
|
||||
matches!(
|
||||
c,
|
||||
'\u{AC00}'..='\u{D7A3}' // 한글 음절 (precomposed)
|
||||
| '\u{1100}'..='\u{11FF}' // 한글 자모
|
||||
| '\u{3130}'..='\u{318F}' // 한글 호환 자모
|
||||
)
|
||||
}
|
||||
|
||||
/// 한국어 chunk text 를 lindera ko-dic 으로 형태소 분해해 공백 join 한 결과를 반환.
|
||||
/// chunker 들이 `Chunk.tokenized_korean_text` pre-fill 에 사용.
|
||||
/// 분석 실패 시 None — 호출자는 NULL fallback 처리.
|
||||
/// Tokenizer 는 OnceLock 으로 1회 초기화; dict load 실패 시 영구 None.
|
||||
///
|
||||
/// v0.21.0 — N-gram supplement (Option β, post-v0.20.1 enhancement).
|
||||
/// ko-dic 가 compound noun (`한국정부`, `서울특별시` 등) 을 단일 token 으로
|
||||
/// 저장하는 정책 의 한계 해소 — morpheme 길이 ≥ 3 인 한글 token 에 대해
|
||||
/// 2-char sliding window n-gram 도 추가 emit. `'한국정부'` morpheme →
|
||||
/// `[한국정부, 한국, 국정, 정부]` 의 4 token 으로 expand. 사용자 의 2-char
|
||||
/// query (`'한국'`) 가 compound chunk 에서도 hit. 영어/숫자 token 은 영향
|
||||
/// 없음 (is_hangul filter). DB size + ingest latency 의 trade-off 는
|
||||
/// HOTFIXES 2026-05-28 의 "N-gram supplement (Option β)" 보강 entry.
|
||||
pub fn tokenize_korean_morphological(text: &str) -> Option<String> {
|
||||
if text.trim().is_empty() {
|
||||
return None;
|
||||
}
|
||||
let tokenizer = KOREAN_TOKENIZER.get_or_init(|| {
|
||||
let dict = match load_embedded_dictionary(DictionaryKind::KoDic) {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
tracing::warn!(target: "kebab-chunk", "tokenize_korean_morphological: dict load failed: {e}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
let segmenter = Segmenter::new(Mode::Normal, dict, None);
|
||||
Some(Tokenizer::new(segmenter))
|
||||
});
|
||||
let tokenizer = tokenizer.as_ref()?;
|
||||
let tokens = tokenizer.tokenize(text).ok()?;
|
||||
|
||||
let mut out_tokens: Vec<String> = Vec::with_capacity(tokens.len() * 2);
|
||||
for tok in tokens.iter() {
|
||||
let surface = tok.surface.as_ref();
|
||||
out_tokens.push(surface.to_string());
|
||||
|
||||
// N-gram supplement: 한글 morpheme 의 2-char sliding window.
|
||||
let chars: Vec<char> = surface.chars().collect();
|
||||
if chars.len() >= 3 && chars.iter().all(|c| is_hangul(*c)) {
|
||||
for window in chars.windows(2) {
|
||||
out_tokens.push(window.iter().collect());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let joined = out_tokens.join(" ");
|
||||
if joined.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(joined)
|
||||
}
|
||||
}
|
||||
|
||||
59
crates/kebab-chunk/src/manifest_file_v1.rs
Normal file
59
crates/kebab-chunk/src/manifest_file_v1.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
//! p10-2: manifest whole-file chunker (Tier 2).
|
||||
//!
|
||||
//! Reads entire manifest file (Cargo.toml / package.json / pom.xml / go.mod /
|
||||
//! build.gradle / pyproject.toml / tsconfig.json) and emits a single Chunk
|
||||
//! with symbol "<manifest>", code_lang read from Block::Code.lang, line range
|
||||
//! 1..EOF. Oversize >200 lines splits into line-windows sharing the symbol via
|
||||
//! tier2_shared::push_chunks_with_oversize.
|
||||
|
||||
use crate::tier2_shared::{policy_hash, push_chunks_with_oversize};
|
||||
use anyhow::Result;
|
||||
use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion};
|
||||
|
||||
pub const VERSION_LABEL: &str = "manifest-file-v1";
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct ManifestFileV1Chunker;
|
||||
|
||||
impl Chunker for ManifestFileV1Chunker {
|
||||
fn chunker_version(&self) -> ChunkerVersion {
|
||||
ChunkerVersion(VERSION_LABEL.to_string())
|
||||
}
|
||||
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String {
|
||||
policy_hash(policy)
|
||||
}
|
||||
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result<Vec<Chunk>> {
|
||||
// Expect a single Block::Code carrying the full manifest text.
|
||||
let (text, lang) = match doc.blocks.first() {
|
||||
Some(Block::Code(cb)) => (cb.code.as_str(), cb.lang.as_deref().unwrap_or("")),
|
||||
_ => return Ok(vec![]),
|
||||
};
|
||||
|
||||
let total_lines = text.lines().count().max(1) as u32;
|
||||
let mut chunks = Vec::new();
|
||||
|
||||
push_chunks_with_oversize(
|
||||
&mut chunks,
|
||||
doc,
|
||||
policy,
|
||||
text,
|
||||
1,
|
||||
total_lines,
|
||||
"<manifest>",
|
||||
lang,
|
||||
VERSION_LABEL,
|
||||
None,
|
||||
)?;
|
||||
|
||||
tracing::debug!(
|
||||
target: "kebab-chunk",
|
||||
doc_id = %doc.doc_id,
|
||||
chunks = chunks.len(),
|
||||
"manifest-file-v1 chunked",
|
||||
);
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
//! `md-heading-v1` — heading-aware Markdown chunker.
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker,
|
||||
ChunkerVersion, DocumentId, SourceSpan, id_for_chunk,
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
/// Version label emitted by [`MdHeadingV1Chunker`]. Bumping this label
|
||||
@@ -99,11 +99,7 @@ impl Chunker for MdHeadingV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
let policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let mut out: Vec<Chunk> = Vec::new();
|
||||
@@ -152,22 +148,12 @@ impl Chunker for MdHeadingV1Chunker {
|
||||
// `collect_overlap_seed` keeps seed ≤ target/2, so
|
||||
// a flush here never produces a chunk smaller than
|
||||
// the seed budget.
|
||||
let would_exceed = acc.text_tokens + next_tokens
|
||||
> policy.target_tokens
|
||||
let would_exceed = acc.text_tokens + next_tokens > policy.target_tokens
|
||||
&& acc.has_non_heading_content();
|
||||
if would_exceed {
|
||||
let overlap_seed = collect_overlap_seed(
|
||||
&acc,
|
||||
policy.overlap_tokens,
|
||||
policy.target_tokens,
|
||||
);
|
||||
flush(
|
||||
&mut acc,
|
||||
doc,
|
||||
&chunker_version,
|
||||
&policy_hash,
|
||||
&mut out,
|
||||
);
|
||||
let overlap_seed =
|
||||
collect_overlap_seed(&acc, policy.overlap_tokens, policy.target_tokens);
|
||||
flush(&mut acc, doc, &chunker_version, &policy_hash, &mut out);
|
||||
// Seed next accumulator with the prior chunk's
|
||||
// tail blocks (paragraph-level overlap). The
|
||||
// heading is *not* re-included here — it lives
|
||||
@@ -292,10 +278,11 @@ fn build_chunk(
|
||||
) -> Chunk {
|
||||
debug_assert!(!blocks.is_empty(), "build_chunk requires ≥1 block");
|
||||
|
||||
let block_ids: Vec<BlockId> =
|
||||
blocks.iter().map(|b| common(b).block_id.clone()).collect();
|
||||
let source_spans: Vec<SourceSpan> =
|
||||
blocks.iter().map(|b| common(b).source_span.clone()).collect();
|
||||
let block_ids: Vec<BlockId> = blocks.iter().map(|b| common(b).block_id.clone()).collect();
|
||||
let source_spans: Vec<SourceSpan> = blocks
|
||||
.iter()
|
||||
.map(|b| common(b).source_span.clone())
|
||||
.collect();
|
||||
|
||||
// heading_path: pick the first non-Heading block's heading_path
|
||||
// (which already includes every parent heading per kb-normalize).
|
||||
@@ -339,17 +326,13 @@ fn build_chunk(
|
||||
text.len().div_ceil(BYTES_PER_TOKEN)
|
||||
};
|
||||
|
||||
let chunk_id = id_for_chunk(
|
||||
&doc.doc_id,
|
||||
chunker_version,
|
||||
&block_ids,
|
||||
policy_hash,
|
||||
);
|
||||
let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, &block_ids, policy_hash);
|
||||
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&text),
|
||||
text,
|
||||
heading_path,
|
||||
source_spans,
|
||||
@@ -387,9 +370,7 @@ fn render_block_text(b: &Block) -> String {
|
||||
// alt keeps lexical search hits on filenames working even when
|
||||
// P6-1's filename auto-fill is bypassed.
|
||||
Block::ImageRef(i) => {
|
||||
let alt = if !i.alt.is_empty() {
|
||||
i.alt.clone()
|
||||
} else {
|
||||
let alt = if i.alt.is_empty() {
|
||||
// P6-1 falls back to filename so this branch is
|
||||
// defensive — keep it lest a future test fixture or
|
||||
// synthetic block path skip the auto-fill.
|
||||
@@ -399,17 +380,11 @@ fn render_block_text(b: &Block) -> String {
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or("[image]")
|
||||
.to_string()
|
||||
} else {
|
||||
i.alt.clone()
|
||||
};
|
||||
let ocr = i
|
||||
.ocr
|
||||
.as_ref()
|
||||
.map(|o| o.joined.as_str())
|
||||
.unwrap_or("");
|
||||
let cap = i
|
||||
.caption
|
||||
.as_ref()
|
||||
.map(|c| c.text.as_str())
|
||||
.unwrap_or("");
|
||||
let ocr = i.ocr.as_ref().map_or("", |o| o.joined.as_str());
|
||||
let cap = i.caption.as_ref().map_or("", |c| c.text.as_str());
|
||||
[alt.as_str(), ocr, cap]
|
||||
.iter()
|
||||
.filter(|s| !s.is_empty())
|
||||
@@ -449,9 +424,8 @@ fn common(b: &Block) -> &kebab_core::CommonBlock {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kebab_core::{
|
||||
AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang,
|
||||
Metadata, Provenance, SourceType, TableBlock, TextBlock, TrustLevel,
|
||||
WorkspacePath, id_for_block,
|
||||
AssetId, CodeBlock, CommonBlock, HeadingBlock, ImageRefBlock, Lang, Metadata, Provenance,
|
||||
SourceType, TableBlock, TextBlock, TrustLevel, WorkspacePath, id_for_block,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -472,6 +446,10 @@ mod tests {
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: None,
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: kebab_core::ParserVersion("test-parser-0".into()),
|
||||
@@ -490,12 +468,7 @@ mod tests {
|
||||
SourceSpan::Line { start, end }
|
||||
}
|
||||
|
||||
fn common_for(
|
||||
kind: &str,
|
||||
heading_path: &[String],
|
||||
ordinal: u32,
|
||||
s: SourceSpan,
|
||||
) -> CommonBlock {
|
||||
fn common_for(kind: &str, heading_path: &[String], ordinal: u32, s: SourceSpan) -> CommonBlock {
|
||||
CommonBlock {
|
||||
block_id: id_for_block(&doc_id(), kind, heading_path, ordinal, &s),
|
||||
heading_path: heading_path.to_vec(),
|
||||
@@ -530,12 +503,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn paragraph(
|
||||
text: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
line: u32,
|
||||
) -> Block {
|
||||
fn paragraph(text: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::Paragraph(TextBlock {
|
||||
common: common_for("paragraph", &hp, ordinal, span(line, line)),
|
||||
@@ -544,12 +512,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn code_block(
|
||||
code: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
s: SourceSpan,
|
||||
) -> Block {
|
||||
fn code_block(code: &str, heading_path: &[&str], ordinal: u32, s: SourceSpan) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::Code(CodeBlock {
|
||||
common: common_for("code", &hp, ordinal, s),
|
||||
@@ -576,12 +539,7 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn image_ref(
|
||||
alt: &str,
|
||||
heading_path: &[&str],
|
||||
ordinal: u32,
|
||||
line: u32,
|
||||
) -> Block {
|
||||
fn image_ref(alt: &str, heading_path: &[&str], ordinal: u32, line: u32) -> Block {
|
||||
let hp: Vec<String> = heading_path.iter().map(|s| (*s).into()).collect();
|
||||
Block::ImageRef(ImageRefBlock {
|
||||
common: common_for("imageref", &hp, ordinal, span(line, line)),
|
||||
|
||||
@@ -53,18 +53,21 @@
|
||||
//! one chunk per atomic block. PdfPageV1 cannot.
|
||||
//!
|
||||
//! Workaround that doesn't change the §4.2 recipe: feed a per-chunk
|
||||
//! variant `format!("{base_policy_hash}#c{char_start}")` into the
|
||||
//! recipe's `policy_hash` slot (so distinct chunks distinguish via
|
||||
//! different policy_hash inputs), while storing the unmodified
|
||||
//! `base_policy_hash` in `Chunk.policy_hash` so the field still answers
|
||||
//! "what policy was active". Logged in `tasks/HOTFIXES.md`.
|
||||
//! variant `format!("{base_policy_hash}#c{segment_start}")` into the
|
||||
//! recipe's `policy_hash` slot. `segment_start` is the pre-overlap
|
||||
//! segment boundary, strictly increasing across the returned chunks
|
||||
//! even when the overlap walk collapses `actual_start` to a previous
|
||||
//! chunk's `prev_min`. Unmodified `base_policy_hash` is stored in
|
||||
//! `Chunk.policy_hash` so the field still answers "what policy was
|
||||
//! active". v1.1 second-iteration patch — logged in
|
||||
//! `tasks/HOTFIXES.md` (2026-05-27).
|
||||
|
||||
use kebab_core::{
|
||||
Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId,
|
||||
SourceSpan, id_for_chunk,
|
||||
};
|
||||
|
||||
const VERSION_LABEL: &str = "pdf-page-v1";
|
||||
const VERSION_LABEL: &str = "pdf-page-v1.1";
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
|
||||
@@ -89,11 +92,7 @@ impl Chunker for PdfPageV1Chunker {
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>> {
|
||||
fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> anyhow::Result<Vec<Chunk>> {
|
||||
// Validate up front — every block must be a Paragraph carrying
|
||||
// SourceSpan::Page. A mixed document signals a routing bug in
|
||||
// the caller (e.g. running this chunker on Markdown) and is
|
||||
@@ -106,18 +105,13 @@ impl Chunker for PdfPageV1Chunker {
|
||||
),
|
||||
};
|
||||
if !matches!(common.source_span, SourceSpan::Page { .. }) {
|
||||
anyhow::bail!(
|
||||
"PdfPageV1Chunker only handles PDF docs (got non-Page source_span)"
|
||||
);
|
||||
anyhow::bail!("PdfPageV1Chunker only handles PDF docs (got non-Page source_span)");
|
||||
}
|
||||
}
|
||||
|
||||
let base_policy_hash = self.policy_hash(policy);
|
||||
let chunker_version = self.chunker_version();
|
||||
let target_bytes = policy
|
||||
.target_tokens
|
||||
.saturating_mul(BYTES_PER_TOKEN)
|
||||
.max(1);
|
||||
let target_bytes = policy.target_tokens.saturating_mul(BYTES_PER_TOKEN).max(1);
|
||||
// Clamp the overlap to half the target. Without this, a policy
|
||||
// with `overlap_tokens >= target_tokens` would make every chunk
|
||||
// fully re-emit the previous chunk's text — mirrors
|
||||
@@ -146,7 +140,7 @@ impl Chunker for PdfPageV1Chunker {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (char_start, char_end, slice) in
|
||||
for (segment_start, char_start, char_end, slice) in
|
||||
chunk_page(&p.text, target_bytes, overlap_bytes)
|
||||
{
|
||||
// PDF chars-per-page comfortably fits in u32 (a single
|
||||
@@ -154,20 +148,20 @@ impl Chunker for PdfPageV1Chunker {
|
||||
// typography); silent `as u32` truncation would only
|
||||
// surface on corrupted input, where an explicit panic
|
||||
// is preferable to an off-by-2^32 span.
|
||||
let char_start_u32 = u32::try_from(char_start)
|
||||
.expect("page chars fit in u32");
|
||||
let char_end_u32 =
|
||||
u32::try_from(char_end).expect("page chars fit in u32");
|
||||
let char_start_u32 = u32::try_from(char_start).expect("page chars fit in u32");
|
||||
let char_end_u32 = u32::try_from(char_end).expect("page chars fit in u32");
|
||||
let span = SourceSpan::Page {
|
||||
page: page_num,
|
||||
char_start: Some(char_start_u32),
|
||||
char_end: Some(char_end_u32),
|
||||
};
|
||||
let block_ids: Vec<BlockId> = vec![p.common.block_id.clone()];
|
||||
// Per-chunk policy_hash variant prevents chunk_id
|
||||
// collision when a page produces multiple chunks. See
|
||||
// module docs for rationale.
|
||||
let per_chunk_hash = format!("{base_policy_hash}#c{char_start}");
|
||||
// v0.20.0 sub-item 1 bugfix (#3): per-chunk policy_hash
|
||||
// variant uses `segment_start` (pre-overlap boundary,
|
||||
// strictly increasing) instead of `char_start` (post-
|
||||
// overlap, may collapse to prev_min). See module docs +
|
||||
// spec §4.1 root cause + HOTFIXES.md 2026-05-27.
|
||||
let per_chunk_hash = format!("{base_policy_hash}#c{segment_start}");
|
||||
let chunk_id =
|
||||
id_for_chunk(&doc.doc_id, &chunker_version, &block_ids, &per_chunk_hash);
|
||||
let token_estimate = slice.len().div_ceil(BYTES_PER_TOKEN);
|
||||
@@ -176,6 +170,7 @@ impl Chunker for PdfPageV1Chunker {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(&slice),
|
||||
text: slice,
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
@@ -198,18 +193,28 @@ impl Chunker for PdfPageV1Chunker {
|
||||
}
|
||||
|
||||
/// Split a single page's text into ordered chunks, each represented as
|
||||
/// `(char_start, char_end, text_slice)`. Char positions are within the
|
||||
/// page text, suitable for `SourceSpan::Page::char_start` / `char_end`.
|
||||
/// `(segment_start, actual_start, chunk_end, text_slice)`.
|
||||
///
|
||||
/// - `segment_start` = pre-overlap segment boundary. Strictly increasing
|
||||
/// across the returned vec. Use this for chunk_id uniqueness suffixes.
|
||||
/// - `actual_start` = post-overlap start char index. May collapse to a
|
||||
/// previous chunk's `actual_start` under aggressive overlap policy.
|
||||
/// Use this for `SourceSpan::Page::char_start`.
|
||||
/// - `chunk_end` = chunk's end char index (exclusive).
|
||||
///
|
||||
/// Returns an empty vector when `text` is empty or whitespace-only.
|
||||
fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usize, usize, String)> {
|
||||
fn chunk_page(
|
||||
text: &str,
|
||||
target_bytes: usize,
|
||||
overlap_bytes: usize,
|
||||
) -> Vec<(usize, usize, usize, String)> {
|
||||
let chars: Vec<char> = text.chars().collect();
|
||||
let n = chars.len();
|
||||
if n == 0 {
|
||||
return Vec::new();
|
||||
}
|
||||
if text.len() <= target_bytes {
|
||||
return vec![(0, n, text.to_string())];
|
||||
return vec![(0, 0, n, text.to_string())];
|
||||
}
|
||||
|
||||
// Build candidate boundary positions (char indices where a chunk
|
||||
@@ -222,8 +227,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
let c = chars[k];
|
||||
let nx = chars[k + 1];
|
||||
let is_paragraph_break = c == '\n' && nx == '\n';
|
||||
let is_sentence_end =
|
||||
matches!(c, '.' | '?' | '!') && nx.is_whitespace();
|
||||
let is_sentence_end = matches!(c, '.' | '?' | '!') && nx.is_whitespace();
|
||||
if (is_paragraph_break || is_sentence_end) && k + 2 <= n {
|
||||
bounds.push(k + 2);
|
||||
}
|
||||
@@ -235,11 +239,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
bounds.dedup();
|
||||
|
||||
// UTF-8 byte length of the slice between two char indices.
|
||||
let byte_len = |a: usize, b: usize| -> usize {
|
||||
chars[a..b].iter().map(|c| c.len_utf8()).sum()
|
||||
};
|
||||
let byte_len = |a: usize, b: usize| -> usize { chars[a..b].iter().map(|c| c.len_utf8()).sum() };
|
||||
|
||||
let mut chunks: Vec<(usize, usize, String)> = Vec::new();
|
||||
let mut chunks: Vec<(usize, usize, usize, String)> = Vec::new();
|
||||
let mut seg_idx: usize = 0;
|
||||
while seg_idx + 1 < bounds.len() {
|
||||
let start = bounds[seg_idx];
|
||||
@@ -264,7 +266,9 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
// have absorbed up to `overlap_bytes` of bytes, but never past
|
||||
// the previous chunk's start (no full re-emission).
|
||||
let actual_start = if let Some(prev) = chunks.last() {
|
||||
let prev_min = prev.0;
|
||||
// prev tuple shape = (segment_start, actual_start, chunk_end, slice).
|
||||
// overlap walk floor = previous chunk's actual_start (prev.1).
|
||||
let prev_min = prev.1;
|
||||
let mut a = start;
|
||||
let mut acc_o: usize = 0;
|
||||
while a > prev_min {
|
||||
@@ -281,7 +285,7 @@ fn chunk_page(text: &str, target_bytes: usize, overlap_bytes: usize) -> Vec<(usi
|
||||
};
|
||||
|
||||
let slice: String = chars[actual_start..chunk_end].iter().collect();
|
||||
chunks.push((actual_start, chunk_end, slice));
|
||||
chunks.push((start, actual_start, chunk_end, slice));
|
||||
seg_idx = end_idx;
|
||||
}
|
||||
|
||||
@@ -347,6 +351,10 @@ mod tests {
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: None,
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version,
|
||||
@@ -386,7 +394,11 @@ mod tests {
|
||||
assert_eq!(c.heading_path, Vec::<String>::new());
|
||||
assert_eq!(c.source_spans.len(), 1);
|
||||
match c.source_spans[0] {
|
||||
SourceSpan::Page { page, char_start, char_end } => {
|
||||
SourceSpan::Page {
|
||||
page,
|
||||
char_start,
|
||||
char_end,
|
||||
} => {
|
||||
assert_eq!(page, (i as u32) + 1);
|
||||
assert_eq!(char_start, Some(0));
|
||||
assert!(char_end.unwrap() > 0);
|
||||
@@ -431,11 +443,16 @@ mod tests {
|
||||
// N-1's char_end).
|
||||
for w in chunks.windows(2) {
|
||||
let prev_end = match w[0].source_spans[0] {
|
||||
SourceSpan::Page { char_end: Some(e), .. } => e,
|
||||
SourceSpan::Page {
|
||||
char_end: Some(e), ..
|
||||
} => e,
|
||||
_ => panic!("missing char_end"),
|
||||
};
|
||||
let next_start = match w[1].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
assert!(
|
||||
@@ -446,7 +463,7 @@ mod tests {
|
||||
// chunk_ids stay distinct despite identical block_ids — the
|
||||
// per-chunk policy_hash variant is doing its job.
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
ids.sort();
|
||||
ids.sort_unstable();
|
||||
let total = ids.len();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), total, "all chunk_ids must be unique");
|
||||
@@ -512,6 +529,10 @@ mod tests {
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: None,
|
||||
git_branch: None,
|
||||
git_commit: None,
|
||||
code_lang: None,
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version,
|
||||
@@ -645,11 +666,17 @@ mod tests {
|
||||
// overlap) is the failure mode.
|
||||
for w in chunks.windows(2) {
|
||||
let prev_start = match w[0].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
let next_start = match w[1].source_spans[0] {
|
||||
SourceSpan::Page { char_start: Some(s), .. } => s,
|
||||
SourceSpan::Page {
|
||||
char_start: Some(s),
|
||||
..
|
||||
} => s,
|
||||
_ => panic!("missing char_start"),
|
||||
};
|
||||
assert!(
|
||||
@@ -660,12 +687,49 @@ mod tests {
|
||||
// chunk_ids stay distinct (the per-chunk hash variant keys off
|
||||
// char_start which is now strictly increasing).
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
ids.sort();
|
||||
ids.sort_unstable();
|
||||
let total = ids.len();
|
||||
ids.dedup();
|
||||
assert_eq!(ids.len(), total, "chunk_ids must remain unique");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_chunk_page_with_aggressive_overlap_produces_unique_chunk_ids() {
|
||||
// 한국어 OCR text 의 trigger shape: 10 char "가" + ". " + 500 char "나".
|
||||
// → first segment [0, 12), second segment [12, n).
|
||||
// page_text byte_len = 10*3 + 2 + 500*3 = 1532 > target_bytes=1500
|
||||
// → multi-chunk. overlap_bytes = min(240, 750) = 240 chars=80
|
||||
// → second chunk 의 actual_start 가 prev_min=0 collapse → same `#c0`.
|
||||
//
|
||||
// default_policy(500, 80) — target_tokens=500 → target_bytes=500*3=1500
|
||||
// (한국어 3byte/char 환산), overlap_tokens=80 → overlap_bytes=min(240, 750)=240.
|
||||
// verifier round 1 L-3 보강.
|
||||
let early_seg = "가".repeat(10);
|
||||
let tail = "나".repeat(500);
|
||||
let page_text = format!("{early_seg}. {tail}");
|
||||
|
||||
let doc = make_pdf_doc(&[&page_text]);
|
||||
let policy = default_policy(500, 80); // target=1500 byte, overlap=240 byte
|
||||
let chunks = PdfPageV1Chunker.chunk(&doc, &policy).unwrap();
|
||||
|
||||
assert!(
|
||||
chunks.len() >= 2,
|
||||
"expected ≥2 chunks for {} byte page; got {}",
|
||||
page_text.len(),
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect();
|
||||
ids.sort_unstable();
|
||||
let total = ids.len();
|
||||
ids.dedup();
|
||||
assert_eq!(
|
||||
ids.len(),
|
||||
total,
|
||||
"all chunk_ids must be unique even when overlap walks actual_start back to prev_min"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn policy_hash_matches_md_heading_v1_for_identical_policy() {
|
||||
// Cross-chunker policy fingerprint identity — important so a
|
||||
|
||||
200
crates/kebab-chunk/src/tier2_shared.rs
Normal file
200
crates/kebab-chunk/src/tier2_shared.rs
Normal file
@@ -0,0 +1,200 @@
|
||||
//! p10-2: Tier 2 chunker shared helpers (oversize fallback + Chunk build).
|
||||
//!
|
||||
//! Mirrors `code_rust_ast_v1`'s Chunk-construction pattern exactly so that
|
||||
//! id / hashes / token-count / ChunkPolicy semantics stay identical across
|
||||
//! Tier 1 (AST) and Tier 2 (resource-aware) chunkers.
|
||||
|
||||
use anyhow::Result;
|
||||
use kebab_core::{
|
||||
BlockId, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, DocumentId, SourceSpan,
|
||||
id_for_chunk,
|
||||
};
|
||||
|
||||
pub(crate) const AST_CHUNK_MAX_LINES: u32 = 200;
|
||||
const BYTES_PER_TOKEN: usize = 3;
|
||||
const POLICY_HASH_HEX_LEN: usize = 16;
|
||||
|
||||
/// Compute the policy hash the same way `code_rust_ast_v1` does.
|
||||
pub(crate) fn policy_hash(policy: &ChunkPolicy) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(policy)
|
||||
.expect("canonical JSON serialization of ChunkPolicy must not fail");
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..POLICY_HASH_HEX_LEN].to_string()
|
||||
}
|
||||
|
||||
/// Emit one chunk for `(text, line_start..=line_end, symbol, lang)`, splitting
|
||||
/// into line-windows of at most `AST_CHUNK_MAX_LINES` if the slice is oversize.
|
||||
/// Mirrors the oversize path in `code_rust_ast_v1`'s `chunk` impl.
|
||||
///
|
||||
/// `base_split_key` is used as the `split_key` for the non-oversize single-chunk
|
||||
/// case. Callers that emit multiple chunks from the same document (e.g.
|
||||
/// `K8sManifestResourceV1Chunker` — one call per k8s resource) MUST pass
|
||||
/// `Some(line_start)` so that each call produces a distinct `chunk_id`.
|
||||
/// Single-chunk callers (dockerfile-file-v1, manifest-file-v1) pass `None` to
|
||||
/// keep chunk_ids stable (no sibling can collide when there's only one chunk).
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn push_chunks_with_oversize(
|
||||
out: &mut Vec<Chunk>,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
text: &str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
symbol: &str,
|
||||
lang: &str,
|
||||
chunker_version: &str,
|
||||
base_split_key: Option<u32>,
|
||||
) -> Result<()> {
|
||||
let n_lines = (line_end - line_start + 1).max(1);
|
||||
let cv = ChunkerVersion(chunker_version.to_string());
|
||||
let base_policy_hash = policy_hash(policy);
|
||||
|
||||
if n_lines <= AST_CHUNK_MAX_LINES {
|
||||
out.push(build_chunk(
|
||||
doc,
|
||||
&cv,
|
||||
&base_policy_hash,
|
||||
text,
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
base_split_key,
|
||||
));
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let lines: Vec<&str> = text.lines().collect();
|
||||
let total = lines.len();
|
||||
let mut window_start = line_start;
|
||||
let mut i = 0usize;
|
||||
while i < total {
|
||||
let take = (AST_CHUNK_MAX_LINES as usize).min(total - i);
|
||||
let window_text = lines[i..i + take].join("\n");
|
||||
let window_end = window_start + take as u32 - 1;
|
||||
out.push(build_chunk(
|
||||
doc,
|
||||
&cv,
|
||||
&base_policy_hash,
|
||||
&window_text,
|
||||
window_start,
|
||||
window_end,
|
||||
symbol,
|
||||
lang,
|
||||
Some(window_start),
|
||||
));
|
||||
i += take;
|
||||
window_start = window_end + 1;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build a single `Chunk`, mirroring `make_chunk` in `code_rust_ast_v1.rs`
|
||||
/// exactly (same id recipe, same token estimate, same field set).
|
||||
///
|
||||
/// `split_key` is `Some(line_start_of_window)` for oversize splits, `None`
|
||||
/// for normal single-chunk emission. Mirrors the `Some(part_ls)` / `None`
|
||||
/// split_key pattern in 1A-2.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn build_chunk(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
base_policy_hash: &str,
|
||||
text: &str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
symbol: &str,
|
||||
lang: &str,
|
||||
split_key: Option<u32>,
|
||||
) -> Chunk {
|
||||
let span = SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol: Some(symbol.to_string()),
|
||||
lang: Some(lang.to_string()),
|
||||
};
|
||||
build_chunk_from_span(
|
||||
doc,
|
||||
chunker_version,
|
||||
base_policy_hash,
|
||||
text,
|
||||
span,
|
||||
split_key,
|
||||
)
|
||||
}
|
||||
|
||||
/// Like `build_chunk` but emits `symbol: None`. Used by Tier 3 (per spec §9.3).
|
||||
///
|
||||
/// Accepts `policy: &ChunkPolicy` and `chunker_version: &str` (string slice)
|
||||
/// so callers don't need to pre-compute the hash and version wrapper.
|
||||
/// `split_key` is `Some(window_start)` for oversize line-window splits.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn build_chunk_no_symbol(
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
text: &str,
|
||||
line_start: u32,
|
||||
line_end: u32,
|
||||
lang: &str,
|
||||
chunker_version: &str,
|
||||
split_key: Option<u32>,
|
||||
) -> Chunk {
|
||||
let cv = ChunkerVersion(chunker_version.to_string());
|
||||
let base_policy_hash = policy_hash(policy);
|
||||
let span = SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol: None,
|
||||
lang: Some(lang.to_string()),
|
||||
};
|
||||
build_chunk_from_span(doc, &cv, &base_policy_hash, text, span, split_key)
|
||||
}
|
||||
|
||||
/// Core chunk-building logic shared by `build_chunk` and `build_chunk_no_symbol`.
|
||||
///
|
||||
/// Takes a pre-built `SourceSpan` so the only difference between the two
|
||||
/// public helpers is whether `symbol` is `Some` or `None`. All id/hash/
|
||||
/// token mechanics are identical.
|
||||
fn build_chunk_from_span(
|
||||
doc: &CanonicalDocument,
|
||||
chunker_version: &ChunkerVersion,
|
||||
base_policy_hash: &str,
|
||||
text: &str,
|
||||
span: SourceSpan,
|
||||
split_key: Option<u32>,
|
||||
) -> Chunk {
|
||||
// id_hash mirrors code_rust_ast_v1's make_chunk logic:
|
||||
// split_key Some(k) => "{base_policy_hash}#L{k}"
|
||||
// split_key None => base_policy_hash
|
||||
let id_hash = match split_key {
|
||||
Some(k) => format!("{base_policy_hash}#L{k}"),
|
||||
None => base_policy_hash.to_string(),
|
||||
};
|
||||
|
||||
// block_ids: Tier 2/3 chunkers have no per-block structure (the whole file
|
||||
// is one Block::Code), so we pass an empty slice — same as using the doc-
|
||||
// level slice without explicit block granularity.
|
||||
let block_ids: Vec<BlockId> = vec![];
|
||||
|
||||
let chunk_id = id_for_chunk(
|
||||
&DocumentId(doc.doc_id.0.clone()),
|
||||
chunker_version,
|
||||
&block_ids,
|
||||
&id_hash,
|
||||
);
|
||||
|
||||
let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN);
|
||||
|
||||
Chunk {
|
||||
chunk_id,
|
||||
doc_id: DocumentId(doc.doc_id.0.clone()),
|
||||
block_ids,
|
||||
tokenized_korean_text: crate::tokenize_korean_morphological(text),
|
||||
text: text.to_string(),
|
||||
heading_path: Vec::new(),
|
||||
source_spans: vec![span],
|
||||
token_estimate,
|
||||
chunker_version: chunker_version.clone(),
|
||||
policy_hash: base_policy_hash.to_string(),
|
||||
}
|
||||
}
|
||||
196
crates/kebab-chunk/tests/code_c_ast_snapshot.rs
Normal file
196
crates/kebab-chunk/tests/code_c_ast_snapshot.rs
Normal file
@@ -0,0 +1,196 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative C code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_go_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeCAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("projects/record.c".into());
|
||||
let aid = AssetId("c".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-c-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Representative units:
|
||||
// 0. imports + defines (lines 1–4, ≤200)
|
||||
// 1. status_t enum typedef (lines 6–9, ≤200)
|
||||
// 2. record_t struct typedef (lines 11–16, ≤200)
|
||||
// 3. static counter decl glue (line 18, ≤200)
|
||||
// 4. parse_record fn (lines 20–23, ≤200)
|
||||
// 5. print_record fn (lines 25–27, ≤200)
|
||||
// 6. main fn (lines 29–33, ≤200)
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"<top-level>",
|
||||
1,
|
||||
18,
|
||||
"#include <stdio.h>\n#include <stdlib.h>\n\n#define MAX_BUF 4096\n\ntypedef enum {\n OK = 0,\n ERR_PARSE,\n ERR_IO,\n} status_t;\n\ntypedef struct {\n int id;\n char name[64];\n status_t status;\n} record_t;\n\nstatic int counter = 0;".to_string(),
|
||||
),
|
||||
(
|
||||
"parse_record",
|
||||
20,
|
||||
23,
|
||||
"int parse_record(const char *line, record_t *out) {\n if (line == NULL || out == NULL) return ERR_PARSE;\n return OK;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"print_record",
|
||||
25,
|
||||
27,
|
||||
"void print_record(const record_t *r) {\n printf(\"[%d] %s (status=%d)\\n\", r->id, r->name, r->status);\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"main",
|
||||
29,
|
||||
33,
|
||||
"int main(void) {\n record_t r = { .id = 1, .name = \"foo\", .status = OK };\n print_record(&r);\n return 0;\n}".to_string(),
|
||||
),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("c".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("c".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "record.c".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("c".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-c-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_c_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeCAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.c.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-c-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_c_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeCAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
354
crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs
Normal file
354
crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs
Normal file
@@ -0,0 +1,354 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative C++ code `CanonicalDocument`.
|
||||
//!
|
||||
//! Two complementary tests:
|
||||
//! 1. `code_cpp_ast_chunks_snapshot` — hand-built `fixed_doc()` validates the
|
||||
//! chunker's 1:1 mapping (design §6.3 / §8 boundary: no parse-code dep needed).
|
||||
//! 2. `code_cpp_ast_extractor_snapshot` — invokes `CppAstExtractor` against the
|
||||
//! real `tests/fixtures/sample.cpp` fixture, validating the extractor → chunker
|
||||
//! end-to-end pipeline. `kebab-parse-code` is a dev-dep (same pattern as
|
||||
//! `kebab-parse-md` in Markdown snapshot tests).
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeCppAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use kebab_parse_code::CppAstExtractor;
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("projects/record.cpp".into());
|
||||
let aid = AssetId("c".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-cpp-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Representative units (C++ specific):
|
||||
// 0. includes + namespace opening (lines 1–4, ≤200)
|
||||
// 1. class definition (lines 6–20, ≤200)
|
||||
// 2. template function (lines 22–25, ≤200)
|
||||
// 3. namespace closing + free fn (lines 27–29, ≤200)
|
||||
// 4. main fn (lines 31–34, ≤200)
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"<top-level>",
|
||||
1,
|
||||
4,
|
||||
"#include <string>\n#include <vector>\n\nnamespace kebab {".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::chunk::MdHeadingV1Chunker",
|
||||
6,
|
||||
20,
|
||||
"class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::identity",
|
||||
22,
|
||||
25,
|
||||
"template <typename T>\nT identity(T value) {\n return value;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"kebab::global_helper",
|
||||
27,
|
||||
29,
|
||||
"void global_helper() {\n // free function in kebab namespace\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"main",
|
||||
31,
|
||||
34,
|
||||
"int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}".to_string(),
|
||||
),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("cpp".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("cpp".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "record.cpp".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("cpp".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-cpp-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helper: run the real CppAstExtractor against tests/fixtures/sample.cpp
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn extract_cpp_fixture() -> CanonicalDocument {
|
||||
use kebab_core::{
|
||||
AssetId, AssetStorage, Checksum, ExtractConfig, ExtractContext, Extractor, RawAsset,
|
||||
SourceUri, WorkspacePath,
|
||||
};
|
||||
use std::path::PathBuf;
|
||||
|
||||
let bytes = std::fs::read(fixtures_dir().join("sample.cpp")).expect("read sample.cpp fixture");
|
||||
let src = String::from_utf8(bytes).expect("fixture is valid UTF-8");
|
||||
let wp = WorkspacePath("tests/fixtures/sample.cpp".to_string());
|
||||
let asset = RawAsset {
|
||||
asset_id: AssetId("e".repeat(64)),
|
||||
source_uri: SourceUri::File(PathBuf::from("tests/fixtures/sample.cpp")),
|
||||
workspace_path: wp,
|
||||
media_type: kebab_core::MediaType::Code("cpp".to_string()),
|
||||
byte_len: src.len() as u64,
|
||||
checksum: Checksum("f".repeat(64)),
|
||||
discovered_at: time::OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
stored: AssetStorage::Reference {
|
||||
path: PathBuf::from("tests/fixtures/sample.cpp"),
|
||||
sha: Checksum("f".repeat(64)),
|
||||
},
|
||||
};
|
||||
let cfg = ExtractConfig::default();
|
||||
let root = PathBuf::from("/tmp");
|
||||
let ctx = ExtractContext {
|
||||
asset: &asset,
|
||||
workspace_root: &root,
|
||||
config: &cfg,
|
||||
};
|
||||
CppAstExtractor::new()
|
||||
.extract(&ctx, src.as_bytes())
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 1 (hand-built): chunker-only 1:1 mapping validation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn code_cpp_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.cpp.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-cpp-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_cpp_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeCppAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 2 (real extractor): end-to-end extractor → chunker pipeline
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Validates that the real `CppAstExtractor` processes `sample.cpp` and
|
||||
/// emits the expected set of symbols through the full chunker pipeline.
|
||||
///
|
||||
/// `sample.cpp` contains:
|
||||
/// - `#include` directives + nested namespace `kebab::chunk` → glue + struct unit
|
||||
/// - `class MdHeadingV1Chunker` with methods (ctor, dtor, chunk_doc, operator())
|
||||
/// - `template <typename T> T identity(T value)` (template fn)
|
||||
/// - `void kebab::global_helper()` (free fn in namespace)
|
||||
/// - `int main()` (global free fn)
|
||||
#[test]
|
||||
fn code_cpp_ast_extractor_snapshot() {
|
||||
let doc = extract_cpp_fixture();
|
||||
|
||||
// Verify the extractor emits all expected named units.
|
||||
let block_syms: Vec<Option<String>> = doc
|
||||
.blocks
|
||||
.iter()
|
||||
.filter_map(|b| match b {
|
||||
Block::Code(c) => match &c.common.source_span {
|
||||
SourceSpan::Code { symbol, .. } => Some(symbol.clone()),
|
||||
_ => None,
|
||||
},
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Must include namespace-qualified class and its methods
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker")),
|
||||
"class unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker")),
|
||||
"ctor unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker")),
|
||||
"dtor unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::chunk_doc")),
|
||||
"chunk_doc unit missing: {block_syms:?}"
|
||||
);
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::MdHeadingV1Chunker::operator()")),
|
||||
"operator() unit missing: {block_syms:?}"
|
||||
);
|
||||
// Template function (inside kebab::chunk namespace in the fixture)
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::chunk::identity")),
|
||||
"identity template fn unit missing: {block_syms:?}"
|
||||
);
|
||||
// Free function in outer namespace
|
||||
assert!(
|
||||
block_syms
|
||||
.iter()
|
||||
.any(|s| s.as_deref() == Some("kebab::global_helper")),
|
||||
"global_helper unit missing: {block_syms:?}"
|
||||
);
|
||||
// Global main
|
||||
assert!(
|
||||
block_syms.iter().any(|s| s.as_deref() == Some("main")),
|
||||
"main unit missing: {block_syms:?}"
|
||||
);
|
||||
}
|
||||
|
||||
/// End-to-end chunker output from real extractor is deterministic.
|
||||
#[test]
|
||||
fn code_cpp_ast_extractor_chunks_deterministic() {
|
||||
let doc1 = extract_cpp_fixture();
|
||||
let doc2 = extract_cpp_fixture();
|
||||
assert_eq!(
|
||||
doc1.blocks, doc2.blocks,
|
||||
"extractor output non-deterministic"
|
||||
);
|
||||
|
||||
let policy = fixed_policy();
|
||||
let chunks1 = CodeCppAstV1Chunker.chunk(&doc1, &policy).unwrap();
|
||||
let chunks2 = CodeCppAstV1Chunker.chunk(&doc2, &policy).unwrap();
|
||||
assert_eq!(
|
||||
chunks1
|
||||
.iter()
|
||||
.map(|c| c.chunk_id.0.clone())
|
||||
.collect::<Vec<_>>(),
|
||||
chunks2
|
||||
.iter()
|
||||
.map(|c| c.chunk_id.0.clone())
|
||||
.collect::<Vec<_>>(),
|
||||
"chunker output non-deterministic"
|
||||
);
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_go_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_go_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Go code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeGoAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("kebab_eval/metrics.go".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-go-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "func BigCompute(data []int) int {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!("\tv{i} := 0\n\tif {i} < len(data) {{\n\t\tv{i} = data[{i}]\n\t}}\n"))
|
||||
.collect();
|
||||
let footer = "\treturn len(data)\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. free fn `ComputeMRR` (lines 7–12, ≤200)
|
||||
// 2. struct `MetricsCollector` (lines 14–20, ≤200)
|
||||
// 3. struct `BaseEvaluator` (lines 22–30, ≤200)
|
||||
// 4. method `Run` (lines 32–38, ≤200)
|
||||
// 5. method `Report` (lines 40–46, ≤200)
|
||||
// 6. BigCompute (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import (\n\t\"fmt\"\n\t\"os\"\n\t\"strings\"\n)".to_string(),
|
||||
),
|
||||
(
|
||||
"ComputeMRR",
|
||||
7,
|
||||
12,
|
||||
"func ComputeMRR(scores []float64) float64 {\n\tif len(scores) == 0 {\n\t\treturn 0.0\n\t}\n\t_ = fmt.Sprintf(\"%v\", scores)\n\treturn 1.0 / float64(len(scores))\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector",
|
||||
14,
|
||||
20,
|
||||
"type MetricsCollector struct {\n\tScores []float64\n\tLabels []string\n\tCounts map[string]int\n\tTotals map[string]float64\n\tTags []string\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseEvaluator",
|
||||
22,
|
||||
30,
|
||||
"type BaseEvaluator struct {\n\tName string\n}\n\nfunc (e *BaseEvaluator) Evaluate(data []string) error {\n\t_ = os.Stderr\n\t_ = strings.Join(data, \",\")\n\treturn nil\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.Run",
|
||||
32,
|
||||
38,
|
||||
"func (m *MetricsCollector) Run(inputs []float64) {\n\tfor _, inp := range inputs {\n\t\tm.Scores = append(\n\t\t\tm.Scores,\n\t\t\tinp,\n\t\t)\n\t}\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.Report",
|
||||
40,
|
||||
46,
|
||||
"func (m *MetricsCollector) Report() map[string]interface{} {\n\treturn map[string]interface{}{\n\t\t\"mean\": 0.0,\n\t\t\"count\": len(m.Scores),\n\t\t\"tags\": m.Tags,\n\t}\n}".to_string(),
|
||||
),
|
||||
("BigCompute", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("go".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("go".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "metrics.go".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("go".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-go-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_go_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeGoAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.go.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-go-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_go_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeGoAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_java_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_java_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Java code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeJavaAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("src/main/java/com/example/Metrics.java".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-java-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line method body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "public class BigCompute {\n public int compute(int[] data) {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" int v{i} = {i} < data.length ? data[{i}] : 0;\n"))
|
||||
.collect();
|
||||
let footer = " return data.length;\n }\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. free method `computeMRR` (lines 7–12, ≤200)
|
||||
// 2. class `MetricsCollector` (lines 14–20, ≤200)
|
||||
// 3. class `BaseEvaluator` (lines 22–30, ≤200)
|
||||
// 4. method `MetricsCollector.run` (lines 32–38, ≤200)
|
||||
// 5. method `MetricsCollector.report` (lines 40–46, ≤200)
|
||||
// 6. BigCompute (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import java.util.List;\nimport java.util.Map;\nimport java.util.ArrayList;\nimport java.util.HashMap;\nimport java.util.stream.Collectors;".to_string(),
|
||||
),
|
||||
(
|
||||
"computeMRR",
|
||||
7,
|
||||
12,
|
||||
"public static double computeMRR(List<Double> scores) {\n if (scores.isEmpty()) {\n return 0.0;\n }\n return 1.0 / scores.size();\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector",
|
||||
14,
|
||||
20,
|
||||
"public class MetricsCollector {\n private List<Double> scores;\n private List<String> labels;\n private Map<String, Integer> counts;\n private Map<String, Double> totals;\n private List<String> tags;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseEvaluator",
|
||||
22,
|
||||
30,
|
||||
"public class BaseEvaluator {\n private String name;\n\n public BaseEvaluator(String name) {\n this.name = name;\n }\n\n public void evaluate(List<String> data) throws Exception {\n String joined = String.join(\",\", data);\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.run",
|
||||
32,
|
||||
38,
|
||||
"public void run(List<Double> inputs) {\n for (Double inp : inputs) {\n scores.add(\n inp\n );\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.report",
|
||||
40,
|
||||
46,
|
||||
"public Map<String, Object> report() {\n Map<String, Object> result = new HashMap<>();\n result.put(\"mean\", 0.0);\n result.put(\"count\", scores.size());\n result.put(\"tags\", tags);\n return result;\n}".to_string(),
|
||||
),
|
||||
("BigCompute", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("java".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("java".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "Metrics.java".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("java".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-java-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_java_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeJavaAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.java.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-java-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_java_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeJavaAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_js_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_js_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative JavaScript code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeJsAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("src/bar.js".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-js-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "function bigTransform(items) {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" const v{i} = items[{i}] !== undefined ? items[{i}] : null;\n"))
|
||||
.collect();
|
||||
let footer = " return items;\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. require/import block (lines 1–5, ≤200)
|
||||
// 1. free fn `add` (lines 7–12, ≤200)
|
||||
// 2. class `EventBus` (lines 14–20, ≤200)
|
||||
// 3. class `BaseHandler` (lines 22–30, ≤200)
|
||||
// 4. method `EventBus.emit` (lines 32–38, ≤200)
|
||||
// 5. method `EventBus.on` (lines 40–46, ≤200)
|
||||
// 6. bigTransform (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"requires",
|
||||
1,
|
||||
5,
|
||||
"const fs = require('fs');\nconst path = require('path');\nconst { EventEmitter } = require('events');\nconst assert = require('assert');\nconst crypto = require('crypto');".to_string(),
|
||||
),
|
||||
(
|
||||
"add",
|
||||
7,
|
||||
12,
|
||||
"export function add(a, b) {\n if (typeof a !== 'number') throw new TypeError('a');\n if (typeof b !== 'number') throw new TypeError('b');\n const result = a + b;\n assert(isFinite(result));\n return result;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"EventBus",
|
||||
14,
|
||||
20,
|
||||
"class EventBus {\n constructor() {\n this._handlers = new Map();\n this._history = [];\n this._maxHistory = 100;\n this._seq = 0;\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseHandler",
|
||||
22,
|
||||
30,
|
||||
"class BaseHandler {\n handle(event) {\n throw new Error('not implemented');\n }\n batchHandle(events) {\n const results = [];\n for (const ev of events) {\n results.push(this.handle(ev));\n }\n return results;\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"EventBus.emit",
|
||||
32,
|
||||
38,
|
||||
"class EventBus {\n emit(name, payload) {\n const handlers = this._handlers.get(name) ?? [];\n for (const h of handlers) {\n h(payload);\n }\n return this;\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"EventBus.on",
|
||||
40,
|
||||
46,
|
||||
"class EventBus {\n on(name, handler) {\n if (!this._handlers.has(name)) {\n this._handlers.set(name, []);\n }\n this._handlers.get(name).push(handler);\n return this;\n }\n}".to_string(),
|
||||
),
|
||||
("bigTransform", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("javascript".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("javascript".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "bar.js".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("javascript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-js-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_js_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.js.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-js-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_js_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeJsAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_kotlin_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Kotlin code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeKotlinAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("src/main/kotlin/com/example/Metrics.kt".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-kotlin-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "class BigCompute {\n fun compute(data: IntArray): Int {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" val v{i} = if ({i} < data.size) data[{i}] else 0\n"))
|
||||
.collect();
|
||||
let footer = " return data.size\n }\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. top-level fn `computeMRR` (lines 7–12, ≤200)
|
||||
// 2. data class `MetricsCollector` (lines 14–20, ≤200)
|
||||
// 3. class `BaseEvaluator` (lines 22–30, ≤200)
|
||||
// 4. method `MetricsCollector.run` (lines 32–38, ≤200)
|
||||
// 5. method `MetricsCollector.report` (lines 40–46, ≤200)
|
||||
// 6. BigCompute (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import kotlin.collections.List\nimport kotlin.collections.Map\nimport kotlin.collections.MutableList\nimport kotlin.collections.MutableMap\nimport kotlin.collections.mutableListOf".to_string(),
|
||||
),
|
||||
(
|
||||
"computeMRR",
|
||||
7,
|
||||
12,
|
||||
"fun computeMRR(scores: List<Double>): Double {\n if (scores.isEmpty()) {\n return 0.0\n }\n return 1.0 / scores.size\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector",
|
||||
14,
|
||||
20,
|
||||
"data class MetricsCollector(\n val scores: MutableList<Double> = mutableListOf(),\n val labels: MutableList<String> = mutableListOf(),\n val counts: MutableMap<String, Int> = mutableMapOf(),\n val totals: MutableMap<String, Double> = mutableMapOf(),\n val tags: MutableList<String> = mutableListOf(),\n)".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseEvaluator",
|
||||
22,
|
||||
30,
|
||||
"open class BaseEvaluator(val name: String) {\n\n fun evaluate(data: List<String>) {\n val joined = data.joinToString(\",\")\n println(joined)\n }\n\n open fun describe(): String = name\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.run",
|
||||
32,
|
||||
38,
|
||||
"fun MetricsCollector.run(inputs: List<Double>) {\n for (inp in inputs) {\n scores.add(\n inp\n )\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.report",
|
||||
40,
|
||||
46,
|
||||
"fun MetricsCollector.report(): Map<String, Any> {\n return mapOf(\n \"mean\" to 0.0,\n \"count\" to scores.size,\n \"tags\" to tags,\n )\n}".to_string(),
|
||||
),
|
||||
("BigCompute", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("kotlin".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("kotlin".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "Metrics.kt".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("kotlin".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-kotlin-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_kotlin_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeKotlinAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.kt.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-kotlin-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_kotlin_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeKotlinAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_python_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_python_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Python code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodePythonAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("kebab_eval/metrics.py".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-python-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "def big_compute(data):\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" v{i} = data[{i}] if {i} < len(data) else 0\n"))
|
||||
.collect();
|
||||
let footer = " return sum(data)";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. free fn `compute_mrr` (lines 7–12, ≤200)
|
||||
// 2. class `MetricsCollector` (lines 14–20, ≤200)
|
||||
// 3. class `BaseEvaluator` (lines 22–30, ≤200)
|
||||
// 4. method `run` (lines 32–38, ≤200)
|
||||
// 5. method `report` (lines 40–46, ≤200)
|
||||
// 6. big_compute (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import os\nimport sys\nfrom typing import List\nfrom pathlib import Path\nfrom collections import defaultdict".to_string(),
|
||||
),
|
||||
(
|
||||
"compute_mrr",
|
||||
7,
|
||||
12,
|
||||
"def compute_mrr(scores):\n if not scores:\n return 0.0\n return sum(\n 1.0 / r for r in scores\n ) / len(scores)".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector",
|
||||
14,
|
||||
20,
|
||||
"class MetricsCollector:\n def __init__(self):\n self.scores = []\n self.labels = []\n self.counts = defaultdict(int)\n self.totals = defaultdict(float)\n self.tags = []".to_string(),
|
||||
),
|
||||
(
|
||||
"BaseEvaluator",
|
||||
22,
|
||||
30,
|
||||
"class BaseEvaluator:\n def evaluate(self, data):\n raise NotImplementedError\n def batch_evaluate(self, items):\n results = []\n for item in items:\n results.append(self.evaluate(item))\n return results\n def name(self):\n return type(self).__name__".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.run",
|
||||
32,
|
||||
38,
|
||||
"class MetricsCollector:\n def run(self, inputs):\n for inp in inputs:\n score = self._score(inp)\n self.scores.append(\n score\n )".to_string(),
|
||||
),
|
||||
(
|
||||
"MetricsCollector.report",
|
||||
40,
|
||||
46,
|
||||
"class MetricsCollector:\n def report(self):\n return {\n 'mean': sum(self.scores) / max(len(self.scores), 1),\n 'count': len(self.scores),\n 'tags': self.tags,\n }".to_string(),
|
||||
),
|
||||
("big_compute", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("python".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("python".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "metrics.py".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("python".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-python-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_python_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.py.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-python-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_python_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodePythonAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_rust_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_rust_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative Rust code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeRustAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("crates/kebab-chunk/src/code_rust_ast_v1.rs".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-rust-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line function body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "pub fn big_fn(input: &[u8]) -> Vec<u8> {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" let v{i} = input.get({i} as usize).copied().unwrap_or(0);\n"))
|
||||
.collect();
|
||||
let footer = " vec![0u8]\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. top-level use+const block (lines 1–5, ≤200)
|
||||
// 1. free fn `parse` (lines 7–12, ≤200)
|
||||
// 2. struct `Foo` (lines 14–20, ≤200)
|
||||
// 3. trait `Frobable` (lines 22–30, ≤200)
|
||||
// 4. impl Foo::double (lines 32–38, ≤200)
|
||||
// 5. impl Foo::triple (lines 40–46, ≤200)
|
||||
// 6. big_fn (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"use+const",
|
||||
1,
|
||||
5,
|
||||
"use std::collections::HashMap;\nuse std::fmt;\n\nconst MAX: usize = 1024;\nconst MIN: usize = 0;".to_string(),
|
||||
),
|
||||
(
|
||||
"parse",
|
||||
7,
|
||||
12,
|
||||
"pub fn parse(input: &str) -> Option<u32> {\n input\n .trim()\n .parse()\n .ok()\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo",
|
||||
14,
|
||||
20,
|
||||
"pub struct Foo {\n pub name: String,\n pub value: u32,\n pub tags: Vec<String>,\n pub meta: Option<String>,\n pub count: usize,\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Frobable",
|
||||
22,
|
||||
30,
|
||||
"pub trait Frobable {\n fn frob(&self) -> String;\n fn frob_twice(&self) -> String {\n let a = self.frob();\n let b = self.frob();\n format!(\"{a}{b}\")\n }\n fn name(&self) -> &str;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo::double",
|
||||
32,
|
||||
38,
|
||||
"impl Foo {\n pub fn double(&self) -> u32 {\n self.value\n .checked_mul(2)\n .unwrap_or(u32::MAX)\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo::triple",
|
||||
40,
|
||||
46,
|
||||
"impl Foo {\n pub fn triple(&self) -> u32 {\n self.value\n .checked_mul(3)\n .unwrap_or(u32::MAX)\n }\n}".to_string(),
|
||||
),
|
||||
("big_fn", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("rust".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("rust".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "code_rust_ast_v1.rs".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("rust".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-rust-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_rust_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeRustAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-rust-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_rust_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeRustAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
270
crates/kebab-chunk/tests/code_text_paragraph_v1.rs
Normal file
270
crates/kebab-chunk/tests/code_text_paragraph_v1.rs
Normal file
@@ -0,0 +1,270 @@
|
||||
//! Behavioural tests for `CodeTextParagraphV1Chunker`.
|
||||
//!
|
||||
//! Documents are constructed manually (no kebab-parse-code dependency) by
|
||||
//! placing raw text into a single `Block::Code`, mirroring the pattern used
|
||||
//! in `k8s_manifest_resource_v1.rs`.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeTextParagraphV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
/// Build a `CanonicalDocument` with a single `Block::Code` containing `text`
|
||||
/// and the supplied `lang` label.
|
||||
fn text_doc(lang: &str, text: &str) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("scripts/sample.sh".into());
|
||||
let aid = AssetId("d".repeat(64));
|
||||
let pv = ParserVersion("code-text-paragraph-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
let line_count = text.lines().count() as u32;
|
||||
let span = SourceSpan::Code {
|
||||
line_start: 1,
|
||||
line_end: line_count.max(1),
|
||||
symbol: None,
|
||||
lang: Some(lang.into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], 0, &span);
|
||||
let block = Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some(lang.into()),
|
||||
code: text.to_string(),
|
||||
});
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "sample.sh".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks: vec![block],
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some(lang.into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-text-paragraph-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// `sample_shell.sh` has 4 paragraphs separated by 3 blank lines:
|
||||
/// - paragraph 1: lines 1-2 (shebang + set -euo pipefail)
|
||||
/// - paragraph 2: lines 4-7 (env setup block)
|
||||
/// - paragraph 3: lines 9-11 (ingest block)
|
||||
/// - paragraph 4: lines 13-15 (report block)
|
||||
///
|
||||
/// We assert:
|
||||
/// - exactly 4 chunks (one per paragraph)
|
||||
/// - all symbols are None (Tier 3 spec §9.3)
|
||||
/// - all langs are "shell"
|
||||
/// - line ranges are strictly ascending and do NOT include the blank lines
|
||||
/// (lines 3, 8, 12 must not appear in any range)
|
||||
#[test]
|
||||
fn shell_multi_paragraph_splits_on_blank_lines() {
|
||||
let fixture_path = fixtures_dir().join("sample_shell.sh");
|
||||
let text = std::fs::read_to_string(&fixture_path)
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
let doc = text_doc("shell", &text);
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
4,
|
||||
"expected 4 chunks (one per paragraph), got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
// All symbols must be None (Tier 3 requirement).
|
||||
for (i, chunk) in chunks.iter().enumerate() {
|
||||
match &chunk.source_spans[0] {
|
||||
SourceSpan::Code { symbol, .. } => {
|
||||
assert!(
|
||||
symbol.is_none(),
|
||||
"chunk[{i}] symbol must be None for Tier 3 chunker, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
other => panic!("chunk[{i}]: expected Code span, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
// All langs must be "shell".
|
||||
for (i, chunk) in chunks.iter().enumerate() {
|
||||
match &chunk.source_spans[0] {
|
||||
SourceSpan::Code { lang, .. } => {
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("shell"),
|
||||
"chunk[{i}] lang must be 'shell', got {lang:?}"
|
||||
);
|
||||
}
|
||||
other => panic!("chunk[{i}]: expected Code span, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
// Line ranges must be strictly ascending with no overlap,
|
||||
// and blank lines (3, 8, 12) must not be included in any range.
|
||||
let expected_ranges: &[(u32, u32)] = &[(1, 2), (4, 7), (9, 11), (13, 15)];
|
||||
let actual_ranges: Vec<(u32, u32)> = chunks
|
||||
.iter()
|
||||
.map(|c| match &c.source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => (*line_start, *line_end),
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
actual_ranges, expected_ranges,
|
||||
"line ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}"
|
||||
);
|
||||
}
|
||||
|
||||
/// `sample_long_paragraph.txt` has exactly 200 non-blank lines and no blank
|
||||
/// lines, so the entire file is one paragraph. 200 > 80 (FALLBACK_LINES_PER_CHUNK),
|
||||
/// so the oversize window split fires with stride 60:
|
||||
/// - window 1: lines 1-80
|
||||
/// - window 2: lines 61-140
|
||||
/// - window 3: lines 121-200
|
||||
///
|
||||
/// All chunk_ids must be distinct (the #L{window_start} split_key suffix).
|
||||
#[test]
|
||||
fn single_long_paragraph_line_window_split() {
|
||||
let fixture_path = fixtures_dir().join("sample_long_paragraph.txt");
|
||||
let text = std::fs::read_to_string(&fixture_path)
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
assert_eq!(
|
||||
text.lines().count(),
|
||||
200,
|
||||
"fixture must have exactly 200 lines"
|
||||
);
|
||||
|
||||
let doc = text_doc("shell", &text);
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
3,
|
||||
"expected 3 window chunks for 200-line paragraph, got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
let expected_ranges: &[(u32, u32)] = &[(1, 80), (61, 140), (121, 200)];
|
||||
let actual_ranges: Vec<(u32, u32)> = chunks
|
||||
.iter()
|
||||
.map(|c| match &c.source_spans[0] {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
..
|
||||
} => (*line_start, *line_end),
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
actual_ranges, expected_ranges,
|
||||
"window ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}"
|
||||
);
|
||||
|
||||
// All chunk_ids must be distinct (#L{window_start} suffix differentiates them).
|
||||
let ids: std::collections::HashSet<_> = chunks.iter().map(|c| c.chunk_id.clone()).collect();
|
||||
assert_eq!(
|
||||
ids.len(),
|
||||
chunks.len(),
|
||||
"oversize window chunks must have distinct chunk_ids"
|
||||
);
|
||||
}
|
||||
|
||||
/// An empty source file (no non-blank lines) must yield zero chunks.
|
||||
#[test]
|
||||
fn empty_file_emits_zero_chunks() {
|
||||
let doc = text_doc("shell", "");
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
0,
|
||||
"empty file must yield 0 chunks, got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
}
|
||||
|
||||
/// The `lang` field on each emitted chunk must match the `lang` passed to
|
||||
/// `text_doc`, regardless of content. `symbol` must be `None` (Tier 3 spec).
|
||||
#[test]
|
||||
fn lang_field_preserved_from_input_doc() {
|
||||
let doc = text_doc("yaml", "key1: value1\nkey2: value2\n");
|
||||
let chunks = CodeTextParagraphV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert!(!chunks.is_empty(), "expected at least one chunk");
|
||||
|
||||
match &chunks[0].source_spans[0] {
|
||||
SourceSpan::Code { lang, symbol, .. } => {
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("yaml"),
|
||||
"lang must be 'yaml', got {lang:?}"
|
||||
);
|
||||
assert!(
|
||||
symbol.is_none(),
|
||||
"symbol must be None for Tier 3 chunker, got {symbol:?}"
|
||||
);
|
||||
}
|
||||
other => panic!("expected Code span, got {other:?}"),
|
||||
}
|
||||
}
|
||||
221
crates/kebab-chunk/tests/code_ts_ast_snapshot.rs
Normal file
221
crates/kebab-chunk/tests/code_ts_ast_snapshot.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Snapshot test pinning the `Vec<Chunk>` JSON for a
|
||||
//! representative TypeScript code `CanonicalDocument`.
|
||||
//!
|
||||
//! This is an integration test. `kebab-parse-code` is intentionally NOT
|
||||
//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side).
|
||||
//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code`
|
||||
//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s
|
||||
//! internal `code_doc` test helper.
|
||||
//!
|
||||
//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::CodeTsAstV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
fn fixed_doc() -> CanonicalDocument {
|
||||
let wp = WorkspacePath("src/Foo.ts".into());
|
||||
let aid = AssetId("b".repeat(64));
|
||||
// Pin parser_version so doc_id / block_ids are reproducible.
|
||||
let pv = ParserVersion("code-ts-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
// Build a >200-line method body to force split_oversize.
|
||||
let big_body: String = {
|
||||
let header = "export class BigProcessor {\n process(items: string[]): string[] {\n";
|
||||
let body: String = (0..210u32)
|
||||
.map(|i| format!(" const v{i} = items[{i}] ?? '';\n"))
|
||||
.collect();
|
||||
let footer = " return items;\n }\n}";
|
||||
format!("{header}{body}{footer}")
|
||||
};
|
||||
let big_line_count = big_body.lines().count() as u32;
|
||||
let big_line_end = 48 + big_line_count - 1;
|
||||
|
||||
// Representative units:
|
||||
// 0. import block (lines 1–5, ≤200)
|
||||
// 1. free fn `parseInput` (lines 7–12, ≤200)
|
||||
// 2. interface `Frobable` (lines 14–20, ≤200)
|
||||
// 3. class `Foo` (lines 22–30, ≤200)
|
||||
// 4. method `Foo.double` (lines 32–38, ≤200)
|
||||
// 5. method `Foo.triple` (lines 40–46, ≤200)
|
||||
// 6. BigProcessor (>200 lines) to force split_oversize
|
||||
let raw_units: Vec<(&str, u32, u32, String)> = vec![
|
||||
(
|
||||
"imports",
|
||||
1,
|
||||
5,
|
||||
"import { readFileSync } from 'fs';\nimport { join } from 'path';\nimport type { Config } from './config';\nimport { Logger } from './logger';\nimport { EventEmitter } from 'events';".to_string(),
|
||||
),
|
||||
(
|
||||
"parseInput",
|
||||
7,
|
||||
12,
|
||||
"export function parseInput(raw: string): number | null {\n const trimmed = raw.trim();\n const n = Number(trimmed);\n if (isNaN(n)) return null;\n return n;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Frobable",
|
||||
14,
|
||||
20,
|
||||
"export interface Frobable {\n frob(): string;\n frobTwice(): string;\n readonly name: string;\n readonly tags: string[];\n count: number;\n reset(): void;\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo",
|
||||
22,
|
||||
30,
|
||||
"export class Foo implements Frobable {\n constructor(\n public readonly name: string,\n public value: number,\n public tags: string[] = [],\n ) {}\n frob(): string { return this.name; }\n frobTwice(): string { return this.name.repeat(2); }\n reset(): void { this.value = 0; }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo.double",
|
||||
32,
|
||||
38,
|
||||
"export class Foo {\n double(): number {\n const result = this.value * 2;\n if (result > Number.MAX_SAFE_INTEGER) {\n return Number.MAX_SAFE_INTEGER;\n }\n return result;\n }\n}".to_string(),
|
||||
),
|
||||
(
|
||||
"Foo.triple",
|
||||
40,
|
||||
46,
|
||||
"export class Foo {\n triple(): number {\n const result = this.value * 3;\n if (result > Number.MAX_SAFE_INTEGER) {\n return Number.MAX_SAFE_INTEGER;\n }\n return result;\n }\n}".to_string(),
|
||||
),
|
||||
("BigProcessor", 48, big_line_end, big_body),
|
||||
];
|
||||
|
||||
let blocks: Vec<Block> = raw_units
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, (sym, ls, le, code))| {
|
||||
let span = SourceSpan::Code {
|
||||
line_start: *ls,
|
||||
line_end: *le,
|
||||
symbol: Some((*sym).to_string()),
|
||||
lang: Some("typescript".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], i as u32, &span);
|
||||
Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("typescript".into()),
|
||||
code: code.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "Foo.ts".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks,
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("typescript".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixed_policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("code-ts-ast-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_ts_ast_chunks_snapshot() {
|
||||
let doc = fixed_doc();
|
||||
let policy = fixed_policy();
|
||||
|
||||
let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy).expect("chunk");
|
||||
let actual = serde_json::to_value(&chunks).unwrap();
|
||||
|
||||
let dir = fixtures_dir();
|
||||
let baseline_path = dir.join("code-sample.ts.chunks.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
std::fs::create_dir_all(&dir).unwrap();
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"code-ts-ast-v1 chunks snapshot drift\n\
|
||||
--- expected ({}) ---\n{baseline_text}\n\
|
||||
--- actual ---\n{pretty}\n\
|
||||
If intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism cross-check: re-running the same pipeline yields the same
|
||||
/// chunk_ids byte-for-byte.
|
||||
#[test]
|
||||
fn code_ts_ast_chunks_are_deterministic() {
|
||||
let policy = fixed_policy();
|
||||
let baseline: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
for _ in 0..5 {
|
||||
let again: Vec<String> = CodeTsAstV1Chunker
|
||||
.chunk(&fixed_doc(), &policy)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|c| c.chunk_id.0)
|
||||
.collect();
|
||||
assert_eq!(again, baseline);
|
||||
}
|
||||
}
|
||||
138
crates/kebab-chunk/tests/dockerfile_file_v1.rs
Normal file
138
crates/kebab-chunk/tests/dockerfile_file_v1.rs
Normal file
@@ -0,0 +1,138 @@
|
||||
//! Behavioural tests for `DockerfileFileV1Chunker`.
|
||||
//!
|
||||
//! Documents are constructed manually (no kebab-parse-code dependency) by
|
||||
//! placing the raw Dockerfile text into a single `Block::Code`, mirroring the
|
||||
//! pattern used in `k8s_manifest_resource_v1.rs`.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kebab_chunk::DockerfileFileV1Chunker;
|
||||
use kebab_core::{
|
||||
AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock,
|
||||
CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel,
|
||||
WorkspacePath, id_for_block, id_for_doc,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests")
|
||||
.join("fixtures")
|
||||
}
|
||||
|
||||
/// Build a `CanonicalDocument` with a single `Block::Code` containing `dockerfile_text`.
|
||||
fn dockerfile_doc(dockerfile_text: &str) -> CanonicalDocument {
|
||||
let wp = WorkspacePath("build/Dockerfile".into());
|
||||
let aid = AssetId("d".repeat(64));
|
||||
let pv = ParserVersion("code-dockerfile-v1".into());
|
||||
let doc_id = id_for_doc(&wp, &aid, &pv);
|
||||
|
||||
let line_count = dockerfile_text.lines().count() as u32;
|
||||
let span = SourceSpan::Code {
|
||||
line_start: 1,
|
||||
line_end: line_count.max(1),
|
||||
symbol: None,
|
||||
lang: Some("dockerfile".into()),
|
||||
};
|
||||
let bid = id_for_block(&doc_id, "code", &[], 0, &span);
|
||||
let block = Block::Code(CodeBlock {
|
||||
common: CommonBlock {
|
||||
block_id: bid,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
},
|
||||
lang: Some("dockerfile".into()),
|
||||
code: dockerfile_text.to_string(),
|
||||
});
|
||||
|
||||
CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: aid,
|
||||
workspace_path: wp,
|
||||
title: "Dockerfile".into(),
|
||||
lang: Lang("und".into()),
|
||||
blocks: vec![block],
|
||||
metadata: Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Note,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user: Default::default(),
|
||||
repo: Some("kebab".into()),
|
||||
git_branch: Some("main".into()),
|
||||
git_commit: Some("0".repeat(40)),
|
||||
code_lang: Some("dockerfile".into()),
|
||||
},
|
||||
provenance: Provenance { events: vec![] },
|
||||
parser_version: pv,
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
last_chunker_version: None,
|
||||
last_embedding_version: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn policy() -> ChunkPolicy {
|
||||
ChunkPolicy {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: false,
|
||||
chunker_version: ChunkerVersion("dockerfile-file-v1".into()),
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// A simple 5-line Dockerfile fixture must emit exactly 1 chunk with the
|
||||
/// correct symbol, lang, and line range.
|
||||
#[test]
|
||||
fn dockerfile_emits_single_chunk() {
|
||||
let fixture_path = fixtures_dir().join("sample.dockerfile");
|
||||
let text = std::fs::read_to_string(&fixture_path)
|
||||
.unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display()));
|
||||
|
||||
let doc = dockerfile_doc(&text);
|
||||
let chunks = DockerfileFileV1Chunker
|
||||
.chunk(&doc, &policy())
|
||||
.expect("chunk");
|
||||
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
1,
|
||||
"expected 1 chunk, got {}: {chunks:#?}",
|
||||
chunks.len()
|
||||
);
|
||||
|
||||
// Inspect the Chunk's source_spans for symbol / lang / line range.
|
||||
let span = chunks[0].source_spans.first().expect("at least one span");
|
||||
match span {
|
||||
SourceSpan::Code {
|
||||
line_start,
|
||||
line_end,
|
||||
symbol,
|
||||
lang,
|
||||
} => {
|
||||
assert_eq!(*line_start, 1, "line_start must be 1");
|
||||
assert_eq!(*line_end, 5, "line_end must be 5 (5-line fixture)");
|
||||
assert_eq!(
|
||||
symbol.as_deref(),
|
||||
Some("<dockerfile>"),
|
||||
"symbol must be '<dockerfile>'"
|
||||
);
|
||||
assert_eq!(
|
||||
lang.as_deref(),
|
||||
Some("dockerfile"),
|
||||
"lang must be 'dockerfile'"
|
||||
);
|
||||
}
|
||||
other => panic!("expected SourceSpan::Code, got {other:?}"),
|
||||
}
|
||||
|
||||
// Verify chunker_version label.
|
||||
assert_eq!(chunks[0].chunker_version.0, "dockerfile-file-v1");
|
||||
}
|
||||
90
crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json
vendored
Normal file
90
crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json
vendored
Normal file
@@ -0,0 +1,90 @@
|
||||
[
|
||||
{
|
||||
"block_ids": [
|
||||
"8149e12ca002489acb4a0f74c97a061a"
|
||||
],
|
||||
"chunk_id": "ec3cf06ae56c8e9796bbc9196438b7c5",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 18,
|
||||
"line_start": 1,
|
||||
"symbol": "<top-level>"
|
||||
}
|
||||
],
|
||||
"text": "#include <stdio.h>\n#include <stdlib.h>\n\n#define MAX_BUF 4096\n\ntypedef enum {\n OK = 0,\n ERR_PARSE,\n ERR_IO,\n} status_t;\n\ntypedef struct {\n int id;\n char name[64];\n status_t status;\n} record_t;\n\nstatic int counter = 0;",
|
||||
"token_estimate": 78,
|
||||
"tokenized_korean_text": "# include < stdio . h > # include < stdlib . h > # define MAX _ BUF 4096 typedef enum { OK = 0 , ERR _ PARSE , ERR _ IO , } status _ t ; typedef struct { int id ; char name [ 64 ]; status _ t status ; } record _ t ; static int counter = 0 ;"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"1baaa89f21a47b2f32d6396a24a85454"
|
||||
],
|
||||
"chunk_id": "c2d7a81c898106733ef2e703774a6a4a",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 23,
|
||||
"line_start": 20,
|
||||
"symbol": "parse_record"
|
||||
}
|
||||
],
|
||||
"text": "int parse_record(const char *line, record_t *out) {\n if (line == NULL || out == NULL) return ERR_PARSE;\n return OK;\n}",
|
||||
"token_estimate": 41,
|
||||
"tokenized_korean_text": "int parse _ record ( const char * line , record _ t * out ) { if ( line == NULL || out == NULL ) return ERR _ PARSE ; return OK ; }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"8d0e14cbcc6d1e92d7878ab796ea68b8"
|
||||
],
|
||||
"chunk_id": "0e4d7b131ab64eba03b51903b5d8f96d",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 27,
|
||||
"line_start": 25,
|
||||
"symbol": "print_record"
|
||||
}
|
||||
],
|
||||
"text": "void print_record(const record_t *r) {\n printf(\"[%d] %s (status=%d)\\n\", r->id, r->name, r->status);\n}",
|
||||
"token_estimate": 35,
|
||||
"tokenized_korean_text": "void print _ record ( const record _ t * r ) { printf (\"[% d ] % s ( status =% d )\\ n \", r -> id , r -> name , r -> status ); }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"9c2ede84423871b615d48c38fefb1853"
|
||||
],
|
||||
"chunk_id": "e076f8edb2ff141d7e99b4106bb95157",
|
||||
"chunker_version": "code-c-ast-v1",
|
||||
"doc_id": "6bec42dd593920a060541db16c4e8e45",
|
||||
"heading_path": [],
|
||||
"policy_hash": "ecfad2ec1223662d",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "c",
|
||||
"line_end": 33,
|
||||
"line_start": 29,
|
||||
"symbol": "main"
|
||||
}
|
||||
],
|
||||
"text": "int main(void) {\n record_t r = { .id = 1, .name = \"foo\", .status = OK };\n print_record(&r);\n return 0;\n}",
|
||||
"token_estimate": 38,
|
||||
"tokenized_korean_text": "int main ( void ) { record _ t r = { . id = 1 , . name = \" foo \", . status = OK }; print _ record (& r ); return 0 ; }"
|
||||
}
|
||||
]
|
||||
178
crates/kebab-chunk/tests/fixtures/code-sample.chunks.snapshot.json
vendored
Normal file
178
crates/kebab-chunk/tests/fixtures/code-sample.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
112
crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json
vendored
Normal file
112
crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json
vendored
Normal file
@@ -0,0 +1,112 @@
|
||||
[
|
||||
{
|
||||
"block_ids": [
|
||||
"53292605459065d170cd36c118e20546"
|
||||
],
|
||||
"chunk_id": "50a5b324300d9082eac4ce2a422810e1",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 4,
|
||||
"line_start": 1,
|
||||
"symbol": "<top-level>"
|
||||
}
|
||||
],
|
||||
"text": "#include <string>\n#include <vector>\n\nnamespace kebab {",
|
||||
"token_estimate": 18,
|
||||
"tokenized_korean_text": "# include < string > # include < vector > namespace kebab {"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"f349acad94c9fa4cf9ad1c0a93e83610"
|
||||
],
|
||||
"chunk_id": "0e6bc7c522665af8a4b0f66afb9d29c8",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 20,
|
||||
"line_start": 6,
|
||||
"symbol": "kebab::chunk::MdHeadingV1Chunker"
|
||||
}
|
||||
],
|
||||
"text": "class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};",
|
||||
"token_estimate": 95,
|
||||
"tokenized_korean_text": "class MdHeadingV 1 Chunker { public : MdHeadingV 1 Chunker ( ) = default ; ~ MdHeadingV 1 Chunker ( ) = default ; std : : string chunk _ doc ( const std : : string & doc ) { return doc ; } int operator ( ) ( int x ) const { return x * 2 ; } private : int counter _ = 0 ; };"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"8b9811387717d0bd4abf84abcc35b8b1"
|
||||
],
|
||||
"chunk_id": "d9326d252905b665b2adb9a416c20451",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 25,
|
||||
"line_start": 22,
|
||||
"symbol": "kebab::identity"
|
||||
}
|
||||
],
|
||||
"text": "template <typename T>\nT identity(T value) {\n return value;\n}",
|
||||
"token_estimate": 21,
|
||||
"tokenized_korean_text": "template < typename T > T identity ( T value ) { return value ; }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"1754cb6b971f6a4cb292f144a4f0570b"
|
||||
],
|
||||
"chunk_id": "56ee5f991de4a413c016da8dc4acfc35",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 29,
|
||||
"line_start": 27,
|
||||
"symbol": "kebab::global_helper"
|
||||
}
|
||||
],
|
||||
"text": "void global_helper() {\n // free function in kebab namespace\n}",
|
||||
"token_estimate": 22,
|
||||
"tokenized_korean_text": "void global _ helper ( ) { / / free function in kebab namespace }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"14b5f3393d6d25f822f5b70763d24acd"
|
||||
],
|
||||
"chunk_id": "c0d7c043cdd575c530db3909b54cc906",
|
||||
"chunker_version": "code-cpp-ast-v1",
|
||||
"doc_id": "fff1e1f0a7ff70ef682937470e5d1d28",
|
||||
"heading_path": [],
|
||||
"policy_hash": "71f3c07bb9ec1d09",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "cpp",
|
||||
"line_end": 34,
|
||||
"line_start": 31,
|
||||
"symbol": "main"
|
||||
}
|
||||
],
|
||||
"text": "int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}",
|
||||
"token_estimate": 23,
|
||||
"tokenized_korean_text": "int main ( ) { kebab : : chunk : : MdHeadingV 1 Chunker c ; return 0 ; }"
|
||||
}
|
||||
]
|
||||
244
crates/kebab-chunk/tests/fixtures/code-sample.go.chunks.snapshot.json
vendored
Normal file
244
crates/kebab-chunk/tests/fixtures/code-sample.go.chunks.snapshot.json
vendored
Normal file
@@ -0,0 +1,244 @@
|
||||
[
|
||||
{
|
||||
"block_ids": [
|
||||
"c182bf37e32c7fc1b868bd617f8eaf66"
|
||||
],
|
||||
"chunk_id": "43de518d946dc18ec040ae20d74e0cff",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 5,
|
||||
"line_start": 1,
|
||||
"symbol": "imports"
|
||||
}
|
||||
],
|
||||
"text": "import (\n\t\"fmt\"\n\t\"os\"\n\t\"strings\"\n)",
|
||||
"token_estimate": 12,
|
||||
"tokenized_korean_text": "import ( \" fmt \" \" os \" \" strings \" )"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"c9992cdcfdf3c2a7700a4abc4782a8a4"
|
||||
],
|
||||
"chunk_id": "af4c382a83f1e8cdea495d8b33c11abc",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 12,
|
||||
"line_start": 7,
|
||||
"symbol": "ComputeMRR"
|
||||
}
|
||||
],
|
||||
"text": "func ComputeMRR(scores []float64) float64 {\n\tif len(scores) == 0 {\n\t\treturn 0.0\n\t}\n\t_ = fmt.Sprintf(\"%v\", scores)\n\treturn 1.0 / float64(len(scores))\n}",
|
||||
"token_estimate": 50,
|
||||
"tokenized_korean_text": "func ComputeMRR ( scores [ ] float 64 ) float 64 { if len ( scores ) == 0 { return 0 . 0 } _ = fmt . Sprintf (\"% v \", scores ) return 1 . 0 / float 64 ( len ( scores ) ) }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5f18dc3e79fe946ba05d32c3bfc00684"
|
||||
],
|
||||
"chunk_id": "4be6d8f180bc19b8651877e5264852ac",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 20,
|
||||
"line_start": 14,
|
||||
"symbol": "MetricsCollector"
|
||||
}
|
||||
],
|
||||
"text": "type MetricsCollector struct {\n\tScores []float64\n\tLabels []string\n\tCounts map[string]int\n\tTotals map[string]float64\n\tTags []string\n}",
|
||||
"token_estimate": 45,
|
||||
"tokenized_korean_text": "type MetricsCollector struct { Scores [ ] float 64 Labels [ ] string Counts map [ string ] int Totals map [ string ] float 64 Tags [ ] string }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"3009cc022ca832c323393e4f9bcdb388"
|
||||
],
|
||||
"chunk_id": "3ae182f4c6d304ee7f0aaf447142f948",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 30,
|
||||
"line_start": 22,
|
||||
"symbol": "BaseEvaluator"
|
||||
}
|
||||
],
|
||||
"text": "type BaseEvaluator struct {\n\tName string\n}\n\nfunc (e *BaseEvaluator) Evaluate(data []string) error {\n\t_ = os.Stderr\n\t_ = strings.Join(data, \",\")\n\treturn nil\n}",
|
||||
"token_estimate": 53,
|
||||
"tokenized_korean_text": "type BaseEvaluator struct { Name string } func ( e * BaseEvaluator ) Evaluate ( data [ ] string ) error { _ = os . Stderr _ = strings . Join ( data , \",\") return nil }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"e0e83d1d7f9327a1902ae9a8f67c1f1c"
|
||||
],
|
||||
"chunk_id": "b962f14980e756bb8ba514e2282756cd",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 38,
|
||||
"line_start": 32,
|
||||
"symbol": "MetricsCollector.Run"
|
||||
}
|
||||
],
|
||||
"text": "func (m *MetricsCollector) Run(inputs []float64) {\n\tfor _, inp := range inputs {\n\t\tm.Scores = append(\n\t\t\tm.Scores,\n\t\t\tinp,\n\t\t)\n\t}\n}",
|
||||
"token_estimate": 44,
|
||||
"tokenized_korean_text": "func ( m * MetricsCollector ) Run ( inputs [ ] float 64 ) { for _, inp := range inputs { m . Scores = append ( m . Scores , inp , ) } }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"0e6a572bc3fe2bd6d173fe614bd1b763"
|
||||
],
|
||||
"chunk_id": "441c695e990e7f49188068433e313e87",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 46,
|
||||
"line_start": 40,
|
||||
"symbol": "MetricsCollector.Report"
|
||||
}
|
||||
],
|
||||
"text": "func (m *MetricsCollector) Report() map[string]interface{} {\n\treturn map[string]interface{}{\n\t\t\"mean\": 0.0,\n\t\t\"count\": len(m.Scores),\n\t\t\"tags\": m.Tags,\n\t}\n}",
|
||||
"token_estimate": 53,
|
||||
"tokenized_korean_text": "func ( m * MetricsCollector ) Report ( ) map [ string ] interface {} { return map [ string ] interface {}{ \" mean \": 0 . 0 , \" count \": len ( m . Scores ) , \" tags \": m . Tags , } }"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "7a942d871c588ec69426290561f05179",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 247,
|
||||
"line_start": 48,
|
||||
"symbol": "BigCompute [part 1/5]"
|
||||
}
|
||||
],
|
||||
"text": "func BigCompute(data []int) int {\n\tv0 := 0\n\tif 0 < len(data) {\n\t\tv0 = data[0]\n\t}\n\tv1 := 0\n\tif 1 < len(data) {\n\t\tv1 = data[1]\n\t}\n\tv2 := 0\n\tif 2 < len(data) {\n\t\tv2 = data[2]\n\t}\n\tv3 := 0\n\tif 3 < len(data) {\n\t\tv3 = data[3]\n\t}\n\tv4 := 0\n\tif 4 < len(data) {\n\t\tv4 = data[4]\n\t}\n\tv5 := 0\n\tif 5 < len(data) {\n\t\tv5 = data[5]\n\t}\n\tv6 := 0\n\tif 6 < len(data) {\n\t\tv6 = data[6]\n\t}\n\tv7 := 0\n\tif 7 < len(data) {\n\t\tv7 = data[7]\n\t}\n\tv8 := 0\n\tif 8 < len(data) {\n\t\tv8 = data[8]\n\t}\n\tv9 := 0\n\tif 9 < len(data) {\n\t\tv9 = data[9]\n\t}\n\tv10 := 0\n\tif 10 < len(data) {\n\t\tv10 = data[10]\n\t}\n\tv11 := 0\n\tif 11 < len(data) {\n\t\tv11 = data[11]\n\t}\n\tv12 := 0\n\tif 12 < len(data) {\n\t\tv12 = data[12]\n\t}\n\tv13 := 0\n\tif 13 < len(data) {\n\t\tv13 = data[13]\n\t}\n\tv14 := 0\n\tif 14 < len(data) {\n\t\tv14 = data[14]\n\t}\n\tv15 := 0\n\tif 15 < len(data) {\n\t\tv15 = data[15]\n\t}\n\tv16 := 0\n\tif 16 < len(data) {\n\t\tv16 = data[16]\n\t}\n\tv17 := 0\n\tif 17 < len(data) {\n\t\tv17 = data[17]\n\t}\n\tv18 := 0\n\tif 18 < len(data) {\n\t\tv18 = data[18]\n\t}\n\tv19 := 0\n\tif 19 < len(data) {\n\t\tv19 = data[19]\n\t}\n\tv20 := 0\n\tif 20 < len(data) {\n\t\tv20 = data[20]\n\t}\n\tv21 := 0\n\tif 21 < len(data) {\n\t\tv21 = data[21]\n\t}\n\tv22 := 0\n\tif 22 < len(data) {\n\t\tv22 = data[22]\n\t}\n\tv23 := 0\n\tif 23 < len(data) {\n\t\tv23 = data[23]\n\t}\n\tv24 := 0\n\tif 24 < len(data) {\n\t\tv24 = data[24]\n\t}\n\tv25 := 0\n\tif 25 < len(data) {\n\t\tv25 = data[25]\n\t}\n\tv26 := 0\n\tif 26 < len(data) {\n\t\tv26 = data[26]\n\t}\n\tv27 := 0\n\tif 27 < len(data) {\n\t\tv27 = data[27]\n\t}\n\tv28 := 0\n\tif 28 < len(data) {\n\t\tv28 = data[28]\n\t}\n\tv29 := 0\n\tif 29 < len(data) {\n\t\tv29 = data[29]\n\t}\n\tv30 := 0\n\tif 30 < len(data) {\n\t\tv30 = data[30]\n\t}\n\tv31 := 0\n\tif 31 < len(data) {\n\t\tv31 = data[31]\n\t}\n\tv32 := 0\n\tif 32 < len(data) {\n\t\tv32 = data[32]\n\t}\n\tv33 := 0\n\tif 33 < len(data) {\n\t\tv33 = data[33]\n\t}\n\tv34 := 0\n\tif 34 < len(data) {\n\t\tv34 = data[34]\n\t}\n\tv35 := 0\n\tif 35 < len(data) {\n\t\tv35 = data[35]\n\t}\n\tv36 := 0\n\tif 36 < len(data) {\n\t\tv36 = data[36]\n\t}\n\tv37 := 0\n\tif 37 < len(data) {\n\t\tv37 = data[37]\n\t}\n\tv38 := 0\n\tif 38 < len(data) {\n\t\tv38 = data[38]\n\t}\n\tv39 := 0\n\tif 39 < len(data) {\n\t\tv39 = data[39]\n\t}\n\tv40 := 0\n\tif 40 < len(data) {\n\t\tv40 = data[40]\n\t}\n\tv41 := 0\n\tif 41 < len(data) {\n\t\tv41 = data[41]\n\t}\n\tv42 := 0\n\tif 42 < len(data) {\n\t\tv42 = data[42]\n\t}\n\tv43 := 0\n\tif 43 < len(data) {\n\t\tv43 = data[43]\n\t}\n\tv44 := 0\n\tif 44 < len(data) {\n\t\tv44 = data[44]\n\t}\n\tv45 := 0\n\tif 45 < len(data) {\n\t\tv45 = data[45]\n\t}\n\tv46 := 0\n\tif 46 < len(data) {\n\t\tv46 = data[46]\n\t}\n\tv47 := 0\n\tif 47 < len(data) {\n\t\tv47 = data[47]\n\t}\n\tv48 := 0\n\tif 48 < len(data) {\n\t\tv48 = data[48]\n\t}\n\tv49 := 0\n\tif 49 < len(data) {\n\t\tv49 = data[49]",
|
||||
"token_estimate": 847,
|
||||
"tokenized_korean_text": "func BigCompute ( data [ ] int ) int { v 0 := 0 if 0 < len ( data ) { v 0 = data [ 0 ] } v 1 := 0 if 1 < len ( data ) { v 1 = data [ 1 ] } v 2 := 0 if 2 < len ( data ) { v 2 = data [ 2 ] } v 3 := 0 if 3 < len ( data ) { v 3 = data [ 3 ] } v 4 := 0 if 4 < len ( data ) { v 4 = data [ 4 ] } v 5 := 0 if 5 < len ( data ) { v 5 = data [ 5 ] } v 6 := 0 if 6 < len ( data ) { v 6 = data [ 6 ] } v 7 := 0 if 7 < len ( data ) { v 7 = data [ 7 ] } v 8 := 0 if 8 < len ( data ) { v 8 = data [ 8 ] } v 9 := 0 if 9 < len ( data ) { v 9 = data [ 9 ] } v 10 := 0 if 10 < len ( data ) { v 10 = data [ 10 ] } v 11 := 0 if 11 < len ( data ) { v 11 = data [ 11 ] } v 12 := 0 if 12 < len ( data ) { v 12 = data [ 12 ] } v 13 := 0 if 13 < len ( data ) { v 13 = data [ 13 ] } v 14 := 0 if 14 < len ( data ) { v 14 = data [ 14 ] } v 15 := 0 if 15 < len ( data ) { v 15 = data [ 15 ] } v 16 := 0 if 16 < len ( data ) { v 16 = data [ 16 ] } v 17 := 0 if 17 < len ( data ) { v 17 = data [ 17 ] } v 18 := 0 if 18 < len ( data ) { v 18 = data [ 18 ] } v 19 := 0 if 19 < len ( data ) { v 19 = data [ 19 ] } v 20 := 0 if 20 < len ( data ) { v 20 = data [ 20 ] } v 21 := 0 if 21 < len ( data ) { v 21 = data [ 21 ] } v 22 := 0 if 22 < len ( data ) { v 22 = data [ 22 ] } v 23 := 0 if 23 < len ( data ) { v 23 = data [ 23 ] } v 24 := 0 if 24 < len ( data ) { v 24 = data [ 24 ] } v 25 := 0 if 25 < len ( data ) { v 25 = data [ 25 ] } v 26 := 0 if 26 < len ( data ) { v 26 = data [ 26 ] } v 27 := 0 if 27 < len ( data ) { v 27 = data [ 27 ] } v 28 := 0 if 28 < len ( data ) { v 28 = data [ 28 ] } v 29 := 0 if 29 < len ( data ) { v 29 = data [ 29 ] } v 30 := 0 if 30 < len ( data ) { v 30 = data [ 30 ] } v 31 := 0 if 31 < len ( data ) { v 31 = data [ 31 ] } v 32 := 0 if 32 < len ( data ) { v 32 = data [ 32 ] } v 33 := 0 if 33 < len ( data ) { v 33 = data [ 33 ] } v 34 := 0 if 34 < len ( data ) { v 34 = data [ 34 ] } v 35 := 0 if 35 < len ( data ) { v 35 = data [ 35 ] } v 36 := 0 if 36 < len ( data ) { v 36 = data [ 36 ] } v 37 := 0 if 37 < len ( data ) { v 37 = data [ 37 ] } v 38 := 0 if 38 < len ( data ) { v 38 = data [ 38 ] } v 39 := 0 if 39 < len ( data ) { v 39 = data [ 39 ] } v 40 := 0 if 40 < len ( data ) { v 40 = data [ 40 ] } v 41 := 0 if 41 < len ( data ) { v 41 = data [ 41 ] } v 42 := 0 if 42 < len ( data ) { v 42 = data [ 42 ] } v 43 := 0 if 43 < len ( data ) { v 43 = data [ 43 ] } v 44 := 0 if 44 < len ( data ) { v 44 = data [ 44 ] } v 45 := 0 if 45 < len ( data ) { v 45 = data [ 45 ] } v 46 := 0 if 46 < len ( data ) { v 46 = data [ 46 ] } v 47 := 0 if 47 < len ( data ) { v 47 = data [ 47 ] } v 48 := 0 if 48 < len ( data ) { v 48 = data [ 48 ] } v 49 := 0 if 49 < len ( data ) { v 49 = data [ 49 ]"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "3f44ba43c9415652e2705bb667776e76",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 447,
|
||||
"line_start": 248,
|
||||
"symbol": "BigCompute [part 2/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv50 := 0\n\tif 50 < len(data) {\n\t\tv50 = data[50]\n\t}\n\tv51 := 0\n\tif 51 < len(data) {\n\t\tv51 = data[51]\n\t}\n\tv52 := 0\n\tif 52 < len(data) {\n\t\tv52 = data[52]\n\t}\n\tv53 := 0\n\tif 53 < len(data) {\n\t\tv53 = data[53]\n\t}\n\tv54 := 0\n\tif 54 < len(data) {\n\t\tv54 = data[54]\n\t}\n\tv55 := 0\n\tif 55 < len(data) {\n\t\tv55 = data[55]\n\t}\n\tv56 := 0\n\tif 56 < len(data) {\n\t\tv56 = data[56]\n\t}\n\tv57 := 0\n\tif 57 < len(data) {\n\t\tv57 = data[57]\n\t}\n\tv58 := 0\n\tif 58 < len(data) {\n\t\tv58 = data[58]\n\t}\n\tv59 := 0\n\tif 59 < len(data) {\n\t\tv59 = data[59]\n\t}\n\tv60 := 0\n\tif 60 < len(data) {\n\t\tv60 = data[60]\n\t}\n\tv61 := 0\n\tif 61 < len(data) {\n\t\tv61 = data[61]\n\t}\n\tv62 := 0\n\tif 62 < len(data) {\n\t\tv62 = data[62]\n\t}\n\tv63 := 0\n\tif 63 < len(data) {\n\t\tv63 = data[63]\n\t}\n\tv64 := 0\n\tif 64 < len(data) {\n\t\tv64 = data[64]\n\t}\n\tv65 := 0\n\tif 65 < len(data) {\n\t\tv65 = data[65]\n\t}\n\tv66 := 0\n\tif 66 < len(data) {\n\t\tv66 = data[66]\n\t}\n\tv67 := 0\n\tif 67 < len(data) {\n\t\tv67 = data[67]\n\t}\n\tv68 := 0\n\tif 68 < len(data) {\n\t\tv68 = data[68]\n\t}\n\tv69 := 0\n\tif 69 < len(data) {\n\t\tv69 = data[69]\n\t}\n\tv70 := 0\n\tif 70 < len(data) {\n\t\tv70 = data[70]\n\t}\n\tv71 := 0\n\tif 71 < len(data) {\n\t\tv71 = data[71]\n\t}\n\tv72 := 0\n\tif 72 < len(data) {\n\t\tv72 = data[72]\n\t}\n\tv73 := 0\n\tif 73 < len(data) {\n\t\tv73 = data[73]\n\t}\n\tv74 := 0\n\tif 74 < len(data) {\n\t\tv74 = data[74]\n\t}\n\tv75 := 0\n\tif 75 < len(data) {\n\t\tv75 = data[75]\n\t}\n\tv76 := 0\n\tif 76 < len(data) {\n\t\tv76 = data[76]\n\t}\n\tv77 := 0\n\tif 77 < len(data) {\n\t\tv77 = data[77]\n\t}\n\tv78 := 0\n\tif 78 < len(data) {\n\t\tv78 = data[78]\n\t}\n\tv79 := 0\n\tif 79 < len(data) {\n\t\tv79 = data[79]\n\t}\n\tv80 := 0\n\tif 80 < len(data) {\n\t\tv80 = data[80]\n\t}\n\tv81 := 0\n\tif 81 < len(data) {\n\t\tv81 = data[81]\n\t}\n\tv82 := 0\n\tif 82 < len(data) {\n\t\tv82 = data[82]\n\t}\n\tv83 := 0\n\tif 83 < len(data) {\n\t\tv83 = data[83]\n\t}\n\tv84 := 0\n\tif 84 < len(data) {\n\t\tv84 = data[84]\n\t}\n\tv85 := 0\n\tif 85 < len(data) {\n\t\tv85 = data[85]\n\t}\n\tv86 := 0\n\tif 86 < len(data) {\n\t\tv86 = data[86]\n\t}\n\tv87 := 0\n\tif 87 < len(data) {\n\t\tv87 = data[87]\n\t}\n\tv88 := 0\n\tif 88 < len(data) {\n\t\tv88 = data[88]\n\t}\n\tv89 := 0\n\tif 89 < len(data) {\n\t\tv89 = data[89]\n\t}\n\tv90 := 0\n\tif 90 < len(data) {\n\t\tv90 = data[90]\n\t}\n\tv91 := 0\n\tif 91 < len(data) {\n\t\tv91 = data[91]\n\t}\n\tv92 := 0\n\tif 92 < len(data) {\n\t\tv92 = data[92]\n\t}\n\tv93 := 0\n\tif 93 < len(data) {\n\t\tv93 = data[93]\n\t}\n\tv94 := 0\n\tif 94 < len(data) {\n\t\tv94 = data[94]\n\t}\n\tv95 := 0\n\tif 95 < len(data) {\n\t\tv95 = data[95]\n\t}\n\tv96 := 0\n\tif 96 < len(data) {\n\t\tv96 = data[96]\n\t}\n\tv97 := 0\n\tif 97 < len(data) {\n\t\tv97 = data[97]\n\t}\n\tv98 := 0\n\tif 98 < len(data) {\n\t\tv98 = data[98]\n\t}\n\tv99 := 0\n\tif 99 < len(data) {\n\t\tv99 = data[99]",
|
||||
"token_estimate": 850,
|
||||
"tokenized_korean_text": "} v 50 := 0 if 50 < len ( data ) { v 50 = data [ 50 ] } v 51 := 0 if 51 < len ( data ) { v 51 = data [ 51 ] } v 52 := 0 if 52 < len ( data ) { v 52 = data [ 52 ] } v 53 := 0 if 53 < len ( data ) { v 53 = data [ 53 ] } v 54 := 0 if 54 < len ( data ) { v 54 = data [ 54 ] } v 55 := 0 if 55 < len ( data ) { v 55 = data [ 55 ] } v 56 := 0 if 56 < len ( data ) { v 56 = data [ 56 ] } v 57 := 0 if 57 < len ( data ) { v 57 = data [ 57 ] } v 58 := 0 if 58 < len ( data ) { v 58 = data [ 58 ] } v 59 := 0 if 59 < len ( data ) { v 59 = data [ 59 ] } v 60 := 0 if 60 < len ( data ) { v 60 = data [ 60 ] } v 61 := 0 if 61 < len ( data ) { v 61 = data [ 61 ] } v 62 := 0 if 62 < len ( data ) { v 62 = data [ 62 ] } v 63 := 0 if 63 < len ( data ) { v 63 = data [ 63 ] } v 64 := 0 if 64 < len ( data ) { v 64 = data [ 64 ] } v 65 := 0 if 65 < len ( data ) { v 65 = data [ 65 ] } v 66 := 0 if 66 < len ( data ) { v 66 = data [ 66 ] } v 67 := 0 if 67 < len ( data ) { v 67 = data [ 67 ] } v 68 := 0 if 68 < len ( data ) { v 68 = data [ 68 ] } v 69 := 0 if 69 < len ( data ) { v 69 = data [ 69 ] } v 70 := 0 if 70 < len ( data ) { v 70 = data [ 70 ] } v 71 := 0 if 71 < len ( data ) { v 71 = data [ 71 ] } v 72 := 0 if 72 < len ( data ) { v 72 = data [ 72 ] } v 73 := 0 if 73 < len ( data ) { v 73 = data [ 73 ] } v 74 := 0 if 74 < len ( data ) { v 74 = data [ 74 ] } v 75 := 0 if 75 < len ( data ) { v 75 = data [ 75 ] } v 76 := 0 if 76 < len ( data ) { v 76 = data [ 76 ] } v 77 := 0 if 77 < len ( data ) { v 77 = data [ 77 ] } v 78 := 0 if 78 < len ( data ) { v 78 = data [ 78 ] } v 79 := 0 if 79 < len ( data ) { v 79 = data [ 79 ] } v 80 := 0 if 80 < len ( data ) { v 80 = data [ 80 ] } v 81 := 0 if 81 < len ( data ) { v 81 = data [ 81 ] } v 82 := 0 if 82 < len ( data ) { v 82 = data [ 82 ] } v 83 := 0 if 83 < len ( data ) { v 83 = data [ 83 ] } v 84 := 0 if 84 < len ( data ) { v 84 = data [ 84 ] } v 85 := 0 if 85 < len ( data ) { v 85 = data [ 85 ] } v 86 := 0 if 86 < len ( data ) { v 86 = data [ 86 ] } v 87 := 0 if 87 < len ( data ) { v 87 = data [ 87 ] } v 88 := 0 if 88 < len ( data ) { v 88 = data [ 88 ] } v 89 := 0 if 89 < len ( data ) { v 89 = data [ 89 ] } v 90 := 0 if 90 < len ( data ) { v 90 = data [ 90 ] } v 91 := 0 if 91 < len ( data ) { v 91 = data [ 91 ] } v 92 := 0 if 92 < len ( data ) { v 92 = data [ 92 ] } v 93 := 0 if 93 < len ( data ) { v 93 = data [ 93 ] } v 94 := 0 if 94 < len ( data ) { v 94 = data [ 94 ] } v 95 := 0 if 95 < len ( data ) { v 95 = data [ 95 ] } v 96 := 0 if 96 < len ( data ) { v 96 = data [ 96 ] } v 97 := 0 if 97 < len ( data ) { v 97 = data [ 97 ] } v 98 := 0 if 98 < len ( data ) { v 98 = data [ 98 ] } v 99 := 0 if 99 < len ( data ) { v 99 = data [ 99 ]"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "e4763e10f059d97f40c2932761b56c3e",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 647,
|
||||
"line_start": 448,
|
||||
"symbol": "BigCompute [part 3/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv100 := 0\n\tif 100 < len(data) {\n\t\tv100 = data[100]\n\t}\n\tv101 := 0\n\tif 101 < len(data) {\n\t\tv101 = data[101]\n\t}\n\tv102 := 0\n\tif 102 < len(data) {\n\t\tv102 = data[102]\n\t}\n\tv103 := 0\n\tif 103 < len(data) {\n\t\tv103 = data[103]\n\t}\n\tv104 := 0\n\tif 104 < len(data) {\n\t\tv104 = data[104]\n\t}\n\tv105 := 0\n\tif 105 < len(data) {\n\t\tv105 = data[105]\n\t}\n\tv106 := 0\n\tif 106 < len(data) {\n\t\tv106 = data[106]\n\t}\n\tv107 := 0\n\tif 107 < len(data) {\n\t\tv107 = data[107]\n\t}\n\tv108 := 0\n\tif 108 < len(data) {\n\t\tv108 = data[108]\n\t}\n\tv109 := 0\n\tif 109 < len(data) {\n\t\tv109 = data[109]\n\t}\n\tv110 := 0\n\tif 110 < len(data) {\n\t\tv110 = data[110]\n\t}\n\tv111 := 0\n\tif 111 < len(data) {\n\t\tv111 = data[111]\n\t}\n\tv112 := 0\n\tif 112 < len(data) {\n\t\tv112 = data[112]\n\t}\n\tv113 := 0\n\tif 113 < len(data) {\n\t\tv113 = data[113]\n\t}\n\tv114 := 0\n\tif 114 < len(data) {\n\t\tv114 = data[114]\n\t}\n\tv115 := 0\n\tif 115 < len(data) {\n\t\tv115 = data[115]\n\t}\n\tv116 := 0\n\tif 116 < len(data) {\n\t\tv116 = data[116]\n\t}\n\tv117 := 0\n\tif 117 < len(data) {\n\t\tv117 = data[117]\n\t}\n\tv118 := 0\n\tif 118 < len(data) {\n\t\tv118 = data[118]\n\t}\n\tv119 := 0\n\tif 119 < len(data) {\n\t\tv119 = data[119]\n\t}\n\tv120 := 0\n\tif 120 < len(data) {\n\t\tv120 = data[120]\n\t}\n\tv121 := 0\n\tif 121 < len(data) {\n\t\tv121 = data[121]\n\t}\n\tv122 := 0\n\tif 122 < len(data) {\n\t\tv122 = data[122]\n\t}\n\tv123 := 0\n\tif 123 < len(data) {\n\t\tv123 = data[123]\n\t}\n\tv124 := 0\n\tif 124 < len(data) {\n\t\tv124 = data[124]\n\t}\n\tv125 := 0\n\tif 125 < len(data) {\n\t\tv125 = data[125]\n\t}\n\tv126 := 0\n\tif 126 < len(data) {\n\t\tv126 = data[126]\n\t}\n\tv127 := 0\n\tif 127 < len(data) {\n\t\tv127 = data[127]\n\t}\n\tv128 := 0\n\tif 128 < len(data) {\n\t\tv128 = data[128]\n\t}\n\tv129 := 0\n\tif 129 < len(data) {\n\t\tv129 = data[129]\n\t}\n\tv130 := 0\n\tif 130 < len(data) {\n\t\tv130 = data[130]\n\t}\n\tv131 := 0\n\tif 131 < len(data) {\n\t\tv131 = data[131]\n\t}\n\tv132 := 0\n\tif 132 < len(data) {\n\t\tv132 = data[132]\n\t}\n\tv133 := 0\n\tif 133 < len(data) {\n\t\tv133 = data[133]\n\t}\n\tv134 := 0\n\tif 134 < len(data) {\n\t\tv134 = data[134]\n\t}\n\tv135 := 0\n\tif 135 < len(data) {\n\t\tv135 = data[135]\n\t}\n\tv136 := 0\n\tif 136 < len(data) {\n\t\tv136 = data[136]\n\t}\n\tv137 := 0\n\tif 137 < len(data) {\n\t\tv137 = data[137]\n\t}\n\tv138 := 0\n\tif 138 < len(data) {\n\t\tv138 = data[138]\n\t}\n\tv139 := 0\n\tif 139 < len(data) {\n\t\tv139 = data[139]\n\t}\n\tv140 := 0\n\tif 140 < len(data) {\n\t\tv140 = data[140]\n\t}\n\tv141 := 0\n\tif 141 < len(data) {\n\t\tv141 = data[141]\n\t}\n\tv142 := 0\n\tif 142 < len(data) {\n\t\tv142 = data[142]\n\t}\n\tv143 := 0\n\tif 143 < len(data) {\n\t\tv143 = data[143]\n\t}\n\tv144 := 0\n\tif 144 < len(data) {\n\t\tv144 = data[144]\n\t}\n\tv145 := 0\n\tif 145 < len(data) {\n\t\tv145 = data[145]\n\t}\n\tv146 := 0\n\tif 146 < len(data) {\n\t\tv146 = data[146]\n\t}\n\tv147 := 0\n\tif 147 < len(data) {\n\t\tv147 = data[147]\n\t}\n\tv148 := 0\n\tif 148 < len(data) {\n\t\tv148 = data[148]\n\t}\n\tv149 := 0\n\tif 149 < len(data) {\n\t\tv149 = data[149]",
|
||||
"token_estimate": 917,
|
||||
"tokenized_korean_text": "} v 100 := 0 if 100 < len ( data ) { v 100 = data [ 100 ] } v 101 := 0 if 101 < len ( data ) { v 101 = data [ 101 ] } v 102 := 0 if 102 < len ( data ) { v 102 = data [ 102 ] } v 103 := 0 if 103 < len ( data ) { v 103 = data [ 103 ] } v 104 := 0 if 104 < len ( data ) { v 104 = data [ 104 ] } v 105 := 0 if 105 < len ( data ) { v 105 = data [ 105 ] } v 106 := 0 if 106 < len ( data ) { v 106 = data [ 106 ] } v 107 := 0 if 107 < len ( data ) { v 107 = data [ 107 ] } v 108 := 0 if 108 < len ( data ) { v 108 = data [ 108 ] } v 109 := 0 if 109 < len ( data ) { v 109 = data [ 109 ] } v 110 := 0 if 110 < len ( data ) { v 110 = data [ 110 ] } v 111 := 0 if 111 < len ( data ) { v 111 = data [ 111 ] } v 112 := 0 if 112 < len ( data ) { v 112 = data [ 112 ] } v 113 := 0 if 113 < len ( data ) { v 113 = data [ 113 ] } v 114 := 0 if 114 < len ( data ) { v 114 = data [ 114 ] } v 115 := 0 if 115 < len ( data ) { v 115 = data [ 115 ] } v 116 := 0 if 116 < len ( data ) { v 116 = data [ 116 ] } v 117 := 0 if 117 < len ( data ) { v 117 = data [ 117 ] } v 118 := 0 if 118 < len ( data ) { v 118 = data [ 118 ] } v 119 := 0 if 119 < len ( data ) { v 119 = data [ 119 ] } v 120 := 0 if 120 < len ( data ) { v 120 = data [ 120 ] } v 121 := 0 if 121 < len ( data ) { v 121 = data [ 121 ] } v 122 := 0 if 122 < len ( data ) { v 122 = data [ 122 ] } v 123 := 0 if 123 < len ( data ) { v 123 = data [ 123 ] } v 124 := 0 if 124 < len ( data ) { v 124 = data [ 124 ] } v 125 := 0 if 125 < len ( data ) { v 125 = data [ 125 ] } v 126 := 0 if 126 < len ( data ) { v 126 = data [ 126 ] } v 127 := 0 if 127 < len ( data ) { v 127 = data [ 127 ] } v 128 := 0 if 128 < len ( data ) { v 128 = data [ 128 ] } v 129 := 0 if 129 < len ( data ) { v 129 = data [ 129 ] } v 130 := 0 if 130 < len ( data ) { v 130 = data [ 130 ] } v 131 := 0 if 131 < len ( data ) { v 131 = data [ 131 ] } v 132 := 0 if 132 < len ( data ) { v 132 = data [ 132 ] } v 133 := 0 if 133 < len ( data ) { v 133 = data [ 133 ] } v 134 := 0 if 134 < len ( data ) { v 134 = data [ 134 ] } v 135 := 0 if 135 < len ( data ) { v 135 = data [ 135 ] } v 136 := 0 if 136 < len ( data ) { v 136 = data [ 136 ] } v 137 := 0 if 137 < len ( data ) { v 137 = data [ 137 ] } v 138 := 0 if 138 < len ( data ) { v 138 = data [ 138 ] } v 139 := 0 if 139 < len ( data ) { v 139 = data [ 139 ] } v 140 := 0 if 140 < len ( data ) { v 140 = data [ 140 ] } v 141 := 0 if 141 < len ( data ) { v 141 = data [ 141 ] } v 142 := 0 if 142 < len ( data ) { v 142 = data [ 142 ] } v 143 := 0 if 143 < len ( data ) { v 143 = data [ 143 ] } v 144 := 0 if 144 < len ( data ) { v 144 = data [ 144 ] } v 145 := 0 if 145 < len ( data ) { v 145 = data [ 145 ] } v 146 := 0 if 146 < len ( data ) { v 146 = data [ 146 ] } v 147 := 0 if 147 < len ( data ) { v 147 = data [ 147 ] } v 148 := 0 if 148 < len ( data ) { v 148 = data [ 148 ] } v 149 := 0 if 149 < len ( data ) { v 149 = data [ 149 ]"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "24176c911d0bacf9a29fa7f8251f5036",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 847,
|
||||
"line_start": 648,
|
||||
"symbol": "BigCompute [part 4/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv150 := 0\n\tif 150 < len(data) {\n\t\tv150 = data[150]\n\t}\n\tv151 := 0\n\tif 151 < len(data) {\n\t\tv151 = data[151]\n\t}\n\tv152 := 0\n\tif 152 < len(data) {\n\t\tv152 = data[152]\n\t}\n\tv153 := 0\n\tif 153 < len(data) {\n\t\tv153 = data[153]\n\t}\n\tv154 := 0\n\tif 154 < len(data) {\n\t\tv154 = data[154]\n\t}\n\tv155 := 0\n\tif 155 < len(data) {\n\t\tv155 = data[155]\n\t}\n\tv156 := 0\n\tif 156 < len(data) {\n\t\tv156 = data[156]\n\t}\n\tv157 := 0\n\tif 157 < len(data) {\n\t\tv157 = data[157]\n\t}\n\tv158 := 0\n\tif 158 < len(data) {\n\t\tv158 = data[158]\n\t}\n\tv159 := 0\n\tif 159 < len(data) {\n\t\tv159 = data[159]\n\t}\n\tv160 := 0\n\tif 160 < len(data) {\n\t\tv160 = data[160]\n\t}\n\tv161 := 0\n\tif 161 < len(data) {\n\t\tv161 = data[161]\n\t}\n\tv162 := 0\n\tif 162 < len(data) {\n\t\tv162 = data[162]\n\t}\n\tv163 := 0\n\tif 163 < len(data) {\n\t\tv163 = data[163]\n\t}\n\tv164 := 0\n\tif 164 < len(data) {\n\t\tv164 = data[164]\n\t}\n\tv165 := 0\n\tif 165 < len(data) {\n\t\tv165 = data[165]\n\t}\n\tv166 := 0\n\tif 166 < len(data) {\n\t\tv166 = data[166]\n\t}\n\tv167 := 0\n\tif 167 < len(data) {\n\t\tv167 = data[167]\n\t}\n\tv168 := 0\n\tif 168 < len(data) {\n\t\tv168 = data[168]\n\t}\n\tv169 := 0\n\tif 169 < len(data) {\n\t\tv169 = data[169]\n\t}\n\tv170 := 0\n\tif 170 < len(data) {\n\t\tv170 = data[170]\n\t}\n\tv171 := 0\n\tif 171 < len(data) {\n\t\tv171 = data[171]\n\t}\n\tv172 := 0\n\tif 172 < len(data) {\n\t\tv172 = data[172]\n\t}\n\tv173 := 0\n\tif 173 < len(data) {\n\t\tv173 = data[173]\n\t}\n\tv174 := 0\n\tif 174 < len(data) {\n\t\tv174 = data[174]\n\t}\n\tv175 := 0\n\tif 175 < len(data) {\n\t\tv175 = data[175]\n\t}\n\tv176 := 0\n\tif 176 < len(data) {\n\t\tv176 = data[176]\n\t}\n\tv177 := 0\n\tif 177 < len(data) {\n\t\tv177 = data[177]\n\t}\n\tv178 := 0\n\tif 178 < len(data) {\n\t\tv178 = data[178]\n\t}\n\tv179 := 0\n\tif 179 < len(data) {\n\t\tv179 = data[179]\n\t}\n\tv180 := 0\n\tif 180 < len(data) {\n\t\tv180 = data[180]\n\t}\n\tv181 := 0\n\tif 181 < len(data) {\n\t\tv181 = data[181]\n\t}\n\tv182 := 0\n\tif 182 < len(data) {\n\t\tv182 = data[182]\n\t}\n\tv183 := 0\n\tif 183 < len(data) {\n\t\tv183 = data[183]\n\t}\n\tv184 := 0\n\tif 184 < len(data) {\n\t\tv184 = data[184]\n\t}\n\tv185 := 0\n\tif 185 < len(data) {\n\t\tv185 = data[185]\n\t}\n\tv186 := 0\n\tif 186 < len(data) {\n\t\tv186 = data[186]\n\t}\n\tv187 := 0\n\tif 187 < len(data) {\n\t\tv187 = data[187]\n\t}\n\tv188 := 0\n\tif 188 < len(data) {\n\t\tv188 = data[188]\n\t}\n\tv189 := 0\n\tif 189 < len(data) {\n\t\tv189 = data[189]\n\t}\n\tv190 := 0\n\tif 190 < len(data) {\n\t\tv190 = data[190]\n\t}\n\tv191 := 0\n\tif 191 < len(data) {\n\t\tv191 = data[191]\n\t}\n\tv192 := 0\n\tif 192 < len(data) {\n\t\tv192 = data[192]\n\t}\n\tv193 := 0\n\tif 193 < len(data) {\n\t\tv193 = data[193]\n\t}\n\tv194 := 0\n\tif 194 < len(data) {\n\t\tv194 = data[194]\n\t}\n\tv195 := 0\n\tif 195 < len(data) {\n\t\tv195 = data[195]\n\t}\n\tv196 := 0\n\tif 196 < len(data) {\n\t\tv196 = data[196]\n\t}\n\tv197 := 0\n\tif 197 < len(data) {\n\t\tv197 = data[197]\n\t}\n\tv198 := 0\n\tif 198 < len(data) {\n\t\tv198 = data[198]\n\t}\n\tv199 := 0\n\tif 199 < len(data) {\n\t\tv199 = data[199]",
|
||||
"token_estimate": 917,
|
||||
"tokenized_korean_text": "} v 150 := 0 if 150 < len ( data ) { v 150 = data [ 150 ] } v 151 := 0 if 151 < len ( data ) { v 151 = data [ 151 ] } v 152 := 0 if 152 < len ( data ) { v 152 = data [ 152 ] } v 153 := 0 if 153 < len ( data ) { v 153 = data [ 153 ] } v 154 := 0 if 154 < len ( data ) { v 154 = data [ 154 ] } v 155 := 0 if 155 < len ( data ) { v 155 = data [ 155 ] } v 156 := 0 if 156 < len ( data ) { v 156 = data [ 156 ] } v 157 := 0 if 157 < len ( data ) { v 157 = data [ 157 ] } v 158 := 0 if 158 < len ( data ) { v 158 = data [ 158 ] } v 159 := 0 if 159 < len ( data ) { v 159 = data [ 159 ] } v 160 := 0 if 160 < len ( data ) { v 160 = data [ 160 ] } v 161 := 0 if 161 < len ( data ) { v 161 = data [ 161 ] } v 162 := 0 if 162 < len ( data ) { v 162 = data [ 162 ] } v 163 := 0 if 163 < len ( data ) { v 163 = data [ 163 ] } v 164 := 0 if 164 < len ( data ) { v 164 = data [ 164 ] } v 165 := 0 if 165 < len ( data ) { v 165 = data [ 165 ] } v 166 := 0 if 166 < len ( data ) { v 166 = data [ 166 ] } v 167 := 0 if 167 < len ( data ) { v 167 = data [ 167 ] } v 168 := 0 if 168 < len ( data ) { v 168 = data [ 168 ] } v 169 := 0 if 169 < len ( data ) { v 169 = data [ 169 ] } v 170 := 0 if 170 < len ( data ) { v 170 = data [ 170 ] } v 171 := 0 if 171 < len ( data ) { v 171 = data [ 171 ] } v 172 := 0 if 172 < len ( data ) { v 172 = data [ 172 ] } v 173 := 0 if 173 < len ( data ) { v 173 = data [ 173 ] } v 174 := 0 if 174 < len ( data ) { v 174 = data [ 174 ] } v 175 := 0 if 175 < len ( data ) { v 175 = data [ 175 ] } v 176 := 0 if 176 < len ( data ) { v 176 = data [ 176 ] } v 177 := 0 if 177 < len ( data ) { v 177 = data [ 177 ] } v 178 := 0 if 178 < len ( data ) { v 178 = data [ 178 ] } v 179 := 0 if 179 < len ( data ) { v 179 = data [ 179 ] } v 180 := 0 if 180 < len ( data ) { v 180 = data [ 180 ] } v 181 := 0 if 181 < len ( data ) { v 181 = data [ 181 ] } v 182 := 0 if 182 < len ( data ) { v 182 = data [ 182 ] } v 183 := 0 if 183 < len ( data ) { v 183 = data [ 183 ] } v 184 := 0 if 184 < len ( data ) { v 184 = data [ 184 ] } v 185 := 0 if 185 < len ( data ) { v 185 = data [ 185 ] } v 186 := 0 if 186 < len ( data ) { v 186 = data [ 186 ] } v 187 := 0 if 187 < len ( data ) { v 187 = data [ 187 ] } v 188 := 0 if 188 < len ( data ) { v 188 = data [ 188 ] } v 189 := 0 if 189 < len ( data ) { v 189 = data [ 189 ] } v 190 := 0 if 190 < len ( data ) { v 190 = data [ 190 ] } v 191 := 0 if 191 < len ( data ) { v 191 = data [ 191 ] } v 192 := 0 if 192 < len ( data ) { v 192 = data [ 192 ] } v 193 := 0 if 193 < len ( data ) { v 193 = data [ 193 ] } v 194 := 0 if 194 < len ( data ) { v 194 = data [ 194 ] } v 195 := 0 if 195 < len ( data ) { v 195 = data [ 195 ] } v 196 := 0 if 196 < len ( data ) { v 196 = data [ 196 ] } v 197 := 0 if 197 < len ( data ) { v 197 = data [ 197 ] } v 198 := 0 if 198 < len ( data ) { v 198 = data [ 198 ] } v 199 := 0 if 199 < len ( data ) { v 199 = data [ 199 ]"
|
||||
},
|
||||
{
|
||||
"block_ids": [
|
||||
"5d269745b2e5dbdcbef0c09ba54b0bd6"
|
||||
],
|
||||
"chunk_id": "438127626378632c03780d10603de32c",
|
||||
"chunker_version": "code-go-ast-v1",
|
||||
"doc_id": "83daba5fbb026e7a400d68a1c4bd36db",
|
||||
"heading_path": [],
|
||||
"policy_hash": "6cfe77abe2b0e5c3",
|
||||
"source_spans": [
|
||||
{
|
||||
"kind": "code",
|
||||
"lang": "go",
|
||||
"line_end": 890,
|
||||
"line_start": 848,
|
||||
"symbol": "BigCompute [part 5/5]"
|
||||
}
|
||||
],
|
||||
"text": "\t}\n\tv200 := 0\n\tif 200 < len(data) {\n\t\tv200 = data[200]\n\t}\n\tv201 := 0\n\tif 201 < len(data) {\n\t\tv201 = data[201]\n\t}\n\tv202 := 0\n\tif 202 < len(data) {\n\t\tv202 = data[202]\n\t}\n\tv203 := 0\n\tif 203 < len(data) {\n\t\tv203 = data[203]\n\t}\n\tv204 := 0\n\tif 204 < len(data) {\n\t\tv204 = data[204]\n\t}\n\tv205 := 0\n\tif 205 < len(data) {\n\t\tv205 = data[205]\n\t}\n\tv206 := 0\n\tif 206 < len(data) {\n\t\tv206 = data[206]\n\t}\n\tv207 := 0\n\tif 207 < len(data) {\n\t\tv207 = data[207]\n\t}\n\tv208 := 0\n\tif 208 < len(data) {\n\t\tv208 = data[208]\n\t}\n\tv209 := 0\n\tif 209 < len(data) {\n\t\tv209 = data[209]\n\t}\n\treturn len(data)\n}",
|
||||
"token_estimate": 191,
|
||||
"tokenized_korean_text": "} v 200 := 0 if 200 < len ( data ) { v 200 = data [ 200 ] } v 201 := 0 if 201 < len ( data ) { v 201 = data [ 201 ] } v 202 := 0 if 202 < len ( data ) { v 202 = data [ 202 ] } v 203 := 0 if 203 < len ( data ) { v 203 = data [ 203 ] } v 204 := 0 if 204 < len ( data ) { v 204 = data [ 204 ] } v 205 := 0 if 205 < len ( data ) { v 205 = data [ 205 ] } v 206 := 0 if 206 < len ( data ) { v 206 = data [ 206 ] } v 207 := 0 if 207 < len ( data ) { v 207 = data [ 207 ] } v 208 := 0 if 208 < len ( data ) { v 208 = data [ 208 ] } v 209 := 0 if 209 < len ( data ) { v 209 = data [ 209 ] } return len ( data ) }"
|
||||
}
|
||||
]
|
||||
178
crates/kebab-chunk/tests/fixtures/code-sample.java.chunks.snapshot.json
vendored
Normal file
178
crates/kebab-chunk/tests/fixtures/code-sample.java.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
178
crates/kebab-chunk/tests/fixtures/code-sample.js.chunks.snapshot.json
vendored
Normal file
178
crates/kebab-chunk/tests/fixtures/code-sample.js.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
178
crates/kebab-chunk/tests/fixtures/code-sample.kt.chunks.snapshot.json
vendored
Normal file
178
crates/kebab-chunk/tests/fixtures/code-sample.kt.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
178
crates/kebab-chunk/tests/fixtures/code-sample.py.chunks.snapshot.json
vendored
Normal file
178
crates/kebab-chunk/tests/fixtures/code-sample.py.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
178
crates/kebab-chunk/tests/fixtures/code-sample.ts.chunks.snapshot.json
vendored
Normal file
178
crates/kebab-chunk/tests/fixtures/code-sample.ts.chunks.snapshot.json
vendored
Normal file
File diff suppressed because one or more lines are too long
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user