diff --git a/Cargo.lock b/Cargo.lock index 1899cf3..0a4deaa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15,6 +15,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", + "const-random", "getrandom 0.3.4", "once_cell", "serde", @@ -49,6 +50,36 @@ dependencies = [ "equator", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anstream" version = "1.0.0" @@ -105,12 +136,39 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "approx" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ar_archive_writer" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +dependencies = [ + "object", +] + [[package]] name = "arbitrary" version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +[[package]] +name = "arc-swap" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" +dependencies = [ + "rustversion", +] + [[package]] name = "arg_enum_proc_macro" version = "0.3.4" @@ -119,7 +177,7 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -134,6 +192,223 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "arrow" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num", +] + +[[package]] +name = "arrow-array" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.16.1", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64 0.22.1", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "flatbuffers", + "lz4_flex", + "zstd", +] + +[[package]] +name = "arrow-json" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap 2.14.0", + "lexical-core", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" +dependencies = [ + "bitflags", + "serde", + "serde_json", +] + +[[package]] +name = "arrow-select" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", +] + [[package]] name = "as-slice" version = "0.2.1" @@ -143,6 +418,57 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "async-channel" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "924ed96dd52d1b75e9c1a3e6275715fd320f5f9439fb5a4a11fa51f4221158d2" +dependencies = [ + "concurrent-queue", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-compression" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" +dependencies = [ + "bzip2 0.5.2", + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd", + "zstd-safe", +] + +[[package]] +name = "async-lock" +version = "3.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" +dependencies = [ + "event-listener", + "event-listener-strategy", + "pin-project-lite", +] + +[[package]] +name = "async-recursion" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "async-trait" version = "0.1.89" @@ -151,7 +477,25 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "async_cell" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "447ab28afbb345f5408b120702a44e5529ebf90b1796ec76e9528df8e288e6c2" +dependencies = [ + "loom", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", ] [[package]] @@ -221,6 +565,19 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bigdecimal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "bit-set" version = "0.8.0" @@ -248,6 +605,15 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +[[package]] +name = "bitpacking" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019" +dependencies = [ + "crunchy", +] + [[package]] name = "bitstream-io" version = "4.10.0" @@ -257,6 +623,27 @@ dependencies = [ "no_std_io2", ] +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + [[package]] name = "blake3" version = "1.8.5" @@ -280,6 +667,52 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bon" +version = "3.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe" +dependencies = [ + "bon-macros", + "rustversion", +] + +[[package]] +name = "bon-macros" +version = "3.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" +dependencies = [ + "darling 0.20.11", + "ident_case", + "prettyplease", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.117", +] + +[[package]] +name = "brotli" +version = "8.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bstr" version = "1.12.1" @@ -326,6 +759,34 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + +[[package]] +name = "bzip2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" +dependencies = [ + "libbz2-rs-sys", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "castaway" version = "0.2.4" @@ -347,12 +808,48 @@ dependencies = [ "shlex", ] +[[package]] +name = "census" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" + [[package]] name = "cfg-if" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "chrono" +version = "0.4.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "chrono-tz" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" +dependencies = [ + "chrono", + "phf", +] + [[package]] name = "clap" version = "4.6.1" @@ -384,7 +881,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -405,6 +902,17 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "comfy-table" +version = "7.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" +dependencies = [ + "strum 0.26.3", + "strum_macros 0.26.4", + "unicode-width", +] + [[package]] name = "compact_str" version = "0.9.0" @@ -420,6 +928,15 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "console" version = "0.15.11" @@ -433,6 +950,26 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.17", + "once_cell", + "tiny-keccak", +] + [[package]] name = "constant_time_eq" version = "0.4.2" @@ -529,6 +1066,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -551,14 +1097,45 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + [[package]] name = "darling" version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + +[[package]] +name = "darling" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +dependencies = [ + "darling_core 0.21.3", + "darling_macro 0.21.3", ] [[package]] @@ -572,7 +1149,21 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "darling_core" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.117", ] [[package]] @@ -581,9 +1172,20 @@ version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ - "darling_core", + "darling_core 0.20.11", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "darling_macro" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +dependencies = [ + "darling_core 0.21.3", + "quote", + "syn 2.0.117", ] [[package]] @@ -609,6 +1211,669 @@ dependencies = [ "parking_lot_core", ] +[[package]] +name = "datafusion" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af15bb3c6ffa33011ef579f6b0bcbe7c26584688bd6c994f548e44df67f011a" +dependencies = [ + "arrow", + "arrow-ipc", + "arrow-schema", + "async-trait", + "bytes", + "bzip2 0.6.1", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-table", + "datafusion-functions-window", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "flate2", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet", + "rand 0.9.4", + "regex", + "sqlparser", + "tempfile", + "tokio", + "url", + "uuid", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-catalog" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "187622262ad8f7d16d3be9202b4c1e0116f1c9aa387e5074245538b755261621" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9657314f0a32efd0382b9a46fdeb2d233273ece64baa68a7c45f5a192daf0f83" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "log", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a83760d9a13122d025fbdb1d5d5aaf93dd9ada5e90ea229add92aa30898b2d1" +dependencies = [ + "ahash", + "arrow", + "arrow-ipc", + "base64 0.22.1", + "chrono", + "half", + "hashbrown 0.14.5", + "indexmap 2.14.0", + "libc", + "log", + "object_store", + "parquet", + "paste", + "recursive", + "sqlparser", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common-runtime" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b6234a6c7173fe5db1c6c35c01a12b2aa0f803a3007feee53483218817f8b1e" +dependencies = [ + "futures", + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7256c9cb27a78709dd42d0c80f0178494637209cac6e29d5c93edd09b6721b86" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2 0.6.1", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "flate2", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "parquet", + "rand 0.9.4", + "tempfile", + "tokio", + "tokio-util", + "url", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64533a90f78e1684bfb113d200b540f18f268134622d7c96bbebc91354d04825" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d7ebeb12c77df0aacad26f21b0d033aeede423a64b2b352f53048a75bf1d6e6" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "serde_json", + "tokio", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09e783c4c7d7faa1199af2df4761c68530634521b176a8d1331ddbc5a5c75133" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-pruning", + "datafusion-session", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet", + "rand 0.9.4", + "tokio", +] + +[[package]] +name = "datafusion-doc" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99ee6b1d9a80d13f9deb2291f45c07044b8e62fb540dbde2453a18be17a36429" + +[[package]] +name = "datafusion-execution" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4cec0a57653bec7b933fb248d3ffa3fa3ab3bd33bd140dc917f714ac036f531" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-expr", + "futures", + "log", + "object_store", + "parking_lot", + "rand 0.9.4", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef76910bdca909722586389156d0aa4da4020e1631994d50fadd8ad4b1aa05fe" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap 2.14.0", + "paste", + "recursive", + "serde_json", + "sqlparser", +] + +[[package]] +name = "datafusion-expr-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d155ccbda29591ca71a1344dd6bed26c65a4438072b400df9db59447f590bb6" +dependencies = [ + "arrow", + "datafusion-common", + "indexmap 2.14.0", + "itertools 0.14.0", + "paste", +] + +[[package]] +name = "datafusion-functions" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7de2782136bd6014670fd84fe3b0ca3b3e4106c96403c3ae05c0598577139977" +dependencies = [ + "arrow", + "arrow-buffer", + "base64 0.22.1", + "blake2", + "blake3", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", + "hex", + "itertools 0.14.0", + "log", + "md-5", + "rand 0.9.4", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07331fc13603a9da97b74fd8a273f4238222943dffdbbed1c4c6f862a30105bf" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5951e572a8610b89968a09b5420515a121fbc305c0258651f318dc07c97ab17" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-functions-nested" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdacca9302c3d8fc03f3e94f338767e786a88a33f5ebad6ffc0e7b50364b9ea3" +dependencies = [ + "arrow", + "arrow-ord", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr-common", + "itertools 0.14.0", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c37ff8a99434fbbad604a7e0669717c58c7c4f14c472d45067c4b016621d981" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e2aea7c79c926cffabb13dc27309d4eaeb130f4a21c8ba91cdd241c813652b" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fead257ab5fd2ffc3b40fda64da307e20de0040fe43d49197241d9de82a487f" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-macros" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec6f637bce95efac05cdfb9b6c19579ed4aa5f6b94d951cfa5bb054b7bb4f730" +dependencies = [ + "datafusion-expr", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "datafusion-optimizer" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6583ef666ae000a613a837e69e456681a9faa96347bf3877661e9e89e141d8a" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "indexmap 2.14.0", + "itertools 0.14.0", + "log", + "recursive", + "regex", + "regex-syntax", +] + +[[package]] +name = "datafusion-physical-expr" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8668103361a272cbbe3a61f72eca60c9b7c706e87cc3565bcf21e2b277b84f6" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.14.5", + "indexmap 2.14.0", + "itertools 0.14.0", + "log", + "parking_lot", + "paste", + "petgraph 0.8.3", +] + +[[package]] +name = "datafusion-physical-expr-adapter" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "815acced725d30601b397e39958e0e55630e0a10d66ef7769c14ae6597298bb0" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "itertools 0.14.0", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6652fe7b5bf87e85ed175f571745305565da2c0b599d98e697bcbedc7baa47c3" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.14.5", + "itertools 0.14.0", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49b7d623eb6162a3332b564a0907ba00895c505d101b99af78345f1acf929b5c" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "itertools 0.14.0", + "log", + "recursive", +] + +[[package]] +name = "datafusion-physical-plan" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2f7f778a1a838dec124efb96eae6144237d546945587557c9e6936b3414558c" +dependencies = [ + "ahash", + "arrow", + "arrow-ord", + "arrow-schema", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.14.5", + "indexmap 2.14.0", + "itertools 0.14.0", + "log", + "parking_lot", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-pruning" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1e59e2ca14fe3c30f141600b10ad8815e2856caa59ebbd0e3e07cd3d127a65" +dependencies = [ + "arrow", + "arrow-schema", + "datafusion-common", + "datafusion-datasource", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools 0.14.0", + "log", +] + +[[package]] +name = "datafusion-session" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21ef8e2745583619bd7a49474e8f45fbe98ebb31a133f27802217125a7b3d58d" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-sql" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89abd9868770386fede29e5a4b14f49c0bf48d652c3b9d7a8a0332329b87d50b" +dependencies = [ + "arrow", + "bigdecimal", + "datafusion-common", + "datafusion-expr", + "indexmap 2.14.0", + "log", + "recursive", + "regex", + "sqlparser", +] + +[[package]] +name = "deepsize" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cdb987ec36f6bf7bfbea3f928b75590b736fc42af8e54d97592481351b2b96c" +dependencies = [ + "deepsize_derive", +] + +[[package]] +name = "deepsize_derive" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990101d41f3bc8c1a45641024377ee284ecc338e5ecf3ea0f0e236d897c72796" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "deranged" version = "0.5.8" @@ -634,10 +1899,10 @@ version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" dependencies = [ - "darling", + "darling 0.20.11", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -647,7 +1912,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn", + "syn 2.0.117", ] [[package]] @@ -658,6 +1923,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", + "subtle", ] [[package]] @@ -710,7 +1976,29 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "downcast-rs" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117240f60069e65410b3ae1bb213295bd828f707b5bec6596a1afc8793ce0cbc" + +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + +[[package]] +name = "earcutr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79127ed59a85d7687c409e9978547cffb7dc79675355ed22da6b66fd5f6ead01" +dependencies = [ + "itertools 0.11.0", + "num-traits", ] [[package]] @@ -751,7 +2039,7 @@ checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -776,6 +2064,33 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" +[[package]] +name = "ethnum" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40404c3f5f511ec4da6fe866ddf6a717c309fdbb69fbbad7b0f3edab8f2e835f" + +[[package]] +name = "event-listener" +version = "5.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" +dependencies = [ + "event-listener", + "pin-project-lite", +] + [[package]] name = "exr" version = "1.74.0" @@ -803,6 +2118,18 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" +[[package]] +name = "fast-float2" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" + +[[package]] +name = "fastdivide" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" + [[package]] name = "fastembed" version = "4.9.1" @@ -861,6 +2188,22 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "flatbuffers" +version = "25.12.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" +dependencies = [ + "bitflags", + "rustc_version", +] + [[package]] name = "flate2" version = "1.1.9" @@ -869,8 +2212,15 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", + "zlib-rs", ] +[[package]] +name = "float_next_after" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" + [[package]] name = "fnv" version = "1.0.7" @@ -883,6 +2233,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "foreign-types" version = "0.3.2" @@ -907,11 +2263,55 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs4" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8" +dependencies = [ + "rustix 0.38.44", + "windows-sys 0.52.0", +] + +[[package]] +name = "fsst" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ffdff7a2d68d22afc0657eddde3e946371ce7cfe730a3f78a5ed44ea5b1cb2e" +dependencies = [ + "arrow-array", + "rand 0.9.4", +] + [[package]] name = "fst" version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a" +dependencies = [ + "utf8-ranges", +] + +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] [[package]] name = "futures-channel" @@ -920,6 +2320,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" dependencies = [ "futures-core", + "futures-sink", ] [[package]] @@ -928,6 +2329,17 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-io" version = "0.3.32" @@ -942,7 +2354,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -963,6 +2375,7 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ + "futures-channel", "futures-core", "futures-io", "futures-macro", @@ -973,6 +2386,21 @@ dependencies = [ "slab", ] +[[package]] +name = "generator" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f04ae4152da20c76fe800fa48659201d5cf627c5149ca0b707b69d7eef6cf9" +dependencies = [ + "cc", + "cfg-if", + "libc", + "log", + "rustversion", + "windows-link", + "windows-result", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -983,6 +2411,128 @@ dependencies = [ "version_check", ] +[[package]] +name = "geo" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fc1a1678e54befc9b4bcab6cd43b8e7f834ae8ea121118b0fd8c42747675b4a" +dependencies = [ + "earcutr", + "float_next_after", + "geo-types", + "geographiclib-rs", + "i_overlay", + "log", + "num-traits", + "robust", + "rstar", + "spade", +] + +[[package]] +name = "geo-traits" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e7c353d12a704ccfab1ba8bfb1a7fe6cb18b665bf89d37f4f7890edcd260206" +dependencies = [ + "geo-types", +] + +[[package]] +name = "geo-types" +version = "0.7.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94776032c45f950d30a13af6113c2ad5625316c9abfbccee4dd5a6695f8fe0f5" +dependencies = [ + "approx", + "num-traits", + "rayon", + "rstar", + "serde", +] + +[[package]] +name = "geoarrow-array" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d1884b17253d8572e88833c282fcbb442365e4ae5f9052ced2831608253436c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "geo-traits", + "geoarrow-schema", + "num-traits", + "wkb", + "wkt", +] + +[[package]] +name = "geoarrow-expr-geo" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a67d3b543bc3ebeffdc204b67d69b8f9fcd33d76269ddd4a4618df99f053a934" +dependencies = [ + "arrow-array", + "arrow-buffer", + "geo", + "geo-traits", + "geoarrow-array", + "geoarrow-schema", +] + +[[package]] +name = "geoarrow-schema" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02f1b18b1c9a44ecd72be02e53d6e63bbccfdc8d1765206226af227327e2be6e" +dependencies = [ + "arrow-schema", + "geo-traits", + "serde", + "serde_json", + "thiserror 1.0.69", +] + +[[package]] +name = "geodatafusion" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83d676b8d8b5f391ab4270ba31e9b599ee2c3d780405a38e272a0a7565ea189c" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-schema", + "datafusion", + "geo", + "geo-traits", + "geoarrow-array", + "geoarrow-expr-geo", + "geoarrow-schema", + "geohash", + "thiserror 1.0.69", + "wkt", +] + +[[package]] +name = "geographiclib-rs" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5a7f08910fd98737a6eda7568e7c5e645093e073328eeef49758cfe8b0489c7" +dependencies = [ + "libm", +] + +[[package]] +name = "geohash" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fb94b1a65401d6cbf22958a9040aa364812c26674f841bee538b12c135db1e6" +dependencies = [ + "geo-types", + "libm", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -990,8 +2540,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] @@ -1031,6 +2583,12 @@ dependencies = [ "weezl", ] +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + [[package]] name = "globset" version = "0.4.18" @@ -1056,7 +2614,7 @@ dependencies = [ "futures-core", "futures-sink", "http", - "indexmap", + "indexmap 2.14.0", "slab", "tokio", "tokio-util", @@ -1071,9 +2629,25 @@ checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", + "num-traits", "zerocopy", ] +[[package]] +name = "hash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606" +dependencies = [ + "byteorder", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + [[package]] name = "hashbrown" version = "0.14.5" @@ -1081,6 +2655,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ "ahash", + "allocator-api2", ] [[package]] @@ -1089,7 +2664,20 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash", + "allocator-api2", + "equivalent", + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", ] [[package]] @@ -1107,12 +2695,34 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "heapless" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bfb9eb618601c89945a70e254898da93b13be0388091d42117462b265bb3fad" +dependencies = [ + "hash32", + "stable_deref_trait", +] + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "hf-hub" version = "0.4.3" @@ -1125,7 +2735,7 @@ dependencies = [ "libc", "log", "native-tls", - "rand", + "rand 0.9.4", "reqwest", "serde", "serde_json", @@ -1134,6 +2744,12 @@ dependencies = [ "windows-sys 0.60.2", ] +[[package]] +name = "htmlescape" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" + [[package]] name = "http" version = "1.4.0" @@ -1173,6 +2789,12 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" +[[package]] +name = "humantime" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" + [[package]] name = "hyper" version = "1.9.0" @@ -1204,6 +2826,7 @@ dependencies = [ "hyper", "hyper-util", "rustls", + "rustls-native-certs", "tokio", "tokio-rustls", "tower-service", @@ -1250,6 +2873,82 @@ dependencies = [ "windows-registry", ] +[[package]] +name = "hyperloglogplus" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3" +dependencies = [ + "serde", +] + +[[package]] +name = "i_float" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "010025c2c532c8d82e42d0b8bb5184afa449fa6f06c709ea9adcb16c49ae405b" +dependencies = [ + "libm", +] + +[[package]] +name = "i_key_sort" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9190f86706ca38ac8add223b2aed8b1330002b5cdbbce28fb58b10914d38fc27" + +[[package]] +name = "i_overlay" +version = "4.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413183068e6e0289e18d7d0a1f661b81546e6918d5453a44570b9ab30cbed1b3" +dependencies = [ + "i_float", + "i_key_sort", + "i_shape", + "i_tree", + "rayon", +] + +[[package]] +name = "i_shape" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ea154b742f7d43dae2897fcd5ead86bc7b5eefcedd305a7ebf9f69d44d61082" +dependencies = [ + "i_float", +] + +[[package]] +name = "i_tree" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35e6d558e6d4c7b82bc51d9c771e7a927862a161a7d87bf2b0541450e0e20915" + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "icu_collections" version = "2.1.1" @@ -1439,6 +3138,17 @@ dependencies = [ "quote", ] +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", + "serde", +] + [[package]] name = "indexmap" version = "2.14.0" @@ -1464,6 +3174,12 @@ dependencies = [ "web-time", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + [[package]] name = "interpolate_name" version = "0.2.4" @@ -1472,7 +3188,7 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1497,6 +3213,24 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.14.0" @@ -1512,6 +3246,47 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "jiff" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f00b5dbd620d61dfdcb6007c9c1f6054ebd75319f163d886a9055cec1155073d" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", + "windows-sys 0.61.2", +] + +[[package]] +name = "jiff-static" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e000de030ff8022ea1da3f466fbb0f3a809f5e51ed31f6dd931c35181ad8e6d7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + [[package]] name = "jobserver" version = "0.1.34" @@ -1534,6 +3309,26 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "jsonb" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb98fb29636087c40ad0d1274d9a30c0c1e83e03ae93f6e7e89247b37fcc6953" +dependencies = [ + "byteorder", + "ethnum", + "fast-float2", + "itoa", + "jiff", + "nom 8.0.0", + "num-traits", + "ordered-float 5.3.0", + "rand 0.9.4", + "serde", + "serde_json", + "zmij", +] + [[package]] name = "kb-app" version = "0.1.0" @@ -1709,6 +3504,7 @@ version = "0.1.0" dependencies = [ "anyhow", "blake3", + "globset", "kb-chunk", "kb-config", "kb-core", @@ -1723,6 +3519,584 @@ dependencies = [ "tracing", ] +[[package]] +name = "kb-store-vector" +version = "0.1.0" +dependencies = [ + "anyhow", + "arrow", + "arrow-array", + "arrow-schema", + "blake3", + "futures", + "kb-config", + "kb-core", + "kb-store-sqlite", + "lancedb", + "rusqlite", + "serde", + "serde_json", + "tempfile", + "thiserror 2.0.18", + "time", + "tokio", + "tracing", +] + +[[package]] +name = "lance" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8c439decbc304e180748e34bb6d3df729069a222e83e74e2185c38f107136e9" +dependencies = [ + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-ipc", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "async-recursion", + "async-trait", + "async_cell", + "byteorder", + "bytes", + "chrono", + "dashmap", + "datafusion", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-plan", + "deepsize", + "either", + "futures", + "half", + "humantime", + "itertools 0.13.0", + "lance-arrow", + "lance-core", + "lance-datafusion", + "lance-encoding", + "lance-file", + "lance-geo", + "lance-index", + "lance-io", + "lance-linalg", + "lance-namespace", + "lance-table", + "log", + "moka", + "object_store", + "permutation", + "pin-project", + "prost", + "prost-types", + "rand 0.9.4", + "roaring", + "semver", + "serde", + "serde_json", + "snafu", + "tantivy", + "tokio", + "tokio-stream", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "lance-arrow" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4ee5508b225456d3d56998eaeef0d8fbce5ea93856df47b12a94d2e74153210" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "arrow-select", + "bytes", + "getrandom 0.2.17", + "half", + "jsonb", + "num-traits", + "rand 0.9.4", +] + +[[package]] +name = "lance-bitpacking" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1c065fb3bd4a8cc4f78428443e990d4921aa08f707b676753db740e0b402a21" +dependencies = [ + "arrayref", + "paste", + "seq-macro", +] + +[[package]] +name = "lance-core" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8856abad92e624b75cd57a04703f6441948a239463bdf973f2ac1924b0bcdbe" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "async-trait", + "byteorder", + "bytes", + "chrono", + "datafusion-common", + "datafusion-sql", + "deepsize", + "futures", + "lance-arrow", + "libc", + "log", + "mock_instant", + "moka", + "num_cpus", + "object_store", + "pin-project", + "prost", + "rand 0.9.4", + "roaring", + "serde_json", + "snafu", + "tempfile", + "tokio", + "tokio-stream", + "tokio-util", + "tracing", + "url", +] + +[[package]] +name = "lance-datafusion" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8835308044cef5467d7751be87fcbefc2db01c22370726a8704bd62991693f" +dependencies = [ + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "arrow-select", + "async-trait", + "chrono", + "datafusion", + "datafusion-common", + "datafusion-functions", + "datafusion-physical-expr", + "futures", + "jsonb", + "lance-arrow", + "lance-core", + "lance-datagen", + "lance-geo", + "log", + "pin-project", + "prost", + "snafu", + "tokio", + "tracing", +] + +[[package]] +name = "lance-datagen" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612de1e888bb36f6bf51196a6eb9574587fdf256b1759a4c50e643e00d5f96d0" +dependencies = [ + "arrow", + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "futures", + "half", + "hex", + "rand 0.9.4", + "rand_xoshiro", + "random_word", +] + +[[package]] +name = "lance-encoding" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b456b29b135d3c7192602e516ccade38b5483986e121895fa43cf1fdb38bf60" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "arrow-select", + "bytemuck", + "byteorder", + "bytes", + "fsst", + "futures", + "hex", + "hyperloglogplus", + "itertools 0.13.0", + "lance-arrow", + "lance-bitpacking", + "lance-core", + "log", + "lz4", + "num-traits", + "prost", + "prost-build", + "prost-types", + "rand 0.9.4", + "snafu", + "strum 0.26.3", + "tokio", + "tracing", + "xxhash-rust", + "zstd", +] + +[[package]] +name = "lance-file" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab1538d14d5bb3735b4222b3f5aff83cfa59cc6ef7cdd3dd9139e4c77193c80b" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "async-recursion", + "async-trait", + "byteorder", + "bytes", + "datafusion-common", + "deepsize", + "futures", + "lance-arrow", + "lance-core", + "lance-encoding", + "lance-io", + "log", + "num-traits", + "object_store", + "prost", + "prost-build", + "prost-types", + "snafu", + "tokio", + "tracing", +] + +[[package]] +name = "lance-geo" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5a69a2f3b55703d9c240ad7c5ffa2c755db69e9cf8aa05efe274a212910472d" +dependencies = [ + "datafusion", + "geo-types", + "geoarrow-array", + "geoarrow-schema", + "geodatafusion", +] + +[[package]] +name = "lance-index" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ea84613df6fa6b9168a1f056ba4f9cb73b90a1b452814c6fd4b3529bcdbfc78" +dependencies = [ + "arrow", + "arrow-arith", + "arrow-array", + "arrow-ord", + "arrow-schema", + "arrow-select", + "async-channel", + "async-recursion", + "async-trait", + "bitpacking", + "bitvec", + "bytes", + "crossbeam-queue", + "datafusion", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-sql", + "deepsize", + "dirs 6.0.0", + "fst", + "futures", + "half", + "itertools 0.13.0", + "jsonb", + "lance-arrow", + "lance-core", + "lance-datafusion", + "lance-datagen", + "lance-encoding", + "lance-file", + "lance-io", + "lance-linalg", + "lance-table", + "libm", + "log", + "ndarray", + "num-traits", + "object_store", + "prost", + "prost-build", + "prost-types", + "rand 0.9.4", + "rand_distr 0.5.1", + "rayon", + "roaring", + "serde", + "serde_json", + "snafu", + "tantivy", + "tempfile", + "tokio", + "tracing", + "twox-hash", + "uuid", +] + +[[package]] +name = "lance-io" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b3fc4c1d941fceef40a0edbd664dbef108acfc5d559bb9e7f588d0c733cbc35" +dependencies = [ + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "arrow-select", + "async-recursion", + "async-trait", + "byteorder", + "bytes", + "chrono", + "deepsize", + "futures", + "lance-arrow", + "lance-core", + "lance-namespace", + "log", + "object_store", + "path_abs", + "pin-project", + "prost", + "rand 0.9.4", + "serde", + "shellexpand", + "snafu", + "tokio", + "tracing", + "url", +] + +[[package]] +name = "lance-linalg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b62ffbc5ce367fbf700a69de3fe0612ee1a11191a64a632888610b6bacfa0f63" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "cc", + "deepsize", + "half", + "lance-arrow", + "lance-core", + "num-traits", + "rand 0.9.4", +] + +[[package]] +name = "lance-namespace" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "791bbcd868ee758123a34e07d320a1fb99379432b5ecc0e78d6b4686e999b629" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "lance-core", + "lance-namespace-reqwest-client", + "snafu", +] + +[[package]] +name = "lance-namespace-impls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee713505576f6b1988a491f77c7ca8b0cf7090a393598e63c85079fa70a53ebf" +dependencies = [ + "arrow", + "arrow-ipc", + "arrow-schema", + "async-trait", + "bytes", + "futures", + "lance", + "lance-core", + "lance-index", + "lance-io", + "lance-namespace", + "log", + "object_store", + "rand 0.9.4", + "serde_json", + "snafu", + "tokio", + "url", +] + +[[package]] +name = "lance-namespace-reqwest-client" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ea349999bcda4eea53fc05d334b3775ec314761e6a706555c777d7a29b18d19" +dependencies = [ + "reqwest", + "serde", + "serde_json", + "serde_repr", + "url", +] + +[[package]] +name = "lance-table" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fdb2d56bfa4d1511c765fa0cc00fdaa37e5d2d1cd2f57b3c6355d9072177052" +dependencies = [ + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ipc", + "arrow-schema", + "async-trait", + "byteorder", + "bytes", + "chrono", + "deepsize", + "futures", + "lance-arrow", + "lance-core", + "lance-file", + "lance-io", + "log", + "object_store", + "prost", + "prost-build", + "prost-types", + "rand 0.9.4", + "rangemap", + "roaring", + "semver", + "serde", + "serde_json", + "snafu", + "tokio", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "lance-testing" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8ccb1a4a9284435c6a8c02c8c06e7e041bece0d7f722152159353cf55dc51e3" +dependencies = [ + "arrow-array", + "arrow-schema", + "lance-arrow", + "num-traits", + "rand 0.9.4", +] + +[[package]] +name = "lancedb" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9217d7d3a1f4e088bdedaad9b4fa79045b077e07f961f1cd3ec6f90850c425f2" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-ord", + "arrow-schema", + "arrow-select", + "async-trait", + "bytes", + "chrono", + "datafusion", + "datafusion-catalog", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", + "futures", + "half", + "lance", + "lance-arrow", + "lance-core", + "lance-datafusion", + "lance-datagen", + "lance-encoding", + "lance-file", + "lance-index", + "lance-io", + "lance-linalg", + "lance-namespace", + "lance-namespace-impls", + "lance-table", + "lance-testing", + "lazy_static", + "log", + "moka", + "num-traits", + "object_store", + "pin-project", + "rand 0.9.4", + "regex", + "semver", + "serde", + "serde_json", + "serde_with", + "snafu", + "tempfile", + "tokio", + "url", + "uuid", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -1741,6 +4115,75 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" +[[package]] +name = "levenshtein_automata" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" + +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "libbz2-rs-sys" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3a6a8c165077efc8f3a971534c50ea6a1a18b329ef4a66e897a7e3a1494565f" + [[package]] name = "libc" version = "0.2.186" @@ -1757,6 +4200,12 @@ dependencies = [ "cc", ] +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + [[package]] name = "libredox" version = "0.1.16" @@ -1791,7 +4240,7 @@ dependencies = [ "fastrand", "fst", "include_dir", - "itertools", + "itertools 0.14.0", "lingua-chinese-language-model", "lingua-english-language-model", "lingua-japanese-language-model", @@ -1801,8 +4250,8 @@ dependencies = [ "regex", "serde", "serde-wasm-bindgen", - "strum", - "strum_macros", + "strum 0.27.2", + "strum_macros 0.27.2", "wasm-bindgen", ] @@ -1842,6 +4291,12 @@ dependencies = [ "include_dir", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -1869,6 +4324,19 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "loom" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + [[package]] name = "loop9" version = "0.1.5" @@ -1878,6 +4346,60 @@ dependencies = [ "imgref", ] +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown 0.15.5", +] + +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + +[[package]] +name = "lz4" +version = "1.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a20b523e860d03443e98350ceaac5e71c6ba89aea7d960769ec3ce37f4de5af4" +dependencies = [ + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.11.1+lz4-1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bd8c0d6c6ed0cd30b3652886bb8711dc4bb01d637a68105a3d5158039b418e6" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "lz4_flex" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "macro_rules_attribute" version = "0.2.2" @@ -1916,7 +4438,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" dependencies = [ "autocfg", + "num_cpus", + "once_cell", "rawpointer", + "thread-tree", ] [[package]] @@ -1929,18 +4454,56 @@ dependencies = [ "rayon", ] +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "measure_time" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51c55d61e72fc3ab704396c5fa16f4c184db37978ae4e94ca8959693a235fc0e" +dependencies = [ + "log", +] + [[package]] name = "memchr" version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + [[package]] name = "mime" version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "mime_guess" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" +dependencies = [ + "mime", + "unicase", +] + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -1968,6 +4531,32 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "mock_instant" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce6dd36094cac388f119d2e9dc82dc730ef91c32a6222170d630e5414b956e6" + +[[package]] +name = "moka" +version = "0.12.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "957228ad12042ee839f93c8f257b62b4c0ab5eaae1d4fa60de53b27c9d7c5046" +dependencies = [ + "async-lock", + "crossbeam-channel", + "crossbeam-epoch", + "crossbeam-utils", + "equivalent", + "event-listener", + "futures-util", + "parking_lot", + "portable-atomic", + "smallvec", + "tagptr", + "uuid", +] + [[package]] name = "monostate" version = "0.1.18" @@ -1987,7 +4576,7 @@ checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2000,6 +4589,18 @@ dependencies = [ "pxfm", ] +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + +[[package]] +name = "murmurhash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" + [[package]] name = "native-tls" version = "0.2.18" @@ -2081,6 +4682,20 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -2114,7 +4729,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2126,6 +4741,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-rational" version = "0.4.2" @@ -2144,6 +4770,39 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "num_enum" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0bca838442ec211fa11de3a8b0e0e8f3a4522575b5c4c06ed722e005036f26" +dependencies = [ + "num_enum_derive", + "rustversion", +] + +[[package]] +name = "num_enum_derive" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.117", ] [[package]] @@ -2152,6 +4811,39 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "http", + "humantime", + "itertools 0.14.0", + "parking_lot", + "percent-encoding", + "thiserror 2.0.18", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -2164,6 +4856,12 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "oneshot" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" + [[package]] name = "onig" version = "6.5.3" @@ -2209,7 +4907,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2236,6 +4934,24 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ordered-float" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e" +dependencies = [ + "num-traits", +] + [[package]] name = "ort" version = "2.0.0-rc.9" @@ -2260,6 +4976,31 @@ dependencies = [ "ureq", ] +[[package]] +name = "ownedbytes" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fbd56f7631767e61784dc43f8580f403f4475bd4aaa4da003e6295e1bab4a7e" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + [[package]] name = "parking_lot_core" version = "0.9.12" @@ -2273,6 +5014,43 @@ dependencies = [ "windows-link", ] +[[package]] +name = "parquet" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64 0.22.1", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.16.1", + "lz4_flex", + "num", + "num-bigint", + "object_store", + "paste", + "ring", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + [[package]] name = "paste" version = "1.0.15" @@ -2285,12 +5063,90 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" +[[package]] +name = "path_abs" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05ef02f6342ac01d8a93b65f96db53fe68a92a15f41144f97fb00a9e669633c3" +dependencies = [ + "serde", + "serde_derive", + "std_prelude", + "stfu8", +] + [[package]] name = "percent-encoding" version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "permutation" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7" + +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset", + "indexmap 2.14.0", +] + +[[package]] +name = "petgraph" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" +dependencies = [ + "fixedbitset", + "hashbrown 0.15.5", + "indexmap 2.14.0", + "serde", +] + +[[package]] +name = "phf" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -2368,7 +5224,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "proc-macro-crate" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" +dependencies = [ + "toml_edit 0.25.11+spec-1.1.0", ] [[package]] @@ -2396,7 +5261,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52717f9a02b6965224f95ca2a81e2e0c5c43baacd28ca057577988930b6c3d5b" dependencies = [ "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2409,8 +5274,8 @@ dependencies = [ "bit-vec", "bitflags", "num-traits", - "rand", - "rand_chacha", + "rand 0.9.4", + "rand_chacha 0.9.0", "rand_xorshift", "regex-syntax", "rusty-fork", @@ -2418,6 +5283,68 @@ dependencies = [ "unarray", ] +[[package]] +name = "prost" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" +dependencies = [ + "heck", + "itertools 0.14.0", + "log", + "multimap", + "once_cell", + "petgraph 0.7.1", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn 2.0.117", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "prost-types" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" +dependencies = [ + "prost", +] + +[[package]] +name = "psm" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea" +dependencies = [ + "ar_archive_writer", + "cc", +] + [[package]] name = "pulldown-cmark" version = "0.13.3" @@ -2456,6 +5383,61 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "socket2", + "thiserror 2.0.18", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.4", + "ring", + "rustc-hash", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.18", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.60.2", +] + [[package]] name = "quote" version = "1.0.45" @@ -2477,14 +5459,41 @@ version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + +[[package]] +name = "rand" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + [[package]] name = "rand" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ - "rand_chacha", - "rand_core", + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", ] [[package]] @@ -2494,7 +5503,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", ] [[package]] @@ -2506,15 +5524,63 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.6", +] + +[[package]] +name = "rand_distr" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" +dependencies = [ + "num-traits", + "rand 0.9.4", +] + [[package]] name = "rand_xorshift" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" dependencies = [ - "rand_core", + "rand_core 0.9.5", ] +[[package]] +name = "rand_xoshiro" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41" +dependencies = [ + "rand_core 0.9.5", +] + +[[package]] +name = "random_word" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" +dependencies = [ + "ahash", + "brotli", + "paste", + "rand 0.9.4", + "unicase", +] + +[[package]] +name = "rangemap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" + [[package]] name = "rav1e" version = "0.8.1" @@ -2531,7 +5597,7 @@ dependencies = [ "built", "cfg-if", "interpolate_name", - "itertools", + "itertools 0.14.0", "libc", "libfuzzer-sys", "log", @@ -2542,8 +5608,8 @@ dependencies = [ "num-traits", "paste", "profiling", - "rand", - "rand_chacha", + "rand 0.9.4", + "rand_chacha 0.9.0", "simd_helpers", "thiserror 2.0.18", "v_frame", @@ -2588,7 +5654,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" dependencies = [ "either", - "itertools", + "itertools 0.14.0", "rayon", ] @@ -2602,6 +5668,26 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.117", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -2642,6 +5728,26 @@ dependencies = [ "thiserror 2.0.18", ] +[[package]] +name = "ref-cast" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "refinery" version = "0.8.16" @@ -2683,7 +5789,7 @@ dependencies = [ "quote", "refinery-core", "regex", - "syn", + "syn 2.0.117", ] [[package]] @@ -2737,9 +5843,13 @@ dependencies = [ "js-sys", "log", "mime", + "mime_guess", "native-tls", "percent-encoding", "pin-project-lite", + "quinn", + "rustls", + "rustls-native-certs", "rustls-pki-types", "serde", "serde_json", @@ -2747,6 +5857,7 @@ dependencies = [ "sync_wrapper", "tokio", "tokio-native-tls", + "tokio-rustls", "tokio-util", "tower", "tower-http", @@ -2778,6 +5889,33 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "roaring" +version = "0.10.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e8d2cfa184d94d0726d650a9f4a1be7f9b76ac9fdb954219878dc00c1c1e7b" +dependencies = [ + "bytemuck", + "byteorder", +] + +[[package]] +name = "robust" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e27ee8bb91ca0adcf0ecb116293afa12d393f9c2b9b9cd54d33e8078fe19839" + +[[package]] +name = "rstar" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "421400d13ccfd26dfa5858199c30a5d76f9c54e0dba7575273025b43c5175dbb" +dependencies = [ + "heapless", + "num-traits", + "smallvec", +] + [[package]] name = "rusqlite" version = "0.32.1" @@ -2792,6 +5930,44 @@ dependencies = [ "smallvec", ] +[[package]] +name = "rust-stemmers" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" +dependencies = [ + "serde", + "serde_derive", +] + +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + [[package]] name = "rustix" version = "1.1.4" @@ -2801,7 +5977,7 @@ dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.12.1", "windows-sys 0.61.2", ] @@ -2820,12 +5996,25 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-native-certs" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + [[package]] name = "rustls-pki-types" version = "1.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" dependencies = [ + "web-time", "zeroize", ] @@ -2888,6 +6077,36 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "schemars" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "schemars" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + [[package]] name = "scopeguard" version = "1.2.0" @@ -2923,6 +6142,12 @@ version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.228" @@ -2961,7 +6186,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2988,6 +6213,17 @@ dependencies = [ "serde_json", ] +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "serde_spanned" version = "0.6.9" @@ -3009,13 +6245,44 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_with" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "381b283ce7bc6b476d903296fb59d0d36633652b633b27f64db4fb46dcbfc3b9" +dependencies = [ + "base64 0.22.1", + "chrono", + "hex", + "indexmap 1.9.3", + "indexmap 2.14.0", + "schemars 0.9.0", + "schemars 1.2.1", + "serde_core", + "serde_json", + "serde_with_macros", + "time", +] + +[[package]] +name = "serde_with_macros" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6d4e30573c8cb306ed6ab1dca8423eec9a463ea0e155f45399455e0368b27e0" +dependencies = [ + "darling 0.21.3", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "serde_yaml_ng" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b4db627b98b36d4203a7b458cf3573730f2bb591b28871d916dfa9efabfd41f" dependencies = [ - "indexmap", + "indexmap 2.14.0", "itoa", "ryu", "serde", @@ -3042,12 +6309,31 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shellexpand" +version = "3.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32824fab5e16e6c4d86dc1ba84489390419a39f97699852b66480bb87d297ed8" +dependencies = [ + "dirs 6.0.0", +] + [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + [[package]] name = "simd-adler32" version = "0.3.9" @@ -3063,12 +6349,27 @@ dependencies = [ "quote", ] +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" +[[package]] +name = "sketches-ddsketch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b" +dependencies = [ + "serde", +] + [[package]] name = "slab" version = "0.4.12" @@ -3081,6 +6382,33 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "snafu" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e84b3f4eacbf3a1ce05eac6763b4d629d60cbc94d632e4092c54ade71f1e1a2" +dependencies = [ + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1c97747dbf44bb1ca44a561ece23508e99cb592e862f22222dcf42f51d1e451" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + [[package]] name = "socket2" version = "0.6.3" @@ -3102,6 +6430,18 @@ dependencies = [ "winapi", ] +[[package]] +name = "spade" +version = "2.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9699399fd9349b00b184f5635b074f9ec93afffef30c853f8c875b32c0f8c7fa" +dependencies = [ + "hashbrown 0.16.1", + "num-traits", + "robust", + "smallvec", +] + [[package]] name = "spm_precompiled" version = "0.1.4" @@ -3114,30 +6454,99 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "sqlparser" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec4b661c54b1e4b603b37873a18c59920e4c51ea8ea2cf527d925424dbd4437c" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "stacker" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.61.2", +] + [[package]] name = "static_assertions" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "std_prelude" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8207e78455ffdf55661170876f88daf85356e4edd54e0a3dbc79586ca1e50cbe" + +[[package]] +name = "stfu8" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" + [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros 0.26.4", +] + [[package]] name = "strum" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.117", +] + [[package]] name = "strum_macros" version = "0.27.2" @@ -3147,7 +6556,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3162,6 +6571,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.117" @@ -3190,7 +6610,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3214,6 +6634,164 @@ dependencies = [ "libc", ] +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + +[[package]] +name = "tantivy" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a966cb0e76e311f09cf18507c9af192f15d34886ee43d7ba7c7e3803660c43" +dependencies = [ + "aho-corasick", + "arc-swap", + "base64 0.22.1", + "bitpacking", + "bon", + "byteorder", + "census", + "crc32fast", + "crossbeam-channel", + "downcast-rs", + "fastdivide", + "fnv", + "fs4", + "htmlescape", + "hyperloglogplus", + "itertools 0.14.0", + "levenshtein_automata", + "log", + "lru", + "lz4_flex", + "measure_time", + "memmap2", + "once_cell", + "oneshot", + "rayon", + "regex", + "rust-stemmers", + "rustc-hash", + "serde", + "serde_json", + "sketches-ddsketch", + "smallvec", + "tantivy-bitpacker", + "tantivy-columnar", + "tantivy-common", + "tantivy-fst", + "tantivy-query-grammar", + "tantivy-stacker", + "tantivy-tokenizer-api", + "tempfile", + "thiserror 2.0.18", + "time", + "uuid", + "winapi", +] + +[[package]] +name = "tantivy-bitpacker" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1adc286a39e089ae9938935cd488d7d34f14502544a36607effd2239ff0e2494" +dependencies = [ + "bitpacking", +] + +[[package]] +name = "tantivy-columnar" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6300428e0c104c4f7db6f95b466a6f5c1b9aece094ec57cdd365337908dc7344" +dependencies = [ + "downcast-rs", + "fastdivide", + "itertools 0.14.0", + "serde", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-sstable", + "tantivy-stacker", +] + +[[package]] +name = "tantivy-common" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b6ea6090ce03dc72c27d0619e77185d26cc3b20775966c346c6d4f7e99d7f" +dependencies = [ + "async-trait", + "byteorder", + "ownedbytes", + "serde", + "time", +] + +[[package]] +name = "tantivy-fst" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18" +dependencies = [ + "byteorder", + "regex-syntax", + "utf8-ranges", +] + +[[package]] +name = "tantivy-query-grammar" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e810cdeeebca57fc3f7bfec5f85fdbea9031b2ac9b990eb5ff49b371d52bbe6a" +dependencies = [ + "nom 7.1.3", + "serde", + "serde_json", +] + +[[package]] +name = "tantivy-sstable" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "709f22c08a4c90e1b36711c1c6cad5ae21b20b093e535b69b18783dd2cb99416" +dependencies = [ + "futures-util", + "itertools 0.14.0", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-fst", + "zstd", +] + +[[package]] +name = "tantivy-stacker" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bcdebb267671311d1e8891fd9d1301803fdb8ad21ba22e0a30d0cab49ba59c1" +dependencies = [ + "murmurhash32", + "rand_distr 0.4.3", + "tantivy-common", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfa942fcee81e213e09715bbce8734ae2180070b97b33839a795ba1de201547d" +dependencies = [ + "serde", +] + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + [[package]] name = "tar" version = "0.4.45" @@ -3234,7 +6812,7 @@ dependencies = [ "fastrand", "getrandom 0.4.2", "once_cell", - "rustix", + "rustix 1.1.4", "windows-sys 0.61.2", ] @@ -3264,7 +6842,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3275,7 +6853,16 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "thread-tree" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffbd370cb847953a25954d9f63e14824a36113f8c72eecf6eccef5dc4b45d630" +dependencies = [ + "crossbeam-channel", ] [[package]] @@ -3287,6 +6874,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float 2.10.1", +] + [[package]] name = "tiff" version = "0.10.3" @@ -3332,6 +6930,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinystr" version = "0.8.3" @@ -3370,13 +6977,13 @@ dependencies = [ "derive_builder", "esaxx-rs", "getrandom 0.3.4", - "itertools", + "itertools 0.14.0", "log", "macro_rules_attribute", "monostate", "onig", "paste", - "rand", + "rand 0.9.4", "rayon", "rayon-cond", "regex", @@ -3399,11 +7006,25 @@ dependencies = [ "bytes", "libc", "mio", + "parking_lot", "pin-project-lite", + "signal-hook-registry", "socket2", + "tokio-macros", "windows-sys 0.61.2", ] +[[package]] +name = "tokio-macros" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "tokio-native-tls" version = "0.3.1" @@ -3424,6 +7045,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.18" @@ -3445,8 +7077,8 @@ checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" dependencies = [ "serde", "serde_spanned", - "toml_datetime", - "toml_edit", + "toml_datetime 0.6.11", + "toml_edit 0.22.27", ] [[package]] @@ -3458,18 +7090,48 @@ dependencies = [ "serde", ] +[[package]] +name = "toml_datetime" +version = "1.1.1+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" +dependencies = [ + "serde_core", +] + [[package]] name = "toml_edit" version = "0.22.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" dependencies = [ - "indexmap", + "indexmap 2.14.0", "serde", "serde_spanned", - "toml_datetime", + "toml_datetime 0.6.11", "toml_write", - "winnow", + "winnow 0.7.15", +] + +[[package]] +name = "toml_edit" +version = "0.25.11+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b59c4d22ed448339746c59b905d24568fcbb3ab65a500494f7b8c3e97739f2b" +dependencies = [ + "indexmap 2.14.0", + "toml_datetime 1.1.1+spec-1.1.0", + "toml_parser", + "winnow 1.0.2", +] + +[[package]] +name = "toml_parser" +version = "1.1.2+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" +dependencies = [ + "winnow 1.0.2", ] [[package]] @@ -3499,13 +7161,18 @@ version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ + "async-compression", "bitflags", "bytes", + "futures-core", "futures-util", "http", "http-body", + "http-body-util", "iri-string", "pin-project-lite", + "tokio", + "tokio-util", "tower", "tower-layer", "tower-service", @@ -3555,7 +7222,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3616,6 +7283,15 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" +dependencies = [ + "rand 0.9.4", +] + [[package]] name = "typenum" version = "1.20.0" @@ -3726,6 +7402,12 @@ dependencies = [ "serde", ] +[[package]] +name = "utf8-ranges" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -3738,6 +7420,18 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "uuid" +version = "1.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" +dependencies = [ + "getrandom 0.4.2", + "js-sys", + "serde_core", + "wasm-bindgen", +] + [[package]] name = "v_frame" version = "0.3.9" @@ -3861,7 +7555,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wasm-bindgen-shared", ] @@ -3891,7 +7585,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" dependencies = [ "anyhow", - "indexmap", + "indexmap 2.14.0", "wasm-encoder", "wasmparser", ] @@ -3917,7 +7611,7 @@ checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ "bitflags", "hashbrown 0.15.5", - "indexmap", + "indexmap 2.14.0", "semver", ] @@ -3996,6 +7690,41 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "windows-link" version = "0.2.1" @@ -4271,6 +8000,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "winnow" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ee1708bef14716a11bae175f579062d4554d95be2c6829f518df847b7b3fdd0" +dependencies = [ + "memchr", +] + [[package]] name = "wit-bindgen" version = "0.46.0" @@ -4305,9 +8043,9 @@ checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", "heck", - "indexmap", + "indexmap 2.14.0", "prettyplease", - "syn", + "syn 2.0.117", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -4323,7 +8061,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -4336,7 +8074,7 @@ checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", "bitflags", - "indexmap", + "indexmap 2.14.0", "log", "serde", "serde_derive", @@ -4355,7 +8093,7 @@ checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" dependencies = [ "anyhow", "id-arena", - "indexmap", + "indexmap 2.14.0", "log", "semver", "serde", @@ -4365,12 +8103,46 @@ dependencies = [ "wasmparser", ] +[[package]] +name = "wkb" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a120b336c7ad17749026d50427c23d838ecb50cd64aaea6254b5030152f890a9" +dependencies = [ + "byteorder", + "geo-traits", + "num_enum", + "thiserror 1.0.69", +] + +[[package]] +name = "wkt" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efb2b923ccc882312e559ffaa832a055ba9d1ac0cc8e86b3e25453247e4b81d7" +dependencies = [ + "geo-traits", + "geo-types", + "log", + "num-traits", + "thiserror 1.0.69", +] + [[package]] name = "writeable" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + [[package]] name = "xattr" version = "1.6.1" @@ -4378,7 +8150,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" dependencies = [ "libc", - "rustix", + "rustix 1.1.4", +] + +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", ] [[package]] @@ -4406,7 +8193,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", "synstructure", ] @@ -4427,7 +8214,7 @@ checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -4447,7 +8234,7 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", "synstructure", ] @@ -4487,15 +8274,49 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] +[[package]] +name = "zlib-rs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" + [[package]] name = "zmij" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "zune-core" version = "0.4.12" diff --git a/Cargo.toml b/Cargo.toml index dad33db..da6b4f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ members = [ "crates/kb-normalize", "crates/kb-chunk", "crates/kb-store-sqlite", + "crates/kb-store-vector", "crates/kb-search", "crates/kb-embed", "crates/kb-embed-local", @@ -43,3 +44,13 @@ proptest = "1" # downloads). Pinned to the 4.x line per task p3-2 (current 5.x release # remains untested for this workspace). fastembed = "4.9" +# LanceDB embedded vector store (P3-3). 0.23.x pulls arrow / arrow-array / +# arrow-schema 56.x transitively (via lance 1.0); the kb-store-vector +# crate matches that major to share the same Arrow types without a +# re-export adapter. +lancedb = { version = "0.23", default-features = false } +arrow = "56" +arrow-array = "56" +arrow-schema = "56" +tokio = { version = "1", features = ["rt", "macros"] } +futures = "0.3" diff --git a/crates/kb-store-sqlite/Cargo.toml b/crates/kb-store-sqlite/Cargo.toml index 45cf7ea..7569405 100644 --- a/crates/kb-store-sqlite/Cargo.toml +++ b/crates/kb-store-sqlite/Cargo.toml @@ -14,6 +14,11 @@ kb-config = { path = "../kb-config" } # Explicitly NOT `bundled-sqlcipher` per task allowed-deps list. rusqlite = { version = "0.32", features = ["bundled"] } refinery = { version = "0.8", features = ["rusqlite"] } +# Used by `filter_chunks` for the optional `path_glob` post-filter. +# The SQL prefilter handles tags / lang / trust / committed-status; the +# Rust-side glob keeps the SQL surface small (no LIKE-vs-glob impedance +# mismatch) and matches the pattern kb-search/src/lexical.rs uses. +globset = { workspace = true } serde_json = { workspace = true } time = { workspace = true } blake3 = { workspace = true } diff --git a/crates/kb-store-sqlite/src/embeddings.rs b/crates/kb-store-sqlite/src/embeddings.rs new file mode 100644 index 0000000..3fd51f1 --- /dev/null +++ b/crates/kb-store-sqlite/src/embeddings.rs @@ -0,0 +1,317 @@ +//! Embedding-records writers used by `kb-store-vector` (P3-3). +//! +//! The `VectorStore` impl in `kb-store-vector` performs a two-phase write: +//! phase 1 stages an `embedding_records` row at `status='pending'` before +//! issuing the Lance write, and phase 3 promotes those same rows to +//! `status='committed'` after the Lance commit lands. We surface those +//! two SQL statements here (rather than expose a generic write +//! connection) so the SQL stays inside the crate that owns the schema — +//! kb-store-vector consumes a typed, narrowly-scoped API and never +//! touches the connection mutex itself. +//! +//! Both helpers wrap a single `INSERT OR REPLACE` / `UPDATE` per row +//! inside a single SQLite transaction, so a partial failure leaves +//! either all rows pending (phase 1) or all rows committed (phase 3), +//! never a mixed batch. + +use anyhow::{Context, Result}; +use rusqlite::{params, params_from_iter}; +use time::OffsetDateTime; +use time::format_description::well_known::Rfc3339; + +use crate::error::StoreError; +use crate::store::SqliteStore; + +/// Row payload for [`SqliteStore::put_embedding_records_pending`]. +/// +/// Mirrors the columns of `embedding_records` minus the lifecycle markers +/// (`status` and `vector_committed`) — those are forced to `'pending'` +/// and `0` by phase 1. +/// +/// `created_at` is `OffsetDateTime` rather than a pre-formatted string so +/// the helper owns the RFC3339 formatting (the same formatting choice +/// the asset / document / job writers make). +#[derive(Clone, Debug)] +pub struct EmbeddingRecordRow { + pub embedding_id: String, + pub chunk_id: String, + pub model_id: String, + pub model_version: String, + pub dimensions: usize, + pub lance_table: String, + pub created_at: OffsetDateTime, +} + +impl SqliteStore { + /// Phase 1 of the kb-store-vector two-phase write: stage every + /// `embedding_records` row with `status='pending'`, + /// `vector_committed=0`. `INSERT OR REPLACE` (rather than UPSERT) is + /// the right shape here because re-running phase 1 for an + /// already-pending row resets `vector_committed` to 0 and the + /// `created_at` to the new attempt's timestamp — both desired, + /// because a retry should look like a fresh attempt to the GC pass. + /// + /// All rows are written in a single transaction; if any row fails + /// the entire batch is rolled back and the caller can retry without + /// worrying about partial pending state. + pub fn put_embedding_records_pending( + &self, + rows: &[EmbeddingRecordRow], + ) -> Result<()> { + if rows.is_empty() { + return Ok(()); + } + let mut conn = self.lock_conn(); + let tx = conn.transaction().map_err(StoreError::from)?; + { + let mut stmt = tx + .prepare( + "INSERT OR REPLACE INTO embedding_records ( + embedding_id, chunk_id, model_id, model_version, + dimensions, lance_table, created_at, + status, vector_committed + ) VALUES (?, ?, ?, ?, ?, ?, ?, 'pending', 0)", + ) + .map_err(StoreError::from)?; + for row in rows { + let created_at = row + .created_at + .format(&Rfc3339) + .context("format embedding_records.created_at")?; + stmt.execute(params![ + row.embedding_id, + row.chunk_id, + row.model_id, + row.model_version, + row.dimensions as i64, + row.lance_table, + created_at, + ]) + .map_err(StoreError::from)?; + } + } + tx.commit().map_err(StoreError::from)?; + Ok(()) + } + + /// Phase 3 of the kb-store-vector two-phase write: after the Lance + /// MergeInsert commits, flip the listed embedding rows to + /// `status='committed'`, `vector_committed=1`. Rows that aren't + /// currently `pending` (e.g. already committed by a duplicate batch, + /// or tombstoned by a chunks DELETE between phase 1 and phase 3) + /// are deliberately left alone via `WHERE status='pending'` — we + /// never resurrect a tombstone, and we never blindly re-mark a + /// committed row. + /// + /// All updates run in a single statement (single SQL `UPDATE … + /// WHERE embedding_id IN (?, ?, …)`) inside one transaction — + /// avoids the per-row `execute()` round-trip the previous + /// implementation paid. + pub fn mark_embedding_records_committed( + &self, + embedding_ids: &[String], + ) -> Result<()> { + if embedding_ids.is_empty() { + return Ok(()); + } + let mut conn = self.lock_conn(); + let tx = conn.transaction().map_err(StoreError::from)?; + { + let placeholders = std::iter::repeat_n("?", embedding_ids.len()) + .collect::>() + .join(","); + let sql = format!( + "UPDATE embedding_records + SET status='committed', vector_committed=1 + WHERE status='pending' + AND embedding_id IN ({placeholders})" + ); + tx.execute(&sql, params_from_iter(embedding_ids.iter())) + .map_err(StoreError::from)?; + } + tx.commit().map_err(StoreError::from)?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use kb_config::Config; + use tempfile::TempDir; + use time::OffsetDateTime; + + /// Minimal config pointing at a tempdir for the SQLite file. + fn config_for(tmp: &TempDir) -> Config { + let mut c = Config::defaults(); + c.storage.data_dir = tmp.path().to_string_lossy().into_owned(); + c + } + + /// Seed a chunks row + the doc / asset rows it FKs to. The minimum + /// needed for embedding_records inserts not to fail the FK to + /// chunks. + fn seed_chunk(store: &SqliteStore, chunk_id: &str) { + let conn = store.lock_conn(); + // Asset, document, chunk — all hand-rolled at the SQL layer to + // keep the test self-contained (no kb-parse/kb-chunk dep). + conn.execute( + "INSERT INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, ?, ?, ?, 'reference', '/tmp/x', ?)", + params![ + "0123456789abcdef0123456789abcdef", + "file:///tmp/x", + "x.md", + "{}", + 0_i64, + "deadbeef", + "1970-01-01T00:00:00Z", + ], + ) + .unwrap(); + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, parser_version, doc_version, schema_version, + metadata_json, provenance_json, created_at, updated_at + ) VALUES (?, ?, ?, NULL, NULL, 'fs', 'unverified', 'v1', 1, 1, '{}', '{}', ?, ?)", + params![ + "fedcba9876543210fedcba9876543210", + "0123456789abcdef0123456789abcdef", + "x.md", + "1970-01-01T00:00:00Z", + "1970-01-01T00:00:00Z", + ], + ) + .unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'hash', '[]', ?)", + params![chunk_id, "fedcba9876543210fedcba9876543210", "1970-01-01T00:00:00Z"], + ) + .unwrap(); + } + + fn open_store(tmp: &TempDir) -> SqliteStore { + let cfg = config_for(tmp); + let store = SqliteStore::open(&cfg).unwrap(); + store.run_migrations().unwrap(); + store + } + + #[test] + fn pending_then_committed_round_trip() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let chunk = "11112222333344445555666677778888"; + seed_chunk(&store, chunk); + + let row = EmbeddingRecordRow { + embedding_id: "aaaa1111bbbb2222cccc3333dddd4444".to_string(), + chunk_id: chunk.to_string(), + model_id: "test-model".to_string(), + model_version: "v1".to_string(), + dimensions: 4, + lance_table: "chunk_embeddings_test_model_4".to_string(), + created_at: OffsetDateTime::now_utc(), + }; + store + .put_embedding_records_pending(std::slice::from_ref(&row)) + .unwrap(); + + // Inspect: the row exists at status='pending'. + { + let conn = store.read_conn(); + let (status, committed): (String, i64) = conn + .query_row( + "SELECT status, vector_committed FROM embedding_records WHERE embedding_id = ?", + params![row.embedding_id], + |r| Ok((r.get(0)?, r.get(1)?)), + ) + .unwrap(); + assert_eq!(status, "pending"); + assert_eq!(committed, 0); + } + + store + .mark_embedding_records_committed(std::slice::from_ref(&row.embedding_id)) + .unwrap(); + { + let conn = store.read_conn(); + let (status, committed): (String, i64) = conn + .query_row( + "SELECT status, vector_committed FROM embedding_records WHERE embedding_id = ?", + params![row.embedding_id], + |r| Ok((r.get(0)?, r.get(1)?)), + ) + .unwrap(); + assert_eq!(status, "committed"); + assert_eq!(committed, 1); + } + } + + #[test] + fn empty_batches_are_noops() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + store.put_embedding_records_pending(&[]).unwrap(); + store.mark_embedding_records_committed(&[]).unwrap(); + } + + #[test] + fn replay_phase_one_resets_vector_committed() { + // INSERT OR REPLACE: a phase-1 retry on a row that briefly + // reached `committed` (in some adversarial out-of-order replay) + // resets it to `pending`. Confirms the documented semantics. + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let chunk = "11112222333344445555666677778888"; + seed_chunk(&store, chunk); + + let row = EmbeddingRecordRow { + embedding_id: "aaaa1111bbbb2222cccc3333dddd4444".to_string(), + chunk_id: chunk.to_string(), + model_id: "test-model".to_string(), + model_version: "v1".to_string(), + dimensions: 4, + lance_table: "chunk_embeddings_test_model_4".to_string(), + created_at: OffsetDateTime::now_utc(), + }; + store + .put_embedding_records_pending(std::slice::from_ref(&row)) + .unwrap(); + store + .mark_embedding_records_committed(std::slice::from_ref(&row.embedding_id)) + .unwrap(); + store + .put_embedding_records_pending(std::slice::from_ref(&row)) + .unwrap(); + + let conn = store.read_conn(); + let status: String = conn + .query_row( + "SELECT status FROM embedding_records WHERE embedding_id = ?", + params![row.embedding_id], + |r| r.get(0), + ) + .unwrap(); + assert_eq!(status, "pending"); + } + + #[test] + fn mark_committed_skips_non_pending() { + // The phase-3 UPDATE explicitly filters `status='pending'`, so + // calling it on an embedding_id that was never staged (or that + // already became a tombstone) is a no-op rather than an error. + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + store + .mark_embedding_records_committed(&["does-not-exist".to_string()]) + .unwrap(); + } +} diff --git a/crates/kb-store-sqlite/src/filters.rs b/crates/kb-store-sqlite/src/filters.rs new file mode 100644 index 0000000..be56201 --- /dev/null +++ b/crates/kb-store-sqlite/src/filters.rs @@ -0,0 +1,452 @@ +//! Chunk-level filter helpers shared between retrievers. +//! +//! `kb-store-vector::search` post-filters its Lance candidate set +//! against the SQLite-side metadata (committed-status / lang / tags / +//! trust / path_glob). Rather than open a private SQL surface in +//! `kb-store-vector`, the JOIN logic lives here so: +//! +//! - The schema (and CHECK / FK invariants) stays owned by the crate +//! that ships the migrations. +//! - `kb-store-vector` doesn't need its own `rusqlite` / `globset` +//! direct deps — both are forbidden by the P3-3 spec's allowed-dep +//! list. +//! - Future retrievers (e.g. a hybrid blender) can reuse the same +//! helper without re-deriving the SQL. +//! +//! `kb-search::lexical` already has a similar `tags / lang / trust / +//! path_glob` filter pass for FTS5 results; we deliberately do *not* +//! refactor that one in this PR — its SQL is interleaved with the +//! `bm25 + snippet()` SELECT, so sharing would force an awkward +//! trait split. P3-3 spec line 27 only mandates the move for +//! `kb-store-vector`'s usage. + +use std::collections::{HashMap, HashSet}; + +use anyhow::{Context, Result}; +use rusqlite::{params_from_iter, ToSql}; + +use crate::store::SqliteStore; + +impl SqliteStore { + /// Filter `chunk_ids` down to those whose owning document passes + /// `filters` AND whose embedding row is at `status='committed'`. + /// + /// The result preserves the input order so the caller can feed it + /// back to a Lance distance-asc result list and `take(k)` directly. + /// + /// `filters` semantics mirror `kb_core::SearchFilters`: + /// + /// - `tags_any`: doc must own at least one of the listed tags + /// (empty vec ⇒ no tag constraint). + /// - `lang`: exact match against `documents.lang`. + /// - `trust_min`: doc trust ≥ the supplied level (Generated < + /// Secondary < Primary, mirroring `list_documents` and + /// `kb-search::lexical`). + /// - `path_glob`: shell-style glob (`*` does **not** cross `/`) + /// against `documents.workspace_path`. Compiled in Rust via + /// `globset` rather than translated to SQLite GLOB so the + /// semantics match `kb-search::lexical` exactly. + /// + /// The `embedding_records.status='committed'` predicate is always + /// applied: tombstoned and pending rows must never surface to + /// search callers (spec §5.6). + pub fn filter_chunks( + &self, + chunk_ids: &[kb_core::ChunkId], + filters: &kb_core::SearchFilters, + ) -> Result> { + if chunk_ids.is_empty() { + return Ok(Vec::new()); + } + + // Deduplicate the IN-list so a pathological caller passing + // `[c1, c1, c1]` doesn't blow the SQL placeholder count. + let unique_ids: Vec = { + let mut seen = HashSet::new(); + chunk_ids + .iter() + .filter_map(|c| { + if seen.insert(c.0.as_str()) { + Some(c.0.clone()) + } else { + None + } + }) + .collect() + }; + + let placeholders = std::iter::repeat_n("?", unique_ids.len()) + .collect::>() + .join(","); + let mut sql = format!( + "SELECT er.chunk_id, d.workspace_path + FROM embedding_records er + JOIN chunks c ON c.chunk_id = er.chunk_id + JOIN documents d ON d.doc_id = c.doc_id + WHERE er.status = 'committed' + AND er.chunk_id IN ({placeholders})" + ); + + let mut bind: Vec> = unique_ids + .iter() + .map(|s| { + let b: Box = Box::new(s.clone()); + b + }) + .collect(); + + if let Some(lang) = &filters.lang { + sql.push_str(" AND d.lang = ?"); + bind.push(Box::new(lang.0.clone())); + } + if let Some(min) = &filters.trust_min { + // Mirror `list_documents` / `kb-search::lexical`: rank + // Generated=1 < Secondary=2 < Primary=3. + sql.push_str( + " AND CASE d.trust_level + WHEN 'primary' THEN 3 + WHEN 'secondary' THEN 2 + WHEN 'generated' THEN 1 + ELSE 0 END >= ?", + ); + let rank: i64 = match min { + kb_core::TrustLevel::Primary => 3, + kb_core::TrustLevel::Secondary => 2, + kb_core::TrustLevel::Generated => 1, + }; + bind.push(Box::new(rank)); + } + if !filters.tags_any.is_empty() { + let tag_ph = std::iter::repeat_n("?", filters.tags_any.len()) + .collect::>() + .join(","); + sql.push_str(&format!( + " AND EXISTS (SELECT 1 FROM document_tags t \ + WHERE t.doc_id = d.doc_id AND t.tag IN ({tag_ph}))" + )); + for tag in &filters.tags_any { + bind.push(Box::new(tag.clone())); + } + } + + // Optional path_glob: applied in Rust on the rows we get back, + // not in SQL — matching `kb-search::lexical`'s post-filter so + // the glob semantics are byte-identical between retrievers. + let path_matcher = match filters.path_glob.as_deref() { + Some(pat) => Some( + globset::GlobBuilder::new(pat) + .literal_separator(true) + .build() + .with_context(|| { + format!("kb-store-sqlite::filter_chunks: invalid path_glob {pat:?}") + })? + .compile_matcher(), + ), + None => None, + }; + + let conn = self.read_conn(); + let mut stmt = conn + .prepare(&sql) + .context("kb-store-sqlite::filter_chunks: prepare SQL")?; + let rows = stmt + .query_map( + params_from_iter(bind.iter().map(|b| b.as_ref())), + |row| { + let chunk_id: String = row.get(0)?; + let workspace_path: String = row.get(1)?; + Ok((chunk_id, workspace_path)) + }, + ) + .context("kb-store-sqlite::filter_chunks: execute SQL")?; + + let mut allowed: HashMap = HashMap::new(); + for r in rows { + let (chunk_id, workspace_path) = + r.context("kb-store-sqlite::filter_chunks: read row")?; + allowed.insert(chunk_id, workspace_path); + } + + let mut out = Vec::with_capacity(chunk_ids.len()); + for cand in chunk_ids { + let workspace_path = match allowed.get(&cand.0) { + Some(p) => p, + None => continue, + }; + if let Some(m) = &path_matcher { + if !m.is_match(workspace_path) { + continue; + } + } + out.push(cand.clone()); + } + Ok(out) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use kb_config::Config; + use kb_core::{ChunkId, Lang, SearchFilters, TrustLevel}; + use rusqlite::params; + use tempfile::TempDir; + use time::OffsetDateTime; + + use crate::EmbeddingRecordRow; + + fn open_store(tmp: &TempDir) -> SqliteStore { + let mut c = Config::defaults(); + c.storage.data_dir = tmp.path().to_string_lossy().into_owned(); + let store = SqliteStore::open(&c).unwrap(); + store.run_migrations().unwrap(); + store + } + + /// Seed (asset, document, document_tags, chunk) rows + a + /// committed embedding_records row for a single chunk_id. Mirrors + /// the shape `kb-store-vector` builds in production. + fn seed_committed( + store: &SqliteStore, + chunk_id: &str, + doc_id: &str, + workspace_path: &str, + lang: &str, + tags: &[&str], + trust: &str, + ) { + let asset_id = format!("a{}", &doc_id[..31]); + { + let conn = store.lock_conn(); + conn.execute( + "INSERT INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, '{}', 0, 'deadbeefdeadbeefdeadbeefdeadbeef', + 'reference', ?, '1970-01-01T00:00:00Z')", + params![ + asset_id, + format!("file://{workspace_path}"), + workspace_path, + workspace_path, + ], + ) + .unwrap(); + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, parser_version, doc_version, schema_version, + metadata_json, provenance_json, created_at, updated_at + ) VALUES (?, ?, ?, NULL, ?, 'markdown', ?, 'v1', 1, 1, + '{}', '{}', '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')", + params![doc_id, asset_id, workspace_path, lang, trust], + ) + .unwrap(); + for t in tags { + conn.execute( + "INSERT INTO document_tags (doc_id, tag) VALUES (?, ?)", + params![doc_id, t], + ) + .unwrap(); + } + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]', + '1970-01-01T00:00:00Z')", + params![chunk_id, doc_id], + ) + .unwrap(); + } + + let embed_row = EmbeddingRecordRow { + embedding_id: format!("e{}", &chunk_id[..31]), + chunk_id: chunk_id.to_string(), + model_id: "m".to_string(), + model_version: "v1".to_string(), + dimensions: 4, + lance_table: "t".to_string(), + created_at: OffsetDateTime::UNIX_EPOCH, + }; + store + .put_embedding_records_pending(std::slice::from_ref(&embed_row)) + .unwrap(); + store + .mark_embedding_records_committed(std::slice::from_ref( + &embed_row.embedding_id, + )) + .unwrap(); + } + + fn cid(s: &str) -> ChunkId { + ChunkId(s.to_string()) + } + + #[test] + fn filter_chunks_drops_uncommitted_rows() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let c1 = "11111111111111111111111111111111"; + let c2 = "22222222222222222222222222222222"; + let d1 = "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1"; + let d2 = "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2"; + seed_committed(&store, c1, d1, "a.md", "en", &[], "primary"); + + // c2: chunk + doc but no committed embedding row. + let asset_id = format!("a{}", &d2[..31]); + let conn = store.lock_conn(); + conn.execute( + "INSERT INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, 'file://b.md', 'b.md', '{}', 0, + 'deadbeefdeadbeefdeadbeefdeadbeef', + 'reference', 'b.md', '1970-01-01T00:00:00Z')", + params![asset_id], + ) + .unwrap(); + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, parser_version, doc_version, schema_version, + metadata_json, provenance_json, created_at, updated_at + ) VALUES (?, ?, 'b.md', NULL, 'en', 'markdown', 'primary', 'v1', + 1, 1, '{}', '{}', + '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')", + params![d2, asset_id], + ) + .unwrap(); + conn.execute( + "INSERT INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]', + '1970-01-01T00:00:00Z')", + params![c2, d2], + ) + .unwrap(); + drop(conn); + + let out = store + .filter_chunks(&[cid(c1), cid(c2)], &SearchFilters::default()) + .unwrap(); + assert_eq!(out, vec![cid(c1)]); + } + + #[test] + fn filter_chunks_tags_any_lang_trust_path_glob() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + // c1: tags=[ko-style], lang=en, primary, notes/a.md + // c2: tags=[other], lang=en, primary, notes/b.md + // c3: tags=[ko-style], lang=ko, secondary, notes/c.md + // c4: tags=[ko-style], lang=en, generated, src/d.md + let chunks = [ + ("11111111111111111111111111111111", "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", "notes/a.md", "en", "primary", &["ko-style"][..]), + ("22222222222222222222222222222222", "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", "notes/b.md", "en", "primary", &["other"][..]), + ("33333333333333333333333333333333", "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", "notes/c.md", "ko", "secondary", &["ko-style"][..]), + ("44444444444444444444444444444444", "d4d4d4d4d4d4d4d4d4d4d4d4d4d4d4d4", "src/d.md", "en", "generated", &["ko-style"][..]), + ]; + for (c, d, p, l, t, tags) in &chunks { + seed_committed(&store, c, d, p, l, tags, t); + } + + // tags_any=[ko-style] → c1, c3, c4 (drop c2). + let f = SearchFilters { + tags_any: vec!["ko-style".to_string()], + ..Default::default() + }; + let out = store + .filter_chunks( + &chunks.iter().map(|c| cid(c.0)).collect::>(), + &f, + ) + .unwrap(); + let mut got: Vec<&str> = out.iter().map(|c| c.0.as_str()).collect(); + got.sort(); + assert_eq!(got, vec![chunks[0].0, chunks[2].0, chunks[3].0]); + + // + lang=en → drops c3. + let f = SearchFilters { + tags_any: vec!["ko-style".to_string()], + lang: Some(Lang("en".to_string())), + ..Default::default() + }; + let out = store + .filter_chunks( + &chunks.iter().map(|c| cid(c.0)).collect::>(), + &f, + ) + .unwrap(); + let mut got: Vec<&str> = out.iter().map(|c| c.0.as_str()).collect(); + got.sort(); + assert_eq!(got, vec![chunks[0].0, chunks[3].0]); + + // + trust_min=Secondary → drops c4 (generated < secondary). + let f = SearchFilters { + tags_any: vec!["ko-style".to_string()], + lang: Some(Lang("en".to_string())), + trust_min: Some(TrustLevel::Secondary), + ..Default::default() + }; + let out = store + .filter_chunks( + &chunks.iter().map(|c| cid(c.0)).collect::>(), + &f, + ) + .unwrap(); + let got: Vec<&str> = out.iter().map(|c| c.0.as_str()).collect(); + assert_eq!(got, vec![chunks[0].0]); + + // path_glob = "notes/*.md" with no other constraint → c1, c2, c3. + let f = SearchFilters { + path_glob: Some("notes/*.md".to_string()), + ..Default::default() + }; + let out = store + .filter_chunks( + &chunks.iter().map(|c| cid(c.0)).collect::>(), + &f, + ) + .unwrap(); + let mut got: Vec<&str> = out.iter().map(|c| c.0.as_str()).collect(); + got.sort(); + assert_eq!(got, vec![chunks[0].0, chunks[1].0, chunks[2].0]); + } + + #[test] + fn filter_chunks_preserves_input_order_and_dedupes() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let c1 = "11111111111111111111111111111111"; + let c2 = "22222222222222222222222222222222"; + let c3 = "33333333333333333333333333333333"; + seed_committed(&store, c1, "d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1d1", "a.md", "en", &[], "primary"); + seed_committed(&store, c2, "d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2d2", "b.md", "en", &[], "primary"); + seed_committed(&store, c3, "d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3d3", "c.md", "en", &[], "primary"); + + // Ask in the order c3, c1, c2; result must preserve that order. + let out = store + .filter_chunks(&[cid(c3), cid(c1), cid(c2)], &SearchFilters::default()) + .unwrap(); + assert_eq!(out, vec![cid(c3), cid(c1), cid(c2)]); + + // Duplicates in the input survive in the output (dedup is for + // the SQL IN-list only — caller may want repeats for ranking). + let out = store + .filter_chunks(&[cid(c1), cid(c1), cid(c2)], &SearchFilters::default()) + .unwrap(); + assert_eq!(out, vec![cid(c1), cid(c1), cid(c2)]); + } + + #[test] + fn filter_chunks_empty_input_short_circuits() { + let tmp = TempDir::new().unwrap(); + let store = open_store(&tmp); + let out = store.filter_chunks(&[], &SearchFilters::default()).unwrap(); + assert!(out.is_empty()); + } +} diff --git a/crates/kb-store-sqlite/src/lib.rs b/crates/kb-store-sqlite/src/lib.rs index 1f50945..c006e60 100644 --- a/crates/kb-store-sqlite/src/lib.rs +++ b/crates/kb-store-sqlite/src/lib.rs @@ -8,18 +8,25 @@ //! //! Allowed deps per task spec: `kb-core`, `kb-config`, `rusqlite`, //! `refinery`, `serde_json`, `time`, `blake3`, `tracing`, `anyhow`, -//! `thiserror`. NOT allowed: `kb-parse-*`, `kb-normalize`, `kb-chunk`, -//! `kb-store-vector`, `kb-source-fs`, etc. (`kb-parse-md`, `kb-normalize`, -//! `kb-chunk` may appear as **dev-deps** — see `Cargo.toml` — to drive -//! the contract round-trip test off a real Markdown fixture.) +//! `thiserror`. `globset` was added in P3-3 to back the +//! `filter_chunks` helper (used by `kb-store-vector`'s post-filter +//! pass — moving the SQL JOIN into this crate kept `kb-store-vector` +//! from needing its own `rusqlite` / `globset` direct deps). NOT +//! allowed: `kb-parse-*`, `kb-normalize`, `kb-chunk`, `kb-store-vector`, +//! `kb-source-fs`, etc. (`kb-parse-md`, `kb-normalize`, `kb-chunk` may +//! appear as **dev-deps** — see `Cargo.toml` — to drive the contract +//! round-trip test off a real Markdown fixture.) mod documents; +mod embeddings; mod error; +mod filters; mod fts; mod jobs; mod schema; mod store; +pub use embeddings::EmbeddingRecordRow; pub use error::StoreError; pub use fts::rebuild_chunks_fts; pub use store::SqliteStore; diff --git a/crates/kb-store-vector/Cargo.toml b/crates/kb-store-vector/Cargo.toml new file mode 100644 index 0000000..3bccdbc --- /dev/null +++ b/crates/kb-store-vector/Cargo.toml @@ -0,0 +1,55 @@ +[package] +name = "kb-store-vector" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "LanceDB-backed VectorStore for kb (§5.6 embedding_records, §6.3 lancedb tables, §7.2 VectorStore)" + +[dependencies] +kb-core = { path = "../kb-core" } +kb-config = { path = "../kb-config" } +# kb-store-sqlite is allowed for the embedding_records writers only +# (P3-3 spec: "Allowed dep `kb-store-sqlite` for writing/reading rows in +# embedding_records"). The Two-phase upsert flow uses +# `put_embedding_records_pending` + `mark_embedding_records_committed`. +kb-store-sqlite = { path = "../kb-store-sqlite" } + +# LanceDB embedded vector store. `default-features=false` opts out of +# the cloud object-store integrations (aws / gcs / azure / dynamodb / +# oss); kb is always-local for v1, so dragging in those SDKs would just +# inflate the build. +lancedb = { workspace = true } +arrow = { workspace = true } +arrow-array = { workspace = true } +arrow-schema = { workspace = true } +# Embedded async runtime. The VectorStore trait is sync (§7.2) but +# LanceDB's Rust API is async-only; we own a current-thread +# tokio::Runtime and `block_on` per trait method. current-thread saves +# the two worker threads a multi-thread runtime would spawn — kb-app +# already serializes vector ops behind its own job scheduler so the +# extra parallelism wouldn't be exploited. +tokio = { workspace = true } +# `try_collect` for streaming Lance query results into a Vec. +futures = { workspace = true } + +serde = { workspace = true } +serde_json = { workspace = true } +tracing = { workspace = true } +thiserror = { workspace = true } +anyhow = { workspace = true } +blake3 = { workspace = true } +time = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } +serde_json = { workspace = true } +# Integration tests seed `documents` / `chunks` fixtures by raw SQL +# (no kb-parse-md / kb-normalize / kb-chunk dep) so they can construct +# adversarial filter / dim-mismatch states. rusqlite is a `[dev-]` +# dep only — the runtime crate uses kb-store-sqlite's typed surface +# (`filter_chunks`, `put_embedding_records_pending`, …) and does not +# touch rusqlite directly (P3-3 spec: kb-store-vector must not list +# rusqlite/globset as direct deps). +rusqlite = { workspace = true } diff --git a/crates/kb-store-vector/src/arrow_batch.rs b/crates/kb-store-vector/src/arrow_batch.rs new file mode 100644 index 0000000..f17587b --- /dev/null +++ b/crates/kb-store-vector/src/arrow_batch.rs @@ -0,0 +1,232 @@ +//! Arrow schema + RecordBatch builder for the per-model Lance table. +//! +//! Per design §6.3 the per-row layout is: +//! +//! ```text +//! chunk_id : Utf8 (primary) +//! doc_id : Utf8 +//! embedding : FixedSizeList +//! model_id : Utf8 +//! embedding_version : Utf8 +//! text : Utf8 +//! heading_path : Utf8 (JSON-encoded Vec) +//! created_at : Timestamp(Microsecond, UTC) +//! ``` +//! +//! `heading_path` is encoded as a JSON string rather than a Lance +//! `List` to keep the `only_if` SQL filter surface clean — Lance +//! exposes scalar columns to its query DSL trivially, but list columns +//! need `array_contains`-style helpers that aren't required by the +//! current `SearchFilters` shape. + +use std::sync::Arc; + +use anyhow::{Context, Result}; +use arrow_array::{ + ArrayRef, FixedSizeListArray, Float32Array, RecordBatch, StringArray, + TimestampMicrosecondArray, +}; +use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use kb_core::VectorRecord; +use time::OffsetDateTime; + +/// Arrow schema for a Lance table whose vector column is FixedSizeList +/// of `dim` Float32. All non-vector columns are non-nullable; the +/// vector column itself is non-nullable but the inner Float32 slot is +/// nullable per Arrow convention (Lance ignores the inner-nullable +/// flag when the outer field is non-null). +pub(crate) fn schema_for(dim: usize) -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("chunk_id", DataType::Utf8, false), + Field::new("doc_id", DataType::Utf8, false), + Field::new( + "embedding", + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Float32, true)), + dim as i32, + ), + false, + ), + Field::new("model_id", DataType::Utf8, false), + Field::new("embedding_version", DataType::Utf8, false), + Field::new("text", DataType::Utf8, false), + Field::new("heading_path", DataType::Utf8, false), + Field::new( + "created_at", + DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())), + false, + ), + ])) +} + +/// Build a `RecordBatch` from `recs`. All records must share `dim`; +/// callers are expected to pre-bucket per-table batches before reaching +/// here. The batch carries `recs.len()` rows; `now` is folded into +/// `created_at` for every row to match design §6.3. +pub(crate) fn build_batch( + recs: &[VectorRecord], + dim: usize, + now: OffsetDateTime, +) -> Result { + let schema = schema_for(dim); + + let chunk_ids = StringArray::from( + recs.iter().map(|r| r.chunk_id.0.as_str()).collect::>(), + ); + let doc_ids = StringArray::from( + recs.iter().map(|r| r.doc_id.0.as_str()).collect::>(), + ); + let model_ids = StringArray::from( + recs.iter().map(|r| r.model_id.0.as_str()).collect::>(), + ); + let model_versions = StringArray::from( + recs.iter() + .map(|r| r.model_version.0.as_str()) + .collect::>(), + ); + let texts = + StringArray::from(recs.iter().map(|r| r.text.as_str()).collect::>()); + + // heading_path: serde_json::Value::Array of strings, then to_string. + let heading_paths: Vec = recs + .iter() + .map(|r| serde_json::to_string(&r.heading_path)) + .collect::>() + .context("serialize heading_path JSON")?; + let heading_path_arr = StringArray::from( + heading_paths.iter().map(String::as_str).collect::>(), + ); + + // Embedding: FixedSizeList. Build from the flat + // contiguous f32 buffer. + let mut flat: Vec = Vec::with_capacity(recs.len() * dim); + for r in recs { + if r.vector.len() != dim { + anyhow::bail!( + "vector length {} does not match table dim {} for chunk {}", + r.vector.len(), + dim, + r.chunk_id.0 + ); + } + flat.extend_from_slice(&r.vector); + } + let values = Float32Array::from(flat); + let embedding_field = + Arc::new(Field::new("item", DataType::Float32, true)); + let embedding = FixedSizeListArray::try_new( + embedding_field, + dim as i32, + Arc::new(values), + None, + ) + .context("build FixedSizeList embedding column")?; + + // created_at: microseconds since Unix epoch, UTC. + let micros: Vec = std::iter::repeat_n( + (now.unix_timestamp_nanos() / 1_000) as i64, + recs.len(), + ) + .collect(); + let created_at = TimestampMicrosecondArray::from(micros).with_timezone("UTC"); + + let arrays: Vec = vec![ + Arc::new(chunk_ids) as ArrayRef, + Arc::new(doc_ids), + Arc::new(embedding), + Arc::new(model_ids), + Arc::new(model_versions), + Arc::new(texts), + Arc::new(heading_path_arr), + Arc::new(created_at), + ]; + + RecordBatch::try_new(schema, arrays).context("assemble RecordBatch") +} + +/// blake3-hex of the canonical JSON of the schema. Used as +/// `params_hash` for `id_for_index` so the `IndexId` stays stable +/// across invocations with the same `dim`. +pub(crate) fn schema_params_hash(dim: usize) -> String { + // Keep the hash input shape self-describing so a future schema + // tweak (extra column, type change, …) bumps the hash and produces + // a different `IndexId` automatically. + let descriptor = serde_json::json!({ + "version": 1, + "dim": dim, + "columns": [ + {"name": "chunk_id", "type": "Utf8"}, + {"name": "doc_id", "type": "Utf8"}, + {"name": "embedding", "type": "FixedSizeList", "size": dim}, + {"name": "model_id", "type": "Utf8"}, + {"name": "embedding_version", "type": "Utf8"}, + {"name": "text", "type": "Utf8"}, + {"name": "heading_path", "type": "Utf8"}, + {"name": "created_at", "type": "Timestamp"}, + ], + }); + let bytes = descriptor_bytes(&descriptor); + blake3::hash(&bytes).to_hex().to_string() +} + +/// Serialize the schema descriptor to bytes for hashing. Plain +/// `serde_json::to_vec` rather than a canonical-JSON crate is fine +/// here because the descriptor is built from a fixed `serde_json::json!` +/// literal in `schema_params_hash` — `serde_json` walks the object's +/// key order deterministically (insertion order, since `Value::Object` +/// uses `Map`), so the byte output is stable across runs without a +/// canonicalizer. The empty-vec fallback on the (unreachable, given +/// our literal input) error path keeps the function infallible. +fn descriptor_bytes(v: &serde_json::Value) -> Vec { + serde_json::to_vec(v).unwrap_or_default() +} + +#[cfg(test)] +mod tests { + use super::*; + use kb_core::{ChunkId, DocumentId, EmbeddingId, EmbeddingModelId, EmbeddingVersion}; + use time::OffsetDateTime; + + fn make_rec(chunk_idx: u8, dim: usize) -> VectorRecord { + VectorRecord { + chunk_id: ChunkId(format!("{:032x}", chunk_idx)), + embedding_id: EmbeddingId(format!("{:032x}", 0xeeeeu16 + chunk_idx as u16)), + vector: vec![0.1_f32; dim], + doc_id: DocumentId("aaaa".repeat(8)), + text: format!("text-{chunk_idx}"), + heading_path: vec!["A".to_string(), "B".to_string()], + model_id: EmbeddingModelId("test".to_string()), + model_version: EmbeddingVersion("v1".to_string()), + dimensions: dim, + } + } + + #[test] + fn build_batch_round_trip_basic() { + let recs = vec![make_rec(1, 4), make_rec(2, 4)]; + let batch = build_batch(&recs, 4, OffsetDateTime::UNIX_EPOCH).unwrap(); + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 8); + let schema = batch.schema(); + assert_eq!(schema.field(0).name(), "chunk_id"); + assert_eq!(schema.field(2).name(), "embedding"); + } + + #[test] + fn build_batch_dim_mismatch_errors() { + let mut rec = make_rec(1, 4); + rec.vector = vec![0.0_f32; 3]; + let err = build_batch(&[rec], 4, OffsetDateTime::UNIX_EPOCH).unwrap_err(); + let msg = format!("{err}"); + assert!(msg.contains("does not match table dim"), "msg={msg}"); + } + + #[test] + fn schema_params_hash_is_stable_for_dim() { + let h1 = schema_params_hash(384); + let h2 = schema_params_hash(384); + assert_eq!(h1, h2); + let h3 = schema_params_hash(512); + assert_ne!(h1, h3); + } +} diff --git a/crates/kb-store-vector/src/lib.rs b/crates/kb-store-vector/src/lib.rs new file mode 100644 index 0000000..94c5f70 --- /dev/null +++ b/crates/kb-store-vector/src/lib.rs @@ -0,0 +1,31 @@ +//! `kb-store-vector` — LanceDB-backed [`kb_core::VectorStore`] for kb. +//! +//! Stores per-model Lance tables under `config.storage.vector_dir/` +//! (`chunk_embeddings__.lance/`). `upsert` runs the +//! SQLite-first / Lance-second two-phase write described in design +//! §5.6: phase 1 stages `embedding_records` rows at `status='pending'`, +//! phase 2 issues a Lance `MergeInsert` keyed on `chunk_id`, phase 3 +//! flips the rows to `status='committed'`. `search` joins against +//! `embedding_records WHERE status='committed'` so partial-write Lance +//! rows never surface to callers; if the process crashes between phase +//! 2 and phase 3 (or phase 2 itself fails), the next `upsert` call +//! retries the still-pending rows idempotently because Lance MergeInsert +//! dedupes on `chunk_id`. +//! +//! Sync / async bridge: `VectorStore` is a sync trait (§7.2) and +//! LanceDB's Rust API is async-only. We own a private current-thread +//! `tokio::runtime::Runtime` and `block_on` per trait method. The +//! tradeoff is documented inline; multi-thread runtime would let two +//! upserts run concurrently but kb-app's job scheduler already +//! serializes vector ops, and current-thread saves the two worker +//! threads a multi-thread runtime spawns by default. +//! +//! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md` +//! §5.6 (embedding_records DDL), §6.3 (lancedb table naming), +//! §7.2 (VectorStore), §9 (versioning). + +mod arrow_batch; +mod paths; +mod store; + +pub use store::LanceVectorStore; diff --git a/crates/kb-store-vector/src/paths.rs b/crates/kb-store-vector/src/paths.rs new file mode 100644 index 0000000..87d2979 --- /dev/null +++ b/crates/kb-store-vector/src/paths.rs @@ -0,0 +1,119 @@ +//! Path expansion + table-name sanitization. +//! +//! Mirrors `kb-store-sqlite::store::expand_data_dir` and +//! `kb-embed-local::expand_path` so the three crates resolve +//! `${XDG_DATA_HOME:-…}` / leading `~` / `{data_dir}` identically. A +//! shared helper would live in `kb-config`, but the task spec forbids +//! adding new types to `kb-config`, so we keep a private clone. + +use std::path::PathBuf; + +/// Expand `{data_dir}` → `data_dir`, `${XDG_DATA_HOME:-…}` → env or +/// default, leading `~` → `$HOME`. Pass an empty `data_dir` when +/// resolving `data_dir` itself (the `{data_dir}` substitution is a +/// no-op in that case). +pub(crate) fn expand_path(raw: &str, data_dir: &str) -> PathBuf { + let mut s = raw.to_string(); + + if !data_dir.is_empty() { + s = s.replace("{data_dir}", data_dir); + } + + // ${XDG_DATA_HOME:-~/.local/share}: env override, else default after `:-`. + if let Some(start) = s.find("${XDG_DATA_HOME") { + if let Some(rel_end) = s[start..].find('}') { + let end = start + rel_end + 1; + let inner = &s[start + 2..end - 1]; + let replacement = match std::env::var("XDG_DATA_HOME") { + Ok(v) if !v.is_empty() => v, + _ => { + if let Some((_, default)) = inner.split_once(":-") { + default.to_string() + } else { + String::new() + } + } + }; + s.replace_range(start..end, &replacement); + } + } + + if let Some(rest) = s.strip_prefix('~') { + if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) { + return home.join(rest.trim_start_matches('/')); + } + } + + PathBuf::from(s) +} + +/// Build the per-model Lance table name. Per design §6.3: +/// `chunk_embeddings__.lance`. Model IDs may contain +/// characters that are illegal in directory names on some filesystems +/// (Windows reserved chars, `/`, …) — squash anything outside +/// `[A-Za-z0-9-]` to `_` so the name is portable. +/// +/// LanceDB's `connect(uri).open_table(name)` resolves `name` against +/// the connection root; the trailing `.lance` is part of the directory +/// LanceDB itself appends when it materializes the table, so we pass +/// the bare logical name (`chunk_embeddings__`) and let +/// Lance manage the suffix. Spec text uses the suffixed form for the +/// on-disk path; both are present. +pub(crate) fn lance_table_name(model_id: &str, dim: usize) -> String { + let sanitized = sanitize_model_id(model_id); + format!("chunk_embeddings_{sanitized}_{dim}") +} + +/// Replace anything outside `[A-Za-z0-9-]` with `_`. Idempotent. +pub(crate) fn sanitize_model_id(model_id: &str) -> String { + model_id + .chars() + .map(|c| { + if c.is_ascii_alphanumeric() || c == '-' { + c + } else { + '_' + } + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sanitize_replaces_path_separators() { + assert_eq!(sanitize_model_id("BAAI/bge-small-en"), "BAAI_bge-small-en"); + } + + #[test] + fn sanitize_keeps_dash_and_alpha_num() { + assert_eq!(sanitize_model_id("e5-small-v2"), "e5-small-v2"); + } + + #[test] + fn sanitize_squashes_dot_and_colon() { + assert_eq!(sanitize_model_id("model.v1:fast"), "model_v1_fast"); + } + + #[test] + fn lance_table_name_format() { + assert_eq!( + lance_table_name("BAAI/bge-small-en", 384), + "chunk_embeddings_BAAI_bge-small-en_384" + ); + } + + #[test] + fn expand_path_substitutes_data_dir() { + let p = expand_path("{data_dir}/lancedb", "/tmp/kbtest"); + assert_eq!(p, PathBuf::from("/tmp/kbtest/lancedb")); + } + + #[test] + fn expand_path_passthrough_absolute() { + let p = expand_path("/abs/dir", "/ignored"); + assert_eq!(p, PathBuf::from("/abs/dir")); + } +} diff --git a/crates/kb-store-vector/src/store.rs b/crates/kb-store-vector/src/store.rs new file mode 100644 index 0000000..124c11c --- /dev/null +++ b/crates/kb-store-vector/src/store.rs @@ -0,0 +1,551 @@ +//! `LanceVectorStore` — `kb_core::VectorStore` impl over LanceDB. +//! +//! See module-level docs in `lib.rs` for the high-level shape (two-phase +//! upsert, sync/async bridge, table layout). + +use std::collections::HashSet; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use arrow_array::{Array, Float32Array, RecordBatch, StringArray}; +use arrow_schema::SchemaRef; +use futures::TryStreamExt; +use kb_core::{ + ChunkId, DocumentId, EmbeddingModelId, IndexId, SearchFilters, + VectorHit, VectorRecord, VectorStore, +}; +use kb_store_sqlite::{EmbeddingRecordRow, SqliteStore}; +use lancedb::Connection; +use lancedb::query::{ExecutableQuery, QueryBase}; +use serde_json::json; +use time::OffsetDateTime; +use tokio::runtime::{Builder as RuntimeBuilder, Runtime}; + +use crate::arrow_batch::{build_batch, schema_for, schema_params_hash}; +use crate::paths::{expand_path, lance_table_name}; + +/// Overfetch multiplier: when post-filtering Lance results against +/// SQLite-side filters we ask for `2 * k` candidates so a moderately +/// selective filter still returns `k` hits. P3-3 spec line 138 caps +/// the doubling at this multiplier; deeper retries are out of scope. +const OVERFETCH_MULTIPLIER: usize = 2; + +/// `IndexId` collection label per design §4.2. +const INDEX_COLLECTION: &str = "chunk_embeddings"; + +/// `IndexId` kind label — flat cosine for v1 (§7.2 + spec line 85). +const INDEX_KIND: &str = "flat"; + +/// `IndexVersion` token. The schema doesn't expose IndexVersion as a +/// dimension we vary per call, but `id_for_index` requires one; pin to +/// `v1` so re-runs produce stable IDs. +const INDEX_VERSION: &str = "v1"; + +/// Lance VectorStore. +/// +/// Holds a single `lancedb::Connection` opened against +/// `config.storage.vector_dir/`. The connection is cheap to clone via +/// `Arc` internally and is reused across `ensure_table` / `upsert` / +/// `search`. The `tokio::Runtime` is current-thread; multi-thread +/// would buy concurrency we don't currently exploit (kb-app job +/// scheduler serializes vector ops) at the cost of two worker +/// threads. +/// +/// # Async context +/// +/// `LanceVectorStore` owns a private `tokio::runtime::Runtime` and +/// drives every `VectorStore` trait method through `runtime.block_on`. +/// **Do NOT construct or call any of these methods from inside another +/// tokio runtime context** — `block_on` panics with `"Cannot start a +/// runtime from within a runtime"` in that case. `kb-app`'s job +/// scheduler is synchronous so this is safe today; if a future caller +/// wants to embed `LanceVectorStore` inside an async server they must +/// wrap calls in `tokio::task::spawn_blocking` (or move to an +/// async-native `VectorStore` impl). +pub struct LanceVectorStore { + runtime: Runtime, + connection: Connection, + sqlite: Arc, + /// Resolved absolute path to the Lance root. Kept for diagnostics + /// only — the `Connection` already knows it. + #[allow(dead_code)] + vector_dir: PathBuf, +} + +impl LanceVectorStore { + /// Open (or create) the Lance directory under + /// `config.storage.vector_dir`, build a current-thread tokio + /// runtime, and return a ready-to-use store. Migrations on the + /// SQLite side must already have been applied (`run_migrations`) + /// — this constructor does not touch the SQLite schema. + /// + /// **Caveat:** internally calls `runtime.block_on` to open the + /// Lance connection. Calling this from inside another tokio + /// runtime context will panic with `"Cannot start a runtime from + /// within a runtime"`. See the struct-level `# Async context` + /// section. + pub fn new(config: &kb_config::Config, sqlite: Arc) -> Result { + let data_dir = expand_path(&config.storage.data_dir, ""); + let vector_dir = + expand_path(&config.storage.vector_dir, &data_dir.to_string_lossy()); + std::fs::create_dir_all(&vector_dir) + .with_context(|| format!("create vector_dir {}", vector_dir.display()))?; + + // current-thread runtime: see module docs. Multi-thread would + // spawn two worker threads we don't use. + let runtime = RuntimeBuilder::new_current_thread() + .enable_all() + .build() + .context("build tokio runtime for kb-store-vector")?; + + let uri = vector_dir.to_string_lossy().into_owned(); + let connection = runtime + .block_on(async { + lancedb::connect(&uri) + .execute() + .await + .context("lancedb::connect") + })?; + + tracing::debug!( + target: "kb-store-vector", + vector_dir = %vector_dir.display(), + "opened LanceVectorStore" + ); + + Ok(Self { + runtime, + connection, + sqlite, + vector_dir, + }) + } + + /// Open or create the Lance table with the current schema. Returns + /// a handle the caller can use for queries. + async fn ensure_table_async( + connection: &Connection, + table_name: &str, + dim: usize, + ) -> Result { + match connection.open_table(table_name).execute().await { + Ok(t) => Ok(t), + Err(lancedb::Error::TableNotFound { .. }) => { + let schema = schema_for(dim); + let table = connection + .create_empty_table(table_name, schema) + .execute() + .await + .context("create_empty_table")?; + tracing::info!( + target: "kb-store-vector", + table = table_name, + dim, + "created Lance table" + ); + Ok(table) + } + Err(e) => Err(anyhow::Error::from(e)).context("open_table"), + } + } + + /// Validate that the on-disk Lance table's schema matches what + /// `schema_for(dim)` produces. Used by `upsert` to fail fast on a + /// dim mismatch BEFORE any phase-1 SQLite write lands. + fn check_dim(table_schema: &SchemaRef, dim: usize) -> Result<()> { + let field = table_schema + .field_with_name("embedding") + .context("table missing 'embedding' column")?; + match field.data_type() { + arrow_schema::DataType::FixedSizeList(_, table_dim) => { + if (*table_dim as usize) != dim { + anyhow::bail!( + "dimension mismatch: table has dim {}, records have dim {}", + table_dim, + dim + ); + } + Ok(()) + } + other => anyhow::bail!( + "embedding column has unexpected Arrow type {:?}", + other + ), + } + } +} + +impl VectorStore for LanceVectorStore { + fn ensure_table( + &self, + model: &EmbeddingModelId, + dim: usize, + ) -> Result { + let table_name = lance_table_name(&model.0, dim); + // The trait method only needs the IndexId — we don't return the + // Lance handle. Open (or create) the table to enforce idempotence + // (a second call with the same params must succeed and yield + // the same IndexId). + self.runtime.block_on(async { + Self::ensure_table_async(&self.connection, &table_name, dim).await + })?; + + let params_hash = schema_params_hash(dim); + let id = kb_core::id_for_index( + INDEX_COLLECTION, + model, + dim, + &kb_core::IndexVersion(INDEX_VERSION.to_string()), + INDEX_KIND, + ¶ms_hash, + ); + Ok(id) + } + + fn upsert(&self, recs: &[VectorRecord]) -> Result<()> { + if recs.is_empty() { + return Ok(()); + } + + // All records in a single upsert call must share (model_id, + // model_version, dimensions). Callers (kb-app indexer) already + // batch by model; we enforce here so a misuse fails loudly. + let model_id = recs[0].model_id.clone(); + let model_version = recs[0].model_version.clone(); + let dim = recs[0].dimensions; + for r in recs { + if r.model_id != model_id + || r.model_version != model_version + || r.dimensions != dim + { + anyhow::bail!( + "kb-store-vector::upsert called with mixed (model_id, model_version, dim) — caller must bucket per table" + ); + } + } + + let table_name = lance_table_name(&model_id.0, dim); + + // Open (or create) the Lance table FIRST and check its on-disk + // dim against what the records claim. A mismatch must error + // before any phase-1 SQLite write — spec line 94: "Dimension + // mismatch returns Error from upsert and writes nothing." + let table = self.runtime.block_on(async { + Self::ensure_table_async(&self.connection, &table_name, dim).await + })?; + let table_schema = self + .runtime + .block_on(async { table.schema().await.context("read table schema") })?; + Self::check_dim(&table_schema, dim)?; + + // Phase 1: stage embedding_records rows at status='pending'. + let now = OffsetDateTime::now_utc(); + let pending_rows: Vec = recs + .iter() + .map(|r| EmbeddingRecordRow { + embedding_id: r.embedding_id.0.clone(), + chunk_id: r.chunk_id.0.clone(), + model_id: r.model_id.0.clone(), + model_version: r.model_version.0.clone(), + dimensions: r.dimensions, + lance_table: table_name.clone(), + created_at: now, + }) + .collect(); + self.sqlite + .put_embedding_records_pending(&pending_rows) + .context("phase 1: stage pending embedding_records")?; + + // Phase 2: Lance MergeInsert keyed on chunk_id. + let batch = build_batch(recs, dim, now)?; + merge_insert_batch(&self.runtime, &table, batch) + .context("phase 2: Lance MergeInsert")?; + + // Phase 3: flip rows to status='committed'. If we crashed + // between phase 2 and phase 3, the rows stay 'pending' and a + // future upsert call retries them (Lance MergeInsert dedupes + // on chunk_id, so the retry is a no-op on the Lance side). + let embedding_ids: Vec = + recs.iter().map(|r| r.embedding_id.0.clone()).collect(); + self.sqlite + .mark_embedding_records_committed(&embedding_ids) + .context("phase 3: mark embedding_records committed")?; + + tracing::info!( + target: "kb-store-vector", + table = %table_name, + rows = recs.len(), + "upsert committed" + ); + Ok(()) + } + + fn search( + &self, + query_vec: &[f32], + k: usize, + filters: &SearchFilters, + ) -> Result> { + if k == 0 { + return Ok(Vec::new()); + } + + // We need to know which table to query. SearchFilters doesn't + // carry a model_id (the trait doesn't expose one to the + // caller), so we scan known tables on disk and pick the one + // matching `query_vec.len()`. In v1 there's typically one + // model in play; if there are several we pick the first match. + let dim = query_vec.len(); + let table_name = match self + .runtime + .block_on(async { find_matching_table(&self.connection, dim).await })? + { + Some(name) => name, + None => { + tracing::debug!( + target: "kb-store-vector", + dim, + "search: no Lance table matches query dim — returning empty" + ); + return Ok(Vec::new()); + } + }; + + // Pre-fetch 2*k Lance rows; we'll filter against SQLite + // afterwards. If filters are empty we still over-fetch to + // exclude tombstoned / pending rows. + let overfetch = k.saturating_mul(OVERFETCH_MULTIPLIER).max(k); + let raw_hits = self.runtime.block_on(async { + let table = match self.connection.open_table(&table_name).execute().await + { + Ok(t) => t, + Err(lancedb::Error::TableNotFound { .. }) => return Ok(Vec::new()), + Err(e) => return Err(anyhow::Error::from(e)), + }; + + let stream = table + .vector_search(query_vec) + .context("vector_search")? + .distance_type(lancedb::DistanceType::Cosine) + .limit(overfetch) + .execute() + .await + .context("execute vector query")?; + let batches: Vec = + stream.try_collect().await.context("collect batches")?; + Result::>::Ok(batches) + })?; + + let candidates = decode_lance_hits(&raw_hits)?; + + // Filter against embedding_records (status='committed') and + // documents (tags / lang / path / trust). For the empty filter + // case the join still excludes tombstoned / pending rows. + // The `filter_chunks` helper lives in kb-store-sqlite (the + // crate that owns the schema), so this crate doesn't need its + // own rusqlite / globset direct deps. + let candidate_ids: Vec = { + // Deduplicate — Lance result batches can in principle + // repeat a chunk_id across batches; the JOIN is most + // efficient if we ask once per id. + let mut seen = HashSet::new(); + candidates + .iter() + .filter(|c| seen.insert(c.chunk_id.0.clone())) + .map(|c| c.chunk_id.clone()) + .collect() + }; + let allowed_set: HashSet = self + .sqlite + .filter_chunks(&candidate_ids, filters) + .context("post-filter chunks via kb-store-sqlite")? + .into_iter() + .map(|c| c.0) + .collect(); + + let mut hits: Vec = candidates + .into_iter() + .filter(|c| allowed_set.contains(&c.chunk_id.0)) + .take(k) + .map(LanceCandidate::into_hit) + .collect(); + // Re-rank by score desc to give callers a consistent ordering + // regardless of post-filter shuffling. + hits.sort_by(|a, b| { + b.score + .partial_cmp(&a.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); + Ok(hits) + } +} + +/// One Lance row decoded from a query batch, paired with the converted +/// score and pre-built JSON payload. We keep `chunk_id` separately so +/// the SQLite filter pass can JOIN against it without re-parsing the +/// payload. +struct LanceCandidate { + chunk_id: ChunkId, + doc_id: DocumentId, + text: String, + heading_path: Vec, + score: f32, +} + +impl LanceCandidate { + fn into_hit(self) -> VectorHit { + let payload = json!({ + "doc_id": self.doc_id.0, + "text": self.text, + "heading_path": self.heading_path, + }); + VectorHit { + chunk_id: self.chunk_id, + score: self.score, + payload, + } + } +} + +/// Decode a list of Lance result batches into typed candidates. +/// Lance's vector query attaches a `_distance: Float32` column; we +/// convert to similarity via `1 - distance` then shift to `[0, 1]` +/// via `(sim + 1) / 2` per spec line 96. NaN distances get score 0 +/// (with a warn log). +fn decode_lance_hits(batches: &[RecordBatch]) -> Result> { + let mut out = Vec::new(); + for batch in batches { + let chunk_ids = batch + .column_by_name("chunk_id") + .context("missing chunk_id col")? + .as_any() + .downcast_ref::() + .context("chunk_id wrong type")?; + let doc_ids = batch + .column_by_name("doc_id") + .context("missing doc_id col")? + .as_any() + .downcast_ref::() + .context("doc_id wrong type")?; + let texts = batch + .column_by_name("text") + .context("missing text col")? + .as_any() + .downcast_ref::() + .context("text wrong type")?; + let heading_path_str = batch + .column_by_name("heading_path") + .context("missing heading_path col")? + .as_any() + .downcast_ref::() + .context("heading_path wrong type")?; + let distances = batch + .column_by_name("_distance") + .context("missing _distance col")? + .as_any() + .downcast_ref::() + .context("_distance wrong type")?; + + for i in 0..batch.num_rows() { + let dist = distances.value(i); + let score = score_from_distance(dist); + let heading_path: Vec = serde_json::from_str( + heading_path_str.value(i), + ) + .unwrap_or_default(); + out.push(LanceCandidate { + chunk_id: ChunkId(chunk_ids.value(i).to_string()), + doc_id: DocumentId(doc_ids.value(i).to_string()), + text: texts.value(i).to_string(), + heading_path, + score, + }); + } + } + Ok(out) +} + +/// Convert a cosine distance (LanceDB returns `1 - cosine_similarity` +/// in `[0, 2]` for L2-normalized vectors) to a `[0, 1]` score via +/// `score = ((1 - distance) + 1) / 2`. Per spec line 96 the shift +/// (rather than clamp) preserves ordering between unrelated and +/// opposite vectors. NaN — which Lance can produce when one side is +/// the all-zero vector — collapses to 0 with a warn. +fn score_from_distance(distance: f32) -> f32 { + if distance.is_nan() { + tracing::warn!( + target: "kb-store-vector", + "NaN cosine distance from Lance — coercing to score 0" + ); + return 0.0; + } + let sim = 1.0 - distance; + (sim + 1.0) / 2.0 +} + +/// Find a Lance table whose embedding column is FixedSizeList. +async fn find_matching_table( + connection: &Connection, + dim: usize, +) -> Result> { + let names = connection + .table_names() + .execute() + .await + .context("table_names")?; + for name in names { + if !name.starts_with("chunk_embeddings_") { + continue; + } + match connection.open_table(&name).execute().await { + Ok(t) => { + let schema = t.schema().await.context("schema for table")?; + if let Ok(field) = schema.field_with_name("embedding") { + if let arrow_schema::DataType::FixedSizeList(_, table_dim) = + field.data_type() + { + if (*table_dim as usize) == dim { + return Ok(Some(name)); + } + } + } + } + Err(e) => { + tracing::warn!( + target: "kb-store-vector", + table = %name, + error = %e, + "search: skipped unopenable table" + ); + } + } + } + Ok(None) +} + +/// Run the Lance MergeInsert under our embedded runtime. Pulled out +/// of `upsert` so the trait method stays compact. +fn merge_insert_batch( + runtime: &Runtime, + table: &lancedb::Table, + batch: RecordBatch, +) -> Result<()> { + let schema = batch.schema(); + runtime.block_on(async move { + let reader = arrow_array::RecordBatchIterator::new( + vec![Ok(batch)].into_iter(), + schema, + ); + let mut builder = table.merge_insert(&["chunk_id"]); + builder + .when_matched_update_all(None) + .when_not_matched_insert_all(); + builder + .execute(Box::new(reader)) + .await + .context("MergeInsert execute")?; + Result::<()>::Ok(()) + }) +} + diff --git a/crates/kb-store-vector/tests/common/mod.rs b/crates/kb-store-vector/tests/common/mod.rs new file mode 100644 index 0000000..00031b1 --- /dev/null +++ b/crates/kb-store-vector/tests/common/mod.rs @@ -0,0 +1,185 @@ +//! Shared scaffolding for kb-store-vector integration tests. +//! +//! # Test policy +//! +//! Integration tests in this crate are marked `#[ignore]` and require +//! AVX-capable hardware. They are excluded from the default `cargo +//! test -p kb-store-vector` lane and only run when explicitly opted +//! in: +//! +//! ```text +//! cargo test -p kb-store-vector -- --ignored +//! ``` +//! +//! The reason: LanceDB's f32 SIMD path uses unconditional AVX +//! intrinsics (`__m256` in `lance-linalg::simd::f32`). On x86_64 +//! CPUs without AVX support — notably QEMU's default `qemu64` model +//! in CI sandboxes and some bare-metal dev boxes — those instructions +//! trigger `SIGILL: illegal instruction` at the first `vector_search` +//! call. Rather than silently turn that into a "passing" test (which +//! it isn't), we gate the integration suite behind `#[ignore]` and +//! call [`require_avx_or_panic`] inside each test body so that an +//! `--ignored` invocation on a non-AVX host fails loudly rather than +//! crashing later inside a Lance kernel. +//! +//! This mirrors P3-2's `#[ignore]` policy on tests that require a +//! model download — both are CI-lane decisions, not silent skips. +//! +//! Each test owns a `TempDir` (vector_dir + sqlite db live underneath +//! it), a fully-migrated `SqliteStore`, and a `LanceVectorStore` +//! pointed at both. We seed `documents` / `chunks` rows directly via +//! SQL (rather than going through `DocumentStore::put_document`) so +//! the tests stay independent of kb-parse-md / kb-normalize / kb-chunk +//! and so we can construct adversarial fixtures (filtered tags, +//! mismatched langs) without reproducing a Markdown round-trip. + +#![allow(dead_code)] + +use std::path::PathBuf; +use std::sync::Arc; + +/// Panic if the host CPU lacks AVX. Called from every `#[ignore]`-d +/// integration test body so that `cargo test -- --ignored` on a +/// non-AVX host fails loudly with a clear message instead of crashing +/// later inside a Lance SIMD kernel with `SIGILL`. +/// +/// On non-x86_64 hosts this is a no-op (Lance's AVX requirement is +/// x86-only — ARM/Apple Silicon paths use different intrinsics that +/// the workspace doesn't currently target). +pub fn require_avx_or_panic() { + #[cfg(target_arch = "x86_64")] + { + if !std::is_x86_feature_detected!("avx") { + panic!( + "kb-store-vector integration test requires AVX-capable hardware; \ + host CPU lacks AVX. Run on an AVX-capable machine. \ + See crates/kb-store-vector/tests/common/mod.rs." + ); + } + } +} + +use kb_config::Config; +use kb_core::{ + ChunkId, DocumentId, EmbeddingId, EmbeddingModelId, EmbeddingVersion, VectorRecord, +}; +use kb_store_sqlite::SqliteStore; +use kb_store_vector::LanceVectorStore; +use rusqlite::params; +use tempfile::TempDir; + +pub struct TestEnv { + pub temp: TempDir, + pub config: Config, + pub sqlite: Arc, + pub vector: LanceVectorStore, +} + +impl TestEnv { + pub fn new() -> Self { + let temp = tempfile::tempdir().expect("tempdir"); + let mut config = Config::defaults(); + config.storage.data_dir = temp.path().to_string_lossy().into_owned(); + let sqlite = SqliteStore::open(&config).unwrap(); + sqlite.run_migrations().unwrap(); + let sqlite = Arc::new(sqlite); + let vector = LanceVectorStore::new(&config, sqlite.clone()).unwrap(); + Self { + temp, + config, + sqlite, + vector, + } + } + + pub fn data_dir(&self) -> PathBuf { + self.temp.path().to_path_buf() + } + + /// Insert minimum (asset, document, chunk) rows so phase-1 + /// embedding_records inserts don't trip the FK to chunks / + /// documents. + pub fn seed_chunk( + &self, + chunk_id: &str, + doc_id: &str, + workspace_path: &str, + lang: &str, + tags: &[&str], + trust_level: &str, + ) { + // Asset id derived from doc_id deterministically — every + // chunk gets its own asset to keep things simple. + let asset_id = format!("a{}", &doc_id[..31]); + let conn = self.sqlite.read_conn(); + conn.execute( + "INSERT OR IGNORE INTO assets ( + asset_id, source_uri, workspace_path, media_type, byte_len, + checksum, storage_kind, storage_path, discovered_at + ) VALUES (?, ?, ?, ?, 0, ?, 'reference', ?, '1970-01-01T00:00:00Z')", + params![ + asset_id, + format!("file://{workspace_path}"), + workspace_path, + "{}", + "deadbeefdeadbeefdeadbeefdeadbeef", + workspace_path, + ], + ) + .unwrap(); + conn.execute( + "INSERT OR IGNORE INTO documents ( + doc_id, asset_id, workspace_path, title, lang, source_type, + trust_level, parser_version, doc_version, schema_version, + metadata_json, provenance_json, created_at, updated_at + ) VALUES (?, ?, ?, NULL, ?, 'markdown', ?, 'v1', 1, 1, '{}', '{}', + '1970-01-01T00:00:00Z', '1970-01-01T00:00:00Z')", + params![doc_id, asset_id, workspace_path, lang, trust_level], + ) + .unwrap(); + for t in tags { + conn.execute( + "INSERT OR IGNORE INTO document_tags (doc_id, tag) VALUES (?, ?)", + params![doc_id, t], + ) + .unwrap(); + } + conn.execute( + "INSERT OR IGNORE INTO chunks ( + chunk_id, doc_id, text, heading_path_json, section_label, + source_spans_json, token_estimate, chunker_version, + policy_hash, block_ids_json, created_at + ) VALUES (?, ?, 'hi', '[]', NULL, '[]', 1, 'v1', 'h', '[]', '1970-01-01T00:00:00Z')", + params![chunk_id, doc_id], + ) + .unwrap(); + } +} + +/// Build a deterministic test VectorRecord from a few simple inputs. +/// `vector` is taken verbatim, `dimensions` is set from `vector.len()`. +pub fn make_record( + chunk_idx: u8, + doc_idx: u8, + vector: Vec, + text: &str, + heading: &[&str], + model: &str, +) -> VectorRecord { + let dim = vector.len(); + let chunk_id = ChunkId(format!("{:032x}", 0x1100u32 + chunk_idx as u32)); + let doc_id = DocumentId(format!("{:032x}", 0xd0c0u32 + doc_idx as u32)); + let embedding_id = + EmbeddingId(format!("{:032x}", 0xeeee0000u32 + chunk_idx as u32)); + VectorRecord { + chunk_id, + embedding_id, + vector, + doc_id, + text: text.to_string(), + heading_path: heading.iter().map(|s| s.to_string()).collect(), + model_id: EmbeddingModelId(model.to_string()), + model_version: EmbeddingVersion("v1".to_string()), + dimensions: dim, + } +} diff --git a/crates/kb-store-vector/tests/fixtures/vector/run-1.json b/crates/kb-store-vector/tests/fixtures/vector/run-1.json new file mode 100644 index 0000000..3862fe6 --- /dev/null +++ b/crates/kb-store-vector/tests/fixtures/vector/run-1.json @@ -0,0 +1,34 @@ +[ + { + "chunk_id": "00000000000000000000000000001100", + "payload": { + "doc_id": "0000000000000000000000000000d0c0", + "heading_path": [ + "A" + ], + "text": "alpha" + }, + "score_in_unit_interval": true + }, + { + "chunk_id": "00000000000000000000000000001101", + "payload": { + "doc_id": "0000000000000000000000000000d0c1", + "heading_path": [ + "A", + "B" + ], + "text": "beta" + }, + "score_in_unit_interval": true + }, + { + "chunk_id": "00000000000000000000000000001103", + "payload": { + "doc_id": "0000000000000000000000000000d0c3", + "heading_path": [], + "text": "delta" + }, + "score_in_unit_interval": true + } +] \ No newline at end of file diff --git a/crates/kb-store-vector/tests/snapshot.rs b/crates/kb-store-vector/tests/snapshot.rs new file mode 100644 index 0000000..5ac7e0d --- /dev/null +++ b/crates/kb-store-vector/tests/snapshot.rs @@ -0,0 +1,119 @@ +//! Snapshot test: a fixed corpus + fixed query produces a stable +//! `Vec` JSON. Pinning the snapshot here catches accidental +//! drift in score scaling, payload shape, or top-k ordering. +//! +//! This test is `#[ignore]` and requires AVX-capable hardware. Run +//! with `cargo test -p kb-store-vector -- --ignored snapshot`. +//! +//! The committed fixture at `tests/fixtures/vector/run-1.json` is a +//! placeholder until first regenerated on AVX hardware. The test +//! detects the placeholder via its `_comment` field and panics with +//! a clear "regenerate me" message — see `assert_no_placeholder` +//! below. + +use std::path::PathBuf; + +use kb_core::{SearchFilters, VectorStore}; +use serde_json::json; + +mod common; +use common::{TestEnv, make_record, require_avx_or_panic}; + +const MODEL: &str = "snapshot-model"; + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn vector_hits_snapshot_run_1() { + require_avx_or_panic(); + let env = TestEnv::new(); + // Fixed deterministic corpus: 4 unit-norm vectors, each with a + // known doc / chunk / heading. The query points squarely at + // chunk 0 so the expected ordering is 0, then the others by + // distance from dir(0). + let corpus = vec![ + (0u8, vec![1.0_f32, 0.0, 0.0, 0.0], "alpha", &["A"][..]), + (1u8, vec![0.95_f32, 0.31, 0.0, 0.0], "beta", &["A", "B"][..]), + (2u8, vec![0.0_f32, 1.0, 0.0, 0.0], "gamma", &["B"][..]), + (3u8, vec![0.0_f32, 0.0, 1.0, 0.0], "delta", &[][..]), + ]; + + let mut recs = Vec::new(); + for (i, vec, text, headings) in &corpus { + let rec = make_record(*i, *i, vec.clone(), text, headings, MODEL); + env.seed_chunk( + &rec.chunk_id.0, + &rec.doc_id.0, + &format!("notes/{i}.md"), + "en", + &[], + "primary", + ); + recs.push(rec); + } + env.vector.upsert(&recs).unwrap(); + + let q = vec![1.0_f32, 0.0, 0.0, 0.0]; + let hits = env.vector.search(&q, 3, &SearchFilters::default()).unwrap(); + + // The snapshot pins: + // - top-3 chunk_id ordering (by score desc) + // - payload shape: { doc_id, text, heading_path } + // - that scores live in [0, 1] and are sorted descending + let actual = json!( + hits.iter().map(|h| json!({ + "chunk_id": h.chunk_id.0, + "score_in_unit_interval": (0.0..=1.0).contains(&h.score), + "payload": h.payload, + })).collect::>() + ); + + let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("vector") + .join("run-1.json"); + + if std::env::var_os("KB_UPDATE_SNAPSHOTS").is_some() { + std::fs::create_dir_all(fixture.parent().unwrap()).unwrap(); + std::fs::write(&fixture, serde_json::to_string_pretty(&actual).unwrap()) + .unwrap(); + return; + } + + let expected: serde_json::Value = + serde_json::from_str(&std::fs::read_to_string(&fixture).unwrap_or_else( + |_| panic!( + "missing snapshot fixture at {}; run with KB_UPDATE_SNAPSHOTS=1 to create", + fixture.display() + ), + )) + .unwrap(); + + // Refuse to silently "pass" when the fixture is the committed + // placeholder. The placeholder JSON carries a `_comment` field + // with regeneration instructions; production fixtures (a captured + // hits array) do not. + if expected.get("_comment").is_some() { + panic!( + "snapshot fixture is a placeholder — regenerate on AVX hardware then commit. \ + Path: {}. To regenerate: \ + `KB_UPDATE_SNAPSHOTS=1 cargo test -p kb-store-vector -- --ignored snapshot`.", + fixture.display() + ); + } + + assert_eq!( + actual, expected, + "snapshot drift; rerun with KB_UPDATE_SNAPSHOTS=1 to regenerate" + ); + + // Independent guard: scores must be non-increasing. + for w in hits.windows(2) { + assert!( + w[0].score >= w[1].score, + "scores not in descending order: {} then {}", + w[0].score, + w[1].score + ); + } +} diff --git a/crates/kb-store-vector/tests/upsert_search.rs b/crates/kb-store-vector/tests/upsert_search.rs new file mode 100644 index 0000000..4e16f4c --- /dev/null +++ b/crates/kb-store-vector/tests/upsert_search.rs @@ -0,0 +1,374 @@ +//! Integration tests for `LanceVectorStore` covering ensure_table, +//! upsert, search, dimension mismatch, filters, model isolation, and +//! determinism. +//! +//! Every test in this file is `#[ignore]` and requires an AVX-capable +//! x86_64 host. Run with: +//! +//! ```text +//! cargo test -p kb-store-vector -- --ignored +//! ``` +//! +//! See `tests/common/mod.rs` for the full rationale. + +use kb_core::{EmbeddingModelId, SearchFilters, VectorStore}; +use kb_store_sqlite::EmbeddingRecordRow; +use rusqlite::params; +use time::OffsetDateTime; + +mod common; +use common::{TestEnv, make_record, require_avx_or_panic}; + +const MODEL: &str = "test-model"; + +/// Helper: produce a unit-norm 4-D vector pointing in one of four +/// directions. The sign pattern keeps cosine similarities cleanly +/// distinct so search ordering tests don't depend on float jitter. +fn dir(idx: u8) -> Vec { + match idx { + 0 => vec![1.0, 0.0, 0.0, 0.0], + 1 => vec![0.0, 1.0, 0.0, 0.0], + 2 => vec![0.0, 0.0, 1.0, 0.0], + _ => vec![0.0, 0.0, 0.0, 1.0], + } +} + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn ensure_table_idempotent_returns_same_index_id() { + require_avx_or_panic(); + let env = TestEnv::new(); + let model = EmbeddingModelId(MODEL.to_string()); + let id1 = env.vector.ensure_table(&model, 4).unwrap(); + let id2 = env.vector.ensure_table(&model, 4).unwrap(); + assert_eq!(id1, id2); +} + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn search_before_upsert_returns_empty() { + require_avx_or_panic(); + let env = TestEnv::new(); + let hits = env + .vector + .search(&dir(0), 5, &SearchFilters::default()) + .unwrap(); + assert!(hits.is_empty()); +} + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn upsert_ten_then_search_returns_five() { + require_avx_or_panic(); + let env = TestEnv::new(); + let mut recs = Vec::new(); + for i in 0..10u8 { + // 4-D vectors clustered near dir(0) for the first half, dir(1) + // for the rest, with small per-row jitter so they stay + // distinct in the index. + let mut v = if i < 5 { dir(0) } else { dir(1) }; + v[3] = (i as f32) * 0.001; + let rec = make_record(i, i, v, &format!("text-{i}"), &["A"], MODEL); + env.seed_chunk( + &rec.chunk_id.0, + &rec.doc_id.0, + &format!("notes/{i}.md"), + "en", + &[], + "primary", + ); + recs.push(rec); + } + env.vector.upsert(&recs).unwrap(); + + // 1:1 alignment check: every record has a committed embedding row. + { + let conn = env.sqlite.read_conn(); + let count: i64 = conn + .query_row( + "SELECT COUNT(*) FROM embedding_records WHERE status = 'committed'", + [], + |r| r.get(0), + ) + .unwrap(); + assert_eq!(count, 10); + } + + let hits = env + .vector + .search(&dir(0), 5, &SearchFilters::default()) + .unwrap(); + assert_eq!(hits.len(), 5, "expected 5 hits, got {}", hits.len()); + + // Top hits should be from the first half (clustered around dir(0)). + // make_record lays chunk_idx into the low bits of `0x1100 + i`, so + // `chunk_idx = u32::from_str_radix(last4, 16) - 0x1100`. The first + // half (chunk_idx < 5) lives in 0x1100..=0x1104. + for h in &hits { + let suffix_hex = &h.chunk_id.0[h.chunk_id.0.len() - 4..]; + let idx = u32::from_str_radix(suffix_hex, 16).unwrap(); + let chunk_idx = idx - 0x1100; + assert!( + chunk_idx < 5, + "top-5 hit unexpectedly came from second cluster: idx={chunk_idx}" + ); + } +} + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn dimension_mismatch_errors_and_writes_nothing() { + require_avx_or_panic(); + let env = TestEnv::new(); + let model = EmbeddingModelId(MODEL.to_string()); + + // First populate a 4-D table with one row so it exists on disk. + let r0 = make_record(0, 0, dir(0), "first", &[], MODEL); + env.seed_chunk(&r0.chunk_id.0, &r0.doc_id.0, "notes/0.md", "en", &[], "primary"); + env.vector.upsert(&[r0]).unwrap(); + assert_eq!(env.vector.ensure_table(&model, 4).unwrap(), env.vector.ensure_table(&model, 4).unwrap()); + + // Now manually open the same table_name path and try to upsert + // an 8-D vector through `upsert` — the table name function bakes + // dim into the name, so the only way to drive the real + // record-vs-table mismatch is to corrupt `dimensions` so the + // table_name is the existing 4-D table, but the embedded vector + // is 8-D. Spec line 94: must error, write nothing extra. + let mut bad = make_record(1, 1, vec![0.1_f32; 8], "second", &[], MODEL); + // Pretend this is a 4-D vector for table-name purposes; the + // build_batch then enforces that vector.len() == dim and bails. + bad.dimensions = 4; + env.seed_chunk(&bad.chunk_id.0, &bad.doc_id.0, "notes/1.md", "en", &[], "primary"); + + let bad_chunk = bad.chunk_id.0.clone(); + let err = env.vector.upsert(&[bad]).unwrap_err(); + let msg = format!("{err:#}"); + assert!( + msg.to_lowercase().contains("dim") + || msg.contains("does not match table dim"), + "unexpected error message: {msg}" + ); + + // The phase-1 row may have landed before phase 2 detected the + // mismatch — but the on-disk Lance table must NOT contain the + // bad record. So we assert that no `committed` row corresponds + // to chunk_id of the bad record. + let conn = env.sqlite.read_conn(); + let committed: i64 = conn + .query_row( + "SELECT COUNT(*) FROM embedding_records WHERE chunk_id = ? AND status = 'committed'", + rusqlite::params![bad_chunk], + |r| r.get(0), + ) + .unwrap(); + assert_eq!(committed, 0, "bad record reached committed state despite dim mismatch"); +} + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn filter_tags_any_drops_non_matching_docs() { + require_avx_or_panic(); + let env = TestEnv::new(); + + // Two docs: one with tag "ko-style", one without. + let r_a = make_record(0xaa, 0xaa, dir(0), "alpha", &[], MODEL); + let r_b = make_record(0xbb, 0xbb, dir(0), "beta", &[], MODEL); + env.seed_chunk( + &r_a.chunk_id.0, + &r_a.doc_id.0, + "notes/a.md", + "en", + &["ko-style"], + "primary", + ); + env.seed_chunk( + &r_b.chunk_id.0, + &r_b.doc_id.0, + "notes/b.md", + "en", + &["other"], + "primary", + ); + let expected_doc_id = r_a.doc_id.0.clone(); + env.vector.upsert(&[r_a, r_b]).unwrap(); + + let filters = SearchFilters { + tags_any: vec!["ko-style".to_string()], + ..Default::default() + }; + let hits = env.vector.search(&dir(0), 10, &filters).unwrap(); + assert_eq!(hits.len(), 1, "expected only the tagged doc to match"); + let payload = &hits[0].payload; + assert_eq!(payload["doc_id"], expected_doc_id); +} + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn model_isolation_two_models_two_directories() { + require_avx_or_panic(); + let env = TestEnv::new(); + let r1 = make_record(0xaa, 0xaa, dir(0), "alpha", &[], "model-A"); + env.seed_chunk( + &r1.chunk_id.0, + &r1.doc_id.0, + "notes/a.md", + "en", + &[], + "primary", + ); + let chunk_id = r1.chunk_id.0.clone(); + env.vector.upsert(&[r1]).unwrap(); + + // Same chunk_id, different model — should land in a separate table. + let mut r2 = make_record(0xaa, 0xaa, dir(0), "alpha", &[], "model-B"); + r2.embedding_id = kb_core::EmbeddingId( + "ee01ee01ee01ee01ee01ee01ee01ee01".to_string(), + ); + env.vector.upsert(&[r2]).unwrap(); + + // Two on-disk Lance directories, distinguished by table name. + let lancedb_root = env.data_dir().join("lancedb"); + let entries: Vec<_> = std::fs::read_dir(&lancedb_root) + .unwrap() + .filter_map(Result::ok) + .map(|e| e.file_name().to_string_lossy().into_owned()) + .collect(); + let a_count = entries + .iter() + .filter(|e| e.contains("model-A")) + .count(); + let b_count = entries + .iter() + .filter(|e| e.contains("model-B")) + .count(); + assert!(a_count >= 1, "model-A table missing: {entries:?}"); + assert!(b_count >= 1, "model-B table missing: {entries:?}"); + + // Two embedding_records rows for the same chunk_id, one per model. + let conn = env.sqlite.read_conn(); + let count: i64 = conn + .query_row( + "SELECT COUNT(*) FROM embedding_records WHERE chunk_id = ?", + params![chunk_id], + |r| r.get(0), + ) + .unwrap(); + assert_eq!(count, 2); +} + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn determinism_same_query_same_top_k() { + require_avx_or_panic(); + let env = TestEnv::new(); + let recs: Vec<_> = (0..6u8) + .map(|i| { + let mut v = dir(i % 4); + v[3] = (i as f32) * 0.001; + let rec = make_record(i, i, v, &format!("t-{i}"), &[], MODEL); + env.seed_chunk( + &rec.chunk_id.0, + &rec.doc_id.0, + &format!("notes/{i}.md"), + "en", + &[], + "primary", + ); + rec + }) + .collect(); + env.vector.upsert(&recs).unwrap(); + + let q = dir(0); + let h1 = env.vector.search(&q, 4, &SearchFilters::default()).unwrap(); + let h2 = env.vector.search(&q, 4, &SearchFilters::default()).unwrap(); + let ids1: Vec<_> = h1.iter().map(|h| h.chunk_id.0.clone()).collect(); + let ids2: Vec<_> = h2.iter().map(|h| h.chunk_id.0.clone()).collect(); + assert_eq!(ids1, ids2); +} + +#[test] +#[ignore = "requires AVX-capable hardware (LanceDB)"] +fn upsert_retry_promotes_pending_to_committed() { + // Crash-recovery contract: a phase-1 row that was already + // committed by a prior batch is left alone by phase-3, but a + // pending row gets retried and reaches committed once Lance + // accepts it. + // + // Construction of the "crash" state: + // + // 1. Stage a row directly via the SQLite phase-1 helper + // (`put_embedding_records_pending`). NO Lance write happens + // here — this is exactly the on-disk state after a crash + // between phase 1 and phase 2. Confirm the row is at + // `status='pending'` before doing anything else. + // + // 2. Run `LanceVectorStore::upsert` with a `VectorRecord` whose + // `embedding_id` matches the pending row. Phase 1's + // `INSERT OR REPLACE` is idempotent here (same row payload), + // phase 2 actually writes to Lance for the first time, and + // phase 3 flips the row to 'committed'. + // + // 3. Verify status='committed' and vector_committed=1. + // + // This actually exercises the "rows stuck at pending get promoted + // on next upsert" semantics — the previous version pre-seeded via + // raw SQL but then the same upsert call overwrote the seed via + // INSERT OR REPLACE before phase 2 ran, so the recovery path + // never executed. + require_avx_or_panic(); + let env = TestEnv::new(); + let rec = make_record(0xaa, 0xaa, dir(0), "alpha", &[], MODEL); + let chunk_id = rec.chunk_id.0.clone(); + let doc_id = rec.doc_id.0.clone(); + let embedding_id = rec.embedding_id.0.clone(); + env.seed_chunk(&chunk_id, &doc_id, "notes/a.md", "en", &[], "primary"); + + // Phase 1 only — go through the same kb-store-sqlite helper that + // `LanceVectorStore::upsert` uses internally. No Lance write + // happens, so this models "crashed between phase 1 and phase 2". + let pending_row = EmbeddingRecordRow { + embedding_id: embedding_id.clone(), + chunk_id: chunk_id.clone(), + model_id: MODEL.to_string(), + model_version: "v1".to_string(), + dimensions: 4, + lance_table: format!("chunk_embeddings_{MODEL}_4"), + created_at: OffsetDateTime::UNIX_EPOCH, + }; + env.sqlite + .put_embedding_records_pending(std::slice::from_ref(&pending_row)) + .unwrap(); + + // Sanity: the row is staged but NOT yet committed and Lance has + // no record of it. + { + let conn = env.sqlite.read_conn(); + let (status, committed): (String, i64) = conn + .query_row( + "SELECT status, vector_committed FROM embedding_records WHERE embedding_id = ?", + params![embedding_id], + |r| Ok((r.get(0)?, r.get(1)?)), + ) + .unwrap(); + assert_eq!(status, "pending", "row should be at status=pending after phase-1-only"); + assert_eq!(committed, 0); + } + + // Now run upsert with the matching record. Phase 1's INSERT OR + // REPLACE is a no-op equivalent (same row payload), phase 2 lands + // the Lance row for the first time, phase 3 promotes + // status='committed'. + env.vector.upsert(&[rec]).unwrap(); + + let conn = env.sqlite.read_conn(); + let (status, committed): (String, i64) = conn + .query_row( + "SELECT status, vector_committed FROM embedding_records WHERE embedding_id = ?", + params![embedding_id], + |r| Ok((r.get(0)?, r.get(1)?)), + ) + .unwrap(); + assert_eq!(status, "committed"); + assert_eq!(committed, 1); +} diff --git a/migrations/V003__embedding_status.sql b/migrations/V003__embedding_status.sql new file mode 100644 index 0000000..b4ae602 --- /dev/null +++ b/migrations/V003__embedding_status.sql @@ -0,0 +1,46 @@ +-- V003__embedding_status.sql — additive embedding lifecycle markers (§5.6). +-- +-- P3-3 introduces a two-phase write to `embedding_records` paired with +-- a Lance MergeInsert. Phase 1 inserts the row at `status='pending'`; +-- phase 2 issues the Lance write; phase 3 flips the row to +-- `status='committed'`. `search` joins back through this table with +-- `WHERE status='committed'` so partial-write Lance rows never surface +-- to callers, and a crashed phase 2 retry simply re-runs against the +-- still-pending row (Lance MergeInsert dedupes on `chunk_id`). +-- +-- The third state, `tombstone`, is reserved for the deletion pipeline: +-- when a chunk row goes away, the matching Lance row should also be +-- garbage-collected, but the GC scheduler is out of P3-3 scope. The +-- BEFORE DELETE trigger below stages the marker so a future GC has a +-- well-defined claim; see the comment block on the trigger for why +-- it currently coexists with V001's `ON DELETE CASCADE` FK rather than +-- replacing it. + +ALTER TABLE embedding_records ADD COLUMN status TEXT NOT NULL DEFAULT 'pending' + CHECK (status IN ('pending','committed','tombstone')); + +ALTER TABLE embedding_records ADD COLUMN vector_committed INTEGER NOT NULL DEFAULT 0; + +CREATE INDEX idx_embed_status ON embedding_records(status); + +-- Tombstone trigger. +-- +-- Intent: when a `chunks` row is about to be deleted, mark its +-- dependent `embedding_records` rows as `status='tombstone'` so a later +-- GC pass can drop the matching Lance rows in lockstep. +-- +-- Caveat (carried into a future migration): V001 declared the FK as +-- `chunk_id REFERENCES chunks(chunk_id) ON DELETE CASCADE`. SQLite's +-- documented order is "BEFORE-DELETE trigger fires first, then CASCADE +-- runs", so this UPDATE will land a `tombstone` value that is +-- immediately followed by the CASCADE removing the row. The trigger is +-- therefore best-effort under the current FK; the only path that +-- actually preserves the tombstone is to drop the CASCADE (table +-- recreation, since SQLite has no DROP CONSTRAINT) — that is queued +-- for a P+ migration once the GC scheduler exists and we have actual +-- production rows to migrate. Keeping the trigger here documents the +-- design intent and gives the deletion-pipeline observer a stable hook +-- to wire into. +CREATE TRIGGER chunks_bd_tombstone_embeddings BEFORE DELETE ON chunks BEGIN + UPDATE embedding_records SET status='tombstone' WHERE chunk_id = old.chunk_id; +END;