diff --git a/.gitignore b/.gitignore index 7a95436..32ef371 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ .superpowers/ +/target/ +**/*.rs.bk +Cargo.lock.bak diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..9d47bea --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,937 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "blake3" +version = "1.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures", +] + +[[package]] +name = "cc" +version = "1.2.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", + "serde_core", +] + +[[package]] +name = "dirs" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.48.0", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "kb-app" +version = "0.1.0" +dependencies = [ + "anyhow", + "dirs", + "kb-config", + "kb-core", + "serde", + "serde_json", + "thiserror 2.0.18", + "toml", + "tracing", + "tracing-appender", + "tracing-subscriber", +] + +[[package]] +name = "kb-cli" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "kb-app", + "kb-config", + "kb-core", + "serde_json", +] + +[[package]] +name = "kb-config" +version = "0.1.0" +dependencies = [ + "anyhow", + "dirs", + "kb-core", + "serde", + "serde_json", + "thiserror 2.0.18", + "toml", +] + +[[package]] +name = "kb-core" +version = "0.1.0" +dependencies = [ + "anyhow", + "blake3", + "serde", + "serde_json", + "serde_json_canonicalizer", + "thiserror 2.0.18", + "time", + "unicode-normalization", +] + +[[package]] +name = "kb-parse-types" +version = "0.1.0" +dependencies = [ + "kb-core", + "serde", + "thiserror 2.0.18", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libredox" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c" +dependencies = [ + "libc", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_users" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" +dependencies = [ + "getrandom", + "libredox", + "thiserror 1.0.69", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "ryu-js" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_json_canonicalizer" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe52319a927259afbfa5180c5157cd8167edfd3e8c254f9558c7fef44c5649f2" +dependencies = [ + "ryu-js", + "serde", + "serde_json", +] + +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "symlink" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "time" +version = "0.3.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b36ee98fd31ec7426d599183e8fe26932a8dc1fb76ddb6214d05493377d34ca" + +[[package]] +name = "time-macros" +version = "0.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71e552d1249bf61ac2a52db88179fd0673def1e1ad8243a00d9ec9ed71fee3dd" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-appender" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "050686193eb999b4bb3bc2acfa891a13da00f79734704c4b8b4ef1a10b368a3c" +dependencies = [ + "crossbeam-channel", + "symlink", + "thiserror 2.0.18", + "time", + "tracing-subscriber", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..ce933c2 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,26 @@ +[workspace] +resolver = "3" +members = [ + "crates/kb-core", + "crates/kb-parse-types", + "crates/kb-config", + "crates/kb-app", + "crates/kb-cli", +] + +[workspace.package] +edition = "2024" +rust-version = "1.85" +license = "MIT OR Apache-2.0" +repository = "https://github.com/altair823/kb" +version = "0.1.0" + +[workspace.dependencies] +anyhow = "1" +thiserror = "2" +serde = { version = "1", features = ["derive"] } +serde_json = "1" +time = { version = "0.3", features = ["serde", "macros", "formatting", "parsing"] } +uuid = { version = "1", features = ["v7", "serde"] } +blake3 = "1" +tracing = "0.1" diff --git a/crates/kb-core/Cargo.toml b/crates/kb-core/Cargo.toml new file mode 100644 index 0000000..795aad3 --- /dev/null +++ b/crates/kb-core/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "kb-core" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "kb domain types, traits, and ID recipe (no other kb-* deps)" + +[dependencies] +anyhow = { workspace = true } +thiserror = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +time = { workspace = true } +blake3 = { workspace = true } +serde_json_canonicalizer = "0.3" +unicode-normalization = "0.1" diff --git a/crates/kb-core/src/answer.rs b/crates/kb-core/src/answer.rs new file mode 100644 index 0000000..bbf6007 --- /dev/null +++ b/crates/kb-core/src/answer.rs @@ -0,0 +1,66 @@ +//! Answer + RAG types (§3.8). + +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; + +use crate::citation::Citation; +use crate::search::SearchMode; +use crate::versions::PromptTemplateVersion; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Answer { + pub answer: String, + pub citations: Vec, + pub grounded: bool, + pub refusal_reason: Option, + pub model: ModelRef, + pub embedding: Option, + pub prompt_template_version: PromptTemplateVersion, + pub retrieval: AnswerRetrievalSummary, + pub usage: TokenUsage, + #[serde(with = "time::serde::rfc3339")] + pub created_at: OffsetDateTime, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct AnswerCitation { + pub marker: Option, + pub citation: Citation, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RefusalReason { + ScoreGate, + LlmSelfJudge, + NoIndex, + NoChunks, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ModelRef { + pub id: String, + pub provider: String, + pub dimensions: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct AnswerRetrievalSummary { + pub trace_id: TraceId, + pub mode: SearchMode, + pub k: usize, + pub score_gate: f32, + pub top_score: f32, + pub chunks_returned: u32, + pub chunks_used: u32, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct TokenUsage { + pub prompt_tokens: u32, + pub completion_tokens: u32, + pub latency_ms: u32, +} + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct TraceId(pub String); diff --git a/crates/kb-core/src/asset.rs b/crates/kb-core/src/asset.rs new file mode 100644 index 0000000..f25e7ba --- /dev/null +++ b/crates/kb-core/src/asset.rs @@ -0,0 +1,42 @@ +//! Raw asset, source URI, workspace path (§3.3). + +use std::path::PathBuf; + +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; + +use crate::ids::AssetId; +use crate::media::{Checksum, MediaType}; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind", content = "value")] +pub enum SourceUri { + File(PathBuf), + /// `kb://` virtual reference. + Kb(String), +} + +/// POSIX-relative path inside the workspace root (§6.6, §4.1). Always +/// produced via `crate::normalize::to_posix`. +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct WorkspacePath(pub String); + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum AssetStorage { + Copied { path: PathBuf }, + Reference { path: PathBuf, sha: Checksum }, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct RawAsset { + pub asset_id: AssetId, + pub source_uri: SourceUri, + pub workspace_path: WorkspacePath, + pub media_type: MediaType, + pub byte_len: u64, + pub checksum: Checksum, + #[serde(with = "time::serde::rfc3339")] + pub discovered_at: OffsetDateTime, + pub stored: AssetStorage, +} diff --git a/crates/kb-core/src/chunk.rs b/crates/kb-core/src/chunk.rs new file mode 100644 index 0000000..1c3b0aa --- /dev/null +++ b/crates/kb-core/src/chunk.rs @@ -0,0 +1,19 @@ +//! Chunk (§3.5). + +use serde::{Deserialize, Serialize}; + +use crate::document::SourceSpan; +use crate::ids::{BlockId, ChunkId, DocumentId}; +use crate::versions::ChunkerVersion; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Chunk { + pub chunk_id: ChunkId, + pub doc_id: DocumentId, + pub block_ids: Vec, + pub text: String, + pub heading_path: Vec, + pub source_spans: Vec, + pub token_estimate: usize, + pub chunker_version: ChunkerVersion, +} diff --git a/crates/kb-core/src/citation.rs b/crates/kb-core/src/citation.rs new file mode 100644 index 0000000..13299bd --- /dev/null +++ b/crates/kb-core/src/citation.rs @@ -0,0 +1,316 @@ +//! Citation (§3.5) — discriminated 5-variant. Each variant has a canonical +//! W3C Media Fragments URI per design §0 Q3. + +use anyhow::{Result, bail}; +use serde::{Deserialize, Serialize}; + +use crate::asset::WorkspacePath; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum Citation { + Line { + path: WorkspacePath, + start: u32, + end: u32, + section: Option, + }, + Page { + path: WorkspacePath, + page: u32, + section: Option, + }, + Region { + path: WorkspacePath, + x: u32, + y: u32, + w: u32, + h: u32, + }, + Caption { + path: WorkspacePath, + model: String, + }, + Time { + path: WorkspacePath, + start_ms: u64, + end_ms: u64, + speaker: Option, + }, +} + +impl Citation { + pub fn path(&self) -> &WorkspacePath { + match self { + Citation::Line { path, .. } + | Citation::Page { path, .. } + | Citation::Region { path, .. } + | Citation::Caption { path, .. } + | Citation::Time { path, .. } => path, + } + } + + /// Emit a W3C Media Fragments URI per design §0 Q3. + /// `section` and `speaker` and `caption.model` are NOT part of the URI + /// fragment; they live in the structured wire object. + pub fn to_uri(&self) -> String { + match self { + Citation::Line { path, start, end, .. } => { + if start == end { + format!("{}#L{}", path.0, start) + } else { + format!("{}#L{}-L{}", path.0, start, end) + } + } + Citation::Page { path, page, .. } => format!("{}#p={}", path.0, page), + Citation::Region { + path, x, y, w, h, .. + } => format!("{}#xywh={},{},{},{}", path.0, x, y, w, h), + Citation::Caption { path, .. } => format!("{}#caption", path.0), + Citation::Time { + path, + start_ms, + end_ms, + speaker, + } => { + let s = format_hms_ms(*start_ms); + let e = format_hms_ms(*end_ms); + match speaker { + Some(sp) => format!("{}#t={},{}&speaker={}", path.0, s, e, sp), + None => format!("{}#t={},{}", path.0, s, e), + } + } + } + } + + /// Strict inverse of `to_uri`. The `section` / `caption.model` fields + /// are not part of the URI grammar, so a parsed Citation will have + /// `section = None` and `model = ""` for the relevant variants. + /// Round-trip property holds for citations whose non-URI fields are at + /// their default values (see test). + pub fn parse(s: &str) -> Result { + let (path_str, frag) = match s.rsplit_once('#') { + Some(t) => t, + None => bail!("citation has no '#' fragment: {s:?}"), + }; + let path = WorkspacePath(path_str.to_owned()); + + if let Some(rest) = frag.strip_prefix("L") { + // line range: `L` or `L-L` + if let Some((a, b)) = rest.split_once("-L") { + let start: u32 = a.parse().map_err(|_| anyhow::anyhow!("bad line start"))?; + let end: u32 = b.parse().map_err(|_| anyhow::anyhow!("bad line end"))?; + return Ok(Citation::Line { + path, + start, + end, + section: None, + }); + } + let n: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad line number"))?; + return Ok(Citation::Line { + path, + start: n, + end: n, + section: None, + }); + } + if let Some(rest) = frag.strip_prefix("p=") { + let page: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad page number"))?; + return Ok(Citation::Page { + path, + page, + section: None, + }); + } + if let Some(rest) = frag.strip_prefix("xywh=") { + let parts: Vec<&str> = rest.split(',').collect(); + if parts.len() != 4 { + bail!("xywh= expects 4 comma-separated values: {rest:?}"); + } + let x: u32 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad xywh.x"))?; + let y: u32 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad xywh.y"))?; + let w: u32 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad xywh.w"))?; + let h: u32 = parts[3].parse().map_err(|_| anyhow::anyhow!("bad xywh.h"))?; + return Ok(Citation::Region { path, x, y, w, h }); + } + if frag == "caption" { + return Ok(Citation::Caption { + path, + model: String::new(), + }); + } + if let Some(rest) = frag.strip_prefix("t=") { + // `t=,` optionally followed by `&speaker=` + let (range, speaker) = match rest.split_once('&') { + Some((r, kv)) => match kv.strip_prefix("speaker=") { + Some(sp) => (r, Some(sp.to_owned())), + None => bail!("unknown time-fragment param: {kv:?}"), + }, + None => (rest, None), + }; + let (s_str, e_str) = match range.split_once(',') { + Some(t) => t, + None => bail!("time fragment expects ',': {range:?}"), + }; + let start_ms = parse_hms_ms(s_str)?; + let end_ms = parse_hms_ms(e_str)?; + return Ok(Citation::Time { + path, + start_ms, + end_ms, + speaker, + }); + } + bail!("unrecognised citation fragment: {frag:?}") + } +} + +/// Format milliseconds as `hh:mm:ss.mmm` (W3C Media Fragments NPT-with-ms). +fn format_hms_ms(ms: u64) -> String { + let hours = ms / 3_600_000; + let minutes = (ms % 3_600_000) / 60_000; + let seconds = (ms % 60_000) / 1000; + let millis = ms % 1000; + format!("{hours:02}:{minutes:02}:{seconds:02}.{millis:03}") +} + +fn parse_hms_ms(s: &str) -> Result { + // Accept `hh:mm:ss.mmm` (the form we emit). Reject malformed input. + let parts: Vec<&str> = s.split(':').collect(); + if parts.len() != 3 { + bail!("time component expects hh:mm:ss.mmm, got {s:?}"); + } + let h: u64 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad hours"))?; + let m: u64 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad minutes"))?; + let (sec, ms) = match parts[2].split_once('.') { + Some((s_part, ms_part)) => { + let sec: u64 = s_part.parse().map_err(|_| anyhow::anyhow!("bad seconds"))?; + // Pad/truncate to exactly 3 digits. + let mut ms_str = ms_part.to_owned(); + while ms_str.len() < 3 { + ms_str.push('0'); + } + ms_str.truncate(3); + let ms: u64 = ms_str.parse().map_err(|_| anyhow::anyhow!("bad milliseconds"))?; + (sec, ms) + } + None => { + let sec: u64 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad seconds"))?; + (sec, 0) + } + }; + Ok(h * 3_600_000 + m * 60_000 + sec * 1000 + ms) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn p(s: &str) -> WorkspacePath { + WorkspacePath(s.to_owned()) + } + + #[test] + fn line_range_uri_and_roundtrip() { + let c = Citation::Line { + path: p("notes/rust/kb.md"), + start: 12, + end: 34, + section: None, + }; + assert_eq!(c.to_uri(), "notes/rust/kb.md#L12-L34"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn line_single_uri_and_roundtrip() { + let c = Citation::Line { + path: p("a/b.md"), + start: 7, + end: 7, + section: None, + }; + assert_eq!(c.to_uri(), "a/b.md#L7"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn page_uri_and_roundtrip() { + let c = Citation::Page { + path: p("papers/book.pdf"), + page: 23, + section: None, + }; + assert_eq!(c.to_uri(), "papers/book.pdf#p=23"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn region_uri_and_roundtrip() { + let c = Citation::Region { + path: p("photos/x.png"), + x: 120, + y: 40, + w: 520, + h: 180, + }; + assert_eq!(c.to_uri(), "photos/x.png#xywh=120,40,520,180"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn caption_uri_and_roundtrip() { + let c = Citation::Caption { + path: p("photos/x.png"), + // `model` is not in the URI grammar; round-trip fills it with "". + model: String::new(), + }; + assert_eq!(c.to_uri(), "photos/x.png#caption"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn time_uri_and_roundtrip_with_speaker() { + let c = Citation::Time { + path: p("recordings/r.m4a"), + start_ms: 822_000, + end_ms: 850_000, + speaker: Some("S1".to_string()), + }; + assert_eq!( + c.to_uri(), + "recordings/r.m4a#t=00:13:42.000,00:14:10.000&speaker=S1" + ); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn time_uri_and_roundtrip_without_speaker() { + let c = Citation::Time { + path: p("recordings/r.m4a"), + start_ms: 1_500, + end_ms: 2_750, + speaker: None, + }; + assert_eq!(c.to_uri(), "recordings/r.m4a#t=00:00:01.500,00:00:02.750"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn parse_rejects_no_fragment() { + assert!(Citation::parse("just/path.md").is_err()); + } + + #[test] + fn parse_rejects_unknown_fragment() { + assert!(Citation::parse("a.md#mystery=1").is_err()); + } +} diff --git a/crates/kb-core/src/document.rs b/crates/kb-core/src/document.rs new file mode 100644 index 0000000..e0bb295 --- /dev/null +++ b/crates/kb-core/src/document.rs @@ -0,0 +1,177 @@ +//! CanonicalDocument, Block, SourceSpan, Inline, plus the forward-declared +//! OCR / caption / transcript stubs (§3.4 + §3.7a). + +use serde::{Deserialize, Serialize}; + +use crate::asset::WorkspacePath; +use crate::ids::{AssetId, BlockId, DocumentId}; +use crate::media::Lang; +use crate::metadata::{Metadata, Provenance}; +use crate::versions::ParserVersion; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct CanonicalDocument { + pub doc_id: DocumentId, + pub source_asset_id: AssetId, + pub workspace_path: WorkspacePath, + pub title: String, + pub lang: Lang, + pub blocks: Vec, + pub metadata: Metadata, + pub provenance: Provenance, + pub parser_version: ParserVersion, + pub schema_version: u32, + pub doc_version: u32, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum Block { + Heading(HeadingBlock), + Paragraph(TextBlock), + List(ListBlock), + Code(CodeBlock), + Table(TableBlock), + Quote(TextBlock), + ImageRef(ImageRefBlock), + AudioRef(AudioRefBlock), +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct CommonBlock { + pub block_id: BlockId, + pub heading_path: Vec, + pub source_span: SourceSpan, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct HeadingBlock { + pub common: CommonBlock, + pub level: u8, + pub text: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct TextBlock { + pub common: CommonBlock, + pub text: String, + pub inlines: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ListBlock { + pub common: CommonBlock, + pub ordered: bool, + pub items: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct CodeBlock { + pub common: CommonBlock, + pub lang: Option, + pub code: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct TableBlock { + pub common: CommonBlock, + pub headers: Vec, + pub rows: Vec>, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ImageRefBlock { + pub common: CommonBlock, + pub asset_id: Option, + pub src: String, + pub alt: String, + pub ocr: Option, + pub caption: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct AudioRefBlock { + pub common: CommonBlock, + pub asset_id: AssetId, + pub duration_ms: u64, + pub transcript: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum Inline { + Text(String), + Code(String), + Link { text: String, href: String }, + Strong(Vec), + Emph(Vec), +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum SourceSpan { + Line { + start: u32, + end: u32, + }, + Byte { + start: u64, + end: u64, + }, + Page { + page: u32, + char_start: Option, + char_end: Option, + }, + Region { + x: u32, + y: u32, + w: u32, + h: u32, + }, + Time { + start_ms: u64, + end_ms: u64, + }, +} + +// ── Forward-declared stubs (§3.7a). Bodies are final per design. ──────── + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct OcrText { + pub joined: String, + pub regions: Vec, + pub engine: String, + pub engine_version: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct OcrRegion { + pub bbox: (u32, u32, u32, u32), + pub text: String, + pub confidence: f32, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ModelCaption { + pub text: String, + pub model: String, + pub model_version: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Transcript { + pub segments: Vec, + pub engine: String, + pub engine_version: String, + pub language: Lang, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct TranscriptSegment { + pub start_ms: u64, + pub end_ms: u64, + pub text: String, + pub speaker: Option, + pub confidence: Option, +} diff --git a/crates/kb-core/src/errors.rs b/crates/kb-core/src/errors.rs new file mode 100644 index 0000000..cb2da46 --- /dev/null +++ b/crates/kb-core/src/errors.rs @@ -0,0 +1,15 @@ +//! `CoreError` (§10). + +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum CoreError { + #[error("invalid id: {0}")] + InvalidId(String), + #[error("invalid citation: {0}")] + InvalidCitation(String), + #[error("invalid source span: {0}")] + InvalidSpan(String), + #[error("malformed input: {0}")] + Malformed(String), +} diff --git a/crates/kb-core/src/ids.rs b/crates/kb-core/src/ids.rs new file mode 100644 index 0000000..75b6624 --- /dev/null +++ b/crates/kb-core/src/ids.rs @@ -0,0 +1,303 @@ +//! Newtype IDs (§3.1) + ID generation recipe (§4.2). +//! +//! Every ID is `blake3(canonical_json(tuple))[..32]`. `Display` returns the +//! inner hex string; `FromStr` rejects strings that are not exactly 32 +//! lowercase hex characters. + +use std::fmt; +use std::str::FromStr; + +use serde::{Deserialize, Serialize}; + +use crate::asset::WorkspacePath; +use crate::document::SourceSpan; +use crate::errors::CoreError; +use crate::versions::{ + ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, + ParserVersion, +}; + +macro_rules! newtype_id { + ($name:ident) => { + #[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] + pub struct $name(pub String); + + impl fmt::Display for $name { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } + } + + impl FromStr for $name { + type Err = CoreError; + fn from_str(s: &str) -> Result { + validate_hex32(s).map(|()| Self(s.to_owned())) + } + } + }; +} + +newtype_id!(AssetId); +newtype_id!(DocumentId); +newtype_id!(BlockId); +newtype_id!(ChunkId); +newtype_id!(EmbeddingId); +newtype_id!(IndexId); + +fn validate_hex32(s: &str) -> Result<(), CoreError> { + if s.len() != 32 { + return Err(CoreError::InvalidId(format!( + "expected 32 hex chars, got {}", + s.len() + ))); + } + if !s.bytes().all(|b| matches!(b, b'0'..=b'9' | b'a'..=b'f')) { + return Err(CoreError::InvalidId(format!( + "non-lowercase-hex character in {s:?}" + ))); + } + Ok(()) +} + +/// Canonical-JSON + blake3 + hex prefix 32. Per design §4.2. +pub fn id_from(tuple: T) -> String { + let bytes = serde_json_canonicalizer::to_vec(&tuple) + .expect("canonical JSON serialization must not fail for kb-core inputs"); + // The crate exposes `to_vec` for `T: Serialize` returning `Vec`. + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..32].to_string() +} + +#[derive(Serialize)] +struct AssetTuple<'a> { + kind: &'static str, + asset_blake3: &'a str, +} + +#[derive(Serialize)] +struct DocTuple<'a> { + kind: &'static str, + workspace_path: &'a str, + asset_id: &'a str, + parser_version: &'a str, +} + +#[derive(Serialize)] +struct BlockTuple<'a> { + kind: &'static str, + doc_id: &'a str, + block_kind: &'a str, + heading_path: &'a [String], + ordinal: u32, + source_span: &'a SourceSpan, +} + +#[derive(Serialize)] +struct ChunkTuple<'a> { + kind: &'static str, + doc_id: &'a str, + chunker_version: &'a str, + block_ids: Vec<&'a str>, + policy_hash: &'a str, +} + +#[derive(Serialize)] +struct EmbeddingTuple<'a> { + kind: &'static str, + chunk_id: &'a str, + model_id: &'a str, + model_version: &'a str, + dimensions: usize, +} + +#[derive(Serialize)] +struct IndexTuple<'a> { + kind: &'static str, + collection: &'a str, + embedding_model: &'a str, + dimensions: usize, + index_version: &'a str, + index_kind: &'a str, + index_params_hash: &'a str, +} + +pub fn id_for_asset(asset_blake3_full_hex: &str) -> AssetId { + AssetId(id_from(AssetTuple { + kind: "asset", + asset_blake3: asset_blake3_full_hex, + })) +} + +pub fn id_for_doc( + workspace_path: &WorkspacePath, + asset: &AssetId, + parser_version: &ParserVersion, +) -> DocumentId { + DocumentId(id_from(DocTuple { + kind: "doc", + workspace_path: &workspace_path.0, + asset_id: &asset.0, + parser_version: &parser_version.0, + })) +} + +pub fn id_for_block( + doc: &DocumentId, + block_kind: &str, + heading_path: &[String], + ordinal: u32, + span: &SourceSpan, +) -> BlockId { + BlockId(id_from(BlockTuple { + kind: "block", + doc_id: &doc.0, + block_kind, + heading_path, + ordinal, + source_span: span, + })) +} + +pub fn id_for_chunk( + doc: &DocumentId, + chunker_version: &ChunkerVersion, + block_ids: &[BlockId], + policy_hash: &str, +) -> ChunkId { + ChunkId(id_from(ChunkTuple { + kind: "chunk", + doc_id: &doc.0, + chunker_version: &chunker_version.0, + block_ids: block_ids.iter().map(|b| b.0.as_str()).collect(), + policy_hash, + })) +} + +pub fn id_for_embedding( + chunk: &ChunkId, + model: &EmbeddingModelId, + version: &EmbeddingVersion, + dims: usize, +) -> EmbeddingId { + EmbeddingId(id_from(EmbeddingTuple { + kind: "embedding", + chunk_id: &chunk.0, + model_id: &model.0, + model_version: &version.0, + dimensions: dims, + })) +} + +pub fn id_for_index( + collection: &str, + model: &EmbeddingModelId, + dims: usize, + version: &IndexVersion, + kind: &str, + params_hash: &str, +) -> IndexId { + IndexId(id_from(IndexTuple { + kind: "index", + collection, + embedding_model: &model.0, + dimensions: dims, + index_version: &version.0, + index_kind: kind, + index_params_hash: params_hash, + })) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn newtype_display_roundtrip() { + let s = "0123456789abcdef0123456789abcdef"; + let id: AssetId = s.parse().unwrap(); + assert_eq!(id.to_string(), s); + } + + #[test] + fn newtype_rejects_short() { + let r: Result = "abc".parse(); + assert!(r.is_err()); + } + + #[test] + fn newtype_rejects_non_hex() { + let r: Result = "ZZZ456789abcdef0123456789abcdef0".parse(); + assert!(r.is_err()); + } + + #[test] + fn newtype_rejects_uppercase() { + let r: Result = "0123456789ABCDEF0123456789ABCDEF".parse(); + assert!(r.is_err()); + } + + /// Determinism: 1000 runs of `id_from` over the same input yield the same + /// hex. + #[test] + fn id_from_deterministic_1000() { + #[derive(Serialize)] + struct T<'a> { + a: u32, + b: &'a str, + } + let input = T { a: 7, b: "hello" }; + let first = id_from(&input); + for _ in 0..1000 { + assert_eq!(id_from(&input), first); + } + assert_eq!(first.len(), 32); + } + + /// Key order in the source struct does not affect hash (canonical JSON + /// sorts keys alphabetically). + #[test] + fn id_from_key_order_invariant() { + #[derive(Serialize)] + struct A { + a: u32, + b: u32, + } + #[derive(Serialize)] + struct B { + b: u32, + a: u32, + } + assert_eq!(id_from(A { a: 1, b: 2 }), id_from(B { b: 2, a: 1 })); + } + + /// The expected hex below is hand-computed via design §4.2: + /// tuple = { "kind": "asset", "asset_blake3": "deadbeef" } + /// canonical JSON (key-sorted, no whitespace, both keys are pure ASCII): + /// {"asset_blake3":"deadbeef","kind":"asset"} + /// blake3 of those bytes → hex → first 32 chars. + /// Pinned via an independent tool (b3sum, computed once outside the code + /// under test) so a regression in our JCS or hash pipeline is caught. + #[test] + fn id_for_asset_pinned() { + // printf '{"asset_blake3":"deadbeef","kind":"asset"}' | b3sum + // → cec9353553efb238a7919d38d3e148f1... + let id = id_for_asset("deadbeef"); + assert_eq!(id.0, "cec9353553efb238a7919d38d3e148f1"); + } + + /// Independent pin for id_for_doc. + /// canonical JSON: + /// {"asset_id":"6cb0ef0eb89c63b8b6e76ec53dca6e7d", + /// "kind":"doc", + /// "parser_version":"pulldown-cmark-0.x", + /// "workspace_path":"notes/test.md"} + /// (concatenated, no whitespace). + #[test] + fn id_for_doc_pinned() { + let asset = AssetId("6cb0ef0eb89c63b8b6e76ec53dca6e7d".to_string()); + let path = WorkspacePath("notes/test.md".to_string()); + let pv = ParserVersion("pulldown-cmark-0.x".to_string()); + let id = id_for_doc(&path, &asset, &pv); + assert_eq!(id.0, "8547fe58cb42d593fd761d77242401db"); + } +} diff --git a/crates/kb-core/src/ingest.rs b/crates/kb-core/src/ingest.rs new file mode 100644 index 0000000..7636a95 --- /dev/null +++ b/crates/kb-core/src/ingest.rs @@ -0,0 +1,45 @@ +//! IngestReport + IngestItem (mirrored from wire §2.4). + +use serde::{Deserialize, Serialize}; + +use crate::asset::WorkspacePath; +use crate::ids::{AssetId, DocumentId}; +use crate::traits::SourceScope; +use crate::versions::{ChunkerVersion, ParserVersion}; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct IngestReport { + pub scope: SourceScope, + pub scanned: u32, + pub new: u32, + pub updated: u32, + pub skipped: u32, + pub errors: u32, + pub duration_ms: u32, + /// `None` ↔ wire `items: null` (`--summary-only`). + pub items: Option>, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct IngestItem { + pub kind: IngestItemKind, + pub doc_id: Option, + pub doc_path: WorkspacePath, + pub asset_id: Option, + pub byte_len: Option, + pub block_count: Option, + pub chunk_count: Option, + pub parser_version: Option, + pub chunker_version: Option, + pub warnings: Vec, + pub error: Option, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum IngestItemKind { + New, + Updated, + Skipped, + Error, +} diff --git a/crates/kb-core/src/jobs.rs b/crates/kb-core/src/jobs.rs new file mode 100644 index 0000000..8b6231e --- /dev/null +++ b/crates/kb-core/src/jobs.rs @@ -0,0 +1,52 @@ +//! Job repo support types (§3.7a forward-decl, §7.2 JobRepo). + +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use time::OffsetDateTime; + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum JobKind { + Ingest, + Chunk, + Embed, + Ocr, + Transcribe, + Reindex, + Doctor, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum JobStatus { + Pending, + Running, + Succeeded, + Failed, + Canceled, +} + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct JobId(pub String); + +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct JobFilter { + pub status: Option, + pub kind: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct JobRow { + pub job_id: JobId, + pub kind: JobKind, + pub status: JobStatus, + pub payload: Value, + pub progress: Option, + pub error: Option, + #[serde(with = "time::serde::rfc3339")] + pub created_at: OffsetDateTime, + #[serde(with = "time::serde::rfc3339")] + pub updated_at: OffsetDateTime, + #[serde(default, with = "time::serde::rfc3339::option")] + pub finished_at: Option, +} diff --git a/crates/kb-core/src/lib.rs b/crates/kb-core/src/lib.rs new file mode 100644 index 0000000..0ed39c3 --- /dev/null +++ b/crates/kb-core/src/lib.rs @@ -0,0 +1,70 @@ +//! `kb-core` — frozen domain types, traits, and ID recipe. +//! +//! Per design §3, §4, §7. This crate has zero dependencies on any other +//! `kb-*` crate, so every other crate in the workspace can depend on it +//! freely. +//! +//! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md` for +//! the canonical type bodies — this crate is the byte-for-byte mirror. + +pub mod ids; +pub mod versions; +pub mod media; +pub mod asset; +pub mod document; +pub mod chunk; +pub mod citation; +pub mod metadata; +pub mod search; +pub mod answer; +pub mod ingest; +pub mod jobs; +pub mod vector; +pub mod errors; +pub mod traits; +pub mod normalize; + +// Re-export the most commonly used items at the crate root, mirroring the +// public surface listed in the task spec. + +pub use ids::{ + AssetId, BlockId, ChunkId, DocumentId, EmbeddingId, IndexId, + id_for_asset, id_for_block, id_for_chunk, id_for_doc, id_for_embedding, + id_for_index, id_from, +}; +pub use versions::{ + ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, + ParserVersion, PromptTemplateVersion, SchemaVersion, +}; +pub use media::{AudioType, Checksum, ImageType, Lang, MediaType}; +pub use asset::{AssetStorage, RawAsset, SourceUri, WorkspacePath}; +pub use document::{ + AudioRefBlock, Block, CanonicalDocument, CodeBlock, CommonBlock, + HeadingBlock, ImageRefBlock, Inline, ListBlock, ModelCaption, OcrRegion, + OcrText, SourceSpan, TableBlock, TextBlock, Transcript, TranscriptSegment, +}; +pub use chunk::Chunk; +pub use citation::Citation; +pub use metadata::{ + Metadata, Provenance, ProvenanceEvent, ProvenanceKind, SourceType, + TrustLevel, +}; +pub use search::{ + DocFilter, DocSummary, RetrievalDetail, SearchFilters, SearchHit, + SearchMode, SearchQuery, +}; +pub use answer::{ + Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, RefusalReason, + TokenUsage, TraceId, +}; +pub use ingest::{IngestItem, IngestReport}; +pub use jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus}; +pub use vector::{VectorHit, VectorRecord}; +pub use errors::CoreError; +pub use traits::{ + ChunkPolicy, Chunker, DocumentStore, Embedder, EmbeddingInput, + EmbeddingKind, ExtractConfig, ExtractContext, Extractor, FinishReason, + GenerateRequest, JobRepo, LanguageModel, Retriever, SourceConnector, + SourceScope, TokenChunk, VectorStore, +}; +pub use normalize::{nfc, to_posix}; diff --git a/crates/kb-core/src/media.rs b/crates/kb-core/src/media.rs new file mode 100644 index 0000000..263e5cf --- /dev/null +++ b/crates/kb-core/src/media.rs @@ -0,0 +1,44 @@ +//! Media / file-type primitives (§3.3 + §3.7a). + +use serde::{Deserialize, Serialize}; + +/// Full blake3 hex (64 chars) per §3.7a. Stored as `String` for serde +/// simplicity. +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct Checksum(pub String); + +/// BCP-47 / ISO-639 language tag (e.g. "ko", "en"). §3.7a. +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct Lang(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ImageType { + Png, + Jpeg, + Webp, + Gif, + Tiff, + Other(String), +} + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum AudioType { + M4a, + Mp3, + Wav, + Flac, + Ogg, + Other(String), +} + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum MediaType { + Markdown, + Pdf, + Image(ImageType), + Audio(AudioType), + Other(String), +} diff --git a/crates/kb-core/src/metadata.rs b/crates/kb-core/src/metadata.rs new file mode 100644 index 0000000..229ee0d --- /dev/null +++ b/crates/kb-core/src/metadata.rs @@ -0,0 +1,68 @@ +//! Metadata + Provenance (§3.6). + +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; +use time::OffsetDateTime; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Metadata { + pub aliases: Vec, + pub tags: Vec, + #[serde(with = "time::serde::rfc3339")] + pub created_at: OffsetDateTime, + #[serde(with = "time::serde::rfc3339")] + pub updated_at: OffsetDateTime, + pub source_type: SourceType, + pub trust_level: TrustLevel, + pub user_id_alias: Option, + /// Frontmatter keys we don't recognise are preserved here per §0 Q9. + pub user: Map, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum SourceType { + Markdown, + Note, + Paper, + Reference, + Inbox, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum TrustLevel { + Primary, + Secondary, + Generated, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Provenance { + pub events: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ProvenanceEvent { + #[serde(with = "time::serde::rfc3339")] + pub at: OffsetDateTime, + pub agent: String, + pub kind: ProvenanceKind, + pub note: Option, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ProvenanceKind { + Discovered, + Parsed, + Normalized, + Chunked, + OcrApplied, + CaptionApplied, + Transcribed, + Embedded, + Indexed, + Warning, + Error, +} diff --git a/crates/kb-core/src/normalize.rs b/crates/kb-core/src/normalize.rs new file mode 100644 index 0000000..ec39c45 --- /dev/null +++ b/crates/kb-core/src/normalize.rs @@ -0,0 +1,86 @@ +//! Path / string normalization helpers (§4.1, §6.6). + +use std::path::{Component, Path}; + +use unicode_normalization::UnicodeNormalization; + +use crate::asset::WorkspacePath; + +/// NFC-normalize a UTF-8 string (§4.1). +pub fn nfc(input: &str) -> String { + input.nfc().collect() +} + +/// Collapse a path to a POSIX-relative `WorkspacePath` per §6.6: +/// - convert all separators to `/` +/// - strip a leading `./` +/// - collapse repeated slashes +/// - NFC-normalize +pub fn to_posix(path: &Path) -> WorkspacePath { + let mut out = String::new(); + let mut first = true; + for comp in path.components() { + match comp { + Component::CurDir => continue, + Component::Normal(s) => { + if !first { + out.push('/'); + } + out.push_str(&s.to_string_lossy()); + first = false; + } + Component::ParentDir => { + if !first { + out.push('/'); + } + out.push_str(".."); + first = false; + } + Component::RootDir => { + if first { + out.push('/'); + } + first = false; + } + Component::Prefix(_) => { + // Windows drive prefixes — `to_string_lossy` keeps form. + out.push_str(&comp.as_os_str().to_string_lossy()); + first = false; + } + } + } + if out.is_empty() { + out.push_str("."); + } + WorkspacePath(nfc(&out)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn collapses_curdir_and_redundant_slashes() { + let p = Path::new("./a//b.md"); + // `Path::components` already collapses `//` on POSIX; the test + // doc-fixed example asserts the final string is `a/b.md`. + assert_eq!(to_posix(p).0, "a/b.md"); + } + + #[test] + fn nfc_normalizes_korean() { + // U+1100 ㄱ + U+1161 ㅏ (NFD) vs U+AC00 가 (NFC). After NFC they + // collapse to the same string; `to_posix` runs NFC after path + // collapse, so the WorkspacePath comes out NFC regardless of input. + let nfd = "\u{1100}\u{1161}.md"; + let nfc_str = "\u{AC00}.md"; + assert_eq!(to_posix(Path::new(nfd)).0, to_posix(Path::new(nfc_str)).0); + assert_eq!(to_posix(Path::new(nfd)).0, "\u{AC00}.md"); + } + + #[test] + fn nfc_function_idempotent() { + let s = "\u{AC00}"; + assert_eq!(nfc(s), s); + } +} diff --git a/crates/kb-core/src/search.rs b/crates/kb-core/src/search.rs new file mode 100644 index 0000000..9621d61 --- /dev/null +++ b/crates/kb-core/src/search.rs @@ -0,0 +1,90 @@ +//! Search query / filters / hit (§3.7) + DocFilter / DocSummary (§2.5). + +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; + +use crate::asset::WorkspacePath; +use crate::citation::Citation; +use crate::ids::{ChunkId, DocumentId}; +use crate::media::Lang; +use crate::metadata::{SourceType, TrustLevel}; +use crate::versions::{ChunkerVersion, EmbeddingModelId, IndexVersion, ParserVersion}; + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum SearchMode { + Lexical, + Vector, + Hybrid, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct SearchQuery { + pub text: String, + pub mode: SearchMode, + pub k: usize, + pub filters: SearchFilters, +} + +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct SearchFilters { + pub tags_any: Vec, + pub lang: Option, + pub path_glob: Option, + pub trust_min: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct SearchHit { + pub rank: u32, + pub chunk_id: ChunkId, + pub doc_id: DocumentId, + pub doc_path: WorkspacePath, + pub heading_path: Vec, + pub section_label: Option, + pub snippet: String, + pub citation: Citation, + pub retrieval: RetrievalDetail, + pub index_version: IndexVersion, + pub embedding_model: Option, + pub chunker_version: ChunkerVersion, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct RetrievalDetail { + pub method: SearchMode, + pub fusion_score: f32, + pub lexical_score: Option, + pub vector_score: Option, + pub lexical_rank: Option, + pub vector_rank: Option, +} + +/// Filter for `kb-app::list_docs` (§7.2 DocumentStore::list_documents). +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct DocFilter { + pub tags_any: Vec, + pub lang: Option, + pub path_glob: Option, + pub trust_min: Option, +} + +/// Internal mirror of wire `doc_summary.v1` (§2.5). +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct DocSummary { + pub doc_id: DocumentId, + pub doc_path: WorkspacePath, + pub title: String, + pub lang: Lang, + pub tags: Vec, + pub trust_level: TrustLevel, + pub source_type: SourceType, + pub byte_len: u64, + pub chunk_count: u32, + #[serde(with = "time::serde::rfc3339")] + pub created_at: OffsetDateTime, + #[serde(with = "time::serde::rfc3339")] + pub updated_at: OffsetDateTime, + pub parser_version: ParserVersion, + pub chunker_version: ChunkerVersion, +} diff --git a/crates/kb-core/src/traits.rs b/crates/kb-core/src/traits.rs new file mode 100644 index 0000000..dcf2024 --- /dev/null +++ b/crates/kb-core/src/traits.rs @@ -0,0 +1,175 @@ +//! Component traits (§7) and their input helper types (§7.1). + +use std::path::{Path, PathBuf}; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use crate::asset::RawAsset; +use crate::chunk::Chunk; +use crate::document::{Block, CanonicalDocument}; +use crate::ids::{ChunkId, DocumentId}; +use crate::jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus}; +use crate::media::MediaType; +use crate::search::{DocFilter, DocSummary, SearchFilters, SearchHit, SearchQuery}; +use crate::vector::{VectorHit, VectorRecord}; +use crate::versions::{ + ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, ParserVersion, +}; +use crate::answer::{ModelRef, TokenUsage}; + +// ── Helper input types (§7.1) ───────────────────────────────────────────── + +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct SourceScope { + pub root: PathBuf, + pub include: Vec, + pub exclude: Vec, +} + +/// Forward-declared (§3.7a) — concrete shape decided by extractors. P0 +/// keeps the option-of-config-file slot only. +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct ExtractConfig { + pub config_path: Option, +} + +/// Carries the raw asset bytes context to an `Extractor::extract` call. +pub struct ExtractContext<'a> { + pub asset: &'a RawAsset, + pub workspace_root: &'a Path, + pub config: &'a ExtractConfig, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ChunkPolicy { + pub target_tokens: usize, + pub overlap_tokens: usize, + pub respect_markdown_headings: bool, + pub chunker_version: ChunkerVersion, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum EmbeddingKind { + Document, + Query, +} + +pub struct EmbeddingInput<'a> { + pub text: &'a str, + pub kind: EmbeddingKind, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct GenerateRequest { + pub system: String, + pub user: String, + pub stop: Vec, + pub max_tokens: usize, + pub temperature: f32, + pub seed: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "kind")] +pub enum TokenChunk { + Token(String), + Done { + finish_reason: FinishReason, + usage: TokenUsage, + }, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum FinishReason { + Stop, + Length, + Aborted, + Error(String), +} + +// ── Traits (§7.2) ───────────────────────────────────────────────────────── + +pub trait SourceConnector { + fn scan(&self, scope: &SourceScope) -> anyhow::Result>; +} + +pub trait Extractor: Send + Sync { + fn supports(&self, media_type: &MediaType) -> bool; + fn parser_version(&self) -> ParserVersion; + fn extract( + &self, + ctx: &ExtractContext<'_>, + bytes: &[u8], + ) -> anyhow::Result; +} + +pub trait Chunker: Send + Sync { + fn chunker_version(&self) -> ChunkerVersion; + fn policy_hash(&self, policy: &ChunkPolicy) -> String; + fn chunk( + &self, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + ) -> anyhow::Result>; +} + +pub trait Embedder: Send + Sync { + fn model_id(&self) -> EmbeddingModelId; + fn model_version(&self) -> EmbeddingVersion; + fn dimensions(&self) -> usize; + fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> anyhow::Result>>; +} + +pub trait Retriever: Send + Sync { + fn search(&self, query: &SearchQuery) -> anyhow::Result>; + fn index_version(&self) -> IndexVersion; +} + +pub trait LanguageModel: Send + Sync { + fn model_ref(&self) -> ModelRef; + fn context_tokens(&self) -> usize; + fn generate_stream( + &self, + req: GenerateRequest, + ) -> anyhow::Result> + Send>>; +} + +pub trait DocumentStore { + fn put_asset(&self, a: &RawAsset) -> anyhow::Result<()>; + fn put_document(&self, d: &CanonicalDocument) -> anyhow::Result<()>; + fn put_blocks(&self, doc: &DocumentId, blocks: &[Block]) -> anyhow::Result<()>; + fn put_chunks(&self, doc: &DocumentId, chunks: &[Chunk]) -> anyhow::Result<()>; + fn get_document(&self, id: &DocumentId) -> anyhow::Result>; + fn get_chunk(&self, id: &ChunkId) -> anyhow::Result>; + fn list_documents(&self, filter: &DocFilter) -> anyhow::Result>; +} + +pub trait VectorStore { + fn ensure_table( + &self, + model: &EmbeddingModelId, + dim: usize, + ) -> anyhow::Result; + fn upsert(&self, recs: &[VectorRecord]) -> anyhow::Result<()>; + fn search( + &self, + query_vec: &[f32], + k: usize, + filters: &SearchFilters, + ) -> anyhow::Result>; +} + +pub trait JobRepo { + fn create(&self, kind: JobKind, payload: Value) -> anyhow::Result; + fn update_progress(&self, id: &JobId, progress: Value) -> anyhow::Result<()>; + fn finish( + &self, + id: &JobId, + status: JobStatus, + error: Option<&str>, + ) -> anyhow::Result<()>; + fn list(&self, filter: &JobFilter) -> anyhow::Result>; +} diff --git a/crates/kb-core/src/vector.rs b/crates/kb-core/src/vector.rs new file mode 100644 index 0000000..e17ab7d --- /dev/null +++ b/crates/kb-core/src/vector.rs @@ -0,0 +1,27 @@ +//! Vector store records (§7.2 VectorStore). + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use crate::ids::{ChunkId, DocumentId, EmbeddingId}; +use crate::versions::{EmbeddingModelId, EmbeddingVersion}; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct VectorRecord { + pub chunk_id: ChunkId, + pub embedding_id: EmbeddingId, + pub vector: Vec, + pub doc_id: DocumentId, + pub text: String, + pub heading_path: Vec, + pub model_id: EmbeddingModelId, + pub model_version: EmbeddingVersion, + pub dimensions: usize, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct VectorHit { + pub chunk_id: ChunkId, + pub score: f32, + pub payload: Value, +} diff --git a/crates/kb-core/src/versions.rs b/crates/kb-core/src/versions.rs new file mode 100644 index 0000000..beda08e --- /dev/null +++ b/crates/kb-core/src/versions.rs @@ -0,0 +1,27 @@ +//! Version / label newtypes (§3.2). + +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct ParserVersion(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct ChunkerVersion(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct EmbeddingModelId(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct EmbeddingVersion(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct IndexVersion(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct PromptTemplateVersion(pub String); + +/// Wire schema version label (`"answer.v1"`, `"search_hit.v1"`, …). +/// Carried as a `&'static str` because every wire type pins its label at +/// compile time. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct SchemaVersion(pub &'static str);