diff --git a/.gitignore b/.gitignore index 7a95436..32ef371 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ .superpowers/ +/target/ +**/*.rs.bk +Cargo.lock.bak diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..58f8384 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,934 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "blake3" +version = "1.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures", +] + +[[package]] +name = "cc" +version = "1.2.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", + "serde_core", +] + +[[package]] +name = "dirs" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.48.0", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "kb-app" +version = "0.1.0" +dependencies = [ + "anyhow", + "dirs", + "kb-config", + "kb-core", + "serde", + "serde_json", + "toml", + "tracing", + "tracing-appender", + "tracing-subscriber", +] + +[[package]] +name = "kb-cli" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "kb-app", + "kb-config", + "kb-core", + "serde_json", +] + +[[package]] +name = "kb-config" +version = "0.1.0" +dependencies = [ + "anyhow", + "dirs", + "kb-core", + "serde", + "serde_json", + "toml", +] + +[[package]] +name = "kb-core" +version = "0.1.0" +dependencies = [ + "anyhow", + "blake3", + "serde", + "serde_json", + "serde_json_canonicalizer", + "thiserror 2.0.18", + "time", + "unicode-normalization", +] + +[[package]] +name = "kb-parse-types" +version = "0.1.0" +dependencies = [ + "kb-core", + "serde", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libredox" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c" +dependencies = [ + "libc", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_users" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" +dependencies = [ + "getrandom", + "libredox", + "thiserror 1.0.69", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "ryu-js" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_json_canonicalizer" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe52319a927259afbfa5180c5157cd8167edfd3e8c254f9558c7fef44c5649f2" +dependencies = [ + "ryu-js", + "serde", + "serde_json", +] + +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "symlink" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "time" +version = "0.3.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b36ee98fd31ec7426d599183e8fe26932a8dc1fb76ddb6214d05493377d34ca" + +[[package]] +name = "time-macros" +version = "0.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71e552d1249bf61ac2a52db88179fd0673def1e1ad8243a00d9ec9ed71fee3dd" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-appender" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "050686193eb999b4bb3bc2acfa891a13da00f79734704c4b8b4ef1a10b368a3c" +dependencies = [ + "crossbeam-channel", + "symlink", + "thiserror 2.0.18", + "time", + "tracing-subscriber", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..ce933c2 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,26 @@ +[workspace] +resolver = "3" +members = [ + "crates/kb-core", + "crates/kb-parse-types", + "crates/kb-config", + "crates/kb-app", + "crates/kb-cli", +] + +[workspace.package] +edition = "2024" +rust-version = "1.85" +license = "MIT OR Apache-2.0" +repository = "https://github.com/altair823/kb" +version = "0.1.0" + +[workspace.dependencies] +anyhow = "1" +thiserror = "2" +serde = { version = "1", features = ["derive"] } +serde_json = "1" +time = { version = "0.3", features = ["serde", "macros", "formatting", "parsing"] } +uuid = { version = "1", features = ["v7", "serde"] } +blake3 = "1" +tracing = "0.1" diff --git a/crates/kb-app/Cargo.toml b/crates/kb-app/Cargo.toml new file mode 100644 index 0000000..b224590 --- /dev/null +++ b/crates/kb-app/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "kb-app" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Facade — orchestrates components for kb-cli/tui/desktop" + +[dependencies] +kb-core = { path = "../kb-core" } +kb-config = { path = "../kb-config" } +anyhow = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +tracing = { workspace = true } +tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] } +tracing-appender = "0.2" +toml = "0.8" +dirs = "5" diff --git a/crates/kb-app/src/doctor_signal.rs b/crates/kb-app/src/doctor_signal.rs new file mode 100644 index 0000000..d83c8e2 --- /dev/null +++ b/crates/kb-app/src/doctor_signal.rs @@ -0,0 +1,39 @@ +//! Signal types used by `kb-cli`'s `exit_code` mapping (§10). +//! +//! These are *not* errors per se: a doctor failure is normal output, just +//! signalled out-of-band so the CLI can exit with the right status. + +use std::fmt; + +#[derive(Debug)] +pub struct DoctorUnhealthy; + +impl fmt::Display for DoctorUnhealthy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("doctor unhealthy") + } +} + +impl std::error::Error for DoctorUnhealthy {} + +#[derive(Debug)] +pub struct RefusalSignal; + +impl fmt::Display for RefusalSignal { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("refusal") + } +} + +impl std::error::Error for RefusalSignal {} + +#[derive(Debug)] +pub struct NoHitSignal; + +impl fmt::Display for NoHitSignal { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("no hit") + } +} + +impl std::error::Error for NoHitSignal {} diff --git a/crates/kb-app/src/lib.rs b/crates/kb-app/src/lib.rs new file mode 100644 index 0000000..8196bfd --- /dev/null +++ b/crates/kb-app/src/lib.rs @@ -0,0 +1,186 @@ +//! `kb-app` — facade that downstream `kb-cli` / `kb-tui` / `kb-desktop` +//! depend on (§7, §8). +//! +//! P0 implementations stub out — the signatures are frozen so that later +//! phases swap in real bodies without breaking call sites. +//! +//! ## Wire-schema convention +//! +//! `kb-app` returns pure domain types (`IngestReport`, `DocSummary`, +//! `Chunk`, `SearchHit`, `Answer`, …) re-exported from `kb-core`. These do +//! NOT carry a `schema_version` field. The CLI (`kb-cli/src/wire.rs`) is +//! responsible for wrapping each Ok-path return value with the matching +//! `*.v1` envelope before emitting JSON on stdout in `--json` mode. The +//! sole exception is [`DoctorReport`], whose `schema_version` is part of +//! the struct because the doctor wire object IS its own structured +//! surface (no domain-side equivalent in `kb-core`). When adding a new +//! facade function in a later phase, remember: keep the return type pure, +//! and add a matching `wire_*` helper in `kb-cli/src/wire.rs`. + +use std::path::PathBuf; + +use anyhow::bail; +use serde::{Deserialize, Serialize}; + +use kb_core::{ + Answer, CanonicalDocument, Chunk, ChunkId, DocFilter, DocSummary, DocumentId, + IngestReport, SearchHit, SearchMode, SearchQuery, SourceScope, +}; + +pub mod doctor_signal; +pub mod logging; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct AskOpts { + pub k: usize, + pub explain: bool, + pub mode: SearchMode, + pub temperature: Option, + pub seed: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct DoctorReport { + /// Wire schema version label (`"doctor.v1"`). + pub schema_version: String, + pub ok: bool, + pub checks: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct DoctorCheck { + pub name: String, + pub ok: bool, + pub detail: String, + pub hint: Option, +} + +/// Create XDG dirs and write a starter `config.toml`. Idempotent unless +/// `force=true` (which overwrites an existing config). +pub fn init_workspace(force: bool) -> anyhow::Result<()> { + let cfg_path = kb_config::Config::xdg_config_path(); + let data_dir = kb_config::Config::xdg_data_dir(); + let cache_dir = kb_config::Config::xdg_cache_dir(); + let state_dir = kb_config::Config::xdg_state_dir(); + + for d in [ + cfg_path.parent().map(PathBuf::from).unwrap_or_default(), + data_dir.clone(), + cache_dir, + state_dir.clone(), + state_dir.join("logs"), + ] { + if !d.as_os_str().is_empty() { + std::fs::create_dir_all(&d)?; + } + } + + let workspace_root = expand_tilde(&kb_config::Config::defaults().workspace.root); + std::fs::create_dir_all(&workspace_root)?; + + if !cfg_path.exists() || force { + let cfg = kb_config::Config::defaults(); + let toml_text = toml::to_string_pretty(&cfg)?; + std::fs::write(&cfg_path, toml_text)?; + } + + Ok(()) +} + +fn expand_tilde(s: &str) -> PathBuf { + if let Some(rest) = s.strip_prefix("~/") { + if let Some(home) = dirs::home_dir() { + return home.join(rest); + } + } + if s == "~" { + if let Some(home) = dirs::home_dir() { + return home; + } + } + PathBuf::from(s) +} + +pub fn ingest(_scope: SourceScope, _summary_only: bool) -> anyhow::Result { + bail!("not yet wired (P1-2)") +} + +pub fn list_docs(_filter: DocFilter) -> anyhow::Result> { + bail!("not yet wired (P1-5)") +} + +pub fn inspect_doc(_id: &DocumentId) -> anyhow::Result { + bail!("not yet wired (P1-5)") +} + +pub fn inspect_chunk(_id: &ChunkId) -> anyhow::Result { + bail!("not yet wired (P1-5)") +} + +pub fn search(_query: SearchQuery) -> anyhow::Result> { + bail!("not yet wired (P3-1/P4-1)") +} + +pub fn ask(_query: &str, _opts: AskOpts) -> anyhow::Result { + bail!("not yet wired (P5-1)") +} + +/// Run the doctor checks. P0 emits `config_loaded` + `data_dir_writable` +/// (downstream checks land in later phases). +pub fn doctor() -> anyhow::Result { + tracing::debug!("doctor() invoked"); + let mut checks = Vec::new(); + + // config_loaded — defaults always load; from-file is best-effort. + let cfg_path = kb_config::Config::xdg_config_path(); + let (config_ok, config_detail) = if cfg_path.exists() { + match kb_config::Config::from_file(&cfg_path) { + Ok(_) => (true, cfg_path.display().to_string()), + Err(e) => (false, format!("{} ({e})", cfg_path.display())), + } + } else { + // Defaults are always loadable; report the path that would be read. + (true, format!("{} (defaults)", cfg_path.display())) + }; + checks.push(DoctorCheck { + name: "config_loaded".to_string(), + ok: config_ok, + detail: config_detail, + hint: if config_ok { + None + } else { + Some("run `kb init` to seed config".to_string()) + }, + }); + + // data_dir_writable — try to create the dir and write a probe file. + let data_dir = kb_config::Config::xdg_data_dir(); + let writable = (|| -> anyhow::Result<()> { + std::fs::create_dir_all(&data_dir)?; + let probe = data_dir.join(".kb-doctor-probe"); + std::fs::write(&probe, b"ok")?; + std::fs::remove_file(&probe).ok(); + Ok(()) + })(); + let (data_ok, data_detail, data_hint) = match writable { + Ok(()) => (true, data_dir.display().to_string(), None), + Err(e) => ( + false, + format!("{} ({e})", data_dir.display()), + Some("ensure XDG_DATA_HOME is writable".to_string()), + ), + }; + checks.push(DoctorCheck { + name: "data_dir_writable".to_string(), + ok: data_ok, + detail: data_detail, + hint: data_hint, + }); + + let ok = checks.iter().all(|c| c.ok); + Ok(DoctorReport { + schema_version: "doctor.v1".to_string(), + ok, + checks, + }) +} diff --git a/crates/kb-app/src/logging.rs b/crates/kb-app/src/logging.rs new file mode 100644 index 0000000..30865fc --- /dev/null +++ b/crates/kb-app/src/logging.rs @@ -0,0 +1,43 @@ +//! Tracing initialization helper for `kb-cli`. +//! +//! Daily-rolling file appender at `~/.local/state/kb/logs/` per task spec. +//! Returns a `WorkerGuard` that the caller must keep alive until program +//! exit (so buffered log lines flush). + +use anyhow::Result; +use tracing_appender::non_blocking::WorkerGuard; +use tracing_subscriber::{EnvFilter, fmt, prelude::*}; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum LogLevel { + Default, + Verbose, + Debug, +} + +/// Initialize tracing. Returns a guard to keep alive until exit. Idempotent +/// — a second call is a no-op (the second `try_init` is dropped silently +/// but the guard is still returned so the caller can keep it alive). +pub fn init(level: LogLevel) -> Result { + let log_dir = kb_config::Config::xdg_state_dir().join("logs"); + std::fs::create_dir_all(&log_dir)?; + + let file_appender = tracing_appender::rolling::daily(&log_dir, "kb.log"); + let (nb, guard) = tracing_appender::non_blocking(file_appender); + + let env_filter = match level { + LogLevel::Default => EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")), + LogLevel::Verbose => EnvFilter::new("info"), + LogLevel::Debug => EnvFilter::new("debug"), + }; + + let registry = tracing_subscriber::registry() + .with(env_filter) + .with(fmt::layer().with_writer(nb).with_ansi(false)); + + // `try_init` rather than `init` so a second call (e.g. in tests) is a + // no-op. + let _ = registry.try_init(); + + Ok(guard) +} diff --git a/crates/kb-cli/Cargo.toml b/crates/kb-cli/Cargo.toml new file mode 100644 index 0000000..ed8a81a --- /dev/null +++ b/crates/kb-cli/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "kb-cli" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "kb command-line interface" + +[[bin]] +name = "kb" +path = "src/main.rs" + +[dependencies] +kb-core = { path = "../kb-core" } +kb-config = { path = "../kb-config" } +kb-app = { path = "../kb-app" } +anyhow = { workspace = true } +serde_json = { workspace = true } +clap = { version = "4", features = ["derive"] } diff --git a/crates/kb-cli/src/main.rs b/crates/kb-cli/src/main.rs new file mode 100644 index 0000000..b31bae5 --- /dev/null +++ b/crates/kb-cli/src/main.rs @@ -0,0 +1,355 @@ +//! `kb` — command-line interface. Each subcommand maps 1:1 to a `kb-app` +//! function. Exit codes per design §10. + +use std::path::PathBuf; +use std::process::ExitCode; + +use clap::{Parser, Subcommand}; + +use kb_app::doctor_signal::{DoctorUnhealthy, NoHitSignal, RefusalSignal}; + +mod wire; + +#[derive(Parser, Debug)] +#[command(name = "kb", version, about = "personal local knowledge base")] +struct Cli { + /// Path to a non-default `config.toml`. + #[arg(long, global = true)] + config: Option, + + /// Show anyhow chain on errors. + #[arg(long, global = true)] + verbose: bool, + + /// Show tracing target/level on errors. + #[arg(long, global = true)] + debug: bool, + + /// Emit machine-readable wire JSON (`*.v1`). + #[arg(long, global = true)] + json: bool, + + #[command(subcommand)] + command: Cmd, +} + +#[derive(Subcommand, Debug)] +enum Cmd { + /// Initialise XDG dirs + workspace + `config.toml`. + Init { + /// Overwrite an existing `config.toml`. + #[arg(long)] + force: bool, + }, + + /// Scan the workspace and ingest new/updated documents. + Ingest { + /// Workspace root override. + #[arg(long)] + root: Option, + + /// Suppress the per-file `items` list. + #[arg(long)] + summary_only: bool, + }, + + /// Listing subcommands. + List { + #[command(subcommand)] + what: ListWhat, + }, + + /// Inspect documents or chunks by ID. + Inspect { + #[command(subcommand)] + what: InspectWhat, + }, + + /// Lexical / vector / hybrid search over chunks. + Search { + query: String, + + #[arg(long, default_value_t = 10)] + k: usize, + + #[arg(long, value_enum, default_value_t = ModeFlag::Hybrid)] + mode: ModeFlag, + + #[arg(long)] + explain: bool, + }, + + /// Retrieval-augmented question answering. + Ask { + query: String, + + #[arg(long, default_value_t = 8)] + k: usize, + + #[arg(long, value_enum, default_value_t = ModeFlag::Hybrid)] + mode: ModeFlag, + + #[arg(long)] + explain: bool, + + #[arg(long)] + temperature: Option, + + #[arg(long)] + seed: Option, + }, + + /// Health check. + Doctor, + + /// Eval suite (placeholder; lands in P9). + Eval { + #[command(subcommand)] + what: EvalWhat, + }, +} + +#[derive(Subcommand, Debug)] +enum ListWhat { + /// List documents currently indexed. + Docs, +} + +#[derive(Subcommand, Debug)] +enum InspectWhat { + /// Inspect a single document by ID. + Doc { id: String }, + /// Inspect a single chunk by ID. + Chunk { id: String }, +} + +#[derive(Subcommand, Debug)] +enum EvalWhat { + /// Run an eval suite (placeholder for P9). + Run { + #[arg(long)] + suite: Option, + }, +} + +#[derive(Clone, Copy, Debug, clap::ValueEnum)] +enum ModeFlag { + Lexical, + Vector, + Hybrid, +} + +impl From for kb_core::SearchMode { + fn from(m: ModeFlag) -> Self { + match m { + ModeFlag::Lexical => kb_core::SearchMode::Lexical, + ModeFlag::Vector => kb_core::SearchMode::Vector, + ModeFlag::Hybrid => kb_core::SearchMode::Hybrid, + } + } +} + +fn main() -> ExitCode { + let cli = Cli::parse(); + let level = if cli.debug { + kb_app::logging::LogLevel::Debug + } else if cli.verbose { + kb_app::logging::LogLevel::Verbose + } else { + kb_app::logging::LogLevel::Default + }; + // Fail-soft: if logging init errors (e.g. XDG state dir is read-only), + // proceed without a guard rather than crashing — `kb` is still usable. + let _log_guard = kb_app::logging::init(level).ok(); + match run(&cli) { + Ok(()) => ExitCode::from(0), + Err(e) => { + let code = exit_code(&e); + // Refusals at exit code 1 print to stdout (already done by the + // caller); errors go to stderr. + if code != 1 { + eprintln!("error: {e}"); + if cli.verbose { + for cause in e.chain().skip(1) { + eprintln!(" caused by: {cause}"); + } + } + } + ExitCode::from(code) + } + } +} + +fn exit_code(err: &anyhow::Error) -> u8 { + if err.downcast_ref::().is_some() { + return 1; + } + if err.downcast_ref::().is_some() { + return 1; + } + if err.downcast_ref::().is_some() { + return 3; + } + 2 +} + +fn run(cli: &Cli) -> anyhow::Result<()> { + match &cli.command { + Cmd::Init { force } => { + kb_app::init_workspace(*force)?; + if !cli.json { + println!( + "created {}", + kb_config::Config::xdg_config_path().display() + ); + println!("created {}", kb_config::Config::xdg_data_dir().display()); + println!("created {}", kb_config::Config::xdg_state_dir().display()); + println!("hint edit the config above, then `kb ingest`"); + } + Ok(()) + } + + Cmd::Ingest { + root, + summary_only, + } => { + let cfg = kb_config::Config::load(cli.config.as_deref())?; + let scope = kb_core::SourceScope { + root: root.clone().unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)), + include: cfg.workspace.include.clone(), + exclude: cfg.workspace.exclude.clone(), + }; + let report = kb_app::ingest(scope, *summary_only)?; + if cli.json { + println!("{}", serde_json::to_string(&wire::wire_ingest(&report))?); + } else { + println!( + "scanned {} new {} updated {} skipped {} errors {} ({} ms)", + report.scanned, + report.new, + report.updated, + report.skipped, + report.errors, + report.duration_ms + ); + } + Ok(()) + } + + Cmd::List { what } => match what { + ListWhat::Docs => { + let docs = kb_app::list_docs(kb_core::DocFilter::default())?; + if cli.json { + println!("{}", serde_json::to_string(&wire::wire_doc_summaries(&docs))?); + } else { + for d in &docs { + println!("{}\t{}", d.doc_id, d.doc_path.0); + } + } + Ok(()) + } + }, + + Cmd::Inspect { what } => match what { + InspectWhat::Doc { id } => { + let doc_id: kb_core::DocumentId = id.parse()?; + let doc = kb_app::inspect_doc(&doc_id)?; + // Inspect doc emits a `CanonicalDocument` — there's no §2 + // wire schema for it (P1-5 will decide whether this also + // becomes a tagged wrapper or stays as the raw domain + // object). Until then keep raw JSON, matching pre-P0-1 + // behaviour. + println!("{}", serde_json::to_string(&doc)?); + Ok(()) + } + InspectWhat::Chunk { id } => { + let chunk_id: kb_core::ChunkId = id.parse()?; + let chunk = kb_app::inspect_chunk(&chunk_id)?; + println!("{}", serde_json::to_string(&wire::wire_chunk_inspection(&chunk))?); + Ok(()) + } + }, + + Cmd::Search { + query, + k, + mode, + explain: _, + } => { + let q = kb_core::SearchQuery { + text: query.clone(), + mode: (*mode).into(), + k: *k, + filters: kb_core::SearchFilters::default(), + }; + let hits = kb_app::search(q)?; + if cli.json { + println!("{}", serde_json::to_string(&wire::wire_search_hits(&hits))?); + } else { + for h in &hits { + println!("{:>2}. {:.2} {}", h.rank, h.retrieval.fusion_score, h.doc_path.0); + } + } + Ok(()) + } + + Cmd::Ask { + query, + k, + mode, + explain, + temperature, + seed, + } => { + let opts = kb_app::AskOpts { + k: *k, + explain: *explain, + mode: (*mode).into(), + temperature: *temperature, + seed: *seed, + }; + let ans = kb_app::ask(query, opts)?; + if cli.json { + println!("{}", serde_json::to_string(&wire::wire_answer(&ans))?); + } else { + println!("{}", ans.answer); + } + // Refusal → exit 1. + if !ans.grounded { + return Err(RefusalSignal.into()); + } + Ok(()) + } + + Cmd::Doctor => { + let report = kb_app::doctor()?; + if cli.json { + println!("{}", serde_json::to_string(&wire::wire_doctor(&report))?); + } else { + for c in &report.checks { + let mark = if c.ok { "✓" } else { "✗" }; + println!("{mark} {:<20} {}", c.name, c.detail); + if let (false, Some(hint)) = (c.ok, c.hint.as_ref()) { + println!(" hint: {hint}"); + } + } + if !report.ok { + println!(); + let failed = report.checks.iter().filter(|c| !c.ok).count(); + println!("{failed} check(s) failed."); + } + } + if !report.ok { + return Err(DoctorUnhealthy.into()); + } + Ok(()) + } + + Cmd::Eval { what } => match what { + EvalWhat::Run { suite: _ } => { + anyhow::bail!("not yet wired (P9-3)") + } + }, + } +} + diff --git a/crates/kb-cli/src/wire.rs b/crates/kb-cli/src/wire.rs new file mode 100644 index 0000000..7dc236d --- /dev/null +++ b/crates/kb-cli/src/wire.rs @@ -0,0 +1,175 @@ +//! CLI-side wire-schema wrappers. +//! +//! Convention (per design §2): every JSON object emitted on stdout in +//! `--json` mode MUST carry a top-level `schema_version` of the form +//! `".v1"`. The kb-core types are pure domain types and do NOT +//! carry `schema_version` themselves; the CLI wraps them on emit. The one +//! exception is `DoctorReport`, where `schema_version` is part of the wire +//! type because the doctor wire object IS its own structured surface. +//! +//! Future tasks (P1-5, P3, P4, P5) replacing stub `bail!` paths must call +//! these helpers from the relevant CLI subcommand handler before +//! `serde_json::to_string`. +//! +//! Each helper is total (returns `serde_json::Value`, never an error) — the +//! input is a fully-typed `serde::Serialize` value, so the only way to fail +//! is OOM, which would have killed the process anyway. + +use serde_json::Value; + +use kb_app::DoctorReport; +use kb_core::{Answer, Chunk, DocSummary, IngestReport, SearchHit}; + +/// Insert `schema_version` into an object-shaped `Value`. Helper for the +/// "serialize, then tag" pattern used by all the per-type wrappers below. +fn tag_object(mut v: Value, schema_version: &str) -> Value { + if let Value::Object(ref mut map) = v { + map.insert( + "schema_version".to_string(), + Value::String(schema_version.to_string()), + ); + } + v +} + +/// Wrap an [`IngestReport`] as `ingest_report.v1`. +pub fn wire_ingest(r: &IngestReport) -> Value { + let v = serde_json::to_value(r).expect("IngestReport serializes"); + tag_object(v, "ingest_report.v1") +} + +/// Wrap a single [`DocSummary`] as `doc_summary.v1`. +pub fn wire_doc_summary(d: &DocSummary) -> Value { + let v = serde_json::to_value(d).expect("DocSummary serializes"); + tag_object(v, "doc_summary.v1") +} + +/// Wrap a list of [`DocSummary`] values as a JSON array of `doc_summary.v1` +/// objects (one tag per element, per design §2.5 — there is no list-envelope +/// schema; the list shape is `[{schema_version: "doc_summary.v1", ...}, ...]`). +pub fn wire_doc_summaries(d: &[DocSummary]) -> Value { + Value::Array(d.iter().map(wire_doc_summary).collect()) +} + +/// Wrap a [`Chunk`] as `chunk_inspection.v1` (§2.6). NOTE: the wire schema +/// requires `doc_path`, which the kb-core `Chunk` does not currently carry — +/// when P1-5 wires the Ok-path, the implementation should either enrich +/// `Chunk` or pass `doc_path` alongside. For now this helper emits whatever +/// fields `Chunk` serializes with, plus the `schema_version` tag. +pub fn wire_chunk_inspection(c: &Chunk) -> Value { + let v = serde_json::to_value(c).expect("Chunk serializes"); + tag_object(v, "chunk_inspection.v1") +} + +/// Wrap a single [`SearchHit`] as `search_hit.v1`. +pub fn wire_search_hit(h: &SearchHit) -> Value { + let mut v = serde_json::to_value(h).expect("SearchHit serializes"); + // Promote `retrieval.fusion_score` to a top-level `score` per §2.2. + if let Value::Object(ref mut map) = v { + if let Some(Value::Object(retrieval)) = map.get("retrieval") { + if let Some(score) = retrieval.get("fusion_score").cloned() { + map.insert("score".to_string(), score); + } + } + } + tag_object(v, "search_hit.v1") +} + +/// Wrap a list of [`SearchHit`] values as a JSON array of `search_hit.v1` +/// objects (one tag per element, per design §2.2). +pub fn wire_search_hits(hits: &[SearchHit]) -> Value { + Value::Array(hits.iter().map(wire_search_hit).collect()) +} + +/// Wrap an [`Answer`] as `answer.v1`. +pub fn wire_answer(a: &Answer) -> Value { + let v = serde_json::to_value(a).expect("Answer serializes"); + tag_object(v, "answer.v1") +} + +/// Idempotent pass-through for [`DoctorReport`] — the type already carries +/// `schema_version: "doctor.v1"` (struct-field convention, the one +/// exception called out in the module doc above). This helper exists so +/// every `--json` branch in `kb-cli` goes through `wire::*`, keeping the +/// emit pattern uniform. +pub fn wire_doctor(d: &DoctorReport) -> Value { + // Round-trip through `to_value` to confirm the field is serialized; + // then re-tag (no-op when the field is already present, defensive + // when a future refactor drops the struct-field). + let v = serde_json::to_value(d).expect("DoctorReport serializes"); + if let Value::Object(ref map) = v { + if matches!( + map.get("schema_version"), + Some(Value::String(s)) if s == "doctor.v1" + ) { + return v; + } + } + tag_object(v, "doctor.v1") +} + +#[cfg(test)] +mod tests { + use super::*; + + fn schema_of(v: &Value) -> Option<&str> { + v.as_object()?.get("schema_version")?.as_str() + } + + #[test] + fn doctor_round_trip_preserves_schema_version() { + let d = DoctorReport { + schema_version: "doctor.v1".to_string(), + ok: true, + checks: Vec::new(), + }; + let v = wire_doctor(&d); + assert_eq!(schema_of(&v), Some("doctor.v1")); + // Sanity: ok/checks are preserved. + assert_eq!(v.get("ok").and_then(Value::as_bool), Some(true)); + assert!(v.get("checks").and_then(Value::as_array).is_some()); + } + + #[test] + fn ingest_wrapper_tags_schema_version() { + use kb_core::SourceScope; + let r = IngestReport { + scope: SourceScope { + root: std::path::PathBuf::from("/tmp"), + include: vec![], + exclude: vec![], + }, + scanned: 0, + new: 0, + updated: 0, + skipped: 0, + errors: 0, + duration_ms: 0, + items: None, + }; + let v = wire_ingest(&r); + assert_eq!(schema_of(&v), Some("ingest_report.v1")); + assert!(v.get("items").is_some()); + } + + #[test] + fn doc_summaries_wraps_each_element() { + let v = wire_doc_summaries(&[]); + assert!(v.is_array()); + assert_eq!(v.as_array().unwrap().len(), 0); + } + + #[test] + fn search_hits_wraps_each_element() { + let v = wire_search_hits(&[]); + assert!(v.is_array()); + assert_eq!(v.as_array().unwrap().len(), 0); + } + + #[test] + fn tag_object_inserts_into_object() { + let v = Value::Object(serde_json::Map::new()); + let tagged = tag_object(v, "x.v1"); + assert_eq!(schema_of(&tagged), Some("x.v1")); + } +} diff --git a/crates/kb-config/Cargo.toml b/crates/kb-config/Cargo.toml new file mode 100644 index 0000000..f1430f8 --- /dev/null +++ b/crates/kb-config/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "kb-config" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Config schema + XDG path resolution" + +[dependencies] +# kb-core::CoreError reserved for P1-* config errors +kb-core = { path = "../kb-core" } +anyhow = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +toml = "0.8" +dirs = "5" diff --git a/crates/kb-config/src/lib.rs b/crates/kb-config/src/lib.rs new file mode 100644 index 0000000..b2b1352 --- /dev/null +++ b/crates/kb-config/src/lib.rs @@ -0,0 +1,489 @@ +//! `kb-config` — `Config` schema and XDG path resolution (§6). +//! +//! Layer order (`Config::load`): defaults → file → env (`KB_
_`). +//! CLI overrides land later, applied by `kb-cli` after `Config::load`. + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; + +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Config { + pub schema_version: u32, + pub workspace: WorkspaceCfg, + pub storage: StorageCfg, + pub indexing: IndexingCfg, + pub chunking: ChunkingCfg, + pub models: ModelsCfg, + pub search: SearchCfg, + pub rag: RagCfg, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct WorkspaceCfg { + pub root: String, + pub include: Vec, + pub exclude: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct StorageCfg { + pub data_dir: String, + pub sqlite: String, + pub vector_dir: String, + pub asset_dir: String, + pub artifact_dir: String, + pub model_dir: String, + pub runs_dir: String, + pub copy_threshold_mb: u64, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct IndexingCfg { + pub max_parallel_extractors: u32, + pub max_parallel_embeddings: u32, + pub watch_filesystem: bool, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ChunkingCfg { + pub target_tokens: usize, + pub overlap_tokens: usize, + pub respect_markdown_headings: bool, + pub chunker_version: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ModelsCfg { + pub embedding: EmbeddingModelCfg, + pub llm: LlmCfg, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct EmbeddingModelCfg { + pub provider: String, + pub model: String, + pub version: String, + pub dimensions: usize, + pub batch_size: usize, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct LlmCfg { + pub provider: String, + pub model: String, + pub context_tokens: usize, + pub endpoint: String, + pub temperature: f32, + pub seed: u64, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct SearchCfg { + pub default_k: usize, + pub hybrid_fusion: String, + pub rrf_k: u32, + pub snippet_chars: usize, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct RagCfg { + pub prompt_template_version: String, + pub score_gate: f32, + pub explain_default: bool, + pub max_context_tokens: usize, +} + +impl Config { + /// Defaults per design §6.4. + pub fn defaults() -> Self { + Self { + schema_version: 1, + workspace: WorkspaceCfg { + root: "~/KnowledgeBase".to_string(), + include: vec!["**/*.md".to_string()], + exclude: vec![ + ".git/**".to_string(), + "node_modules/**".to_string(), + ".obsidian/**".to_string(), + ], + }, + storage: StorageCfg { + data_dir: "${XDG_DATA_HOME:-~/.local/share}/kb".to_string(), + sqlite: "{data_dir}/kb.sqlite".to_string(), + vector_dir: "{data_dir}/lancedb".to_string(), + asset_dir: "{data_dir}/assets".to_string(), + artifact_dir: "{data_dir}/artifacts".to_string(), + model_dir: "{data_dir}/models".to_string(), + runs_dir: "{data_dir}/runs".to_string(), + copy_threshold_mb: 100, + }, + indexing: IndexingCfg { + max_parallel_extractors: 2, + max_parallel_embeddings: 1, + watch_filesystem: false, + }, + chunking: ChunkingCfg { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: true, + chunker_version: "md-heading-v1".to_string(), + }, + models: ModelsCfg { + embedding: EmbeddingModelCfg { + provider: "fastembed".to_string(), + model: "multilingual-e5-small".to_string(), + version: "v1".to_string(), + dimensions: 384, + batch_size: 64, + }, + llm: LlmCfg { + provider: "ollama".to_string(), + model: "qwen2.5:14b-instruct".to_string(), + context_tokens: 32768, + endpoint: "http://127.0.0.1:11434".to_string(), + temperature: 0.0, + seed: 0, + }, + }, + search: SearchCfg { + default_k: 10, + hybrid_fusion: "rrf".to_string(), + rrf_k: 60, + snippet_chars: 220, + }, + rag: RagCfg { + prompt_template_version: "rag-v1".to_string(), + score_gate: 0.30, + explain_default: false, + max_context_tokens: 8000, + }, + } + } + + /// Read config from disk and merge env overrides on top of it. If the + /// file is missing, defaults are used (so `kb doctor` runs with no + /// prior `kb init`). + pub fn load(path: Option<&Path>) -> anyhow::Result { + let from_disk = match path { + Some(p) if p.exists() => Self::from_file(p)?, + Some(_) => Self::defaults(), + None => { + let p = Self::xdg_config_path(); + if p.exists() { + Self::from_file(&p)? + } else { + Self::defaults() + } + } + }; + let env: HashMap = std::env::vars().collect(); + Ok(from_disk.apply_env(&env)) + } + + pub fn from_file(path: &Path) -> anyhow::Result { + let text = std::fs::read_to_string(path)?; + let cfg: Self = toml::from_str(&text)?; + Ok(cfg) + } + + /// Apply `KB_
_` env overrides. Unknown keys are ignored. + /// + /// The mapping is an explicit grep-friendly whitelist — one match arm + /// per leaf key in `Config`. Booleans accept `1` / `true` / `yes` + /// (case-insensitive) for true and anything else for false. Numeric + /// keys silently keep their prior value if the env value fails to + /// parse, so a malformed `KB_*` cannot crash startup. + pub fn apply_env(mut self, env: &HashMap) -> Self { + for (k, v) in env { + if !k.starts_with("KB_") { + continue; + } + match k.as_str() { + // workspace + "KB_WORKSPACE_ROOT" => self.workspace.root = v.clone(), + + // storage + "KB_STORAGE_DATA_DIR" => self.storage.data_dir = v.clone(), + "KB_STORAGE_SQLITE" => self.storage.sqlite = v.clone(), + "KB_STORAGE_VECTOR_DIR" => self.storage.vector_dir = v.clone(), + "KB_STORAGE_ASSET_DIR" => self.storage.asset_dir = v.clone(), + "KB_STORAGE_ARTIFACT_DIR" => self.storage.artifact_dir = v.clone(), + "KB_STORAGE_MODEL_DIR" => self.storage.model_dir = v.clone(), + "KB_STORAGE_RUNS_DIR" => self.storage.runs_dir = v.clone(), + "KB_STORAGE_COPY_THRESHOLD_MB" => { + if let Ok(n) = v.parse::() { + self.storage.copy_threshold_mb = n; + } + } + + // indexing + "KB_INDEXING_MAX_PARALLEL_EXTRACTORS" => { + if let Ok(n) = v.parse::() { + self.indexing.max_parallel_extractors = n; + } + } + "KB_INDEXING_MAX_PARALLEL_EMBEDDINGS" => { + if let Ok(n) = v.parse::() { + self.indexing.max_parallel_embeddings = n; + } + } + "KB_INDEXING_WATCH_FILESYSTEM" => { + self.indexing.watch_filesystem = parse_bool(v); + } + + // chunking + "KB_CHUNKING_TARGET_TOKENS" => { + if let Ok(n) = v.parse::() { + self.chunking.target_tokens = n; + } + } + "KB_CHUNKING_OVERLAP_TOKENS" => { + if let Ok(n) = v.parse::() { + self.chunking.overlap_tokens = n; + } + } + "KB_CHUNKING_RESPECT_MARKDOWN_HEADINGS" => { + self.chunking.respect_markdown_headings = parse_bool(v); + } + "KB_CHUNKING_CHUNKER_VERSION" => self.chunking.chunker_version = v.clone(), + + // models.embedding + "KB_MODELS_EMBEDDING_PROVIDER" => self.models.embedding.provider = v.clone(), + "KB_MODELS_EMBEDDING_MODEL" => self.models.embedding.model = v.clone(), + "KB_MODELS_EMBEDDING_VERSION" => self.models.embedding.version = v.clone(), + "KB_MODELS_EMBEDDING_DIMENSIONS" => { + if let Ok(n) = v.parse::() { + self.models.embedding.dimensions = n; + } + } + "KB_MODELS_EMBEDDING_BATCH_SIZE" => { + if let Ok(n) = v.parse::() { + self.models.embedding.batch_size = n; + } + } + + // models.llm + "KB_MODELS_LLM_PROVIDER" => self.models.llm.provider = v.clone(), + "KB_MODELS_LLM_MODEL" => self.models.llm.model = v.clone(), + "KB_MODELS_LLM_CONTEXT_TOKENS" => { + if let Ok(n) = v.parse::() { + self.models.llm.context_tokens = n; + } + } + "KB_MODELS_LLM_ENDPOINT" => self.models.llm.endpoint = v.clone(), + "KB_MODELS_LLM_TEMPERATURE" => { + if let Ok(f) = v.parse::() { + self.models.llm.temperature = f; + } + } + "KB_MODELS_LLM_SEED" => { + if let Ok(n) = v.parse::() { + self.models.llm.seed = n; + } + } + + // search + "KB_SEARCH_DEFAULT_K" => { + if let Ok(n) = v.parse::() { + self.search.default_k = n; + } + } + "KB_SEARCH_HYBRID_FUSION" => self.search.hybrid_fusion = v.clone(), + "KB_SEARCH_RRF_K" => { + if let Ok(n) = v.parse::() { + self.search.rrf_k = n; + } + } + "KB_SEARCH_SNIPPET_CHARS" => { + if let Ok(n) = v.parse::() { + self.search.snippet_chars = n; + } + } + + // rag + "KB_RAG_PROMPT_TEMPLATE_VERSION" => { + self.rag.prompt_template_version = v.clone(); + } + "KB_RAG_SCORE_GATE" => { + if let Ok(f) = v.parse::() { + self.rag.score_gate = f; + } + } + "KB_RAG_EXPLAIN_DEFAULT" => { + self.rag.explain_default = parse_bool(v); + } + "KB_RAG_MAX_CONTEXT_TOKENS" => { + if let Ok(n) = v.parse::() { + self.rag.max_context_tokens = n; + } + } + + // Unknown KB_* keys are silently ignored — see + // `env_unknown_key_is_ignored` test. + _ => {} + } + } + self + } + + /// `~/.config/kb/config.toml` (honors `XDG_CONFIG_HOME`). + pub fn xdg_config_path() -> PathBuf { + if let Ok(custom) = std::env::var("XDG_CONFIG_HOME") { + if !custom.is_empty() { + return PathBuf::from(custom).join("kb").join("config.toml"); + } + } + match dirs::config_dir() { + Some(d) => d.join("kb").join("config.toml"), + None => PathBuf::from("./kb/config.toml"), + } + } + + /// `~/.local/share/kb` (honors `XDG_DATA_HOME`). + pub fn xdg_data_dir() -> PathBuf { + if let Ok(custom) = std::env::var("XDG_DATA_HOME") { + if !custom.is_empty() { + return PathBuf::from(custom).join("kb"); + } + } + match dirs::data_dir() { + Some(d) => d.join("kb"), + None => PathBuf::from("./kb-data"), + } + } + + /// `~/.cache/kb` (honors `XDG_CACHE_HOME`). + pub fn xdg_cache_dir() -> PathBuf { + if let Ok(custom) = std::env::var("XDG_CACHE_HOME") { + if !custom.is_empty() { + return PathBuf::from(custom).join("kb"); + } + } + match dirs::cache_dir() { + Some(d) => d.join("kb"), + None => PathBuf::from("./kb-cache"), + } + } + + /// `~/.local/state/kb` (honors `XDG_STATE_HOME`). + pub fn xdg_state_dir() -> PathBuf { + if let Ok(custom) = std::env::var("XDG_STATE_HOME") { + if !custom.is_empty() { + return PathBuf::from(custom).join("kb"); + } + } + // `dirs` doesn't expose state_dir on all platforms; fall back to + // `$HOME/.local/state/kb` if XDG_STATE_HOME is unset. + if let Some(home) = dirs::home_dir() { + return home.join(".local").join("state").join("kb"); + } + PathBuf::from("./kb-state") + } +} + +/// Parse a permissive boolean — `1` / `true` / `yes` (case-insensitive) +/// for true, anything else for false. Used by `apply_env` for boolean +/// leaves of `Config`. +fn parse_bool(s: &str) -> bool { + matches!(s.to_ascii_lowercase().as_str(), "1" | "true" | "yes") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn defaults_are_serde_roundtrip_stable() { + let c = Config::defaults(); + let toml_text = toml::to_string(&c).unwrap(); + let back: Config = toml::from_str(&toml_text).unwrap(); + assert_eq!(c, back); + } + + #[test] + fn defaults_match_design_64_score_gate() { + let c = Config::defaults(); + assert_eq!(c.rag.score_gate, 0.30); + assert_eq!(c.chunking.target_tokens, 500); + assert_eq!(c.models.embedding.dimensions, 384); + assert_eq!(c.search.rrf_k, 60); + } + + #[test] + fn env_override_score_gate() { + let mut env = HashMap::new(); + env.insert("KB_RAG_SCORE_GATE".to_string(), "0.5".to_string()); + let c = Config::defaults().apply_env(&env); + assert!((c.rag.score_gate - 0.5).abs() < 1e-6); + } + + #[test] + fn env_override_search_k() { + let mut env = HashMap::new(); + env.insert("KB_SEARCH_DEFAULT_K".to_string(), "25".to_string()); + let c = Config::defaults().apply_env(&env); + assert_eq!(c.search.default_k, 25); + } + + #[test] + fn env_unknown_key_is_ignored() { + let baseline = Config::defaults(); + let mut env = HashMap::new(); + env.insert("KB_NOPE_FOO".to_string(), "garbage".to_string()); + let c = Config::defaults().apply_env(&env); + assert_eq!(c, baseline); + } + + #[test] + fn env_overrides_chunking_target_tokens() { + let mut env = HashMap::new(); + env.insert("KB_CHUNKING_TARGET_TOKENS".to_string(), "777".to_string()); + let c = Config::defaults().apply_env(&env); + assert_eq!(c.chunking.target_tokens, 777); + } + + #[test] + fn env_overrides_models_llm_endpoint_and_temperature() { + let mut env = HashMap::new(); + env.insert( + "KB_MODELS_LLM_ENDPOINT".to_string(), + "http://10.0.0.1:11434".to_string(), + ); + env.insert("KB_MODELS_LLM_TEMPERATURE".to_string(), "0.7".to_string()); + let c = Config::defaults().apply_env(&env); + assert_eq!(c.models.llm.endpoint, "http://10.0.0.1:11434"); + assert!((c.models.llm.temperature - 0.7).abs() < 1e-6); + } + + #[test] + fn env_overrides_indexing_watch_filesystem_bool() { + let mut env = HashMap::new(); + env.insert( + "KB_INDEXING_WATCH_FILESYSTEM".to_string(), + "true".to_string(), + ); + let c = Config::defaults().apply_env(&env); + assert!(c.indexing.watch_filesystem); + } + + #[test] + fn xdg_paths_honor_env() { + // Must restore env after the test to avoid polluting other tests. + let prev = std::env::var("XDG_CONFIG_HOME").ok(); + // SAFETY: tests in this module run sequentially; we restore below. + unsafe { + std::env::set_var("XDG_CONFIG_HOME", "/tmp/kbtest-xdg-config"); + } + let p = Config::xdg_config_path(); + assert_eq!(p, PathBuf::from("/tmp/kbtest-xdg-config/kb/config.toml")); + // SAFETY: scope-local restore. + unsafe { + match prev { + Some(v) => std::env::set_var("XDG_CONFIG_HOME", v), + None => std::env::remove_var("XDG_CONFIG_HOME"), + } + } + } +} diff --git a/crates/kb-core/Cargo.toml b/crates/kb-core/Cargo.toml new file mode 100644 index 0000000..795aad3 --- /dev/null +++ b/crates/kb-core/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "kb-core" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "kb domain types, traits, and ID recipe (no other kb-* deps)" + +[dependencies] +anyhow = { workspace = true } +thiserror = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +time = { workspace = true } +blake3 = { workspace = true } +serde_json_canonicalizer = "0.3" +unicode-normalization = "0.1" diff --git a/crates/kb-core/src/answer.rs b/crates/kb-core/src/answer.rs new file mode 100644 index 0000000..bbf6007 --- /dev/null +++ b/crates/kb-core/src/answer.rs @@ -0,0 +1,66 @@ +//! Answer + RAG types (§3.8). + +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; + +use crate::citation::Citation; +use crate::search::SearchMode; +use crate::versions::PromptTemplateVersion; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Answer { + pub answer: String, + pub citations: Vec, + pub grounded: bool, + pub refusal_reason: Option, + pub model: ModelRef, + pub embedding: Option, + pub prompt_template_version: PromptTemplateVersion, + pub retrieval: AnswerRetrievalSummary, + pub usage: TokenUsage, + #[serde(with = "time::serde::rfc3339")] + pub created_at: OffsetDateTime, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct AnswerCitation { + pub marker: Option, + pub citation: Citation, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RefusalReason { + ScoreGate, + LlmSelfJudge, + NoIndex, + NoChunks, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ModelRef { + pub id: String, + pub provider: String, + pub dimensions: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct AnswerRetrievalSummary { + pub trace_id: TraceId, + pub mode: SearchMode, + pub k: usize, + pub score_gate: f32, + pub top_score: f32, + pub chunks_returned: u32, + pub chunks_used: u32, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct TokenUsage { + pub prompt_tokens: u32, + pub completion_tokens: u32, + pub latency_ms: u32, +} + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct TraceId(pub String); diff --git a/crates/kb-core/src/asset.rs b/crates/kb-core/src/asset.rs new file mode 100644 index 0000000..8532175 --- /dev/null +++ b/crates/kb-core/src/asset.rs @@ -0,0 +1,61 @@ +//! Raw asset, source URI, workspace path (§3.3). + +use std::path::PathBuf; + +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; + +use crate::errors::CoreError; +use crate::ids::AssetId; +use crate::media::{Checksum, MediaType}; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind", content = "value")] +pub enum SourceUri { + File(PathBuf), + /// `kb://` virtual reference. + Kb(String), +} + +/// POSIX-relative path inside the workspace root (§6.6, §4.1). Always +/// produced via `crate::normalize::to_posix` (filesystem side) or +/// `WorkspacePath::new` (parse side). The inner string is forbidden from +/// containing the `#` character: a workspace path must never collide with +/// the W3C-Media-Fragments separator that `Citation` URIs rely on, so the +/// invariant is enforced at construction rather than at every call site. +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct WorkspacePath(pub String); + +impl WorkspacePath { + /// Construct a `WorkspacePath` from a string, rejecting any input that + /// contains `#`. Use this on the parser side (e.g. `Citation::parse`) + /// where the input does not flow through `to_posix`. + pub fn new(s: String) -> Result { + if s.contains('#') { + return Err(CoreError::Malformed(format!( + "workspace path must not contain '#': {s:?}" + ))); + } + Ok(Self(s)) + } +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum AssetStorage { + Copied { path: PathBuf }, + Reference { path: PathBuf, sha: Checksum }, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct RawAsset { + pub asset_id: AssetId, + pub source_uri: SourceUri, + pub workspace_path: WorkspacePath, + pub media_type: MediaType, + pub byte_len: u64, + pub checksum: Checksum, + #[serde(with = "time::serde::rfc3339")] + pub discovered_at: OffsetDateTime, + pub stored: AssetStorage, +} diff --git a/crates/kb-core/src/chunk.rs b/crates/kb-core/src/chunk.rs new file mode 100644 index 0000000..1c3b0aa --- /dev/null +++ b/crates/kb-core/src/chunk.rs @@ -0,0 +1,19 @@ +//! Chunk (§3.5). + +use serde::{Deserialize, Serialize}; + +use crate::document::SourceSpan; +use crate::ids::{BlockId, ChunkId, DocumentId}; +use crate::versions::ChunkerVersion; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Chunk { + pub chunk_id: ChunkId, + pub doc_id: DocumentId, + pub block_ids: Vec, + pub text: String, + pub heading_path: Vec, + pub source_spans: Vec, + pub token_estimate: usize, + pub chunker_version: ChunkerVersion, +} diff --git a/crates/kb-core/src/citation.rs b/crates/kb-core/src/citation.rs new file mode 100644 index 0000000..80adb67 --- /dev/null +++ b/crates/kb-core/src/citation.rs @@ -0,0 +1,357 @@ +//! Citation (§3.5) — discriminated 5-variant. Each variant has a canonical +//! W3C Media Fragments URI per design §0 Q3. + +use anyhow::{Result, bail}; +use serde::{Deserialize, Serialize}; + +use crate::asset::WorkspacePath; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum Citation { + Line { + path: WorkspacePath, + start: u32, + end: u32, + section: Option, + }, + Page { + path: WorkspacePath, + page: u32, + section: Option, + }, + Region { + path: WorkspacePath, + x: u32, + y: u32, + w: u32, + h: u32, + }, + Caption { + path: WorkspacePath, + model: String, + }, + Time { + path: WorkspacePath, + start_ms: u64, + end_ms: u64, + speaker: Option, + }, +} + +impl Citation { + pub fn path(&self) -> &WorkspacePath { + match self { + Citation::Line { path, .. } + | Citation::Page { path, .. } + | Citation::Region { path, .. } + | Citation::Caption { path, .. } + | Citation::Time { path, .. } => path, + } + } + + /// Emit a W3C Media Fragments URI per design §0 Q3. + /// `section` and `speaker` and `caption.model` are NOT part of the URI + /// fragment; they live in the structured wire object. + pub fn to_uri(&self) -> String { + match self { + Citation::Line { path, start, end, .. } => { + if start == end { + format!("{}#L{}", path.0, start) + } else { + format!("{}#L{}-L{}", path.0, start, end) + } + } + Citation::Page { path, page, .. } => format!("{}#p={}", path.0, page), + Citation::Region { + path, x, y, w, h, .. + } => format!("{}#xywh={},{},{},{}", path.0, x, y, w, h), + Citation::Caption { path, .. } => format!("{}#caption", path.0), + Citation::Time { + path, + start_ms, + end_ms, + speaker, + } => { + let s = format_hms_ms(*start_ms); + let e = format_hms_ms(*end_ms); + match speaker { + Some(sp) => format!("{}#t={},{}&speaker={}", path.0, s, e, sp), + None => format!("{}#t={},{}", path.0, s, e), + } + } + } + } + + /// Strict inverse of `to_uri`. The `section` / `caption.model` fields + /// are not part of the URI grammar, so a parsed Citation will have + /// `section = None` and `model = ""` for the relevant variants. + /// Round-trip property holds for citations whose non-URI fields are at + /// their default values (see test). + pub fn parse(s: &str) -> Result { + let (path_str, frag) = match s.rsplit_once('#') { + Some(t) => t, + None => bail!("citation has no '#' fragment: {s:?}"), + }; + // `WorkspacePath::new` rejects any remaining `#` on the path side + // (e.g. the input had multiple `#` separators), closing the + // hash-in-path concern at construction rather than at every reader. + let path = WorkspacePath::new(path_str.to_owned())?; + + if let Some(rest) = frag.strip_prefix("L") { + // line range: `L` or `L-L` + if let Some((a, b)) = rest.split_once("-L") { + let start: u32 = a + .parse() + .map_err(|_| anyhow::anyhow!("bad line start in {a:?} (input {s:?})"))?; + let end: u32 = b + .parse() + .map_err(|_| anyhow::anyhow!("bad line end in {b:?} (input {s:?})"))?; + return Ok(Citation::Line { + path, + start, + end, + section: None, + }); + } + let n: u32 = rest + .parse() + .map_err(|_| anyhow::anyhow!("bad line number in {rest:?} (input {s:?})"))?; + return Ok(Citation::Line { + path, + start: n, + end: n, + section: None, + }); + } + if let Some(rest) = frag.strip_prefix("p=") { + let page: u32 = rest + .parse() + .map_err(|_| anyhow::anyhow!("bad page number in {rest:?} (input {s:?})"))?; + return Ok(Citation::Page { + path, + page, + section: None, + }); + } + if let Some(rest) = frag.strip_prefix("xywh=") { + let parts: Vec<&str> = rest.split(',').collect(); + if parts.len() != 4 { + bail!("xywh= expects 4 comma-separated values, got {rest:?} (input {s:?})"); + } + let x: u32 = parts[0] + .parse() + .map_err(|_| anyhow::anyhow!("bad xywh.x in {:?} (input {s:?})", parts[0]))?; + let y: u32 = parts[1] + .parse() + .map_err(|_| anyhow::anyhow!("bad xywh.y in {:?} (input {s:?})", parts[1]))?; + let w: u32 = parts[2] + .parse() + .map_err(|_| anyhow::anyhow!("bad xywh.w in {:?} (input {s:?})", parts[2]))?; + let h: u32 = parts[3] + .parse() + .map_err(|_| anyhow::anyhow!("bad xywh.h in {:?} (input {s:?})", parts[3]))?; + return Ok(Citation::Region { path, x, y, w, h }); + } + if frag == "caption" { + return Ok(Citation::Caption { + path, + model: String::new(), + }); + } + if let Some(rest) = frag.strip_prefix("t=") { + // `t=,` optionally followed by `&speaker=` + let (range, speaker) = match rest.split_once('&') { + Some((r, kv)) => match kv.strip_prefix("speaker=") { + Some(sp) => (r, Some(sp.to_owned())), + None => bail!("unknown time-fragment param {kv:?} (input {s:?})"), + }, + None => (rest, None), + }; + let (s_str, e_str) = match range.split_once(',') { + Some(t) => t, + None => bail!("time fragment expects ',', got {range:?} (input {s:?})"), + }; + let start_ms = parse_hms_ms(s_str)?; + let end_ms = parse_hms_ms(e_str)?; + return Ok(Citation::Time { + path, + start_ms, + end_ms, + speaker, + }); + } + bail!("unrecognised citation fragment {frag:?} (input {s:?})") + } +} + +/// Format milliseconds as `hh:mm:ss.mmm` (W3C Media Fragments NPT-with-ms). +fn format_hms_ms(ms: u64) -> String { + let hours = ms / 3_600_000; + let minutes = (ms % 3_600_000) / 60_000; + let seconds = (ms % 60_000) / 1000; + let millis = ms % 1000; + format!("{hours:02}:{minutes:02}:{seconds:02}.{millis:03}") +} + +fn parse_hms_ms(s: &str) -> Result { + // Accept `hh:mm:ss.mmm` (the form we emit). Reject malformed input. + let parts: Vec<&str> = s.split(':').collect(); + if parts.len() != 3 { + bail!("time component expects hh:mm:ss.mmm, got {s:?}"); + } + let h: u64 = parts[0] + .parse() + .map_err(|_| anyhow::anyhow!("bad hours in {:?} (input {s:?})", parts[0]))?; + let m: u64 = parts[1] + .parse() + .map_err(|_| anyhow::anyhow!("bad minutes in {:?} (input {s:?})", parts[1]))?; + let (sec, ms) = match parts[2].split_once('.') { + Some((s_part, ms_part)) => { + let sec: u64 = s_part + .parse() + .map_err(|_| anyhow::anyhow!("bad seconds in {s_part:?} (input {s:?})"))?; + // Pad/truncate to exactly 3 digits. + let mut ms_str = ms_part.to_owned(); + while ms_str.len() < 3 { + ms_str.push('0'); + } + ms_str.truncate(3); + let ms: u64 = ms_str + .parse() + .map_err(|_| anyhow::anyhow!("bad milliseconds in {ms_part:?} (input {s:?})"))?; + (sec, ms) + } + None => { + let sec: u64 = parts[2] + .parse() + .map_err(|_| anyhow::anyhow!("bad seconds in {:?} (input {s:?})", parts[2]))?; + (sec, 0) + } + }; + Ok(h * 3_600_000 + m * 60_000 + sec * 1000 + ms) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn p(s: &str) -> WorkspacePath { + WorkspacePath::new(s.to_owned()).expect("test paths must not contain '#'") + } + + #[test] + fn line_range_uri_and_roundtrip() { + let c = Citation::Line { + path: p("notes/rust/kb.md"), + start: 12, + end: 34, + section: None, + }; + assert_eq!(c.to_uri(), "notes/rust/kb.md#L12-L34"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn line_single_uri_and_roundtrip() { + let c = Citation::Line { + path: p("a/b.md"), + start: 7, + end: 7, + section: None, + }; + assert_eq!(c.to_uri(), "a/b.md#L7"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn page_uri_and_roundtrip() { + let c = Citation::Page { + path: p("papers/book.pdf"), + page: 23, + section: None, + }; + assert_eq!(c.to_uri(), "papers/book.pdf#p=23"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn region_uri_and_roundtrip() { + let c = Citation::Region { + path: p("photos/x.png"), + x: 120, + y: 40, + w: 520, + h: 180, + }; + assert_eq!(c.to_uri(), "photos/x.png#xywh=120,40,520,180"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn caption_uri_and_roundtrip() { + let c = Citation::Caption { + path: p("photos/x.png"), + // `model` is not in the URI grammar; round-trip fills it with "". + model: String::new(), + }; + assert_eq!(c.to_uri(), "photos/x.png#caption"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn time_uri_and_roundtrip_with_speaker() { + let c = Citation::Time { + path: p("recordings/r.m4a"), + start_ms: 822_000, + end_ms: 850_000, + speaker: Some("S1".to_string()), + }; + assert_eq!( + c.to_uri(), + "recordings/r.m4a#t=00:13:42.000,00:14:10.000&speaker=S1" + ); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn time_uri_and_roundtrip_without_speaker() { + let c = Citation::Time { + path: p("recordings/r.m4a"), + start_ms: 1_500, + end_ms: 2_750, + speaker: None, + }; + assert_eq!(c.to_uri(), "recordings/r.m4a#t=00:00:01.500,00:00:02.750"); + let parsed = Citation::parse(&c.to_uri()).unwrap(); + assert_eq!(parsed, c); + } + + #[test] + fn parse_rejects_no_fragment() { + assert!(Citation::parse("just/path.md").is_err()); + } + + #[test] + fn parse_rejects_unknown_fragment() { + assert!(Citation::parse("a.md#mystery=1").is_err()); + } + + /// `rsplit_once('#')` would otherwise leave a `#` on the path side when + /// the input contains multiple `#` separators (e.g. someone embeds a + /// fake fragment in the path). The `WorkspacePath::new` constructor + /// closes that hole at construction time. + #[test] + fn parse_path_with_hash_rejected_at_to_posix_layer() { + // `notes/x#evil.md#L7` — rsplit_once strips `#L7`, leaving + // `notes/x#evil.md` on the path side. WorkspacePath::new must reject. + let r = Citation::parse("notes/x#evil.md#L7"); + assert!(r.is_err(), "path with embedded '#' must be rejected"); + } +} diff --git a/crates/kb-core/src/document.rs b/crates/kb-core/src/document.rs new file mode 100644 index 0000000..e0bb295 --- /dev/null +++ b/crates/kb-core/src/document.rs @@ -0,0 +1,177 @@ +//! CanonicalDocument, Block, SourceSpan, Inline, plus the forward-declared +//! OCR / caption / transcript stubs (§3.4 + §3.7a). + +use serde::{Deserialize, Serialize}; + +use crate::asset::WorkspacePath; +use crate::ids::{AssetId, BlockId, DocumentId}; +use crate::media::Lang; +use crate::metadata::{Metadata, Provenance}; +use crate::versions::ParserVersion; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct CanonicalDocument { + pub doc_id: DocumentId, + pub source_asset_id: AssetId, + pub workspace_path: WorkspacePath, + pub title: String, + pub lang: Lang, + pub blocks: Vec, + pub metadata: Metadata, + pub provenance: Provenance, + pub parser_version: ParserVersion, + pub schema_version: u32, + pub doc_version: u32, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum Block { + Heading(HeadingBlock), + Paragraph(TextBlock), + List(ListBlock), + Code(CodeBlock), + Table(TableBlock), + Quote(TextBlock), + ImageRef(ImageRefBlock), + AudioRef(AudioRefBlock), +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct CommonBlock { + pub block_id: BlockId, + pub heading_path: Vec, + pub source_span: SourceSpan, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct HeadingBlock { + pub common: CommonBlock, + pub level: u8, + pub text: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct TextBlock { + pub common: CommonBlock, + pub text: String, + pub inlines: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ListBlock { + pub common: CommonBlock, + pub ordered: bool, + pub items: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct CodeBlock { + pub common: CommonBlock, + pub lang: Option, + pub code: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct TableBlock { + pub common: CommonBlock, + pub headers: Vec, + pub rows: Vec>, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ImageRefBlock { + pub common: CommonBlock, + pub asset_id: Option, + pub src: String, + pub alt: String, + pub ocr: Option, + pub caption: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct AudioRefBlock { + pub common: CommonBlock, + pub asset_id: AssetId, + pub duration_ms: u64, + pub transcript: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum Inline { + Text(String), + Code(String), + Link { text: String, href: String }, + Strong(Vec), + Emph(Vec), +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum SourceSpan { + Line { + start: u32, + end: u32, + }, + Byte { + start: u64, + end: u64, + }, + Page { + page: u32, + char_start: Option, + char_end: Option, + }, + Region { + x: u32, + y: u32, + w: u32, + h: u32, + }, + Time { + start_ms: u64, + end_ms: u64, + }, +} + +// ── Forward-declared stubs (§3.7a). Bodies are final per design. ──────── + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct OcrText { + pub joined: String, + pub regions: Vec, + pub engine: String, + pub engine_version: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct OcrRegion { + pub bbox: (u32, u32, u32, u32), + pub text: String, + pub confidence: f32, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ModelCaption { + pub text: String, + pub model: String, + pub model_version: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Transcript { + pub segments: Vec, + pub engine: String, + pub engine_version: String, + pub language: Lang, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct TranscriptSegment { + pub start_ms: u64, + pub end_ms: u64, + pub text: String, + pub speaker: Option, + pub confidence: Option, +} diff --git a/crates/kb-core/src/errors.rs b/crates/kb-core/src/errors.rs new file mode 100644 index 0000000..cb2da46 --- /dev/null +++ b/crates/kb-core/src/errors.rs @@ -0,0 +1,15 @@ +//! `CoreError` (§10). + +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum CoreError { + #[error("invalid id: {0}")] + InvalidId(String), + #[error("invalid citation: {0}")] + InvalidCitation(String), + #[error("invalid source span: {0}")] + InvalidSpan(String), + #[error("malformed input: {0}")] + Malformed(String), +} diff --git a/crates/kb-core/src/ids.rs b/crates/kb-core/src/ids.rs new file mode 100644 index 0000000..46f60ce --- /dev/null +++ b/crates/kb-core/src/ids.rs @@ -0,0 +1,477 @@ +//! Newtype IDs (§3.1) + ID generation recipe (§4.2). +//! +//! Every ID is `blake3(canonical_json(tuple))[..32]`. `Display` returns the +//! inner hex string; `FromStr` accepts 32 hex characters (mixed case) and +//! normalizes the stored representation to lowercase so equality and hashing +//! are canonical. + +use std::fmt; +use std::str::FromStr; + +use serde::{Deserialize, Serialize}; + +use crate::asset::WorkspacePath; +use crate::document::SourceSpan; +use crate::errors::CoreError; +use crate::versions::{ + ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, + ParserVersion, +}; + +macro_rules! newtype_id { + ($name:ident) => { + #[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] + pub struct $name(pub String); + + impl fmt::Display for $name { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } + } + + impl FromStr for $name { + type Err = CoreError; + fn from_str(s: &str) -> Result { + validate_hex32(s)?; + Ok(Self(s.to_ascii_lowercase())) + } + } + }; +} + +newtype_id!(AssetId); +newtype_id!(DocumentId); +newtype_id!(BlockId); +newtype_id!(ChunkId); +newtype_id!(EmbeddingId); +newtype_id!(IndexId); + +fn validate_hex32(s: &str) -> Result<(), CoreError> { + if s.len() != 32 { + return Err(CoreError::InvalidId(format!( + "expected 32 hex chars, got {}", + s.len() + ))); + } + if !s.bytes().all(|b| b.is_ascii_hexdigit()) { + return Err(CoreError::InvalidId(format!( + "non-hex character in {s:?}" + ))); + } + Ok(()) +} + +/// Canonical-JSON + blake3 + hex prefix 32. Per design §4.2. +pub fn id_from(tuple: T) -> String { + let bytes = serde_json_canonicalizer::to_vec(&tuple) + .expect("canonical JSON serialization must not fail for kb-core inputs"); + // The crate exposes `to_vec` for `T: Serialize` returning `Vec`. + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..32].to_string() +} + +#[derive(Serialize)] +struct AssetTuple<'a> { + kind: &'static str, + asset_blake3: &'a str, +} + +#[derive(Serialize)] +struct DocTuple<'a> { + kind: &'static str, + workspace_path: &'a str, + asset_id: &'a str, + parser_version: &'a str, +} + +#[derive(Serialize)] +struct BlockTuple<'a> { + kind: &'static str, + doc_id: &'a str, + block_kind: &'a str, + heading_path: &'a [String], + ordinal: u32, + source_span: &'a SourceSpan, +} + +#[derive(Serialize)] +struct ChunkTuple<'a> { + kind: &'static str, + doc_id: &'a str, + chunker_version: &'a str, + block_ids: Vec<&'a str>, + policy_hash: &'a str, +} + +#[derive(Serialize)] +struct EmbeddingTuple<'a> { + kind: &'static str, + chunk_id: &'a str, + model_id: &'a str, + model_version: &'a str, + dimensions: usize, +} + +#[derive(Serialize)] +struct IndexTuple<'a> { + kind: &'static str, + collection: &'a str, + embedding_model: &'a str, + dimensions: usize, + index_version: &'a str, + index_kind: &'a str, + index_params_hash: &'a str, +} + +pub fn id_for_asset(asset_blake3_full_hex: &str) -> AssetId { + AssetId(id_from(AssetTuple { + kind: "asset", + asset_blake3: asset_blake3_full_hex, + })) +} + +pub fn id_for_doc( + workspace_path: &WorkspacePath, + asset: &AssetId, + parser_version: &ParserVersion, +) -> DocumentId { + DocumentId(id_from(DocTuple { + kind: "doc", + workspace_path: &workspace_path.0, + asset_id: &asset.0, + parser_version: &parser_version.0, + })) +} + +pub fn id_for_block( + doc: &DocumentId, + block_kind: &str, + heading_path: &[String], + ordinal: u32, + span: &SourceSpan, +) -> BlockId { + BlockId(id_from(BlockTuple { + kind: "block", + doc_id: &doc.0, + block_kind, + heading_path, + ordinal, + source_span: span, + })) +} + +pub fn id_for_chunk( + doc: &DocumentId, + chunker_version: &ChunkerVersion, + block_ids: &[BlockId], + policy_hash: &str, +) -> ChunkId { + ChunkId(id_from(ChunkTuple { + kind: "chunk", + doc_id: &doc.0, + chunker_version: &chunker_version.0, + block_ids: block_ids.iter().map(|b| b.0.as_str()).collect(), + policy_hash, + })) +} + +pub fn id_for_embedding( + chunk: &ChunkId, + model: &EmbeddingModelId, + version: &EmbeddingVersion, + dims: usize, +) -> EmbeddingId { + EmbeddingId(id_from(EmbeddingTuple { + kind: "embedding", + chunk_id: &chunk.0, + model_id: &model.0, + model_version: &version.0, + dimensions: dims, + })) +} + +pub fn id_for_index( + collection: &str, + model: &EmbeddingModelId, + dims: usize, + version: &IndexVersion, + kind: &str, + params_hash: &str, +) -> IndexId { + IndexId(id_from(IndexTuple { + kind: "index", + collection, + embedding_model: &model.0, + dimensions: dims, + index_version: &version.0, + index_kind: kind, + index_params_hash: params_hash, + })) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn newtype_display_roundtrip() { + let s = "0123456789abcdef0123456789abcdef"; + let id: AssetId = s.parse().unwrap(); + assert_eq!(id.to_string(), s); + } + + #[test] + fn newtype_rejects_short() { + let r: Result = "abc".parse(); + assert!(r.is_err()); + } + + #[test] + fn newtype_rejects_non_hex() { + let r: Result = "ZZZ456789abcdef0123456789abcdef0".parse(); + assert!(r.is_err()); + } + + #[test] + fn newtype_accepts_uppercase_normalizes_to_lowercase() { + let r: Result = "0123456789ABCDEF0123456789ABCDEF".parse(); + let id = r.expect("uppercase hex must be accepted"); + assert_eq!(id.0, "0123456789abcdef0123456789abcdef"); + assert_eq!(id.to_string(), "0123456789abcdef0123456789abcdef"); + } + + #[test] + fn newtype_rejects_invalid_chars_after_uppercase_pass() { + // Mix of upper-hex (would pass) and non-hex `XYZ` (must reject). + let r: Result = "DEADBEEFCAFEBAB1XYZ23456789ABCD0".parse(); + assert!(r.is_err()); + } + + /// Determinism: 1000 runs of `id_from` over the same input yield the same + /// hex. + #[test] + fn id_from_deterministic_1000() { + #[derive(Serialize)] + struct T<'a> { + a: u32, + b: &'a str, + } + let input = T { a: 7, b: "hello" }; + let first = id_from(&input); + for _ in 0..1000 { + assert_eq!(id_from(&input), first); + } + assert_eq!(first.len(), 32); + } + + /// Key order in the source struct does not affect hash (canonical JSON + /// sorts keys alphabetically). + #[test] + fn id_from_key_order_invariant() { + #[derive(Serialize)] + struct A { + a: u32, + b: u32, + } + #[derive(Serialize)] + struct B { + b: u32, + a: u32, + } + assert_eq!(id_from(A { a: 1, b: 2 }), id_from(B { b: 2, a: 1 })); + } + + /// The expected hex below is hand-computed via design §4.2: + /// tuple = { "kind": "asset", "asset_blake3": "deadbeef" } + /// canonical JSON (key-sorted, no whitespace, both keys are pure ASCII): + /// {"asset_blake3":"deadbeef","kind":"asset"} + /// blake3 of those bytes → hex → first 32 chars. + /// Pinned via an independent tool (b3sum, computed once outside the code + /// under test) so a regression in our JCS or hash pipeline is caught. + #[test] + fn id_for_asset_pinned() { + // printf '{"asset_blake3":"deadbeef","kind":"asset"}' | b3sum + // → cec9353553efb238a7919d38d3e148f1... + let id = id_for_asset("deadbeef"); + assert_eq!(id.0, "cec9353553efb238a7919d38d3e148f1"); + } + + /// Independent pin for id_for_doc. + /// canonical JSON: + /// {"asset_id":"6cb0ef0eb89c63b8b6e76ec53dca6e7d", + /// "kind":"doc", + /// "parser_version":"pulldown-cmark-0.x", + /// "workspace_path":"notes/test.md"} + /// (concatenated, no whitespace). + #[test] + fn id_for_doc_pinned() { + let asset = AssetId("6cb0ef0eb89c63b8b6e76ec53dca6e7d".to_string()); + let path = WorkspacePath::new("notes/test.md".to_string()).unwrap(); + let pv = ParserVersion("pulldown-cmark-0.x".to_string()); + let id = id_for_doc(&path, &asset, &pv); + assert_eq!(id.0, "8547fe58cb42d593fd761d77242401db"); + } + + /// Independent pin for id_for_block. + /// inputs: + /// doc=DocumentId("aabbccdd00112233445566778899aabb"), + /// block_kind="paragraph", heading_path=["Intro"], ordinal=3, + /// span=SourceSpan::Line { start: 10, end: 20 } + /// canonical JSON (key-sorted, compact, no whitespace): + /// {"block_kind":"paragraph", + /// "doc_id":"aabbccdd00112233445566778899aabb", + /// "heading_path":["Intro"], + /// "kind":"block", + /// "ordinal":3, + /// "source_span":{"end":20,"kind":"line","start":10}} + /// computed via: + /// printf '{"block_kind":"paragraph","doc_id":"aabbccdd00112233445566778899aabb","heading_path":["Intro"],"kind":"block","ordinal":3,"source_span":{"end":20,"kind":"line","start":10}}' \ + /// | ~/.cargo/bin/b3sum --no-names | cut -c1-32 + /// → 8a7bf22de7ec3293a792028c829b3812 + #[test] + fn id_for_block_pinned() { + let doc = DocumentId("aabbccdd00112233445566778899aabb".to_string()); + let heading = vec!["Intro".to_string()]; + let span = SourceSpan::Line { start: 10, end: 20 }; + + // Sanity check: confirm that the canonical JSON our code produces + // matches the literal we hashed externally. If a future field-order + // change (or rename) silently shifts the hash, this assertion fails + // before the hex comparison and points at the JSON layer directly. + let expected_json = b"{\"block_kind\":\"paragraph\",\"doc_id\":\"aabbccdd00112233445566778899aabb\",\"heading_path\":[\"Intro\"],\"kind\":\"block\",\"ordinal\":3,\"source_span\":{\"end\":20,\"kind\":\"line\",\"start\":10}}"; + let tuple = BlockTuple { + kind: "block", + doc_id: &doc.0, + block_kind: "paragraph", + heading_path: &heading, + ordinal: 3, + source_span: &span, + }; + assert_eq!( + serde_json_canonicalizer::to_vec(&tuple).unwrap(), + expected_json + ); + + let id = id_for_block(&doc, "paragraph", &heading, 3, &span); + assert_eq!(id.0, "8a7bf22de7ec3293a792028c829b3812"); + } + + /// Independent pin for id_for_chunk. + /// inputs: + /// doc=DocumentId("aabbccdd00112233445566778899aabb"), + /// chunker_version=ChunkerVersion("greedy-1.0"), + /// block_ids=[BlockId("a1b2c3d4e5f6789012345678abcdef00")], + /// policy_hash="abc123" + /// canonical JSON (key-sorted, compact, no whitespace): + /// {"block_ids":["a1b2c3d4e5f6789012345678abcdef00"], + /// "chunker_version":"greedy-1.0", + /// "doc_id":"aabbccdd00112233445566778899aabb", + /// "kind":"chunk", + /// "policy_hash":"abc123"} + /// computed via: + /// printf '{"block_ids":["a1b2c3d4e5f6789012345678abcdef00"],"chunker_version":"greedy-1.0","doc_id":"aabbccdd00112233445566778899aabb","kind":"chunk","policy_hash":"abc123"}' \ + /// | ~/.cargo/bin/b3sum --no-names | cut -c1-32 + /// → 8809f627777fe7ca5c4433b97dd88ce9 + #[test] + fn id_for_chunk_pinned() { + let doc = DocumentId("aabbccdd00112233445566778899aabb".to_string()); + let cv = ChunkerVersion("greedy-1.0".to_string()); + let blocks = vec![BlockId("a1b2c3d4e5f6789012345678abcdef00".to_string())]; + + let expected_json = b"{\"block_ids\":[\"a1b2c3d4e5f6789012345678abcdef00\"],\"chunker_version\":\"greedy-1.0\",\"doc_id\":\"aabbccdd00112233445566778899aabb\",\"kind\":\"chunk\",\"policy_hash\":\"abc123\"}"; + let tuple = ChunkTuple { + kind: "chunk", + doc_id: &doc.0, + chunker_version: &cv.0, + block_ids: blocks.iter().map(|b| b.0.as_str()).collect(), + policy_hash: "abc123", + }; + assert_eq!( + serde_json_canonicalizer::to_vec(&tuple).unwrap(), + expected_json + ); + + let id = id_for_chunk(&doc, &cv, &blocks, "abc123"); + assert_eq!(id.0, "8809f627777fe7ca5c4433b97dd88ce9"); + } + + /// Independent pin for id_for_embedding. + /// inputs: + /// chunk=ChunkId("d1e2f3a4b5c6789012345678aabbccdd"), + /// model_id=EmbeddingModelId("BAAI/bge-small-en"), + /// model_version=EmbeddingVersion("v1"), dimensions=384 + /// canonical JSON (key-sorted, compact, no whitespace): + /// {"chunk_id":"d1e2f3a4b5c6789012345678aabbccdd", + /// "dimensions":384, + /// "kind":"embedding", + /// "model_id":"BAAI/bge-small-en", + /// "model_version":"v1"} + /// computed via: + /// printf '{"chunk_id":"d1e2f3a4b5c6789012345678aabbccdd","dimensions":384,"kind":"embedding","model_id":"BAAI/bge-small-en","model_version":"v1"}' \ + /// | ~/.cargo/bin/b3sum --no-names | cut -c1-32 + /// → 71992c457a5da39880a6d17d646ed0fd + #[test] + fn id_for_embedding_pinned() { + let chunk = ChunkId("d1e2f3a4b5c6789012345678aabbccdd".to_string()); + let model = EmbeddingModelId("BAAI/bge-small-en".to_string()); + let version = EmbeddingVersion("v1".to_string()); + + let expected_json = b"{\"chunk_id\":\"d1e2f3a4b5c6789012345678aabbccdd\",\"dimensions\":384,\"kind\":\"embedding\",\"model_id\":\"BAAI/bge-small-en\",\"model_version\":\"v1\"}"; + let tuple = EmbeddingTuple { + kind: "embedding", + chunk_id: &chunk.0, + model_id: &model.0, + model_version: &version.0, + dimensions: 384, + }; + assert_eq!( + serde_json_canonicalizer::to_vec(&tuple).unwrap(), + expected_json + ); + + let id = id_for_embedding(&chunk, &model, &version, 384); + assert_eq!(id.0, "71992c457a5da39880a6d17d646ed0fd"); + } + + /// Independent pin for id_for_index. + /// inputs: + /// collection="default", + /// embedding_model=EmbeddingModelId("BAAI/bge-small-en"), + /// dimensions=384, version=IndexVersion("v1"), + /// kind="hnsw", params_hash="xyz" + /// canonical JSON (key-sorted, compact, no whitespace): + /// {"collection":"default", + /// "dimensions":384, + /// "embedding_model":"BAAI/bge-small-en", + /// "index_kind":"hnsw", + /// "index_params_hash":"xyz", + /// "index_version":"v1", + /// "kind":"index"} + /// computed via: + /// printf '{"collection":"default","dimensions":384,"embedding_model":"BAAI/bge-small-en","index_kind":"hnsw","index_params_hash":"xyz","index_version":"v1","kind":"index"}' \ + /// | ~/.cargo/bin/b3sum --no-names | cut -c1-32 + /// → e733ee2f9936f0e1ac5143cdbf0f2b54 + #[test] + fn id_for_index_pinned() { + let model = EmbeddingModelId("BAAI/bge-small-en".to_string()); + let version = IndexVersion("v1".to_string()); + + let expected_json = b"{\"collection\":\"default\",\"dimensions\":384,\"embedding_model\":\"BAAI/bge-small-en\",\"index_kind\":\"hnsw\",\"index_params_hash\":\"xyz\",\"index_version\":\"v1\",\"kind\":\"index\"}"; + let tuple = IndexTuple { + kind: "index", + collection: "default", + embedding_model: &model.0, + dimensions: 384, + index_version: &version.0, + index_kind: "hnsw", + index_params_hash: "xyz", + }; + assert_eq!( + serde_json_canonicalizer::to_vec(&tuple).unwrap(), + expected_json + ); + + let id = id_for_index("default", &model, 384, &version, "hnsw", "xyz"); + assert_eq!(id.0, "e733ee2f9936f0e1ac5143cdbf0f2b54"); + } +} diff --git a/crates/kb-core/src/ingest.rs b/crates/kb-core/src/ingest.rs new file mode 100644 index 0000000..7636a95 --- /dev/null +++ b/crates/kb-core/src/ingest.rs @@ -0,0 +1,45 @@ +//! IngestReport + IngestItem (mirrored from wire §2.4). + +use serde::{Deserialize, Serialize}; + +use crate::asset::WorkspacePath; +use crate::ids::{AssetId, DocumentId}; +use crate::traits::SourceScope; +use crate::versions::{ChunkerVersion, ParserVersion}; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct IngestReport { + pub scope: SourceScope, + pub scanned: u32, + pub new: u32, + pub updated: u32, + pub skipped: u32, + pub errors: u32, + pub duration_ms: u32, + /// `None` ↔ wire `items: null` (`--summary-only`). + pub items: Option>, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct IngestItem { + pub kind: IngestItemKind, + pub doc_id: Option, + pub doc_path: WorkspacePath, + pub asset_id: Option, + pub byte_len: Option, + pub block_count: Option, + pub chunk_count: Option, + pub parser_version: Option, + pub chunker_version: Option, + pub warnings: Vec, + pub error: Option, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum IngestItemKind { + New, + Updated, + Skipped, + Error, +} diff --git a/crates/kb-core/src/jobs.rs b/crates/kb-core/src/jobs.rs new file mode 100644 index 0000000..8b6231e --- /dev/null +++ b/crates/kb-core/src/jobs.rs @@ -0,0 +1,52 @@ +//! Job repo support types (§3.7a forward-decl, §7.2 JobRepo). + +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use time::OffsetDateTime; + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum JobKind { + Ingest, + Chunk, + Embed, + Ocr, + Transcribe, + Reindex, + Doctor, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum JobStatus { + Pending, + Running, + Succeeded, + Failed, + Canceled, +} + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct JobId(pub String); + +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct JobFilter { + pub status: Option, + pub kind: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct JobRow { + pub job_id: JobId, + pub kind: JobKind, + pub status: JobStatus, + pub payload: Value, + pub progress: Option, + pub error: Option, + #[serde(with = "time::serde::rfc3339")] + pub created_at: OffsetDateTime, + #[serde(with = "time::serde::rfc3339")] + pub updated_at: OffsetDateTime, + #[serde(default, with = "time::serde::rfc3339::option")] + pub finished_at: Option, +} diff --git a/crates/kb-core/src/lib.rs b/crates/kb-core/src/lib.rs new file mode 100644 index 0000000..629d739 --- /dev/null +++ b/crates/kb-core/src/lib.rs @@ -0,0 +1,70 @@ +//! `kb-core` — frozen domain types, traits, and ID recipe. +//! +//! Per design §3, §4, §7. This crate has zero dependencies on any other +//! `kb-*` crate, so every other crate in the workspace can depend on it +//! freely. +//! +//! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md` for +//! the canonical type bodies — this crate is the byte-for-byte mirror. + +pub mod ids; +pub mod versions; +pub mod media; +pub mod asset; +pub mod document; +pub mod chunk; +pub mod citation; +pub mod metadata; +pub mod search; +pub mod answer; +pub mod ingest; +pub mod jobs; +pub mod vector; +pub mod errors; +pub mod traits; +pub mod normalize; + +// Re-export the most commonly used items at the crate root, mirroring the +// public surface listed in the task spec. + +pub use ids::{ + AssetId, BlockId, ChunkId, DocumentId, EmbeddingId, IndexId, + id_for_asset, id_for_block, id_for_chunk, id_for_doc, id_for_embedding, + id_for_index, id_from, +}; +pub use versions::{ + ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, + ParserVersion, PromptTemplateVersion, SchemaVersion, +}; +pub use media::{AudioType, Checksum, ImageType, Lang, MediaType}; +pub use asset::{AssetStorage, RawAsset, SourceUri, WorkspacePath}; +pub use document::{ + AudioRefBlock, Block, CanonicalDocument, CodeBlock, CommonBlock, + HeadingBlock, ImageRefBlock, Inline, ListBlock, ModelCaption, OcrRegion, + OcrText, SourceSpan, TableBlock, TextBlock, Transcript, TranscriptSegment, +}; +pub use chunk::Chunk; +pub use citation::Citation; +pub use metadata::{ + Metadata, Provenance, ProvenanceEvent, ProvenanceKind, SourceType, + TrustLevel, +}; +pub use search::{ + DocFilter, DocSummary, RetrievalDetail, SearchFilters, SearchHit, + SearchMode, SearchQuery, +}; +pub use answer::{ + Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, RefusalReason, + TokenUsage, TraceId, +}; +pub use ingest::{IngestItem, IngestItemKind, IngestReport}; +pub use jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus}; +pub use vector::{VectorHit, VectorRecord}; +pub use errors::CoreError; +pub use traits::{ + ChunkPolicy, Chunker, DocumentStore, Embedder, EmbeddingInput, + EmbeddingKind, ExtractConfig, ExtractContext, Extractor, FinishReason, + GenerateRequest, JobRepo, LanguageModel, Retriever, SourceConnector, + SourceScope, TokenChunk, VectorStore, +}; +pub use normalize::{nfc, to_posix}; diff --git a/crates/kb-core/src/media.rs b/crates/kb-core/src/media.rs new file mode 100644 index 0000000..263e5cf --- /dev/null +++ b/crates/kb-core/src/media.rs @@ -0,0 +1,44 @@ +//! Media / file-type primitives (§3.3 + §3.7a). + +use serde::{Deserialize, Serialize}; + +/// Full blake3 hex (64 chars) per §3.7a. Stored as `String` for serde +/// simplicity. +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct Checksum(pub String); + +/// BCP-47 / ISO-639 language tag (e.g. "ko", "en"). §3.7a. +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct Lang(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ImageType { + Png, + Jpeg, + Webp, + Gif, + Tiff, + Other(String), +} + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum AudioType { + M4a, + Mp3, + Wav, + Flac, + Ogg, + Other(String), +} + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum MediaType { + Markdown, + Pdf, + Image(ImageType), + Audio(AudioType), + Other(String), +} diff --git a/crates/kb-core/src/metadata.rs b/crates/kb-core/src/metadata.rs new file mode 100644 index 0000000..229ee0d --- /dev/null +++ b/crates/kb-core/src/metadata.rs @@ -0,0 +1,68 @@ +//! Metadata + Provenance (§3.6). + +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; +use time::OffsetDateTime; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Metadata { + pub aliases: Vec, + pub tags: Vec, + #[serde(with = "time::serde::rfc3339")] + pub created_at: OffsetDateTime, + #[serde(with = "time::serde::rfc3339")] + pub updated_at: OffsetDateTime, + pub source_type: SourceType, + pub trust_level: TrustLevel, + pub user_id_alias: Option, + /// Frontmatter keys we don't recognise are preserved here per §0 Q9. + pub user: Map, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum SourceType { + Markdown, + Note, + Paper, + Reference, + Inbox, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum TrustLevel { + Primary, + Secondary, + Generated, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Provenance { + pub events: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ProvenanceEvent { + #[serde(with = "time::serde::rfc3339")] + pub at: OffsetDateTime, + pub agent: String, + pub kind: ProvenanceKind, + pub note: Option, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ProvenanceKind { + Discovered, + Parsed, + Normalized, + Chunked, + OcrApplied, + CaptionApplied, + Transcribed, + Embedded, + Indexed, + Warning, + Error, +} diff --git a/crates/kb-core/src/normalize.rs b/crates/kb-core/src/normalize.rs new file mode 100644 index 0000000..c4c20b9 --- /dev/null +++ b/crates/kb-core/src/normalize.rs @@ -0,0 +1,104 @@ +//! Path / string normalization helpers (§4.1, §6.6). + +use std::path::{Component, Path}; + +use unicode_normalization::UnicodeNormalization; + +use crate::asset::WorkspacePath; +use crate::errors::CoreError; + +/// NFC-normalize a UTF-8 string (§4.1). +pub fn nfc(input: &str) -> String { + input.nfc().collect() +} + +/// Collapse a path to a POSIX-relative `WorkspacePath` per §6.6: +/// - convert all separators to `/` +/// - strip a leading `./` +/// - collapse repeated slashes +/// - NFC-normalize +/// +/// Returns `Err(CoreError::Malformed(..))` if the resulting POSIX form +/// contains `#`, since `WorkspacePath` is forbidden from colliding with +/// the W3C-Media-Fragments separator that `Citation` URIs depend on. +pub fn to_posix(path: &Path) -> Result { + let mut out = String::new(); + let mut first = true; + for comp in path.components() { + match comp { + Component::CurDir => continue, + Component::Normal(s) => { + if !first { + out.push('/'); + } + out.push_str(&s.to_string_lossy()); + first = false; + } + Component::ParentDir => { + if !first { + out.push('/'); + } + out.push_str(".."); + first = false; + } + Component::RootDir => { + if first { + out.push('/'); + } + first = false; + } + Component::Prefix(_) => { + // Windows drive prefixes — `to_string_lossy` keeps form. + out.push_str(&comp.as_os_str().to_string_lossy()); + first = false; + } + } + } + if out.is_empty() { + out.push('.'); + } + WorkspacePath::new(nfc(&out)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn collapses_curdir_and_redundant_slashes() { + let p = Path::new("./a//b.md"); + // `Path::components` already collapses `//` on POSIX; the test + // doc-fixed example asserts the final string is `a/b.md`. + assert_eq!(to_posix(p).unwrap().0, "a/b.md"); + } + + #[test] + fn nfc_normalizes_korean() { + // U+1100 ㄱ + U+1161 ㅏ (NFD) vs U+AC00 가 (NFC). After NFC they + // collapse to the same string; `to_posix` runs NFC after path + // collapse, so the WorkspacePath comes out NFC regardless of input. + let nfd = "\u{1100}\u{1161}.md"; + let nfc_str = "\u{AC00}.md"; + assert_eq!( + to_posix(Path::new(nfd)).unwrap().0, + to_posix(Path::new(nfc_str)).unwrap().0 + ); + assert_eq!(to_posix(Path::new(nfd)).unwrap().0, "\u{AC00}.md"); + } + + #[test] + fn nfc_function_idempotent() { + let s = "\u{AC00}"; + assert_eq!(nfc(s), s); + } + + #[test] + fn to_posix_rejects_hash_in_path() { + // `#` collides with the W3C-Media-Fragments separator used by + // `Citation`; the WorkspacePath invariant rejects it at construction. + let p = Path::new("notes/has#hash.md"); + let err = to_posix(p).expect_err("# in path must be rejected"); + let msg = format!("{err}"); + assert!(msg.contains('#'), "error message should mention '#': {msg}"); + } +} diff --git a/crates/kb-core/src/search.rs b/crates/kb-core/src/search.rs new file mode 100644 index 0000000..9621d61 --- /dev/null +++ b/crates/kb-core/src/search.rs @@ -0,0 +1,90 @@ +//! Search query / filters / hit (§3.7) + DocFilter / DocSummary (§2.5). + +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; + +use crate::asset::WorkspacePath; +use crate::citation::Citation; +use crate::ids::{ChunkId, DocumentId}; +use crate::media::Lang; +use crate::metadata::{SourceType, TrustLevel}; +use crate::versions::{ChunkerVersion, EmbeddingModelId, IndexVersion, ParserVersion}; + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum SearchMode { + Lexical, + Vector, + Hybrid, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct SearchQuery { + pub text: String, + pub mode: SearchMode, + pub k: usize, + pub filters: SearchFilters, +} + +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct SearchFilters { + pub tags_any: Vec, + pub lang: Option, + pub path_glob: Option, + pub trust_min: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct SearchHit { + pub rank: u32, + pub chunk_id: ChunkId, + pub doc_id: DocumentId, + pub doc_path: WorkspacePath, + pub heading_path: Vec, + pub section_label: Option, + pub snippet: String, + pub citation: Citation, + pub retrieval: RetrievalDetail, + pub index_version: IndexVersion, + pub embedding_model: Option, + pub chunker_version: ChunkerVersion, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct RetrievalDetail { + pub method: SearchMode, + pub fusion_score: f32, + pub lexical_score: Option, + pub vector_score: Option, + pub lexical_rank: Option, + pub vector_rank: Option, +} + +/// Filter for `kb-app::list_docs` (§7.2 DocumentStore::list_documents). +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct DocFilter { + pub tags_any: Vec, + pub lang: Option, + pub path_glob: Option, + pub trust_min: Option, +} + +/// Internal mirror of wire `doc_summary.v1` (§2.5). +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct DocSummary { + pub doc_id: DocumentId, + pub doc_path: WorkspacePath, + pub title: String, + pub lang: Lang, + pub tags: Vec, + pub trust_level: TrustLevel, + pub source_type: SourceType, + pub byte_len: u64, + pub chunk_count: u32, + #[serde(with = "time::serde::rfc3339")] + pub created_at: OffsetDateTime, + #[serde(with = "time::serde::rfc3339")] + pub updated_at: OffsetDateTime, + pub parser_version: ParserVersion, + pub chunker_version: ChunkerVersion, +} diff --git a/crates/kb-core/src/traits.rs b/crates/kb-core/src/traits.rs new file mode 100644 index 0000000..dcf2024 --- /dev/null +++ b/crates/kb-core/src/traits.rs @@ -0,0 +1,175 @@ +//! Component traits (§7) and their input helper types (§7.1). + +use std::path::{Path, PathBuf}; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use crate::asset::RawAsset; +use crate::chunk::Chunk; +use crate::document::{Block, CanonicalDocument}; +use crate::ids::{ChunkId, DocumentId}; +use crate::jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus}; +use crate::media::MediaType; +use crate::search::{DocFilter, DocSummary, SearchFilters, SearchHit, SearchQuery}; +use crate::vector::{VectorHit, VectorRecord}; +use crate::versions::{ + ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, ParserVersion, +}; +use crate::answer::{ModelRef, TokenUsage}; + +// ── Helper input types (§7.1) ───────────────────────────────────────────── + +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct SourceScope { + pub root: PathBuf, + pub include: Vec, + pub exclude: Vec, +} + +/// Forward-declared (§3.7a) — concrete shape decided by extractors. P0 +/// keeps the option-of-config-file slot only. +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct ExtractConfig { + pub config_path: Option, +} + +/// Carries the raw asset bytes context to an `Extractor::extract` call. +pub struct ExtractContext<'a> { + pub asset: &'a RawAsset, + pub workspace_root: &'a Path, + pub config: &'a ExtractConfig, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ChunkPolicy { + pub target_tokens: usize, + pub overlap_tokens: usize, + pub respect_markdown_headings: bool, + pub chunker_version: ChunkerVersion, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum EmbeddingKind { + Document, + Query, +} + +pub struct EmbeddingInput<'a> { + pub text: &'a str, + pub kind: EmbeddingKind, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct GenerateRequest { + pub system: String, + pub user: String, + pub stop: Vec, + pub max_tokens: usize, + pub temperature: f32, + pub seed: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "kind")] +pub enum TokenChunk { + Token(String), + Done { + finish_reason: FinishReason, + usage: TokenUsage, + }, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum FinishReason { + Stop, + Length, + Aborted, + Error(String), +} + +// ── Traits (§7.2) ───────────────────────────────────────────────────────── + +pub trait SourceConnector { + fn scan(&self, scope: &SourceScope) -> anyhow::Result>; +} + +pub trait Extractor: Send + Sync { + fn supports(&self, media_type: &MediaType) -> bool; + fn parser_version(&self) -> ParserVersion; + fn extract( + &self, + ctx: &ExtractContext<'_>, + bytes: &[u8], + ) -> anyhow::Result; +} + +pub trait Chunker: Send + Sync { + fn chunker_version(&self) -> ChunkerVersion; + fn policy_hash(&self, policy: &ChunkPolicy) -> String; + fn chunk( + &self, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + ) -> anyhow::Result>; +} + +pub trait Embedder: Send + Sync { + fn model_id(&self) -> EmbeddingModelId; + fn model_version(&self) -> EmbeddingVersion; + fn dimensions(&self) -> usize; + fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> anyhow::Result>>; +} + +pub trait Retriever: Send + Sync { + fn search(&self, query: &SearchQuery) -> anyhow::Result>; + fn index_version(&self) -> IndexVersion; +} + +pub trait LanguageModel: Send + Sync { + fn model_ref(&self) -> ModelRef; + fn context_tokens(&self) -> usize; + fn generate_stream( + &self, + req: GenerateRequest, + ) -> anyhow::Result> + Send>>; +} + +pub trait DocumentStore { + fn put_asset(&self, a: &RawAsset) -> anyhow::Result<()>; + fn put_document(&self, d: &CanonicalDocument) -> anyhow::Result<()>; + fn put_blocks(&self, doc: &DocumentId, blocks: &[Block]) -> anyhow::Result<()>; + fn put_chunks(&self, doc: &DocumentId, chunks: &[Chunk]) -> anyhow::Result<()>; + fn get_document(&self, id: &DocumentId) -> anyhow::Result>; + fn get_chunk(&self, id: &ChunkId) -> anyhow::Result>; + fn list_documents(&self, filter: &DocFilter) -> anyhow::Result>; +} + +pub trait VectorStore { + fn ensure_table( + &self, + model: &EmbeddingModelId, + dim: usize, + ) -> anyhow::Result; + fn upsert(&self, recs: &[VectorRecord]) -> anyhow::Result<()>; + fn search( + &self, + query_vec: &[f32], + k: usize, + filters: &SearchFilters, + ) -> anyhow::Result>; +} + +pub trait JobRepo { + fn create(&self, kind: JobKind, payload: Value) -> anyhow::Result; + fn update_progress(&self, id: &JobId, progress: Value) -> anyhow::Result<()>; + fn finish( + &self, + id: &JobId, + status: JobStatus, + error: Option<&str>, + ) -> anyhow::Result<()>; + fn list(&self, filter: &JobFilter) -> anyhow::Result>; +} diff --git a/crates/kb-core/src/vector.rs b/crates/kb-core/src/vector.rs new file mode 100644 index 0000000..e17ab7d --- /dev/null +++ b/crates/kb-core/src/vector.rs @@ -0,0 +1,27 @@ +//! Vector store records (§7.2 VectorStore). + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use crate::ids::{ChunkId, DocumentId, EmbeddingId}; +use crate::versions::{EmbeddingModelId, EmbeddingVersion}; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct VectorRecord { + pub chunk_id: ChunkId, + pub embedding_id: EmbeddingId, + pub vector: Vec, + pub doc_id: DocumentId, + pub text: String, + pub heading_path: Vec, + pub model_id: EmbeddingModelId, + pub model_version: EmbeddingVersion, + pub dimensions: usize, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct VectorHit { + pub chunk_id: ChunkId, + pub score: f32, + pub payload: Value, +} diff --git a/crates/kb-core/src/versions.rs b/crates/kb-core/src/versions.rs new file mode 100644 index 0000000..beda08e --- /dev/null +++ b/crates/kb-core/src/versions.rs @@ -0,0 +1,27 @@ +//! Version / label newtypes (§3.2). + +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct ParserVersion(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct ChunkerVersion(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct EmbeddingModelId(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct EmbeddingVersion(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct IndexVersion(pub String); + +#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct PromptTemplateVersion(pub String); + +/// Wire schema version label (`"answer.v1"`, `"search_hit.v1"`, …). +/// Carried as a `&'static str` because every wire type pins its label at +/// compile time. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +pub struct SchemaVersion(pub &'static str); diff --git a/crates/kb-parse-types/Cargo.toml b/crates/kb-parse-types/Cargo.toml new file mode 100644 index 0000000..6f79453 --- /dev/null +++ b/crates/kb-parse-types/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "kb-parse-types" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Parser intermediate representations (no parser libs allowed)" + +[dependencies] +kb-core = { path = "../kb-core" } +serde = { workspace = true } diff --git a/crates/kb-parse-types/src/lib.rs b/crates/kb-parse-types/src/lib.rs new file mode 100644 index 0000000..e09016f --- /dev/null +++ b/crates/kb-parse-types/src/lib.rs @@ -0,0 +1,98 @@ +//! `kb-parse-types` — parser intermediate representations (§3.7b). +//! +//! Depends ONLY on `kb-core`. Must NOT depend on any parser library +//! (`pulldown-cmark`, `pdf-extract`, `image`, `whisper-rs`, …) and must +//! NOT depend on any other `kb-*` crate. + +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ParsedBlock { + pub kind: ParsedBlockKind, + pub heading_path: Vec, + pub source_span: kb_core::SourceSpan, + pub payload: ParsedPayload, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ParsedBlockKind { + Heading, + Paragraph, + List, + Code, + Table, + Quote, + ImageRef, + AudioRef, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum ParsedPayload { + Heading { + level: u8, + text: String, + }, + Paragraph { + text: String, + inlines: Vec, + }, + List { + ordered: bool, + items: Vec>, + }, + Code { + lang: Option, + code: String, + }, + Table { + headers: Vec, + rows: Vec>, + }, + Quote { + text: String, + inlines: Vec, + }, + ImageRef { + src: String, + alt: String, + }, + /// `duration_ms` is filled in by the extractor before chunking — see + /// design §3.7b. + AudioRef { + src: String, + }, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Warning { + pub kind: WarningKind, + pub note: String, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum WarningKind { + MalformedFrontmatter, + MalformedTable, + EncodingFallback, + ExtractFailed, +} + +// Forward-declared (P6/P7/P8). Bodies stay minimal for now. +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct ParsedImageRegion; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ParsedPdfPage { + pub page: u32, + pub text: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ParsedAudioSegment { + pub start_ms: u64, + pub end_ms: u64, + pub text: String, +} diff --git a/docs/spec/ai-generation-guidelines.md b/docs/spec/ai-generation-guidelines.md new file mode 100644 index 0000000..b0040fd --- /dev/null +++ b/docs/spec/ai-generation-guidelines.md @@ -0,0 +1,12 @@ +# AI generation guidelines + +When implementing tasks against this codebase: + +- Treat the frozen design doc as the single source of truth. Do not invent + new fields, traits, or enum variants. +- Prefer editing existing files to creating new ones; reuse types from + `kb-core` instead of duplicating shapes. +- For each task, follow the task spec under `tasks/p/p-.md`. + +Canonical source: +[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §11 + §12. diff --git a/docs/spec/canonical-document.md b/docs/spec/canonical-document.md new file mode 100644 index 0000000..0926fc2 --- /dev/null +++ b/docs/spec/canonical-document.md @@ -0,0 +1,7 @@ +# CanonicalDocument + +Medium-agnostic representation of a document with `Block`s, `SourceSpan`s, +and provenance. + +Canonical source: +[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §3.4 + §3.7a. diff --git a/docs/spec/chunk-policy.md b/docs/spec/chunk-policy.md new file mode 100644 index 0000000..fe204fc --- /dev/null +++ b/docs/spec/chunk-policy.md @@ -0,0 +1,8 @@ +# Chunk policy + +`ChunkPolicy` carries `target_tokens`, `overlap_tokens`, +`respect_markdown_headings`, and `chunker_version`. Chunkers expose a +`policy_hash` so chunk IDs include the policy. + +Canonical source: +[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §3.5 + §7.1 + §7.2. diff --git a/docs/spec/citation-policy.md b/docs/spec/citation-policy.md new file mode 100644 index 0000000..3ef0f82 --- /dev/null +++ b/docs/spec/citation-policy.md @@ -0,0 +1,7 @@ +# Citation policy + +Citations use W3C Media Fragments URIs to locate evidence inside a +document. Five variants: `Line`, `Page`, `Region`, `Caption`, `Time`. + +Canonical source: +[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §3.5 + §0 Q3. diff --git a/docs/spec/domain-model.md b/docs/spec/domain-model.md new file mode 100644 index 0000000..98ef0ec --- /dev/null +++ b/docs/spec/domain-model.md @@ -0,0 +1,6 @@ +# Domain model + +The domain types live in `kb-core` and mirror the frozen design exactly. + +Canonical source: +[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §3. diff --git a/docs/spec/ids.md b/docs/spec/ids.md new file mode 100644 index 0000000..19bbc15 --- /dev/null +++ b/docs/spec/ids.md @@ -0,0 +1,6 @@ +# ID recipe + +All `kb-*` IDs are 32 hex chars: the first 32 of `blake3(canonical_json(tuple))`. + +Canonical source: +[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §4. diff --git a/docs/spec/module-boundaries.md b/docs/spec/module-boundaries.md new file mode 100644 index 0000000..7225608 --- /dev/null +++ b/docs/spec/module-boundaries.md @@ -0,0 +1,8 @@ +# Module boundaries + +`kb-core` is leaf — every other crate depends on it. Parsers depend on +`kb-parse-types` (not on `kb-normalize`); `kb-normalize` depends on +`kb-parse-types` (not on parsers). UI crates depend only on `kb-app`. + +Canonical source: +[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §8. diff --git a/docs/wire-schema/v1/answer.schema.json b/docs/wire-schema/v1/answer.schema.json new file mode 100644 index 0000000..7428cf1 --- /dev/null +++ b/docs/wire-schema/v1/answer.schema.json @@ -0,0 +1,31 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://kb.local/wire/v1/answer.schema.json", + "title": "Answer v1", + "description": "Stub schema — declares the schema_version label and the required fields per design §2.3.", + "type": "object", + "required": [ + "schema_version", + "answer", + "citations", + "grounded", + "model", + "prompt_template_version", + "retrieval", + "usage", + "created_at" + ], + "properties": { + "schema_version": { "const": "answer.v1" }, + "answer": { "type": "string" }, + "citations": { "type": "array" }, + "grounded": { "type": "boolean" }, + "refusal_reason": { "type": ["string", "null"] }, + "model": { "type": "object" }, + "embedding": { "type": ["object", "null"] }, + "prompt_template_version": { "type": "string" }, + "retrieval": { "type": "object" }, + "usage": { "type": "object" }, + "created_at": { "type": "string" } + } +} diff --git a/docs/wire-schema/v1/chunk_inspection.schema.json b/docs/wire-schema/v1/chunk_inspection.schema.json new file mode 100644 index 0000000..0771e22 --- /dev/null +++ b/docs/wire-schema/v1/chunk_inspection.schema.json @@ -0,0 +1,32 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://kb.local/wire/v1/chunk_inspection.schema.json", + "title": "ChunkInspection v1", + "description": "Stub schema — declares the schema_version label and the required fields per design §2.6.", + "type": "object", + "required": [ + "schema_version", + "chunk_id", + "doc_id", + "doc_path", + "heading_path", + "text", + "source_spans", + "block_ids", + "token_estimate", + "chunker_version" + ], + "properties": { + "schema_version": { "const": "chunk_inspection.v1" }, + "chunk_id": { "type": "string" }, + "doc_id": { "type": "string" }, + "doc_path": { "type": "string" }, + "heading_path": { "type": "array", "items": { "type": "string" } }, + "text": { "type": "string" }, + "source_spans": { "type": "array" }, + "block_ids": { "type": "array", "items": { "type": "string" } }, + "token_estimate": { "type": "integer", "minimum": 0 }, + "chunker_version": { "type": "string" }, + "embeddings": { "type": "array" } + } +} diff --git a/docs/wire-schema/v1/citation.schema.json b/docs/wire-schema/v1/citation.schema.json new file mode 100644 index 0000000..90ebe0f --- /dev/null +++ b/docs/wire-schema/v1/citation.schema.json @@ -0,0 +1,19 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://kb.local/wire/v1/citation.schema.json", + "title": "Citation v1", + "description": "Stub schema — declares the schema_version label and the always-present fields. Variant-discriminated property validation lands in a later phase.", + "type": "object", + "required": ["schema_version", "kind", "path", "uri"], + "properties": { + "schema_version": { "const": "citation.v1" }, + "kind": { "enum": ["line", "page", "region", "caption", "time"] }, + "path": { "type": "string" }, + "uri": { "type": "string" }, + "line": { "type": "object" }, + "page": { "type": "object" }, + "region": { "type": "object" }, + "caption": { "type": "object" }, + "time": { "type": "object" } + } +} diff --git a/docs/wire-schema/v1/doc_summary.schema.json b/docs/wire-schema/v1/doc_summary.schema.json new file mode 100644 index 0000000..d757e8f --- /dev/null +++ b/docs/wire-schema/v1/doc_summary.schema.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://kb.local/wire/v1/doc_summary.schema.json", + "title": "DocSummary v1", + "description": "Stub schema — declares the schema_version label and the required fields per design §2.5.", + "type": "object", + "required": [ + "schema_version", + "doc_id", + "doc_path", + "title", + "lang", + "tags", + "trust_level", + "source_type", + "byte_len", + "chunk_count", + "created_at", + "updated_at", + "parser_version", + "chunker_version" + ], + "properties": { + "schema_version": { "const": "doc_summary.v1" }, + "doc_id": { "type": "string" }, + "doc_path": { "type": "string" }, + "title": { "type": "string" }, + "lang": { "type": "string" }, + "tags": { "type": "array", "items": { "type": "string" } }, + "trust_level": { "type": "string" }, + "source_type": { "type": "string" }, + "byte_len": { "type": "integer", "minimum": 0 }, + "chunk_count": { "type": "integer", "minimum": 0 }, + "created_at": { "type": "string" }, + "updated_at": { "type": "string" }, + "parser_version": { "type": "string" }, + "chunker_version": { "type": "string" } + } +} diff --git a/docs/wire-schema/v1/doctor.schema.json b/docs/wire-schema/v1/doctor.schema.json new file mode 100644 index 0000000..212c165 --- /dev/null +++ b/docs/wire-schema/v1/doctor.schema.json @@ -0,0 +1,25 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://kb.local/wire/v1/doctor.schema.json", + "title": "DoctorReport v1", + "description": "Stub schema — declares the schema_version label and the required fields per design §2.7.", + "type": "object", + "required": ["schema_version", "ok", "checks"], + "properties": { + "schema_version": { "const": "doctor.v1" }, + "ok": { "type": "boolean" }, + "checks": { + "type": "array", + "items": { + "type": "object", + "required": ["name", "ok", "detail"], + "properties": { + "name": { "type": "string" }, + "ok": { "type": "boolean" }, + "detail": { "type": "string" }, + "hint": { "type": ["string", "null"] } + } + } + } + } +} diff --git a/docs/wire-schema/v1/ingest_report.schema.json b/docs/wire-schema/v1/ingest_report.schema.json new file mode 100644 index 0000000..be25ad0 --- /dev/null +++ b/docs/wire-schema/v1/ingest_report.schema.json @@ -0,0 +1,28 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://kb.local/wire/v1/ingest_report.schema.json", + "title": "IngestReport v1", + "description": "Stub schema — declares the schema_version label and the required fields per design §2.4.", + "type": "object", + "required": [ + "schema_version", + "scope", + "scanned", + "new", + "updated", + "skipped", + "errors", + "duration_ms" + ], + "properties": { + "schema_version": { "const": "ingest_report.v1" }, + "scope": { "type": "object" }, + "scanned": { "type": "integer", "minimum": 0 }, + "new": { "type": "integer", "minimum": 0 }, + "updated": { "type": "integer", "minimum": 0 }, + "skipped": { "type": "integer", "minimum": 0 }, + "errors": { "type": "integer", "minimum": 0 }, + "duration_ms": { "type": "integer", "minimum": 0 }, + "items": { "type": ["array", "null"] } + } +} diff --git a/docs/wire-schema/v1/search_hit.schema.json b/docs/wire-schema/v1/search_hit.schema.json new file mode 100644 index 0000000..01b8a96 --- /dev/null +++ b/docs/wire-schema/v1/search_hit.schema.json @@ -0,0 +1,38 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://kb.local/wire/v1/search_hit.schema.json", + "title": "SearchHit v1", + "description": "Stub schema — declares the schema_version label and the required top-level fields per design §2.2.", + "type": "object", + "required": [ + "schema_version", + "rank", + "score", + "chunk_id", + "doc_id", + "doc_path", + "heading_path", + "snippet", + "citation", + "retrieval", + "index_version", + "chunker_version" + ], + "properties": { + "schema_version": { "const": "search_hit.v1" }, + "rank": { "type": "integer", "minimum": 1 }, + "score": { "type": "number" }, + "chunk_id": { "type": "string" }, + "doc_id": { "type": "string" }, + "doc_path": { "type": "string" }, + "heading_path": { "type": "array", "items": { "type": "string" } }, + "section_label": { "type": ["string", "null"] }, + "snippet": { "type": "string" }, + "snippet_full_text": { "type": "boolean" }, + "citation": { "type": "object" }, + "retrieval": { "type": "object" }, + "index_version": { "type": "string" }, + "embedding_model": { "type": ["string", "null"] }, + "chunker_version": { "type": "string" } + } +} diff --git a/fixtures/audio/.gitkeep b/fixtures/audio/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/embed/.gitkeep b/fixtures/embed/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/eval/.gitkeep b/fixtures/eval/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/image/.gitkeep b/fixtures/image/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/markdown/code-and-table.md b/fixtures/markdown/code-and-table.md new file mode 100644 index 0000000..6a4a59f --- /dev/null +++ b/fixtures/markdown/code-and-table.md @@ -0,0 +1,12 @@ +# Code And Table + +```rust +fn main() { + println!("hi"); +} +``` + +| col a | col b | +|-------|-------| +| 1 | 2 | +| 3 | 4 | diff --git a/fixtures/markdown/nested-headings.md b/fixtures/markdown/nested-headings.md new file mode 100644 index 0000000..43f9d72 --- /dev/null +++ b/fixtures/markdown/nested-headings.md @@ -0,0 +1,15 @@ +# Top + +intro + +## Section A + +body of A + +### Sub A.1 + +deeper + +## Section B + +body of B diff --git a/fixtures/markdown/simple-note.md b/fixtures/markdown/simple-note.md new file mode 100644 index 0000000..72bc4bf --- /dev/null +++ b/fixtures/markdown/simple-note.md @@ -0,0 +1,3 @@ +# Simple Note + +A short paragraph. diff --git a/fixtures/pdf/.gitkeep b/fixtures/pdf/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/rag/.gitkeep b/fixtures/rag/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/search/hybrid/.gitkeep b/fixtures/search/hybrid/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/search/lexical/.gitkeep b/fixtures/search/lexical/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/source-fs/.gitkeep b/fixtures/source-fs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/vector/.gitkeep b/fixtures/vector/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/migrations/V001__init.sql b/migrations/V001__init.sql new file mode 100644 index 0000000..2db2d5e --- /dev/null +++ b/migrations/V001__init.sql @@ -0,0 +1,15 @@ +-- V001__init.sql — schema bootstrap. +-- Per design §5.1 + §5.9. Only the meta + migrations tables land here; +-- data tables (assets, documents, blocks, chunks, fts5, …) ship in later +-- phase-specific migrations (P1-6 / P2-1 / P3-3). + +CREATE TABLE schema_meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL +); + +CREATE TABLE migrations ( + id INTEGER PRIMARY KEY, + applied_at TEXT NOT NULL, + description TEXT NOT NULL +);