feat(p0-1): workspace skeleton + frozen contracts #5
3
.gitignore
vendored
@@ -1 +1,4 @@
|
||||
.superpowers/
|
||||
/target/
|
||||
**/*.rs.bk
|
||||
Cargo.lock.bak
|
||||
|
||||
934
Cargo.lock
generated
Normal file
@@ -0,0 +1,934 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"anstyle-parse",
|
||||
"anstyle-query",
|
||||
"anstyle-wincon",
|
||||
"colorchoice",
|
||||
"is_terminal_polyfill",
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle"
|
||||
version = "1.0.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-parse"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
|
||||
dependencies = [
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-query"
|
||||
version = "1.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
|
||||
dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-wincon"
|
||||
version = "3.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"once_cell_polyfill",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.102"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
|
||||
|
||||
[[package]]
|
||||
name = "arrayref"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
|
||||
|
||||
[[package]]
|
||||
name = "arrayvec"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
|
||||
|
||||
[[package]]
|
||||
name = "blake3"
|
||||
version = "1.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce"
|
||||
dependencies = [
|
||||
"arrayref",
|
||||
"arrayvec",
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"constant_time_eq",
|
||||
"cpufeatures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.61"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d"
|
||||
dependencies = [
|
||||
"find-msvc-tools",
|
||||
"shlex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_builder"
|
||||
version = "4.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"clap_lex",
|
||||
"strsim",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
|
||||
|
||||
[[package]]
|
||||
name = "constant_time_eq"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.5.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
|
||||
dependencies = [
|
||||
"powerfmt",
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs"
|
||||
version = "5.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225"
|
||||
dependencies = [
|
||||
"dirs-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-sys"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"option-ext",
|
||||
"redox_users",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
|
||||
|
||||
[[package]]
|
||||
name = "kb-app"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"dirs",
|
||||
"kb-config",
|
||||
"kb-core",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"toml",
|
||||
"tracing",
|
||||
"tracing-appender",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-cli"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"kb-app",
|
||||
"kb-config",
|
||||
"kb-core",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-config"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"dirs",
|
||||
"kb-core",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-core"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_json_canonicalizer",
|
||||
"thiserror 2.0.18",
|
||||
"time",
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-parse-types"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"kb-core",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.186"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
|
||||
|
||||
[[package]]
|
||||
name = "libredox"
|
||||
version = "0.1.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.50.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
|
||||
dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-conv"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell_polyfill"
|
||||
version = "1.70.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
|
||||
|
||||
[[package]]
|
||||
name = "option-ext"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
|
||||
|
||||
[[package]]
|
||||
name = "powerfmt"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.106"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.45"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_users"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
"libredox",
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
||||
|
||||
[[package]]
|
||||
name = "ryu-js"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_core"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.149"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"memchr",
|
||||
"serde",
|
||||
"serde_core",
|
||||
"zmij",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json_canonicalizer"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fe52319a927259afbfa5180c5157cd8167edfd3e8c254f9558c7fef44c5649f2"
|
||||
dependencies = [
|
||||
"ryu-js",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_spanned"
|
||||
version = "0.6.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.15.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "symlink"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.117"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
|
||||
dependencies = [
|
||||
"thiserror-impl 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "2.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
|
||||
dependencies = [
|
||||
"thiserror-impl 2.0.18",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "2.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.45"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd"
|
||||
dependencies = [
|
||||
"deranged",
|
||||
"itoa",
|
||||
"num-conv",
|
||||
"powerfmt",
|
||||
"serde_core",
|
||||
"time-core",
|
||||
"time-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time-core"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b36ee98fd31ec7426d599183e8fe26932a8dc1fb76ddb6214d05493377d34ca"
|
||||
|
||||
[[package]]
|
||||
name = "time-macros"
|
||||
version = "0.2.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "71e552d1249bf61ac2a52db88179fd0673def1e1ad8243a00d9ec9ed71fee3dd"
|
||||
dependencies = [
|
||||
"num-conv",
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
|
||||
dependencies = [
|
||||
"tinyvec_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec_macros"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.8.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_edit",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_datetime"
|
||||
version = "0.6.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.22.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
|
||||
dependencies = [
|
||||
"indexmap",
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_write",
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_write"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.44"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
|
||||
dependencies = [
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-appender"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "050686193eb999b4bb3bc2acfa891a13da00f79734704c4b8b4ef1a10b368a3c"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"symlink",
|
||||
"thiserror 2.0.18",
|
||||
"time",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"valuable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-log"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
|
||||
dependencies = [
|
||||
"log",
|
||||
"once_cell",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-serde"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.3.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex-automata",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sharded-slab",
|
||||
"smallvec",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-normalization"
|
||||
version = "0.1.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
|
||||
dependencies = [
|
||||
"tinyvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||
|
||||
[[package]]
|
||||
name = "valuable"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.11.1+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
||||
|
||||
[[package]]
|
||||
name = "windows-link"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.61.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.7.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zmij"
|
||||
version = "1.0.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
|
||||
26
Cargo.toml
Normal file
@@ -0,0 +1,26 @@
|
||||
[workspace]
|
||||
resolver = "3"
|
||||
members = [
|
||||
"crates/kb-core",
|
||||
"crates/kb-parse-types",
|
||||
"crates/kb-config",
|
||||
"crates/kb-app",
|
||||
"crates/kb-cli",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
edition = "2024"
|
||||
rust-version = "1.85"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/altair823/kb"
|
||||
version = "0.1.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1"
|
||||
thiserror = "2"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
time = { version = "0.3", features = ["serde", "macros", "formatting", "parsing"] }
|
||||
uuid = { version = "1", features = ["v7", "serde"] }
|
||||
blake3 = "1"
|
||||
tracing = "0.1"
|
||||
20
crates/kb-app/Cargo.toml
Normal file
@@ -0,0 +1,20 @@
|
||||
[package]
|
||||
name = "kb-app"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Facade — orchestrates components for kb-cli/tui/desktop"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
kb-config = { path = "../kb-config" }
|
||||
anyhow = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json"] }
|
||||
tracing-appender = "0.2"
|
||||
toml = "0.8"
|
||||
dirs = "5"
|
||||
39
crates/kb-app/src/doctor_signal.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
//! Signal types used by `kb-cli`'s `exit_code` mapping (§10).
|
||||
//!
|
||||
//! These are *not* errors per se: a doctor failure is normal output, just
|
||||
//! signalled out-of-band so the CLI can exit with the right status.
|
||||
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DoctorUnhealthy;
|
||||
|
||||
impl fmt::Display for DoctorUnhealthy {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str("doctor unhealthy")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for DoctorUnhealthy {}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RefusalSignal;
|
||||
|
||||
impl fmt::Display for RefusalSignal {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str("refusal")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for RefusalSignal {}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct NoHitSignal;
|
||||
|
||||
impl fmt::Display for NoHitSignal {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str("no hit")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for NoHitSignal {}
|
||||
186
crates/kb-app/src/lib.rs
Normal file
@@ -0,0 +1,186 @@
|
||||
//! `kb-app` — facade that downstream `kb-cli` / `kb-tui` / `kb-desktop`
|
||||
//! depend on (§7, §8).
|
||||
//!
|
||||
//! P0 implementations stub out — the signatures are frozen so that later
|
||||
//! phases swap in real bodies without breaking call sites.
|
||||
//!
|
||||
//! ## Wire-schema convention
|
||||
//!
|
||||
//! `kb-app` returns pure domain types (`IngestReport`, `DocSummary`,
|
||||
//! `Chunk`, `SearchHit`, `Answer`, …) re-exported from `kb-core`. These do
|
||||
//! NOT carry a `schema_version` field. The CLI (`kb-cli/src/wire.rs`) is
|
||||
//! responsible for wrapping each Ok-path return value with the matching
|
||||
//! `*.v1` envelope before emitting JSON on stdout in `--json` mode. The
|
||||
//! sole exception is [`DoctorReport`], whose `schema_version` is part of
|
||||
//! the struct because the doctor wire object IS its own structured
|
||||
//! surface (no domain-side equivalent in `kb-core`). When adding a new
|
||||
//! facade function in a later phase, remember: keep the return type pure,
|
||||
//! and add a matching `wire_*` helper in `kb-cli/src/wire.rs`.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::bail;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use kb_core::{
|
||||
Answer, CanonicalDocument, Chunk, ChunkId, DocFilter, DocSummary, DocumentId,
|
||||
IngestReport, SearchHit, SearchMode, SearchQuery, SourceScope,
|
||||
};
|
||||
|
||||
pub mod doctor_signal;
|
||||
pub mod logging;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct AskOpts {
|
||||
pub k: usize,
|
||||
pub explain: bool,
|
||||
pub mode: SearchMode,
|
||||
pub temperature: Option<f32>,
|
||||
pub seed: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct DoctorReport {
|
||||
/// Wire schema version label (`"doctor.v1"`).
|
||||
pub schema_version: String,
|
||||
pub ok: bool,
|
||||
pub checks: Vec<DoctorCheck>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct DoctorCheck {
|
||||
pub name: String,
|
||||
pub ok: bool,
|
||||
pub detail: String,
|
||||
pub hint: Option<String>,
|
||||
}
|
||||
|
||||
/// Create XDG dirs and write a starter `config.toml`. Idempotent unless
|
||||
/// `force=true` (which overwrites an existing config).
|
||||
pub fn init_workspace(force: bool) -> anyhow::Result<()> {
|
||||
let cfg_path = kb_config::Config::xdg_config_path();
|
||||
let data_dir = kb_config::Config::xdg_data_dir();
|
||||
let cache_dir = kb_config::Config::xdg_cache_dir();
|
||||
let state_dir = kb_config::Config::xdg_state_dir();
|
||||
|
||||
for d in [
|
||||
cfg_path.parent().map(PathBuf::from).unwrap_or_default(),
|
||||
data_dir.clone(),
|
||||
cache_dir,
|
||||
state_dir.clone(),
|
||||
state_dir.join("logs"),
|
||||
] {
|
||||
if !d.as_os_str().is_empty() {
|
||||
std::fs::create_dir_all(&d)?;
|
||||
}
|
||||
}
|
||||
|
||||
let workspace_root = expand_tilde(&kb_config::Config::defaults().workspace.root);
|
||||
std::fs::create_dir_all(&workspace_root)?;
|
||||
|
||||
if !cfg_path.exists() || force {
|
||||
let cfg = kb_config::Config::defaults();
|
||||
let toml_text = toml::to_string_pretty(&cfg)?;
|
||||
std::fs::write(&cfg_path, toml_text)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn expand_tilde(s: &str) -> PathBuf {
|
||||
if let Some(rest) = s.strip_prefix("~/") {
|
||||
if let Some(home) = dirs::home_dir() {
|
||||
return home.join(rest);
|
||||
}
|
||||
}
|
||||
if s == "~" {
|
||||
if let Some(home) = dirs::home_dir() {
|
||||
return home;
|
||||
}
|
||||
}
|
||||
PathBuf::from(s)
|
||||
}
|
||||
|
||||
pub fn ingest(_scope: SourceScope, _summary_only: bool) -> anyhow::Result<IngestReport> {
|
||||
bail!("not yet wired (P1-2)")
|
||||
}
|
||||
|
||||
pub fn list_docs(_filter: DocFilter) -> anyhow::Result<Vec<DocSummary>> {
|
||||
bail!("not yet wired (P1-5)")
|
||||
}
|
||||
|
||||
pub fn inspect_doc(_id: &DocumentId) -> anyhow::Result<CanonicalDocument> {
|
||||
bail!("not yet wired (P1-5)")
|
||||
}
|
||||
|
||||
pub fn inspect_chunk(_id: &ChunkId) -> anyhow::Result<Chunk> {
|
||||
bail!("not yet wired (P1-5)")
|
||||
}
|
||||
|
||||
pub fn search(_query: SearchQuery) -> anyhow::Result<Vec<SearchHit>> {
|
||||
bail!("not yet wired (P3-1/P4-1)")
|
||||
}
|
||||
|
||||
pub fn ask(_query: &str, _opts: AskOpts) -> anyhow::Result<Answer> {
|
||||
bail!("not yet wired (P5-1)")
|
||||
}
|
||||
|
||||
/// Run the doctor checks. P0 emits `config_loaded` + `data_dir_writable`
|
||||
/// (downstream checks land in later phases).
|
||||
pub fn doctor() -> anyhow::Result<DoctorReport> {
|
||||
tracing::debug!("doctor() invoked");
|
||||
let mut checks = Vec::new();
|
||||
|
||||
// config_loaded — defaults always load; from-file is best-effort.
|
||||
let cfg_path = kb_config::Config::xdg_config_path();
|
||||
let (config_ok, config_detail) = if cfg_path.exists() {
|
||||
match kb_config::Config::from_file(&cfg_path) {
|
||||
Ok(_) => (true, cfg_path.display().to_string()),
|
||||
Err(e) => (false, format!("{} ({e})", cfg_path.display())),
|
||||
}
|
||||
} else {
|
||||
// Defaults are always loadable; report the path that would be read.
|
||||
(true, format!("{} (defaults)", cfg_path.display()))
|
||||
};
|
||||
checks.push(DoctorCheck {
|
||||
name: "config_loaded".to_string(),
|
||||
ok: config_ok,
|
||||
detail: config_detail,
|
||||
hint: if config_ok {
|
||||
None
|
||||
} else {
|
||||
Some("run `kb init` to seed config".to_string())
|
||||
},
|
||||
});
|
||||
|
||||
// data_dir_writable — try to create the dir and write a probe file.
|
||||
let data_dir = kb_config::Config::xdg_data_dir();
|
||||
let writable = (|| -> anyhow::Result<()> {
|
||||
std::fs::create_dir_all(&data_dir)?;
|
||||
let probe = data_dir.join(".kb-doctor-probe");
|
||||
std::fs::write(&probe, b"ok")?;
|
||||
std::fs::remove_file(&probe).ok();
|
||||
Ok(())
|
||||
})();
|
||||
let (data_ok, data_detail, data_hint) = match writable {
|
||||
Ok(()) => (true, data_dir.display().to_string(), None),
|
||||
Err(e) => (
|
||||
false,
|
||||
format!("{} ({e})", data_dir.display()),
|
||||
Some("ensure XDG_DATA_HOME is writable".to_string()),
|
||||
),
|
||||
};
|
||||
checks.push(DoctorCheck {
|
||||
name: "data_dir_writable".to_string(),
|
||||
ok: data_ok,
|
||||
detail: data_detail,
|
||||
hint: data_hint,
|
||||
});
|
||||
|
||||
let ok = checks.iter().all(|c| c.ok);
|
||||
Ok(DoctorReport {
|
||||
schema_version: "doctor.v1".to_string(),
|
||||
ok,
|
||||
checks,
|
||||
})
|
||||
}
|
||||
43
crates/kb-app/src/logging.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
//! Tracing initialization helper for `kb-cli`.
|
||||
//!
|
||||
//! Daily-rolling file appender at `~/.local/state/kb/logs/` per task spec.
|
||||
//! Returns a `WorkerGuard` that the caller must keep alive until program
|
||||
//! exit (so buffered log lines flush).
|
||||
|
||||
use anyhow::Result;
|
||||
use tracing_appender::non_blocking::WorkerGuard;
|
||||
use tracing_subscriber::{EnvFilter, fmt, prelude::*};
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub enum LogLevel {
|
||||
Default,
|
||||
Verbose,
|
||||
Debug,
|
||||
}
|
||||
|
||||
/// Initialize tracing. Returns a guard to keep alive until exit. Idempotent
|
||||
/// — a second call is a no-op (the second `try_init` is dropped silently
|
||||
/// but the guard is still returned so the caller can keep it alive).
|
||||
|
|
||||
pub fn init(level: LogLevel) -> Result<WorkerGuard> {
|
||||
let log_dir = kb_config::Config::xdg_state_dir().join("logs");
|
||||
std::fs::create_dir_all(&log_dir)?;
|
||||
|
||||
let file_appender = tracing_appender::rolling::daily(&log_dir, "kb.log");
|
||||
let (nb, guard) = tracing_appender::non_blocking(file_appender);
|
||||
|
||||
let env_filter = match level {
|
||||
LogLevel::Default => EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("warn")),
|
||||
LogLevel::Verbose => EnvFilter::new("info"),
|
||||
LogLevel::Debug => EnvFilter::new("debug"),
|
||||
};
|
||||
|
||||
let registry = tracing_subscriber::registry()
|
||||
.with(env_filter)
|
||||
.with(fmt::layer().with_writer(nb).with_ansi(false));
|
||||
|
||||
// `try_init` rather than `init` so a second call (e.g. in tests) is a
|
||||
// no-op.
|
||||
let _ = registry.try_init();
|
||||
|
||||
Ok(guard)
|
||||
}
|
||||
20
crates/kb-cli/Cargo.toml
Normal file
@@ -0,0 +1,20 @@
|
||||
[package]
|
||||
name = "kb-cli"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "kb command-line interface"
|
||||
|
||||
[[bin]]
|
||||
name = "kb"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
kb-config = { path = "../kb-config" }
|
||||
kb-app = { path = "../kb-app" }
|
||||
anyhow = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
355
crates/kb-cli/src/main.rs
Normal file
@@ -0,0 +1,355 @@
|
||||
//! `kb` — command-line interface. Each subcommand maps 1:1 to a `kb-app`
|
||||
//! function. Exit codes per design §10.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::process::ExitCode;
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
|
||||
use kb_app::doctor_signal::{DoctorUnhealthy, NoHitSignal, RefusalSignal};
|
||||
|
||||
mod wire;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "kb", version, about = "personal local knowledge base")]
|
||||
struct Cli {
|
||||
/// Path to a non-default `config.toml`.
|
||||
#[arg(long, global = true)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Show anyhow chain on errors.
|
||||
#[arg(long, global = true)]
|
||||
verbose: bool,
|
||||
|
||||
/// Show tracing target/level on errors.
|
||||
#[arg(long, global = true)]
|
||||
debug: bool,
|
||||
|
||||
/// Emit machine-readable wire JSON (`*.v1`).
|
||||
#[arg(long, global = true)]
|
||||
json: bool,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Cmd,
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Cmd {
|
||||
/// Initialise XDG dirs + workspace + `config.toml`.
|
||||
Init {
|
||||
/// Overwrite an existing `config.toml`.
|
||||
#[arg(long)]
|
||||
force: bool,
|
||||
},
|
||||
|
||||
/// Scan the workspace and ingest new/updated documents.
|
||||
Ingest {
|
||||
/// Workspace root override.
|
||||
#[arg(long)]
|
||||
root: Option<PathBuf>,
|
||||
|
||||
/// Suppress the per-file `items` list.
|
||||
#[arg(long)]
|
||||
summary_only: bool,
|
||||
},
|
||||
|
||||
/// Listing subcommands.
|
||||
List {
|
||||
#[command(subcommand)]
|
||||
what: ListWhat,
|
||||
},
|
||||
|
||||
/// Inspect documents or chunks by ID.
|
||||
Inspect {
|
||||
#[command(subcommand)]
|
||||
what: InspectWhat,
|
||||
},
|
||||
|
||||
/// Lexical / vector / hybrid search over chunks.
|
||||
Search {
|
||||
query: String,
|
||||
|
||||
#[arg(long, default_value_t = 10)]
|
||||
k: usize,
|
||||
|
||||
#[arg(long, value_enum, default_value_t = ModeFlag::Hybrid)]
|
||||
mode: ModeFlag,
|
||||
|
||||
#[arg(long)]
|
||||
explain: bool,
|
||||
},
|
||||
|
||||
/// Retrieval-augmented question answering.
|
||||
Ask {
|
||||
query: String,
|
||||
|
||||
#[arg(long, default_value_t = 8)]
|
||||
k: usize,
|
||||
|
||||
#[arg(long, value_enum, default_value_t = ModeFlag::Hybrid)]
|
||||
mode: ModeFlag,
|
||||
|
||||
#[arg(long)]
|
||||
explain: bool,
|
||||
|
||||
#[arg(long)]
|
||||
temperature: Option<f32>,
|
||||
|
||||
#[arg(long)]
|
||||
seed: Option<u64>,
|
||||
},
|
||||
|
||||
/// Health check.
|
||||
Doctor,
|
||||
|
||||
/// Eval suite (placeholder; lands in P9).
|
||||
Eval {
|
||||
#[command(subcommand)]
|
||||
what: EvalWhat,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum ListWhat {
|
||||
/// List documents currently indexed.
|
||||
Docs,
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum InspectWhat {
|
||||
/// Inspect a single document by ID.
|
||||
Doc { id: String },
|
||||
/// Inspect a single chunk by ID.
|
||||
Chunk { id: String },
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum EvalWhat {
|
||||
/// Run an eval suite (placeholder for P9).
|
||||
Run {
|
||||
#[arg(long)]
|
||||
suite: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, clap::ValueEnum)]
|
||||
enum ModeFlag {
|
||||
Lexical,
|
||||
Vector,
|
||||
Hybrid,
|
||||
}
|
||||
|
||||
impl From<ModeFlag> for kb_core::SearchMode {
|
||||
fn from(m: ModeFlag) -> Self {
|
||||
match m {
|
||||
ModeFlag::Lexical => kb_core::SearchMode::Lexical,
|
||||
ModeFlag::Vector => kb_core::SearchMode::Vector,
|
||||
ModeFlag::Hybrid => kb_core::SearchMode::Hybrid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> ExitCode {
|
||||
let cli = Cli::parse();
|
||||
let level = if cli.debug {
|
||||
kb_app::logging::LogLevel::Debug
|
||||
} else if cli.verbose {
|
||||
kb_app::logging::LogLevel::Verbose
|
||||
} else {
|
||||
kb_app::logging::LogLevel::Default
|
||||
};
|
||||
// Fail-soft: if logging init errors (e.g. XDG state dir is read-only),
|
||||
// proceed without a guard rather than crashing — `kb` is still usable.
|
||||
let _log_guard = kb_app::logging::init(level).ok();
|
||||
match run(&cli) {
|
||||
Ok(()) => ExitCode::from(0),
|
||||
Err(e) => {
|
||||
let code = exit_code(&e);
|
||||
// Refusals at exit code 1 print to stdout (already done by the
|
||||
// caller); errors go to stderr.
|
||||
if code != 1 {
|
||||
eprintln!("error: {e}");
|
||||
if cli.verbose {
|
||||
for cause in e.chain().skip(1) {
|
||||
eprintln!(" caused by: {cause}");
|
||||
}
|
||||
}
|
||||
}
|
||||
ExitCode::from(code)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn exit_code(err: &anyhow::Error) -> u8 {
|
||||
if err.downcast_ref::<RefusalSignal>().is_some() {
|
||||
return 1;
|
||||
}
|
||||
if err.downcast_ref::<NoHitSignal>().is_some() {
|
||||
return 1;
|
||||
}
|
||||
if err.downcast_ref::<DoctorUnhealthy>().is_some() {
|
||||
return 3;
|
||||
}
|
||||
2
|
||||
}
|
||||
|
||||
fn run(cli: &Cli) -> anyhow::Result<()> {
|
||||
match &cli.command {
|
||||
Cmd::Init { force } => {
|
||||
kb_app::init_workspace(*force)?;
|
||||
if !cli.json {
|
||||
println!(
|
||||
"created {}",
|
||||
kb_config::Config::xdg_config_path().display()
|
||||
);
|
||||
println!("created {}", kb_config::Config::xdg_data_dir().display());
|
||||
println!("created {}", kb_config::Config::xdg_state_dir().display());
|
||||
println!("hint edit the config above, then `kb ingest`");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Cmd::Ingest {
|
||||
root,
|
||||
summary_only,
|
||||
} => {
|
||||
let cfg = kb_config::Config::load(cli.config.as_deref())?;
|
||||
let scope = kb_core::SourceScope {
|
||||
root: root.clone().unwrap_or_else(|| PathBuf::from(&cfg.workspace.root)),
|
||||
include: cfg.workspace.include.clone(),
|
||||
exclude: cfg.workspace.exclude.clone(),
|
||||
};
|
||||
let report = kb_app::ingest(scope, *summary_only)?;
|
||||
if cli.json {
|
||||
println!("{}", serde_json::to_string(&wire::wire_ingest(&report))?);
|
||||
} else {
|
||||
println!(
|
||||
"scanned {} new {} updated {} skipped {} errors {} ({} ms)",
|
||||
report.scanned,
|
||||
report.new,
|
||||
report.updated,
|
||||
report.skipped,
|
||||
report.errors,
|
||||
report.duration_ms
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Cmd::List { what } => match what {
|
||||
ListWhat::Docs => {
|
||||
let docs = kb_app::list_docs(kb_core::DocFilter::default())?;
|
||||
if cli.json {
|
||||
println!("{}", serde_json::to_string(&wire::wire_doc_summaries(&docs))?);
|
||||
} else {
|
||||
for d in &docs {
|
||||
println!("{}\t{}", d.doc_id, d.doc_path.0);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
|
||||
Cmd::Inspect { what } => match what {
|
||||
InspectWhat::Doc { id } => {
|
||||
let doc_id: kb_core::DocumentId = id.parse()?;
|
||||
let doc = kb_app::inspect_doc(&doc_id)?;
|
||||
// Inspect doc emits a `CanonicalDocument` — there's no §2
|
||||
// wire schema for it (P1-5 will decide whether this also
|
||||
// becomes a tagged wrapper or stays as the raw domain
|
||||
// object). Until then keep raw JSON, matching pre-P0-1
|
||||
// behaviour.
|
||||
println!("{}", serde_json::to_string(&doc)?);
|
||||
Ok(())
|
||||
}
|
||||
InspectWhat::Chunk { id } => {
|
||||
let chunk_id: kb_core::ChunkId = id.parse()?;
|
||||
let chunk = kb_app::inspect_chunk(&chunk_id)?;
|
||||
println!("{}", serde_json::to_string(&wire::wire_chunk_inspection(&chunk))?);
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
|
||||
Cmd::Search {
|
||||
query,
|
||||
k,
|
||||
mode,
|
||||
explain: _,
|
||||
} => {
|
||||
let q = kb_core::SearchQuery {
|
||||
text: query.clone(),
|
||||
mode: (*mode).into(),
|
||||
k: *k,
|
||||
filters: kb_core::SearchFilters::default(),
|
||||
};
|
||||
let hits = kb_app::search(q)?;
|
||||
if cli.json {
|
||||
println!("{}", serde_json::to_string(&wire::wire_search_hits(&hits))?);
|
||||
} else {
|
||||
for h in &hits {
|
||||
println!("{:>2}. {:.2} {}", h.rank, h.retrieval.fusion_score, h.doc_path.0);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Cmd::Ask {
|
||||
query,
|
||||
k,
|
||||
mode,
|
||||
explain,
|
||||
temperature,
|
||||
seed,
|
||||
} => {
|
||||
let opts = kb_app::AskOpts {
|
||||
k: *k,
|
||||
explain: *explain,
|
||||
mode: (*mode).into(),
|
||||
temperature: *temperature,
|
||||
seed: *seed,
|
||||
};
|
||||
let ans = kb_app::ask(query, opts)?;
|
||||
if cli.json {
|
||||
println!("{}", serde_json::to_string(&wire::wire_answer(&ans))?);
|
||||
} else {
|
||||
println!("{}", ans.answer);
|
||||
}
|
||||
// Refusal → exit 1.
|
||||
if !ans.grounded {
|
||||
return Err(RefusalSignal.into());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Cmd::Doctor => {
|
||||
let report = kb_app::doctor()?;
|
||||
if cli.json {
|
||||
println!("{}", serde_json::to_string(&wire::wire_doctor(&report))?);
|
||||
} else {
|
||||
for c in &report.checks {
|
||||
let mark = if c.ok { "✓" } else { "✗" };
|
||||
println!("{mark} {:<20} {}", c.name, c.detail);
|
||||
if let (false, Some(hint)) = (c.ok, c.hint.as_ref()) {
|
||||
println!(" hint: {hint}");
|
||||
}
|
||||
}
|
||||
if !report.ok {
|
||||
println!();
|
||||
let failed = report.checks.iter().filter(|c| !c.ok).count();
|
||||
println!("{failed} check(s) failed.");
|
||||
}
|
||||
}
|
||||
if !report.ok {
|
||||
return Err(DoctorUnhealthy.into());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Cmd::Eval { what } => match what {
|
||||
EvalWhat::Run { suite: _ } => {
|
||||
anyhow::bail!("not yet wired (P9-3)")
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
175
crates/kb-cli/src/wire.rs
Normal file
@@ -0,0 +1,175 @@
|
||||
//! CLI-side wire-schema wrappers.
|
||||
//!
|
||||
//! Convention (per design §2): every JSON object emitted on stdout in
|
||||
//! `--json` mode MUST carry a top-level `schema_version` of the form
|
||||
//! `"<object>.v1"`. The kb-core types are pure domain types and do NOT
|
||||
//! carry `schema_version` themselves; the CLI wraps them on emit. The one
|
||||
//! exception is `DoctorReport`, where `schema_version` is part of the wire
|
||||
//! type because the doctor wire object IS its own structured surface.
|
||||
//!
|
||||
//! Future tasks (P1-5, P3, P4, P5) replacing stub `bail!` paths must call
|
||||
//! these helpers from the relevant CLI subcommand handler before
|
||||
//! `serde_json::to_string`.
|
||||
//!
|
||||
|
claude-reviewer-01
commented
👍 strength — wire convention 정착quality review 후 특히 나중에 wire schema 갱신할 때 단일 진입점이라 영향 범위 명확. ## 👍 strength — wire convention 정착
quality review 후 `kb-cli/src/wire.rs` 도입한 게 정확한 결정. 도메인 타입 (`kb-core`)에 `schema_version` 필드를 박지 않고 CLI emit 시점에 wrap 하는 방향이 design §2 의도와 부합 (도메인 ↔ wire 분리).
특히 `wire_search_hit`이 `retrieval.fusion_score` → 최상위 `score` promote 하는 로직 (§2.2 wire schema와 부합)을 한 곳에 모아둔 것 — P1-5/P3/P4/P5 어느 task가 이 함수를 호출하든 spec drift 없이 동일한 wire shape 보장.
나중에 wire schema 갱신할 때 단일 진입점이라 영향 범위 명확.
|
||||
//! Each helper is total (returns `serde_json::Value`, never an error) — the
|
||||
//! input is a fully-typed `serde::Serialize` value, so the only way to fail
|
||||
//! is OOM, which would have killed the process anyway.
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use kb_app::DoctorReport;
|
||||
use kb_core::{Answer, Chunk, DocSummary, IngestReport, SearchHit};
|
||||
|
||||
/// Insert `schema_version` into an object-shaped `Value`. Helper for the
|
||||
/// "serialize, then tag" pattern used by all the per-type wrappers below.
|
||||
fn tag_object(mut v: Value, schema_version: &str) -> Value {
|
||||
if let Value::Object(ref mut map) = v {
|
||||
map.insert(
|
||||
"schema_version".to_string(),
|
||||
Value::String(schema_version.to_string()),
|
||||
);
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
/// Wrap an [`IngestReport`] as `ingest_report.v1`.
|
||||
pub fn wire_ingest(r: &IngestReport) -> Value {
|
||||
let v = serde_json::to_value(r).expect("IngestReport serializes");
|
||||
tag_object(v, "ingest_report.v1")
|
||||
}
|
||||
|
||||
/// Wrap a single [`DocSummary`] as `doc_summary.v1`.
|
||||
pub fn wire_doc_summary(d: &DocSummary) -> Value {
|
||||
let v = serde_json::to_value(d).expect("DocSummary serializes");
|
||||
tag_object(v, "doc_summary.v1")
|
||||
}
|
||||
|
||||
/// Wrap a list of [`DocSummary`] values as a JSON array of `doc_summary.v1`
|
||||
/// objects (one tag per element, per design §2.5 — there is no list-envelope
|
||||
/// schema; the list shape is `[{schema_version: "doc_summary.v1", ...}, ...]`).
|
||||
pub fn wire_doc_summaries(d: &[DocSummary]) -> Value {
|
||||
Value::Array(d.iter().map(wire_doc_summary).collect())
|
||||
}
|
||||
|
||||
/// Wrap a [`Chunk`] as `chunk_inspection.v1` (§2.6). NOTE: the wire schema
|
||||
/// requires `doc_path`, which the kb-core `Chunk` does not currently carry —
|
||||
/// when P1-5 wires the Ok-path, the implementation should either enrich
|
||||
/// `Chunk` or pass `doc_path` alongside. For now this helper emits whatever
|
||||
/// fields `Chunk` serializes with, plus the `schema_version` tag.
|
||||
pub fn wire_chunk_inspection(c: &Chunk) -> Value {
|
||||
let v = serde_json::to_value(c).expect("Chunk serializes");
|
||||
tag_object(v, "chunk_inspection.v1")
|
||||
}
|
||||
|
||||
/// Wrap a single [`SearchHit`] as `search_hit.v1`.
|
||||
pub fn wire_search_hit(h: &SearchHit) -> Value {
|
||||
let mut v = serde_json::to_value(h).expect("SearchHit serializes");
|
||||
// Promote `retrieval.fusion_score` to a top-level `score` per §2.2.
|
||||
if let Value::Object(ref mut map) = v {
|
||||
if let Some(Value::Object(retrieval)) = map.get("retrieval") {
|
||||
if let Some(score) = retrieval.get("fusion_score").cloned() {
|
||||
map.insert("score".to_string(), score);
|
||||
}
|
||||
}
|
||||
}
|
||||
tag_object(v, "search_hit.v1")
|
||||
}
|
||||
|
||||
/// Wrap a list of [`SearchHit`] values as a JSON array of `search_hit.v1`
|
||||
/// objects (one tag per element, per design §2.2).
|
||||
pub fn wire_search_hits(hits: &[SearchHit]) -> Value {
|
||||
Value::Array(hits.iter().map(wire_search_hit).collect())
|
||||
}
|
||||
|
||||
/// Wrap an [`Answer`] as `answer.v1`.
|
||||
pub fn wire_answer(a: &Answer) -> Value {
|
||||
let v = serde_json::to_value(a).expect("Answer serializes");
|
||||
tag_object(v, "answer.v1")
|
||||
}
|
||||
|
||||
/// Idempotent pass-through for [`DoctorReport`] — the type already carries
|
||||
/// `schema_version: "doctor.v1"` (struct-field convention, the one
|
||||
/// exception called out in the module doc above). This helper exists so
|
||||
/// every `--json` branch in `kb-cli` goes through `wire::*`, keeping the
|
||||
/// emit pattern uniform.
|
||||
pub fn wire_doctor(d: &DoctorReport) -> Value {
|
||||
// Round-trip through `to_value` to confirm the field is serialized;
|
||||
// then re-tag (no-op when the field is already present, defensive
|
||||
// when a future refactor drops the struct-field).
|
||||
let v = serde_json::to_value(d).expect("DoctorReport serializes");
|
||||
if let Value::Object(ref map) = v {
|
||||
if matches!(
|
||||
map.get("schema_version"),
|
||||
Some(Value::String(s)) if s == "doctor.v1"
|
||||
) {
|
||||
return v;
|
||||
}
|
||||
}
|
||||
tag_object(v, "doctor.v1")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn schema_of(v: &Value) -> Option<&str> {
|
||||
v.as_object()?.get("schema_version")?.as_str()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doctor_round_trip_preserves_schema_version() {
|
||||
let d = DoctorReport {
|
||||
schema_version: "doctor.v1".to_string(),
|
||||
ok: true,
|
||||
checks: Vec::new(),
|
||||
};
|
||||
let v = wire_doctor(&d);
|
||||
assert_eq!(schema_of(&v), Some("doctor.v1"));
|
||||
// Sanity: ok/checks are preserved.
|
||||
assert_eq!(v.get("ok").and_then(Value::as_bool), Some(true));
|
||||
assert!(v.get("checks").and_then(Value::as_array).is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_wrapper_tags_schema_version() {
|
||||
use kb_core::SourceScope;
|
||||
let r = IngestReport {
|
||||
scope: SourceScope {
|
||||
root: std::path::PathBuf::from("/tmp"),
|
||||
include: vec![],
|
||||
exclude: vec![],
|
||||
},
|
||||
scanned: 0,
|
||||
new: 0,
|
||||
updated: 0,
|
||||
skipped: 0,
|
||||
errors: 0,
|
||||
duration_ms: 0,
|
||||
items: None,
|
||||
};
|
||||
let v = wire_ingest(&r);
|
||||
assert_eq!(schema_of(&v), Some("ingest_report.v1"));
|
||||
assert!(v.get("items").is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doc_summaries_wraps_each_element() {
|
||||
let v = wire_doc_summaries(&[]);
|
||||
assert!(v.is_array());
|
||||
assert_eq!(v.as_array().unwrap().len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_hits_wraps_each_element() {
|
||||
let v = wire_search_hits(&[]);
|
||||
assert!(v.is_array());
|
||||
assert_eq!(v.as_array().unwrap().len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tag_object_inserts_into_object() {
|
||||
let v = Value::Object(serde_json::Map::new());
|
||||
let tagged = tag_object(v, "x.v1");
|
||||
assert_eq!(schema_of(&tagged), Some("x.v1"));
|
||||
}
|
||||
}
|
||||
17
crates/kb-config/Cargo.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "kb-config"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Config schema + XDG path resolution"
|
||||
|
||||
[dependencies]
|
||||
# kb-core::CoreError reserved for P1-* config errors
|
||||
|
claude-reviewer-01
commented
🟡 minor — 사용 안 하는 dep 선언
빌드 시간 증가 + 미래 grep으로 "왜 의존성 있지?" 혼란 유발. 두 옵션:
## 🟡 minor — 사용 안 하는 dep 선언
`kb-core` (line 11) 와 `thiserror` (line 13) 가 declare 됐지만 `kb-config/src/lib.rs`에서 import되지 않음. 동일 패턴이 `kb-parse-types/Cargo.toml` (`thiserror`) 와 `kb-app/Cargo.toml` (`thiserror`) 에도 있음.
빌드 시간 증가 + 미래 grep으로 "왜 의존성 있지?" 혼란 유발. 두 옵션:
1. 제거 (가장 깔끔).
2. `# reserved for P1-*: <reason>` 주석 — 의도가 분명할 때만.
`kb-config`의 경우 `kb-core::CoreError`를 결국 사용할 가능성 높으므로 옵션 2 + 주석 권장.
|
||||
kb-core = { path = "../kb-core" }
|
||||
anyhow = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
toml = "0.8"
|
||||
dirs = "5"
|
||||
489
crates/kb-config/src/lib.rs
Normal file
@@ -0,0 +1,489 @@
|
||||
//! `kb-config` — `Config` schema and XDG path resolution (§6).
|
||||
//!
|
||||
//! Layer order (`Config::load`): defaults → file → env (`KB_<SECTION>_<KEY>`).
|
||||
//! CLI overrides land later, applied by `kb-cli` after `Config::load`.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Config {
|
||||
pub schema_version: u32,
|
||||
pub workspace: WorkspaceCfg,
|
||||
pub storage: StorageCfg,
|
||||
pub indexing: IndexingCfg,
|
||||
pub chunking: ChunkingCfg,
|
||||
pub models: ModelsCfg,
|
||||
pub search: SearchCfg,
|
||||
pub rag: RagCfg,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct WorkspaceCfg {
|
||||
pub root: String,
|
||||
pub include: Vec<String>,
|
||||
pub exclude: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct StorageCfg {
|
||||
pub data_dir: String,
|
||||
pub sqlite: String,
|
||||
pub vector_dir: String,
|
||||
pub asset_dir: String,
|
||||
pub artifact_dir: String,
|
||||
pub model_dir: String,
|
||||
pub runs_dir: String,
|
||||
pub copy_threshold_mb: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IndexingCfg {
|
||||
pub max_parallel_extractors: u32,
|
||||
pub max_parallel_embeddings: u32,
|
||||
pub watch_filesystem: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ChunkingCfg {
|
||||
pub target_tokens: usize,
|
||||
pub overlap_tokens: usize,
|
||||
pub respect_markdown_headings: bool,
|
||||
pub chunker_version: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ModelsCfg {
|
||||
pub embedding: EmbeddingModelCfg,
|
||||
pub llm: LlmCfg,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct EmbeddingModelCfg {
|
||||
pub provider: String,
|
||||
pub model: String,
|
||||
pub version: String,
|
||||
pub dimensions: usize,
|
||||
pub batch_size: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct LlmCfg {
|
||||
pub provider: String,
|
||||
pub model: String,
|
||||
pub context_tokens: usize,
|
||||
pub endpoint: String,
|
||||
pub temperature: f32,
|
||||
pub seed: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SearchCfg {
|
||||
pub default_k: usize,
|
||||
pub hybrid_fusion: String,
|
||||
pub rrf_k: u32,
|
||||
pub snippet_chars: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RagCfg {
|
||||
pub prompt_template_version: String,
|
||||
pub score_gate: f32,
|
||||
pub explain_default: bool,
|
||||
pub max_context_tokens: usize,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Defaults per design §6.4.
|
||||
pub fn defaults() -> Self {
|
||||
Self {
|
||||
schema_version: 1,
|
||||
workspace: WorkspaceCfg {
|
||||
root: "~/KnowledgeBase".to_string(),
|
||||
include: vec!["**/*.md".to_string()],
|
||||
exclude: vec![
|
||||
".git/**".to_string(),
|
||||
"node_modules/**".to_string(),
|
||||
".obsidian/**".to_string(),
|
||||
],
|
||||
},
|
||||
storage: StorageCfg {
|
||||
data_dir: "${XDG_DATA_HOME:-~/.local/share}/kb".to_string(),
|
||||
sqlite: "{data_dir}/kb.sqlite".to_string(),
|
||||
vector_dir: "{data_dir}/lancedb".to_string(),
|
||||
asset_dir: "{data_dir}/assets".to_string(),
|
||||
artifact_dir: "{data_dir}/artifacts".to_string(),
|
||||
model_dir: "{data_dir}/models".to_string(),
|
||||
runs_dir: "{data_dir}/runs".to_string(),
|
||||
copy_threshold_mb: 100,
|
||||
},
|
||||
indexing: IndexingCfg {
|
||||
max_parallel_extractors: 2,
|
||||
max_parallel_embeddings: 1,
|
||||
watch_filesystem: false,
|
||||
},
|
||||
chunking: ChunkingCfg {
|
||||
target_tokens: 500,
|
||||
overlap_tokens: 80,
|
||||
respect_markdown_headings: true,
|
||||
chunker_version: "md-heading-v1".to_string(),
|
||||
},
|
||||
models: ModelsCfg {
|
||||
embedding: EmbeddingModelCfg {
|
||||
provider: "fastembed".to_string(),
|
||||
model: "multilingual-e5-small".to_string(),
|
||||
version: "v1".to_string(),
|
||||
dimensions: 384,
|
||||
batch_size: 64,
|
||||
},
|
||||
llm: LlmCfg {
|
||||
provider: "ollama".to_string(),
|
||||
model: "qwen2.5:14b-instruct".to_string(),
|
||||
context_tokens: 32768,
|
||||
endpoint: "http://127.0.0.1:11434".to_string(),
|
||||
temperature: 0.0,
|
||||
seed: 0,
|
||||
},
|
||||
},
|
||||
search: SearchCfg {
|
||||
default_k: 10,
|
||||
hybrid_fusion: "rrf".to_string(),
|
||||
rrf_k: 60,
|
||||
snippet_chars: 220,
|
||||
},
|
||||
rag: RagCfg {
|
||||
prompt_template_version: "rag-v1".to_string(),
|
||||
score_gate: 0.30,
|
||||
explain_default: false,
|
||||
max_context_tokens: 8000,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Read config from disk and merge env overrides on top of it. If the
|
||||
/// file is missing, defaults are used (so `kb doctor` runs with no
|
||||
/// prior `kb init`).
|
||||
pub fn load(path: Option<&Path>) -> anyhow::Result<Self> {
|
||||
let from_disk = match path {
|
||||
Some(p) if p.exists() => Self::from_file(p)?,
|
||||
Some(_) => Self::defaults(),
|
||||
None => {
|
||||
let p = Self::xdg_config_path();
|
||||
if p.exists() {
|
||||
Self::from_file(&p)?
|
||||
} else {
|
||||
Self::defaults()
|
||||
}
|
||||
}
|
||||
};
|
||||
let env: HashMap<String, String> = std::env::vars().collect();
|
||||
Ok(from_disk.apply_env(&env))
|
||||
}
|
||||
|
||||
pub fn from_file(path: &Path) -> anyhow::Result<Self> {
|
||||
let text = std::fs::read_to_string(path)?;
|
||||
let cfg: Self = toml::from_str(&text)?;
|
||||
Ok(cfg)
|
||||
}
|
||||
|
||||
/// Apply `KB_<SECTION>_<KEY>` env overrides. Unknown keys are ignored.
|
||||
///
|
||||
/// The mapping is an explicit grep-friendly whitelist — one match arm
|
||||
/// per leaf key in `Config`. Booleans accept `1` / `true` / `yes`
|
||||
/// (case-insensitive) for true and anything else for false. Numeric
|
||||
/// keys silently keep their prior value if the env value fails to
|
||||
/// parse, so a malformed `KB_*` cannot crash startup.
|
||||
pub fn apply_env(mut self, env: &HashMap<String, String>) -> Self {
|
||||
for (k, v) in env {
|
||||
|
claude-reviewer-01
commented
🟡 nice-to-fix —
|
||||
if !k.starts_with("KB_") {
|
||||
continue;
|
||||
}
|
||||
match k.as_str() {
|
||||
// workspace
|
||||
"KB_WORKSPACE_ROOT" => self.workspace.root = v.clone(),
|
||||
|
||||
// storage
|
||||
"KB_STORAGE_DATA_DIR" => self.storage.data_dir = v.clone(),
|
||||
"KB_STORAGE_SQLITE" => self.storage.sqlite = v.clone(),
|
||||
"KB_STORAGE_VECTOR_DIR" => self.storage.vector_dir = v.clone(),
|
||||
"KB_STORAGE_ASSET_DIR" => self.storage.asset_dir = v.clone(),
|
||||
"KB_STORAGE_ARTIFACT_DIR" => self.storage.artifact_dir = v.clone(),
|
||||
"KB_STORAGE_MODEL_DIR" => self.storage.model_dir = v.clone(),
|
||||
"KB_STORAGE_RUNS_DIR" => self.storage.runs_dir = v.clone(),
|
||||
"KB_STORAGE_COPY_THRESHOLD_MB" => {
|
||||
if let Ok(n) = v.parse::<u64>() {
|
||||
self.storage.copy_threshold_mb = n;
|
||||
}
|
||||
}
|
||||
|
||||
// indexing
|
||||
"KB_INDEXING_MAX_PARALLEL_EXTRACTORS" => {
|
||||
if let Ok(n) = v.parse::<u32>() {
|
||||
self.indexing.max_parallel_extractors = n;
|
||||
}
|
||||
}
|
||||
"KB_INDEXING_MAX_PARALLEL_EMBEDDINGS" => {
|
||||
if let Ok(n) = v.parse::<u32>() {
|
||||
self.indexing.max_parallel_embeddings = n;
|
||||
}
|
||||
}
|
||||
"KB_INDEXING_WATCH_FILESYSTEM" => {
|
||||
self.indexing.watch_filesystem = parse_bool(v);
|
||||
}
|
||||
|
||||
// chunking
|
||||
"KB_CHUNKING_TARGET_TOKENS" => {
|
||||
if let Ok(n) = v.parse::<usize>() {
|
||||
self.chunking.target_tokens = n;
|
||||
}
|
||||
}
|
||||
"KB_CHUNKING_OVERLAP_TOKENS" => {
|
||||
if let Ok(n) = v.parse::<usize>() {
|
||||
self.chunking.overlap_tokens = n;
|
||||
}
|
||||
}
|
||||
"KB_CHUNKING_RESPECT_MARKDOWN_HEADINGS" => {
|
||||
self.chunking.respect_markdown_headings = parse_bool(v);
|
||||
}
|
||||
"KB_CHUNKING_CHUNKER_VERSION" => self.chunking.chunker_version = v.clone(),
|
||||
|
||||
// models.embedding
|
||||
"KB_MODELS_EMBEDDING_PROVIDER" => self.models.embedding.provider = v.clone(),
|
||||
"KB_MODELS_EMBEDDING_MODEL" => self.models.embedding.model = v.clone(),
|
||||
"KB_MODELS_EMBEDDING_VERSION" => self.models.embedding.version = v.clone(),
|
||||
"KB_MODELS_EMBEDDING_DIMENSIONS" => {
|
||||
if let Ok(n) = v.parse::<usize>() {
|
||||
self.models.embedding.dimensions = n;
|
||||
}
|
||||
}
|
||||
"KB_MODELS_EMBEDDING_BATCH_SIZE" => {
|
||||
if let Ok(n) = v.parse::<usize>() {
|
||||
self.models.embedding.batch_size = n;
|
||||
}
|
||||
}
|
||||
|
||||
// models.llm
|
||||
"KB_MODELS_LLM_PROVIDER" => self.models.llm.provider = v.clone(),
|
||||
"KB_MODELS_LLM_MODEL" => self.models.llm.model = v.clone(),
|
||||
"KB_MODELS_LLM_CONTEXT_TOKENS" => {
|
||||
if let Ok(n) = v.parse::<usize>() {
|
||||
self.models.llm.context_tokens = n;
|
||||
}
|
||||
}
|
||||
"KB_MODELS_LLM_ENDPOINT" => self.models.llm.endpoint = v.clone(),
|
||||
"KB_MODELS_LLM_TEMPERATURE" => {
|
||||
if let Ok(f) = v.parse::<f32>() {
|
||||
self.models.llm.temperature = f;
|
||||
}
|
||||
}
|
||||
"KB_MODELS_LLM_SEED" => {
|
||||
if let Ok(n) = v.parse::<u64>() {
|
||||
self.models.llm.seed = n;
|
||||
}
|
||||
}
|
||||
|
||||
// search
|
||||
"KB_SEARCH_DEFAULT_K" => {
|
||||
if let Ok(n) = v.parse::<usize>() {
|
||||
self.search.default_k = n;
|
||||
}
|
||||
}
|
||||
"KB_SEARCH_HYBRID_FUSION" => self.search.hybrid_fusion = v.clone(),
|
||||
"KB_SEARCH_RRF_K" => {
|
||||
if let Ok(n) = v.parse::<u32>() {
|
||||
self.search.rrf_k = n;
|
||||
}
|
||||
}
|
||||
"KB_SEARCH_SNIPPET_CHARS" => {
|
||||
if let Ok(n) = v.parse::<usize>() {
|
||||
self.search.snippet_chars = n;
|
||||
}
|
||||
}
|
||||
|
||||
// rag
|
||||
"KB_RAG_PROMPT_TEMPLATE_VERSION" => {
|
||||
self.rag.prompt_template_version = v.clone();
|
||||
}
|
||||
"KB_RAG_SCORE_GATE" => {
|
||||
if let Ok(f) = v.parse::<f32>() {
|
||||
self.rag.score_gate = f;
|
||||
}
|
||||
}
|
||||
"KB_RAG_EXPLAIN_DEFAULT" => {
|
||||
self.rag.explain_default = parse_bool(v);
|
||||
}
|
||||
"KB_RAG_MAX_CONTEXT_TOKENS" => {
|
||||
if let Ok(n) = v.parse::<usize>() {
|
||||
self.rag.max_context_tokens = n;
|
||||
}
|
||||
}
|
||||
|
||||
// Unknown KB_* keys are silently ignored — see
|
||||
// `env_unknown_key_is_ignored` test.
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// `~/.config/kb/config.toml` (honors `XDG_CONFIG_HOME`).
|
||||
pub fn xdg_config_path() -> PathBuf {
|
||||
if let Ok(custom) = std::env::var("XDG_CONFIG_HOME") {
|
||||
if !custom.is_empty() {
|
||||
return PathBuf::from(custom).join("kb").join("config.toml");
|
||||
}
|
||||
}
|
||||
match dirs::config_dir() {
|
||||
Some(d) => d.join("kb").join("config.toml"),
|
||||
None => PathBuf::from("./kb/config.toml"),
|
||||
}
|
||||
}
|
||||
|
||||
/// `~/.local/share/kb` (honors `XDG_DATA_HOME`).
|
||||
pub fn xdg_data_dir() -> PathBuf {
|
||||
if let Ok(custom) = std::env::var("XDG_DATA_HOME") {
|
||||
if !custom.is_empty() {
|
||||
return PathBuf::from(custom).join("kb");
|
||||
}
|
||||
}
|
||||
match dirs::data_dir() {
|
||||
Some(d) => d.join("kb"),
|
||||
None => PathBuf::from("./kb-data"),
|
||||
}
|
||||
}
|
||||
|
||||
/// `~/.cache/kb` (honors `XDG_CACHE_HOME`).
|
||||
pub fn xdg_cache_dir() -> PathBuf {
|
||||
if let Ok(custom) = std::env::var("XDG_CACHE_HOME") {
|
||||
if !custom.is_empty() {
|
||||
return PathBuf::from(custom).join("kb");
|
||||
}
|
||||
}
|
||||
match dirs::cache_dir() {
|
||||
Some(d) => d.join("kb"),
|
||||
None => PathBuf::from("./kb-cache"),
|
||||
}
|
||||
}
|
||||
|
||||
/// `~/.local/state/kb` (honors `XDG_STATE_HOME`).
|
||||
pub fn xdg_state_dir() -> PathBuf {
|
||||
if let Ok(custom) = std::env::var("XDG_STATE_HOME") {
|
||||
if !custom.is_empty() {
|
||||
return PathBuf::from(custom).join("kb");
|
||||
}
|
||||
}
|
||||
// `dirs` doesn't expose state_dir on all platforms; fall back to
|
||||
// `$HOME/.local/state/kb` if XDG_STATE_HOME is unset.
|
||||
if let Some(home) = dirs::home_dir() {
|
||||
return home.join(".local").join("state").join("kb");
|
||||
}
|
||||
PathBuf::from("./kb-state")
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a permissive boolean — `1` / `true` / `yes` (case-insensitive)
|
||||
/// for true, anything else for false. Used by `apply_env` for boolean
|
||||
/// leaves of `Config`.
|
||||
fn parse_bool(s: &str) -> bool {
|
||||
matches!(s.to_ascii_lowercase().as_str(), "1" | "true" | "yes")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn defaults_are_serde_roundtrip_stable() {
|
||||
let c = Config::defaults();
|
||||
let toml_text = toml::to_string(&c).unwrap();
|
||||
let back: Config = toml::from_str(&toml_text).unwrap();
|
||||
assert_eq!(c, back);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn defaults_match_design_64_score_gate() {
|
||||
let c = Config::defaults();
|
||||
assert_eq!(c.rag.score_gate, 0.30);
|
||||
assert_eq!(c.chunking.target_tokens, 500);
|
||||
assert_eq!(c.models.embedding.dimensions, 384);
|
||||
assert_eq!(c.search.rrf_k, 60);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_override_score_gate() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert("KB_RAG_SCORE_GATE".to_string(), "0.5".to_string());
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert!((c.rag.score_gate - 0.5).abs() < 1e-6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_override_search_k() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert("KB_SEARCH_DEFAULT_K".to_string(), "25".to_string());
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert_eq!(c.search.default_k, 25);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_unknown_key_is_ignored() {
|
||||
let baseline = Config::defaults();
|
||||
let mut env = HashMap::new();
|
||||
env.insert("KB_NOPE_FOO".to_string(), "garbage".to_string());
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert_eq!(c, baseline);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_overrides_chunking_target_tokens() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert("KB_CHUNKING_TARGET_TOKENS".to_string(), "777".to_string());
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert_eq!(c.chunking.target_tokens, 777);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_overrides_models_llm_endpoint_and_temperature() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert(
|
||||
"KB_MODELS_LLM_ENDPOINT".to_string(),
|
||||
"http://10.0.0.1:11434".to_string(),
|
||||
);
|
||||
env.insert("KB_MODELS_LLM_TEMPERATURE".to_string(), "0.7".to_string());
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert_eq!(c.models.llm.endpoint, "http://10.0.0.1:11434");
|
||||
assert!((c.models.llm.temperature - 0.7).abs() < 1e-6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn env_overrides_indexing_watch_filesystem_bool() {
|
||||
let mut env = HashMap::new();
|
||||
env.insert(
|
||||
"KB_INDEXING_WATCH_FILESYSTEM".to_string(),
|
||||
"true".to_string(),
|
||||
);
|
||||
let c = Config::defaults().apply_env(&env);
|
||||
assert!(c.indexing.watch_filesystem);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xdg_paths_honor_env() {
|
||||
// Must restore env after the test to avoid polluting other tests.
|
||||
let prev = std::env::var("XDG_CONFIG_HOME").ok();
|
||||
// SAFETY: tests in this module run sequentially; we restore below.
|
||||
unsafe {
|
||||
std::env::set_var("XDG_CONFIG_HOME", "/tmp/kbtest-xdg-config");
|
||||
}
|
||||
let p = Config::xdg_config_path();
|
||||
assert_eq!(p, PathBuf::from("/tmp/kbtest-xdg-config/kb/config.toml"));
|
||||
// SAFETY: scope-local restore.
|
||||
unsafe {
|
||||
match prev {
|
||||
Some(v) => std::env::set_var("XDG_CONFIG_HOME", v),
|
||||
None => std::env::remove_var("XDG_CONFIG_HOME"),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
18
crates/kb-core/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "kb-core"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "kb domain types, traits, and ID recipe (no other kb-* deps)"
|
||||
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
serde_json_canonicalizer = "0.3"
|
||||
unicode-normalization = "0.1"
|
||||
66
crates/kb-core/src/answer.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
//! Answer + RAG types (§3.8).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::citation::Citation;
|
||||
use crate::search::SearchMode;
|
||||
use crate::versions::PromptTemplateVersion;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Answer {
|
||||
pub answer: String,
|
||||
pub citations: Vec<AnswerCitation>,
|
||||
pub grounded: bool,
|
||||
pub refusal_reason: Option<RefusalReason>,
|
||||
pub model: ModelRef,
|
||||
pub embedding: Option<ModelRef>,
|
||||
pub prompt_template_version: PromptTemplateVersion,
|
||||
pub retrieval: AnswerRetrievalSummary,
|
||||
pub usage: TokenUsage,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub created_at: OffsetDateTime,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct AnswerCitation {
|
||||
pub marker: Option<String>,
|
||||
pub citation: Citation,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum RefusalReason {
|
||||
ScoreGate,
|
||||
LlmSelfJudge,
|
||||
NoIndex,
|
||||
NoChunks,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ModelRef {
|
||||
pub id: String,
|
||||
pub provider: String,
|
||||
pub dimensions: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct AnswerRetrievalSummary {
|
||||
pub trace_id: TraceId,
|
||||
pub mode: SearchMode,
|
||||
pub k: usize,
|
||||
pub score_gate: f32,
|
||||
pub top_score: f32,
|
||||
pub chunks_returned: u32,
|
||||
pub chunks_used: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct TokenUsage {
|
||||
pub prompt_tokens: u32,
|
||||
pub completion_tokens: u32,
|
||||
pub latency_ms: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct TraceId(pub String);
|
||||
61
crates/kb-core/src/asset.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
//! Raw asset, source URI, workspace path (§3.3).
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::errors::CoreError;
|
||||
use crate::ids::AssetId;
|
||||
use crate::media::{Checksum, MediaType};
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind", content = "value")]
|
||||
pub enum SourceUri {
|
||||
File(PathBuf),
|
||||
/// `kb://` virtual reference.
|
||||
Kb(String),
|
||||
}
|
||||
|
||||
/// POSIX-relative path inside the workspace root (§6.6, §4.1). Always
|
||||
/// produced via `crate::normalize::to_posix` (filesystem side) or
|
||||
/// `WorkspacePath::new` (parse side). The inner string is forbidden from
|
||||
/// containing the `#` character: a workspace path must never collide with
|
||||
/// the W3C-Media-Fragments separator that `Citation` URIs rely on, so the
|
||||
/// invariant is enforced at construction rather than at every call site.
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct WorkspacePath(pub String);
|
||||
|
||||
impl WorkspacePath {
|
||||
/// Construct a `WorkspacePath` from a string, rejecting any input that
|
||||
/// contains `#`. Use this on the parser side (e.g. `Citation::parse`)
|
||||
/// where the input does not flow through `to_posix`.
|
||||
pub fn new(s: String) -> Result<Self, CoreError> {
|
||||
if s.contains('#') {
|
||||
return Err(CoreError::Malformed(format!(
|
||||
"workspace path must not contain '#': {s:?}"
|
||||
)));
|
||||
}
|
||||
Ok(Self(s))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum AssetStorage {
|
||||
Copied { path: PathBuf },
|
||||
Reference { path: PathBuf, sha: Checksum },
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RawAsset {
|
||||
pub asset_id: AssetId,
|
||||
pub source_uri: SourceUri,
|
||||
pub workspace_path: WorkspacePath,
|
||||
pub media_type: MediaType,
|
||||
pub byte_len: u64,
|
||||
pub checksum: Checksum,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub discovered_at: OffsetDateTime,
|
||||
pub stored: AssetStorage,
|
||||
}
|
||||
19
crates/kb-core/src/chunk.rs
Normal file
@@ -0,0 +1,19 @@
|
||||
//! Chunk (§3.5).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::document::SourceSpan;
|
||||
use crate::ids::{BlockId, ChunkId, DocumentId};
|
||||
use crate::versions::ChunkerVersion;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Chunk {
|
||||
pub chunk_id: ChunkId,
|
||||
pub doc_id: DocumentId,
|
||||
pub block_ids: Vec<BlockId>,
|
||||
pub text: String,
|
||||
pub heading_path: Vec<String>,
|
||||
pub source_spans: Vec<SourceSpan>,
|
||||
pub token_estimate: usize,
|
||||
pub chunker_version: ChunkerVersion,
|
||||
}
|
||||
357
crates/kb-core/src/citation.rs
Normal file
@@ -0,0 +1,357 @@
|
||||
//! Citation (§3.5) — discriminated 5-variant. Each variant has a canonical
|
||||
//! W3C Media Fragments URI per design §0 Q3.
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum Citation {
|
||||
Line {
|
||||
path: WorkspacePath,
|
||||
start: u32,
|
||||
end: u32,
|
||||
section: Option<String>,
|
||||
},
|
||||
Page {
|
||||
path: WorkspacePath,
|
||||
page: u32,
|
||||
section: Option<String>,
|
||||
},
|
||||
Region {
|
||||
path: WorkspacePath,
|
||||
x: u32,
|
||||
y: u32,
|
||||
w: u32,
|
||||
h: u32,
|
||||
},
|
||||
Caption {
|
||||
path: WorkspacePath,
|
||||
model: String,
|
||||
},
|
||||
Time {
|
||||
path: WorkspacePath,
|
||||
start_ms: u64,
|
||||
end_ms: u64,
|
||||
speaker: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
impl Citation {
|
||||
pub fn path(&self) -> &WorkspacePath {
|
||||
match self {
|
||||
Citation::Line { path, .. }
|
||||
| Citation::Page { path, .. }
|
||||
| Citation::Region { path, .. }
|
||||
| Citation::Caption { path, .. }
|
||||
| Citation::Time { path, .. } => path,
|
||||
}
|
||||
}
|
||||
|
||||
/// Emit a W3C Media Fragments URI per design §0 Q3.
|
||||
/// `section` and `speaker` and `caption.model` are NOT part of the URI
|
||||
/// fragment; they live in the structured wire object.
|
||||
pub fn to_uri(&self) -> String {
|
||||
match self {
|
||||
Citation::Line { path, start, end, .. } => {
|
||||
if start == end {
|
||||
format!("{}#L{}", path.0, start)
|
||||
} else {
|
||||
format!("{}#L{}-L{}", path.0, start, end)
|
||||
}
|
||||
}
|
||||
Citation::Page { path, page, .. } => format!("{}#p={}", path.0, page),
|
||||
Citation::Region {
|
||||
path, x, y, w, h, ..
|
||||
} => format!("{}#xywh={},{},{},{}", path.0, x, y, w, h),
|
||||
Citation::Caption { path, .. } => format!("{}#caption", path.0),
|
||||
Citation::Time {
|
||||
path,
|
||||
start_ms,
|
||||
end_ms,
|
||||
speaker,
|
||||
} => {
|
||||
let s = format_hms_ms(*start_ms);
|
||||
let e = format_hms_ms(*end_ms);
|
||||
match speaker {
|
||||
Some(sp) => format!("{}#t={},{}&speaker={}", path.0, s, e, sp),
|
||||
None => format!("{}#t={},{}", path.0, s, e),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Strict inverse of `to_uri`. The `section` / `caption.model` fields
|
||||
/// are not part of the URI grammar, so a parsed Citation will have
|
||||
/// `section = None` and `model = ""` for the relevant variants.
|
||||
/// Round-trip property holds for citations whose non-URI fields are at
|
||||
/// their default values (see test).
|
||||
pub fn parse(s: &str) -> Result<Self> {
|
||||
|
claude-reviewer-01
commented
🟡 minor — parse error 메시지에 입력 인용 부족
같은 패턴으로 다 바꾸면 디버깅 비용이 한 번에 줄어듦. 회귀 위험 0. ## 🟡 minor — parse error 메시지에 입력 인용 부족
`citation.rs`의 `bail!("bad line start")` / `bail!("unknown fragment")` 등 (8군데) 모두 입력 문자열을 인용하지 않음. 사용자/디버거 입장에서 "bad line start"만 받으면 `s` 값을 추측해야 함.
```rust
bail!("bad line start in {a:?}")
```
같은 패턴으로 다 바꾸면 디버깅 비용이 한 번에 줄어듦. 회귀 위험 0.
|
||||
let (path_str, frag) = match s.rsplit_once('#') {
|
||||
|
claude-reviewer-01
commented
🟡 nice-to-fix — 경로에
|
||||
Some(t) => t,
|
||||
None => bail!("citation has no '#' fragment: {s:?}"),
|
||||
};
|
||||
// `WorkspacePath::new` rejects any remaining `#` on the path side
|
||||
// (e.g. the input had multiple `#` separators), closing the
|
||||
// hash-in-path concern at construction rather than at every reader.
|
||||
let path = WorkspacePath::new(path_str.to_owned())?;
|
||||
|
||||
if let Some(rest) = frag.strip_prefix("L") {
|
||||
// line range: `L<a>` or `L<a>-L<b>`
|
||||
if let Some((a, b)) = rest.split_once("-L") {
|
||||
let start: u32 = a
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad line start in {a:?} (input {s:?})"))?;
|
||||
let end: u32 = b
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad line end in {b:?} (input {s:?})"))?;
|
||||
return Ok(Citation::Line {
|
||||
path,
|
||||
start,
|
||||
end,
|
||||
section: None,
|
||||
});
|
||||
}
|
||||
let n: u32 = rest
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad line number in {rest:?} (input {s:?})"))?;
|
||||
return Ok(Citation::Line {
|
||||
path,
|
||||
start: n,
|
||||
end: n,
|
||||
section: None,
|
||||
});
|
||||
}
|
||||
if let Some(rest) = frag.strip_prefix("p=") {
|
||||
let page: u32 = rest
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad page number in {rest:?} (input {s:?})"))?;
|
||||
return Ok(Citation::Page {
|
||||
path,
|
||||
page,
|
||||
section: None,
|
||||
});
|
||||
}
|
||||
if let Some(rest) = frag.strip_prefix("xywh=") {
|
||||
let parts: Vec<&str> = rest.split(',').collect();
|
||||
if parts.len() != 4 {
|
||||
bail!("xywh= expects 4 comma-separated values, got {rest:?} (input {s:?})");
|
||||
}
|
||||
let x: u32 = parts[0]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad xywh.x in {:?} (input {s:?})", parts[0]))?;
|
||||
let y: u32 = parts[1]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad xywh.y in {:?} (input {s:?})", parts[1]))?;
|
||||
let w: u32 = parts[2]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad xywh.w in {:?} (input {s:?})", parts[2]))?;
|
||||
let h: u32 = parts[3]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad xywh.h in {:?} (input {s:?})", parts[3]))?;
|
||||
return Ok(Citation::Region { path, x, y, w, h });
|
||||
}
|
||||
if frag == "caption" {
|
||||
return Ok(Citation::Caption {
|
||||
path,
|
||||
model: String::new(),
|
||||
});
|
||||
}
|
||||
if let Some(rest) = frag.strip_prefix("t=") {
|
||||
// `t=<start>,<end>` optionally followed by `&speaker=<id>`
|
||||
let (range, speaker) = match rest.split_once('&') {
|
||||
Some((r, kv)) => match kv.strip_prefix("speaker=") {
|
||||
Some(sp) => (r, Some(sp.to_owned())),
|
||||
None => bail!("unknown time-fragment param {kv:?} (input {s:?})"),
|
||||
},
|
||||
None => (rest, None),
|
||||
};
|
||||
let (s_str, e_str) = match range.split_once(',') {
|
||||
Some(t) => t,
|
||||
None => bail!("time fragment expects '<start>,<end>', got {range:?} (input {s:?})"),
|
||||
};
|
||||
let start_ms = parse_hms_ms(s_str)?;
|
||||
let end_ms = parse_hms_ms(e_str)?;
|
||||
return Ok(Citation::Time {
|
||||
path,
|
||||
start_ms,
|
||||
end_ms,
|
||||
speaker,
|
||||
});
|
||||
}
|
||||
bail!("unrecognised citation fragment {frag:?} (input {s:?})")
|
||||
}
|
||||
}
|
||||
|
||||
/// Format milliseconds as `hh:mm:ss.mmm` (W3C Media Fragments NPT-with-ms).
|
||||
fn format_hms_ms(ms: u64) -> String {
|
||||
let hours = ms / 3_600_000;
|
||||
let minutes = (ms % 3_600_000) / 60_000;
|
||||
let seconds = (ms % 60_000) / 1000;
|
||||
let millis = ms % 1000;
|
||||
format!("{hours:02}:{minutes:02}:{seconds:02}.{millis:03}")
|
||||
}
|
||||
|
||||
fn parse_hms_ms(s: &str) -> Result<u64> {
|
||||
// Accept `hh:mm:ss.mmm` (the form we emit). Reject malformed input.
|
||||
let parts: Vec<&str> = s.split(':').collect();
|
||||
if parts.len() != 3 {
|
||||
bail!("time component expects hh:mm:ss.mmm, got {s:?}");
|
||||
}
|
||||
let h: u64 = parts[0]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad hours in {:?} (input {s:?})", parts[0]))?;
|
||||
let m: u64 = parts[1]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad minutes in {:?} (input {s:?})", parts[1]))?;
|
||||
let (sec, ms) = match parts[2].split_once('.') {
|
||||
Some((s_part, ms_part)) => {
|
||||
let sec: u64 = s_part
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad seconds in {s_part:?} (input {s:?})"))?;
|
||||
// Pad/truncate to exactly 3 digits.
|
||||
let mut ms_str = ms_part.to_owned();
|
||||
while ms_str.len() < 3 {
|
||||
ms_str.push('0');
|
||||
}
|
||||
ms_str.truncate(3);
|
||||
let ms: u64 = ms_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad milliseconds in {ms_part:?} (input {s:?})"))?;
|
||||
(sec, ms)
|
||||
}
|
||||
None => {
|
||||
let sec: u64 = parts[2]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad seconds in {:?} (input {s:?})", parts[2]))?;
|
||||
(sec, 0)
|
||||
}
|
||||
};
|
||||
Ok(h * 3_600_000 + m * 60_000 + sec * 1000 + ms)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn p(s: &str) -> WorkspacePath {
|
||||
WorkspacePath::new(s.to_owned()).expect("test paths must not contain '#'")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn line_range_uri_and_roundtrip() {
|
||||
let c = Citation::Line {
|
||||
path: p("notes/rust/kb.md"),
|
||||
start: 12,
|
||||
end: 34,
|
||||
section: None,
|
||||
};
|
||||
assert_eq!(c.to_uri(), "notes/rust/kb.md#L12-L34");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn line_single_uri_and_roundtrip() {
|
||||
let c = Citation::Line {
|
||||
path: p("a/b.md"),
|
||||
start: 7,
|
||||
end: 7,
|
||||
section: None,
|
||||
};
|
||||
assert_eq!(c.to_uri(), "a/b.md#L7");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn page_uri_and_roundtrip() {
|
||||
let c = Citation::Page {
|
||||
path: p("papers/book.pdf"),
|
||||
page: 23,
|
||||
section: None,
|
||||
};
|
||||
assert_eq!(c.to_uri(), "papers/book.pdf#p=23");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn region_uri_and_roundtrip() {
|
||||
let c = Citation::Region {
|
||||
path: p("photos/x.png"),
|
||||
x: 120,
|
||||
y: 40,
|
||||
w: 520,
|
||||
h: 180,
|
||||
};
|
||||
assert_eq!(c.to_uri(), "photos/x.png#xywh=120,40,520,180");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn caption_uri_and_roundtrip() {
|
||||
let c = Citation::Caption {
|
||||
path: p("photos/x.png"),
|
||||
// `model` is not in the URI grammar; round-trip fills it with "".
|
||||
model: String::new(),
|
||||
};
|
||||
assert_eq!(c.to_uri(), "photos/x.png#caption");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn time_uri_and_roundtrip_with_speaker() {
|
||||
let c = Citation::Time {
|
||||
path: p("recordings/r.m4a"),
|
||||
start_ms: 822_000,
|
||||
end_ms: 850_000,
|
||||
speaker: Some("S1".to_string()),
|
||||
};
|
||||
assert_eq!(
|
||||
c.to_uri(),
|
||||
"recordings/r.m4a#t=00:13:42.000,00:14:10.000&speaker=S1"
|
||||
);
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn time_uri_and_roundtrip_without_speaker() {
|
||||
let c = Citation::Time {
|
||||
path: p("recordings/r.m4a"),
|
||||
start_ms: 1_500,
|
||||
end_ms: 2_750,
|
||||
speaker: None,
|
||||
};
|
||||
assert_eq!(c.to_uri(), "recordings/r.m4a#t=00:00:01.500,00:00:02.750");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_rejects_no_fragment() {
|
||||
assert!(Citation::parse("just/path.md").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_rejects_unknown_fragment() {
|
||||
assert!(Citation::parse("a.md#mystery=1").is_err());
|
||||
}
|
||||
|
||||
/// `rsplit_once('#')` would otherwise leave a `#` on the path side when
|
||||
/// the input contains multiple `#` separators (e.g. someone embeds a
|
||||
/// fake fragment in the path). The `WorkspacePath::new` constructor
|
||||
/// closes that hole at construction time.
|
||||
#[test]
|
||||
fn parse_path_with_hash_rejected_at_to_posix_layer() {
|
||||
// `notes/x#evil.md#L7` — rsplit_once strips `#L7`, leaving
|
||||
// `notes/x#evil.md` on the path side. WorkspacePath::new must reject.
|
||||
let r = Citation::parse("notes/x#evil.md#L7");
|
||||
assert!(r.is_err(), "path with embedded '#' must be rejected");
|
||||
}
|
||||
}
|
||||
177
crates/kb-core/src/document.rs
Normal file
@@ -0,0 +1,177 @@
|
||||
//! CanonicalDocument, Block, SourceSpan, Inline, plus the forward-declared
|
||||
//! OCR / caption / transcript stubs (§3.4 + §3.7a).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
use crate::ids::{AssetId, BlockId, DocumentId};
|
||||
use crate::media::Lang;
|
||||
use crate::metadata::{Metadata, Provenance};
|
||||
use crate::versions::ParserVersion;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct CanonicalDocument {
|
||||
pub doc_id: DocumentId,
|
||||
pub source_asset_id: AssetId,
|
||||
pub workspace_path: WorkspacePath,
|
||||
pub title: String,
|
||||
pub lang: Lang,
|
||||
pub blocks: Vec<Block>,
|
||||
pub metadata: Metadata,
|
||||
pub provenance: Provenance,
|
||||
pub parser_version: ParserVersion,
|
||||
pub schema_version: u32,
|
||||
pub doc_version: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum Block {
|
||||
Heading(HeadingBlock),
|
||||
Paragraph(TextBlock),
|
||||
List(ListBlock),
|
||||
Code(CodeBlock),
|
||||
Table(TableBlock),
|
||||
Quote(TextBlock),
|
||||
ImageRef(ImageRefBlock),
|
||||
AudioRef(AudioRefBlock),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct CommonBlock {
|
||||
pub block_id: BlockId,
|
||||
pub heading_path: Vec<String>,
|
||||
pub source_span: SourceSpan,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct HeadingBlock {
|
||||
pub common: CommonBlock,
|
||||
pub level: u8,
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct TextBlock {
|
||||
pub common: CommonBlock,
|
||||
pub text: String,
|
||||
pub inlines: Vec<Inline>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ListBlock {
|
||||
pub common: CommonBlock,
|
||||
pub ordered: bool,
|
||||
pub items: Vec<TextBlock>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct CodeBlock {
|
||||
pub common: CommonBlock,
|
||||
pub lang: Option<String>,
|
||||
pub code: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct TableBlock {
|
||||
pub common: CommonBlock,
|
||||
pub headers: Vec<String>,
|
||||
pub rows: Vec<Vec<String>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ImageRefBlock {
|
||||
pub common: CommonBlock,
|
||||
pub asset_id: Option<AssetId>,
|
||||
pub src: String,
|
||||
pub alt: String,
|
||||
pub ocr: Option<OcrText>,
|
||||
pub caption: Option<ModelCaption>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct AudioRefBlock {
|
||||
pub common: CommonBlock,
|
||||
pub asset_id: AssetId,
|
||||
pub duration_ms: u64,
|
||||
pub transcript: Option<Transcript>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum Inline {
|
||||
Text(String),
|
||||
Code(String),
|
||||
Link { text: String, href: String },
|
||||
Strong(Vec<Inline>),
|
||||
Emph(Vec<Inline>),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum SourceSpan {
|
||||
Line {
|
||||
start: u32,
|
||||
end: u32,
|
||||
},
|
||||
Byte {
|
||||
start: u64,
|
||||
end: u64,
|
||||
},
|
||||
Page {
|
||||
page: u32,
|
||||
char_start: Option<u32>,
|
||||
char_end: Option<u32>,
|
||||
},
|
||||
Region {
|
||||
x: u32,
|
||||
y: u32,
|
||||
w: u32,
|
||||
h: u32,
|
||||
},
|
||||
Time {
|
||||
start_ms: u64,
|
||||
end_ms: u64,
|
||||
},
|
||||
}
|
||||
|
||||
// ── Forward-declared stubs (§3.7a). Bodies are final per design. ────────
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct OcrText {
|
||||
pub joined: String,
|
||||
pub regions: Vec<OcrRegion>,
|
||||
pub engine: String,
|
||||
pub engine_version: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct OcrRegion {
|
||||
pub bbox: (u32, u32, u32, u32),
|
||||
pub text: String,
|
||||
pub confidence: f32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ModelCaption {
|
||||
pub text: String,
|
||||
pub model: String,
|
||||
pub model_version: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Transcript {
|
||||
pub segments: Vec<TranscriptSegment>,
|
||||
pub engine: String,
|
||||
pub engine_version: String,
|
||||
pub language: Lang,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct TranscriptSegment {
|
||||
pub start_ms: u64,
|
||||
pub end_ms: u64,
|
||||
pub text: String,
|
||||
pub speaker: Option<String>,
|
||||
pub confidence: Option<f32>,
|
||||
}
|
||||
15
crates/kb-core/src/errors.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
//! `CoreError` (§10).
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum CoreError {
|
||||
#[error("invalid id: {0}")]
|
||||
InvalidId(String),
|
||||
#[error("invalid citation: {0}")]
|
||||
InvalidCitation(String),
|
||||
#[error("invalid source span: {0}")]
|
||||
InvalidSpan(String),
|
||||
#[error("malformed input: {0}")]
|
||||
Malformed(String),
|
||||
}
|
||||
477
crates/kb-core/src/ids.rs
Normal file
@@ -0,0 +1,477 @@
|
||||
//! Newtype IDs (§3.1) + ID generation recipe (§4.2).
|
||||
//!
|
||||
//! Every ID is `blake3(canonical_json(tuple))[..32]`. `Display` returns the
|
||||
//! inner hex string; `FromStr` accepts 32 hex characters (mixed case) and
|
||||
//! normalizes the stored representation to lowercase so equality and hashing
|
||||
//! are canonical.
|
||||
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
use crate::document::SourceSpan;
|
||||
use crate::errors::CoreError;
|
||||
use crate::versions::{
|
||||
ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion,
|
||||
ParserVersion,
|
||||
};
|
||||
|
||||
macro_rules! newtype_id {
|
||||
($name:ident) => {
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct $name(pub String);
|
||||
|
||||
impl fmt::Display for $name {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for $name {
|
||||
type Err = CoreError;
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
validate_hex32(s)?;
|
||||
Ok(Self(s.to_ascii_lowercase()))
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
newtype_id!(AssetId);
|
||||
newtype_id!(DocumentId);
|
||||
newtype_id!(BlockId);
|
||||
newtype_id!(ChunkId);
|
||||
newtype_id!(EmbeddingId);
|
||||
newtype_id!(IndexId);
|
||||
|
||||
fn validate_hex32(s: &str) -> Result<(), CoreError> {
|
||||
if s.len() != 32 {
|
||||
return Err(CoreError::InvalidId(format!(
|
||||
"expected 32 hex chars, got {}",
|
||||
s.len()
|
||||
)));
|
||||
|
claude-reviewer-01
commented
🟡 nice-to-fix —
|
||||
}
|
||||
if !s.bytes().all(|b| b.is_ascii_hexdigit()) {
|
||||
return Err(CoreError::InvalidId(format!(
|
||||
"non-hex character in {s:?}"
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Canonical-JSON + blake3 + hex prefix 32. Per design §4.2.
|
||||
pub fn id_from<T: Serialize>(tuple: T) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(&tuple)
|
||||
.expect("canonical JSON serialization must not fail for kb-core inputs");
|
||||
// The crate exposes `to_vec` for `T: Serialize` returning `Vec<u8>`.
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..32].to_string()
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct AssetTuple<'a> {
|
||||
kind: &'static str,
|
||||
asset_blake3: &'a str,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct DocTuple<'a> {
|
||||
kind: &'static str,
|
||||
workspace_path: &'a str,
|
||||
asset_id: &'a str,
|
||||
parser_version: &'a str,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct BlockTuple<'a> {
|
||||
kind: &'static str,
|
||||
doc_id: &'a str,
|
||||
block_kind: &'a str,
|
||||
heading_path: &'a [String],
|
||||
ordinal: u32,
|
||||
source_span: &'a SourceSpan,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct ChunkTuple<'a> {
|
||||
kind: &'static str,
|
||||
doc_id: &'a str,
|
||||
chunker_version: &'a str,
|
||||
block_ids: Vec<&'a str>,
|
||||
policy_hash: &'a str,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct EmbeddingTuple<'a> {
|
||||
kind: &'static str,
|
||||
chunk_id: &'a str,
|
||||
model_id: &'a str,
|
||||
model_version: &'a str,
|
||||
dimensions: usize,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct IndexTuple<'a> {
|
||||
kind: &'static str,
|
||||
collection: &'a str,
|
||||
embedding_model: &'a str,
|
||||
dimensions: usize,
|
||||
index_version: &'a str,
|
||||
index_kind: &'a str,
|
||||
index_params_hash: &'a str,
|
||||
}
|
||||
|
||||
pub fn id_for_asset(asset_blake3_full_hex: &str) -> AssetId {
|
||||
AssetId(id_from(AssetTuple {
|
||||
kind: "asset",
|
||||
asset_blake3: asset_blake3_full_hex,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn id_for_doc(
|
||||
workspace_path: &WorkspacePath,
|
||||
asset: &AssetId,
|
||||
parser_version: &ParserVersion,
|
||||
) -> DocumentId {
|
||||
DocumentId(id_from(DocTuple {
|
||||
kind: "doc",
|
||||
workspace_path: &workspace_path.0,
|
||||
asset_id: &asset.0,
|
||||
parser_version: &parser_version.0,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn id_for_block(
|
||||
doc: &DocumentId,
|
||||
block_kind: &str,
|
||||
heading_path: &[String],
|
||||
ordinal: u32,
|
||||
span: &SourceSpan,
|
||||
) -> BlockId {
|
||||
BlockId(id_from(BlockTuple {
|
||||
kind: "block",
|
||||
doc_id: &doc.0,
|
||||
block_kind,
|
||||
heading_path,
|
||||
ordinal,
|
||||
source_span: span,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn id_for_chunk(
|
||||
doc: &DocumentId,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
policy_hash: &str,
|
||||
) -> ChunkId {
|
||||
ChunkId(id_from(ChunkTuple {
|
||||
kind: "chunk",
|
||||
doc_id: &doc.0,
|
||||
chunker_version: &chunker_version.0,
|
||||
block_ids: block_ids.iter().map(|b| b.0.as_str()).collect(),
|
||||
policy_hash,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn id_for_embedding(
|
||||
chunk: &ChunkId,
|
||||
model: &EmbeddingModelId,
|
||||
version: &EmbeddingVersion,
|
||||
dims: usize,
|
||||
) -> EmbeddingId {
|
||||
EmbeddingId(id_from(EmbeddingTuple {
|
||||
kind: "embedding",
|
||||
chunk_id: &chunk.0,
|
||||
model_id: &model.0,
|
||||
model_version: &version.0,
|
||||
dimensions: dims,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn id_for_index(
|
||||
collection: &str,
|
||||
model: &EmbeddingModelId,
|
||||
dims: usize,
|
||||
version: &IndexVersion,
|
||||
kind: &str,
|
||||
params_hash: &str,
|
||||
) -> IndexId {
|
||||
IndexId(id_from(IndexTuple {
|
||||
kind: "index",
|
||||
collection,
|
||||
embedding_model: &model.0,
|
||||
dimensions: dims,
|
||||
index_version: &version.0,
|
||||
index_kind: kind,
|
||||
index_params_hash: params_hash,
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn newtype_display_roundtrip() {
|
||||
let s = "0123456789abcdef0123456789abcdef";
|
||||
let id: AssetId = s.parse().unwrap();
|
||||
assert_eq!(id.to_string(), s);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn newtype_rejects_short() {
|
||||
let r: Result<AssetId, _> = "abc".parse();
|
||||
assert!(r.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn newtype_rejects_non_hex() {
|
||||
let r: Result<AssetId, _> = "ZZZ456789abcdef0123456789abcdef0".parse();
|
||||
assert!(r.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn newtype_accepts_uppercase_normalizes_to_lowercase() {
|
||||
let r: Result<AssetId, _> = "0123456789ABCDEF0123456789ABCDEF".parse();
|
||||
let id = r.expect("uppercase hex must be accepted");
|
||||
assert_eq!(id.0, "0123456789abcdef0123456789abcdef");
|
||||
assert_eq!(id.to_string(), "0123456789abcdef0123456789abcdef");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn newtype_rejects_invalid_chars_after_uppercase_pass() {
|
||||
// Mix of upper-hex (would pass) and non-hex `XYZ` (must reject).
|
||||
let r: Result<AssetId, _> = "DEADBEEFCAFEBAB1XYZ23456789ABCD0".parse();
|
||||
assert!(r.is_err());
|
||||
}
|
||||
|
||||
/// Determinism: 1000 runs of `id_from` over the same input yield the same
|
||||
/// hex.
|
||||
#[test]
|
||||
fn id_from_deterministic_1000() {
|
||||
#[derive(Serialize)]
|
||||
struct T<'a> {
|
||||
a: u32,
|
||||
b: &'a str,
|
||||
}
|
||||
let input = T { a: 7, b: "hello" };
|
||||
let first = id_from(&input);
|
||||
for _ in 0..1000 {
|
||||
assert_eq!(id_from(&input), first);
|
||||
}
|
||||
assert_eq!(first.len(), 32);
|
||||
}
|
||||
|
||||
/// Key order in the source struct does not affect hash (canonical JSON
|
||||
/// sorts keys alphabetically).
|
||||
#[test]
|
||||
fn id_from_key_order_invariant() {
|
||||
#[derive(Serialize)]
|
||||
struct A {
|
||||
a: u32,
|
||||
b: u32,
|
||||
}
|
||||
#[derive(Serialize)]
|
||||
struct B {
|
||||
b: u32,
|
||||
a: u32,
|
||||
}
|
||||
assert_eq!(id_from(A { a: 1, b: 2 }), id_from(B { b: 2, a: 1 }));
|
||||
}
|
||||
|
||||
/// The expected hex below is hand-computed via design §4.2:
|
||||
/// tuple = { "kind": "asset", "asset_blake3": "deadbeef" }
|
||||
/// canonical JSON (key-sorted, no whitespace, both keys are pure ASCII):
|
||||
|
claude-reviewer-01
commented
👍 strength — pinned hex test 구조가 정확외부 도구( 같은 패턴을 ## 👍 strength — pinned hex test 구조가 정확
외부 도구(`b3sum`)로 계산한 hex를 literal로 박았고, 주석에 canonical-JSON 입력 + `b3sum` 명령까지 명시. self-referential 아니므로 JCS 또는 blake3 파이프라인 회귀가 진짜로 잡힘. 이게 ID recipe test의 올바른 형태.
같은 패턴을 `id_for_block` / `id_for_chunk` / `id_for_embedding` / `id_for_index`에도 4개 더 추가하면 모든 ID 함수가 외부 검증 안전망에 들어감 — P0-1 spec test plan의 "each `id_for_*` recipe matches design §4.2 byte-for-byte" 항목을 더 강하게 만족.
|
||||
/// {"asset_blake3":"deadbeef","kind":"asset"}
|
||||
/// blake3 of those bytes → hex → first 32 chars.
|
||||
/// Pinned via an independent tool (b3sum, computed once outside the code
|
||||
/// under test) so a regression in our JCS or hash pipeline is caught.
|
||||
#[test]
|
||||
fn id_for_asset_pinned() {
|
||||
// printf '{"asset_blake3":"deadbeef","kind":"asset"}' | b3sum
|
||||
// → cec9353553efb238a7919d38d3e148f1...
|
||||
let id = id_for_asset("deadbeef");
|
||||
assert_eq!(id.0, "cec9353553efb238a7919d38d3e148f1");
|
||||
}
|
||||
|
||||
/// Independent pin for id_for_doc.
|
||||
/// canonical JSON:
|
||||
/// {"asset_id":"6cb0ef0eb89c63b8b6e76ec53dca6e7d",
|
||||
/// "kind":"doc",
|
||||
/// "parser_version":"pulldown-cmark-0.x",
|
||||
/// "workspace_path":"notes/test.md"}
|
||||
/// (concatenated, no whitespace).
|
||||
#[test]
|
||||
fn id_for_doc_pinned() {
|
||||
let asset = AssetId("6cb0ef0eb89c63b8b6e76ec53dca6e7d".to_string());
|
||||
let path = WorkspacePath::new("notes/test.md".to_string()).unwrap();
|
||||
let pv = ParserVersion("pulldown-cmark-0.x".to_string());
|
||||
let id = id_for_doc(&path, &asset, &pv);
|
||||
assert_eq!(id.0, "8547fe58cb42d593fd761d77242401db");
|
||||
}
|
||||
|
||||
/// Independent pin for id_for_block.
|
||||
/// inputs:
|
||||
/// doc=DocumentId("aabbccdd00112233445566778899aabb"),
|
||||
/// block_kind="paragraph", heading_path=["Intro"], ordinal=3,
|
||||
/// span=SourceSpan::Line { start: 10, end: 20 }
|
||||
/// canonical JSON (key-sorted, compact, no whitespace):
|
||||
/// {"block_kind":"paragraph",
|
||||
/// "doc_id":"aabbccdd00112233445566778899aabb",
|
||||
/// "heading_path":["Intro"],
|
||||
/// "kind":"block",
|
||||
/// "ordinal":3,
|
||||
/// "source_span":{"end":20,"kind":"line","start":10}}
|
||||
/// computed via:
|
||||
/// printf '{"block_kind":"paragraph","doc_id":"aabbccdd00112233445566778899aabb","heading_path":["Intro"],"kind":"block","ordinal":3,"source_span":{"end":20,"kind":"line","start":10}}' \
|
||||
/// | ~/.cargo/bin/b3sum --no-names | cut -c1-32
|
||||
/// → 8a7bf22de7ec3293a792028c829b3812
|
||||
#[test]
|
||||
fn id_for_block_pinned() {
|
||||
let doc = DocumentId("aabbccdd00112233445566778899aabb".to_string());
|
||||
let heading = vec!["Intro".to_string()];
|
||||
let span = SourceSpan::Line { start: 10, end: 20 };
|
||||
|
||||
// Sanity check: confirm that the canonical JSON our code produces
|
||||
// matches the literal we hashed externally. If a future field-order
|
||||
// change (or rename) silently shifts the hash, this assertion fails
|
||||
// before the hex comparison and points at the JSON layer directly.
|
||||
let expected_json = b"{\"block_kind\":\"paragraph\",\"doc_id\":\"aabbccdd00112233445566778899aabb\",\"heading_path\":[\"Intro\"],\"kind\":\"block\",\"ordinal\":3,\"source_span\":{\"end\":20,\"kind\":\"line\",\"start\":10}}";
|
||||
let tuple = BlockTuple {
|
||||
kind: "block",
|
||||
doc_id: &doc.0,
|
||||
block_kind: "paragraph",
|
||||
heading_path: &heading,
|
||||
ordinal: 3,
|
||||
source_span: &span,
|
||||
};
|
||||
assert_eq!(
|
||||
serde_json_canonicalizer::to_vec(&tuple).unwrap(),
|
||||
expected_json
|
||||
);
|
||||
|
||||
let id = id_for_block(&doc, "paragraph", &heading, 3, &span);
|
||||
assert_eq!(id.0, "8a7bf22de7ec3293a792028c829b3812");
|
||||
}
|
||||
|
||||
/// Independent pin for id_for_chunk.
|
||||
/// inputs:
|
||||
/// doc=DocumentId("aabbccdd00112233445566778899aabb"),
|
||||
/// chunker_version=ChunkerVersion("greedy-1.0"),
|
||||
/// block_ids=[BlockId("a1b2c3d4e5f6789012345678abcdef00")],
|
||||
/// policy_hash="abc123"
|
||||
/// canonical JSON (key-sorted, compact, no whitespace):
|
||||
/// {"block_ids":["a1b2c3d4e5f6789012345678abcdef00"],
|
||||
/// "chunker_version":"greedy-1.0",
|
||||
/// "doc_id":"aabbccdd00112233445566778899aabb",
|
||||
/// "kind":"chunk",
|
||||
/// "policy_hash":"abc123"}
|
||||
/// computed via:
|
||||
/// printf '{"block_ids":["a1b2c3d4e5f6789012345678abcdef00"],"chunker_version":"greedy-1.0","doc_id":"aabbccdd00112233445566778899aabb","kind":"chunk","policy_hash":"abc123"}' \
|
||||
/// | ~/.cargo/bin/b3sum --no-names | cut -c1-32
|
||||
/// → 8809f627777fe7ca5c4433b97dd88ce9
|
||||
#[test]
|
||||
fn id_for_chunk_pinned() {
|
||||
let doc = DocumentId("aabbccdd00112233445566778899aabb".to_string());
|
||||
let cv = ChunkerVersion("greedy-1.0".to_string());
|
||||
let blocks = vec![BlockId("a1b2c3d4e5f6789012345678abcdef00".to_string())];
|
||||
|
||||
let expected_json = b"{\"block_ids\":[\"a1b2c3d4e5f6789012345678abcdef00\"],\"chunker_version\":\"greedy-1.0\",\"doc_id\":\"aabbccdd00112233445566778899aabb\",\"kind\":\"chunk\",\"policy_hash\":\"abc123\"}";
|
||||
let tuple = ChunkTuple {
|
||||
kind: "chunk",
|
||||
doc_id: &doc.0,
|
||||
chunker_version: &cv.0,
|
||||
block_ids: blocks.iter().map(|b| b.0.as_str()).collect(),
|
||||
policy_hash: "abc123",
|
||||
};
|
||||
assert_eq!(
|
||||
serde_json_canonicalizer::to_vec(&tuple).unwrap(),
|
||||
expected_json
|
||||
);
|
||||
|
||||
let id = id_for_chunk(&doc, &cv, &blocks, "abc123");
|
||||
assert_eq!(id.0, "8809f627777fe7ca5c4433b97dd88ce9");
|
||||
}
|
||||
|
||||
/// Independent pin for id_for_embedding.
|
||||
/// inputs:
|
||||
/// chunk=ChunkId("d1e2f3a4b5c6789012345678aabbccdd"),
|
||||
/// model_id=EmbeddingModelId("BAAI/bge-small-en"),
|
||||
/// model_version=EmbeddingVersion("v1"), dimensions=384
|
||||
/// canonical JSON (key-sorted, compact, no whitespace):
|
||||
/// {"chunk_id":"d1e2f3a4b5c6789012345678aabbccdd",
|
||||
/// "dimensions":384,
|
||||
/// "kind":"embedding",
|
||||
/// "model_id":"BAAI/bge-small-en",
|
||||
/// "model_version":"v1"}
|
||||
/// computed via:
|
||||
/// printf '{"chunk_id":"d1e2f3a4b5c6789012345678aabbccdd","dimensions":384,"kind":"embedding","model_id":"BAAI/bge-small-en","model_version":"v1"}' \
|
||||
/// | ~/.cargo/bin/b3sum --no-names | cut -c1-32
|
||||
/// → 71992c457a5da39880a6d17d646ed0fd
|
||||
#[test]
|
||||
fn id_for_embedding_pinned() {
|
||||
let chunk = ChunkId("d1e2f3a4b5c6789012345678aabbccdd".to_string());
|
||||
let model = EmbeddingModelId("BAAI/bge-small-en".to_string());
|
||||
let version = EmbeddingVersion("v1".to_string());
|
||||
|
||||
let expected_json = b"{\"chunk_id\":\"d1e2f3a4b5c6789012345678aabbccdd\",\"dimensions\":384,\"kind\":\"embedding\",\"model_id\":\"BAAI/bge-small-en\",\"model_version\":\"v1\"}";
|
||||
let tuple = EmbeddingTuple {
|
||||
kind: "embedding",
|
||||
chunk_id: &chunk.0,
|
||||
model_id: &model.0,
|
||||
model_version: &version.0,
|
||||
dimensions: 384,
|
||||
};
|
||||
assert_eq!(
|
||||
serde_json_canonicalizer::to_vec(&tuple).unwrap(),
|
||||
expected_json
|
||||
);
|
||||
|
||||
let id = id_for_embedding(&chunk, &model, &version, 384);
|
||||
assert_eq!(id.0, "71992c457a5da39880a6d17d646ed0fd");
|
||||
}
|
||||
|
||||
/// Independent pin for id_for_index.
|
||||
/// inputs:
|
||||
/// collection="default",
|
||||
/// embedding_model=EmbeddingModelId("BAAI/bge-small-en"),
|
||||
/// dimensions=384, version=IndexVersion("v1"),
|
||||
/// kind="hnsw", params_hash="xyz"
|
||||
/// canonical JSON (key-sorted, compact, no whitespace):
|
||||
/// {"collection":"default",
|
||||
/// "dimensions":384,
|
||||
/// "embedding_model":"BAAI/bge-small-en",
|
||||
/// "index_kind":"hnsw",
|
||||
/// "index_params_hash":"xyz",
|
||||
/// "index_version":"v1",
|
||||
/// "kind":"index"}
|
||||
/// computed via:
|
||||
/// printf '{"collection":"default","dimensions":384,"embedding_model":"BAAI/bge-small-en","index_kind":"hnsw","index_params_hash":"xyz","index_version":"v1","kind":"index"}' \
|
||||
/// | ~/.cargo/bin/b3sum --no-names | cut -c1-32
|
||||
/// → e733ee2f9936f0e1ac5143cdbf0f2b54
|
||||
#[test]
|
||||
fn id_for_index_pinned() {
|
||||
let model = EmbeddingModelId("BAAI/bge-small-en".to_string());
|
||||
let version = IndexVersion("v1".to_string());
|
||||
|
||||
let expected_json = b"{\"collection\":\"default\",\"dimensions\":384,\"embedding_model\":\"BAAI/bge-small-en\",\"index_kind\":\"hnsw\",\"index_params_hash\":\"xyz\",\"index_version\":\"v1\",\"kind\":\"index\"}";
|
||||
let tuple = IndexTuple {
|
||||
kind: "index",
|
||||
collection: "default",
|
||||
embedding_model: &model.0,
|
||||
dimensions: 384,
|
||||
index_version: &version.0,
|
||||
index_kind: "hnsw",
|
||||
index_params_hash: "xyz",
|
||||
};
|
||||
assert_eq!(
|
||||
serde_json_canonicalizer::to_vec(&tuple).unwrap(),
|
||||
expected_json
|
||||
);
|
||||
|
||||
let id = id_for_index("default", &model, 384, &version, "hnsw", "xyz");
|
||||
assert_eq!(id.0, "e733ee2f9936f0e1ac5143cdbf0f2b54");
|
||||
}
|
||||
}
|
||||
45
crates/kb-core/src/ingest.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
//! IngestReport + IngestItem (mirrored from wire §2.4).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
use crate::ids::{AssetId, DocumentId};
|
||||
use crate::traits::SourceScope;
|
||||
use crate::versions::{ChunkerVersion, ParserVersion};
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IngestReport {
|
||||
pub scope: SourceScope,
|
||||
pub scanned: u32,
|
||||
pub new: u32,
|
||||
pub updated: u32,
|
||||
pub skipped: u32,
|
||||
pub errors: u32,
|
||||
pub duration_ms: u32,
|
||||
/// `None` ↔ wire `items: null` (`--summary-only`).
|
||||
pub items: Option<Vec<IngestItem>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IngestItem {
|
||||
pub kind: IngestItemKind,
|
||||
pub doc_id: Option<DocumentId>,
|
||||
pub doc_path: WorkspacePath,
|
||||
pub asset_id: Option<AssetId>,
|
||||
pub byte_len: Option<u64>,
|
||||
pub block_count: Option<u32>,
|
||||
pub chunk_count: Option<u32>,
|
||||
pub parser_version: Option<ParserVersion>,
|
||||
pub chunker_version: Option<ChunkerVersion>,
|
||||
pub warnings: Vec<String>,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum IngestItemKind {
|
||||
New,
|
||||
Updated,
|
||||
Skipped,
|
||||
Error,
|
||||
}
|
||||
52
crates/kb-core/src/jobs.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
//! Job repo support types (§3.7a forward-decl, §7.2 JobRepo).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum JobKind {
|
||||
Ingest,
|
||||
Chunk,
|
||||
Embed,
|
||||
Ocr,
|
||||
Transcribe,
|
||||
Reindex,
|
||||
Doctor,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum JobStatus {
|
||||
Pending,
|
||||
Running,
|
||||
Succeeded,
|
||||
Failed,
|
||||
Canceled,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct JobId(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct JobFilter {
|
||||
pub status: Option<JobStatus>,
|
||||
pub kind: Option<JobKind>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct JobRow {
|
||||
pub job_id: JobId,
|
||||
pub kind: JobKind,
|
||||
pub status: JobStatus,
|
||||
pub payload: Value,
|
||||
pub progress: Option<Value>,
|
||||
pub error: Option<String>,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub created_at: OffsetDateTime,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub updated_at: OffsetDateTime,
|
||||
#[serde(default, with = "time::serde::rfc3339::option")]
|
||||
pub finished_at: Option<OffsetDateTime>,
|
||||
}
|
||||
70
crates/kb-core/src/lib.rs
Normal file
@@ -0,0 +1,70 @@
|
||||
//! `kb-core` — frozen domain types, traits, and ID recipe.
|
||||
|
claude-reviewer-01
commented
👍 strength — module 분할 깔끔
## 👍 strength — module 분할 깔끔
`kb-core` 16 모듈이 design §3 sub-section 경계를 그대로 따라감 (`ids` / `citation` / `document` / `chunk` / `metadata` / `search` / `answer` / `ingest` / `jobs` / `vector` / `errors` / `traits` / `media` / `asset` / `versions` / `normalize`). 거대 dump 파일 없음. 가장 큰 `citation.rs`도 316 lines로 hold-in-context 가능.
`lib.rs` 자체가 70 lines, `pub mod` + 큐레이션된 re-export만 있고 로직 없음. P1-* component task가 `kb_core::*`로 단일 facade를 통해 import할 수 있어 downstream 의존성 그래프가 안 흔들림.
|
||||
//!
|
||||
//! Per design §3, §4, §7. This crate has zero dependencies on any other
|
||||
//! `kb-*` crate, so every other crate in the workspace can depend on it
|
||||
//! freely.
|
||||
//!
|
||||
//! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md` for
|
||||
//! the canonical type bodies — this crate is the byte-for-byte mirror.
|
||||
|
||||
pub mod ids;
|
||||
pub mod versions;
|
||||
pub mod media;
|
||||
pub mod asset;
|
||||
pub mod document;
|
||||
pub mod chunk;
|
||||
pub mod citation;
|
||||
pub mod metadata;
|
||||
pub mod search;
|
||||
pub mod answer;
|
||||
pub mod ingest;
|
||||
pub mod jobs;
|
||||
pub mod vector;
|
||||
pub mod errors;
|
||||
pub mod traits;
|
||||
pub mod normalize;
|
||||
|
||||
// Re-export the most commonly used items at the crate root, mirroring the
|
||||
// public surface listed in the task spec.
|
||||
|
||||
pub use ids::{
|
||||
AssetId, BlockId, ChunkId, DocumentId, EmbeddingId, IndexId,
|
||||
id_for_asset, id_for_block, id_for_chunk, id_for_doc, id_for_embedding,
|
||||
id_for_index, id_from,
|
||||
};
|
||||
pub use versions::{
|
||||
ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion,
|
||||
ParserVersion, PromptTemplateVersion, SchemaVersion,
|
||||
};
|
||||
pub use media::{AudioType, Checksum, ImageType, Lang, MediaType};
|
||||
pub use asset::{AssetStorage, RawAsset, SourceUri, WorkspacePath};
|
||||
pub use document::{
|
||||
AudioRefBlock, Block, CanonicalDocument, CodeBlock, CommonBlock,
|
||||
HeadingBlock, ImageRefBlock, Inline, ListBlock, ModelCaption, OcrRegion,
|
||||
OcrText, SourceSpan, TableBlock, TextBlock, Transcript, TranscriptSegment,
|
||||
};
|
||||
pub use chunk::Chunk;
|
||||
pub use citation::Citation;
|
||||
pub use metadata::{
|
||||
Metadata, Provenance, ProvenanceEvent, ProvenanceKind, SourceType,
|
||||
TrustLevel,
|
||||
};
|
||||
pub use search::{
|
||||
DocFilter, DocSummary, RetrievalDetail, SearchFilters, SearchHit,
|
||||
SearchMode, SearchQuery,
|
||||
};
|
||||
pub use answer::{
|
||||
Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, RefusalReason,
|
||||
TokenUsage, TraceId,
|
||||
};
|
||||
pub use ingest::{IngestItem, IngestItemKind, IngestReport};
|
||||
pub use jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
|
||||
pub use vector::{VectorHit, VectorRecord};
|
||||
pub use errors::CoreError;
|
||||
pub use traits::{
|
||||
ChunkPolicy, Chunker, DocumentStore, Embedder, EmbeddingInput,
|
||||
EmbeddingKind, ExtractConfig, ExtractContext, Extractor, FinishReason,
|
||||
GenerateRequest, JobRepo, LanguageModel, Retriever, SourceConnector,
|
||||
SourceScope, TokenChunk, VectorStore,
|
||||
};
|
||||
pub use normalize::{nfc, to_posix};
|
||||
44
crates/kb-core/src/media.rs
Normal file
@@ -0,0 +1,44 @@
|
||||
//! Media / file-type primitives (§3.3 + §3.7a).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Full blake3 hex (64 chars) per §3.7a. Stored as `String` for serde
|
||||
/// simplicity.
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Checksum(pub String);
|
||||
|
||||
/// BCP-47 / ISO-639 language tag (e.g. "ko", "en"). §3.7a.
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Lang(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum ImageType {
|
||||
Png,
|
||||
Jpeg,
|
||||
Webp,
|
||||
Gif,
|
||||
Tiff,
|
||||
Other(String),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum AudioType {
|
||||
M4a,
|
||||
Mp3,
|
||||
Wav,
|
||||
Flac,
|
||||
Ogg,
|
||||
Other(String),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum MediaType {
|
||||
Markdown,
|
||||
Pdf,
|
||||
Image(ImageType),
|
||||
Audio(AudioType),
|
||||
Other(String),
|
||||
}
|
||||
68
crates/kb-core/src/metadata.rs
Normal file
@@ -0,0 +1,68 @@
|
||||
//! Metadata + Provenance (§3.6).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Map, Value};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Metadata {
|
||||
pub aliases: Vec<String>,
|
||||
pub tags: Vec<String>,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub created_at: OffsetDateTime,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub updated_at: OffsetDateTime,
|
||||
pub source_type: SourceType,
|
||||
pub trust_level: TrustLevel,
|
||||
pub user_id_alias: Option<String>,
|
||||
/// Frontmatter keys we don't recognise are preserved here per §0 Q9.
|
||||
pub user: Map<String, Value>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum SourceType {
|
||||
Markdown,
|
||||
Note,
|
||||
Paper,
|
||||
Reference,
|
||||
Inbox,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum TrustLevel {
|
||||
Primary,
|
||||
Secondary,
|
||||
Generated,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Provenance {
|
||||
pub events: Vec<ProvenanceEvent>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ProvenanceEvent {
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub at: OffsetDateTime,
|
||||
pub agent: String,
|
||||
pub kind: ProvenanceKind,
|
||||
pub note: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ProvenanceKind {
|
||||
Discovered,
|
||||
Parsed,
|
||||
Normalized,
|
||||
Chunked,
|
||||
OcrApplied,
|
||||
CaptionApplied,
|
||||
Transcribed,
|
||||
Embedded,
|
||||
Indexed,
|
||||
Warning,
|
||||
Error,
|
||||
}
|
||||
104
crates/kb-core/src/normalize.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
//! Path / string normalization helpers (§4.1, §6.6).
|
||||
|
||||
use std::path::{Component, Path};
|
||||
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
use crate::errors::CoreError;
|
||||
|
||||
/// NFC-normalize a UTF-8 string (§4.1).
|
||||
pub fn nfc(input: &str) -> String {
|
||||
input.nfc().collect()
|
||||
}
|
||||
|
||||
/// Collapse a path to a POSIX-relative `WorkspacePath` per §6.6:
|
||||
/// - convert all separators to `/`
|
||||
/// - strip a leading `./`
|
||||
/// - collapse repeated slashes
|
||||
/// - NFC-normalize
|
||||
///
|
||||
/// Returns `Err(CoreError::Malformed(..))` if the resulting POSIX form
|
||||
/// contains `#`, since `WorkspacePath` is forbidden from colliding with
|
||||
/// the W3C-Media-Fragments separator that `Citation` URIs depend on.
|
||||
pub fn to_posix(path: &Path) -> Result<WorkspacePath, CoreError> {
|
||||
let mut out = String::new();
|
||||
let mut first = true;
|
||||
for comp in path.components() {
|
||||
match comp {
|
||||
Component::CurDir => continue,
|
||||
Component::Normal(s) => {
|
||||
if !first {
|
||||
out.push('/');
|
||||
}
|
||||
out.push_str(&s.to_string_lossy());
|
||||
first = false;
|
||||
}
|
||||
Component::ParentDir => {
|
||||
if !first {
|
||||
out.push('/');
|
||||
}
|
||||
out.push_str("..");
|
||||
first = false;
|
||||
}
|
||||
Component::RootDir => {
|
||||
if first {
|
||||
out.push('/');
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
Component::Prefix(_) => {
|
||||
// Windows drive prefixes — `to_string_lossy` keeps form.
|
||||
out.push_str(&comp.as_os_str().to_string_lossy());
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push('.');
|
||||
}
|
||||
WorkspacePath::new(nfc(&out))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn collapses_curdir_and_redundant_slashes() {
|
||||
let p = Path::new("./a//b.md");
|
||||
// `Path::components` already collapses `//` on POSIX; the test
|
||||
// doc-fixed example asserts the final string is `a/b.md`.
|
||||
assert_eq!(to_posix(p).unwrap().0, "a/b.md");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nfc_normalizes_korean() {
|
||||
// U+1100 ㄱ + U+1161 ㅏ (NFD) vs U+AC00 가 (NFC). After NFC they
|
||||
// collapse to the same string; `to_posix` runs NFC after path
|
||||
// collapse, so the WorkspacePath comes out NFC regardless of input.
|
||||
let nfd = "\u{1100}\u{1161}.md";
|
||||
let nfc_str = "\u{AC00}.md";
|
||||
assert_eq!(
|
||||
to_posix(Path::new(nfd)).unwrap().0,
|
||||
to_posix(Path::new(nfc_str)).unwrap().0
|
||||
);
|
||||
assert_eq!(to_posix(Path::new(nfd)).unwrap().0, "\u{AC00}.md");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nfc_function_idempotent() {
|
||||
let s = "\u{AC00}";
|
||||
assert_eq!(nfc(s), s);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_posix_rejects_hash_in_path() {
|
||||
// `#` collides with the W3C-Media-Fragments separator used by
|
||||
// `Citation`; the WorkspacePath invariant rejects it at construction.
|
||||
let p = Path::new("notes/has#hash.md");
|
||||
let err = to_posix(p).expect_err("# in path must be rejected");
|
||||
let msg = format!("{err}");
|
||||
assert!(msg.contains('#'), "error message should mention '#': {msg}");
|
||||
}
|
||||
}
|
||||
90
crates/kb-core/src/search.rs
Normal file
@@ -0,0 +1,90 @@
|
||||
//! Search query / filters / hit (§3.7) + DocFilter / DocSummary (§2.5).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
use crate::citation::Citation;
|
||||
use crate::ids::{ChunkId, DocumentId};
|
||||
use crate::media::Lang;
|
||||
use crate::metadata::{SourceType, TrustLevel};
|
||||
use crate::versions::{ChunkerVersion, EmbeddingModelId, IndexVersion, ParserVersion};
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum SearchMode {
|
||||
Lexical,
|
||||
Vector,
|
||||
Hybrid,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SearchQuery {
|
||||
pub text: String,
|
||||
pub mode: SearchMode,
|
||||
pub k: usize,
|
||||
pub filters: SearchFilters,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SearchFilters {
|
||||
pub tags_any: Vec<String>,
|
||||
pub lang: Option<Lang>,
|
||||
pub path_glob: Option<String>,
|
||||
pub trust_min: Option<TrustLevel>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SearchHit {
|
||||
pub rank: u32,
|
||||
pub chunk_id: ChunkId,
|
||||
pub doc_id: DocumentId,
|
||||
pub doc_path: WorkspacePath,
|
||||
pub heading_path: Vec<String>,
|
||||
pub section_label: Option<String>,
|
||||
pub snippet: String,
|
||||
pub citation: Citation,
|
||||
pub retrieval: RetrievalDetail,
|
||||
pub index_version: IndexVersion,
|
||||
pub embedding_model: Option<EmbeddingModelId>,
|
||||
pub chunker_version: ChunkerVersion,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RetrievalDetail {
|
||||
pub method: SearchMode,
|
||||
pub fusion_score: f32,
|
||||
pub lexical_score: Option<f32>,
|
||||
pub vector_score: Option<f32>,
|
||||
pub lexical_rank: Option<u32>,
|
||||
pub vector_rank: Option<u32>,
|
||||
}
|
||||
|
||||
/// Filter for `kb-app::list_docs` (§7.2 DocumentStore::list_documents).
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct DocFilter {
|
||||
pub tags_any: Vec<String>,
|
||||
pub lang: Option<Lang>,
|
||||
pub path_glob: Option<String>,
|
||||
pub trust_min: Option<TrustLevel>,
|
||||
}
|
||||
|
||||
/// Internal mirror of wire `doc_summary.v1` (§2.5).
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct DocSummary {
|
||||
pub doc_id: DocumentId,
|
||||
pub doc_path: WorkspacePath,
|
||||
pub title: String,
|
||||
pub lang: Lang,
|
||||
pub tags: Vec<String>,
|
||||
pub trust_level: TrustLevel,
|
||||
pub source_type: SourceType,
|
||||
pub byte_len: u64,
|
||||
pub chunk_count: u32,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub created_at: OffsetDateTime,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub updated_at: OffsetDateTime,
|
||||
pub parser_version: ParserVersion,
|
||||
pub chunker_version: ChunkerVersion,
|
||||
}
|
||||
175
crates/kb-core/src/traits.rs
Normal file
@@ -0,0 +1,175 @@
|
||||
//! Component traits (§7) and their input helper types (§7.1).
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::asset::RawAsset;
|
||||
use crate::chunk::Chunk;
|
||||
use crate::document::{Block, CanonicalDocument};
|
||||
use crate::ids::{ChunkId, DocumentId};
|
||||
use crate::jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
|
||||
use crate::media::MediaType;
|
||||
use crate::search::{DocFilter, DocSummary, SearchFilters, SearchHit, SearchQuery};
|
||||
use crate::vector::{VectorHit, VectorRecord};
|
||||
use crate::versions::{
|
||||
ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, ParserVersion,
|
||||
};
|
||||
use crate::answer::{ModelRef, TokenUsage};
|
||||
|
||||
// ── Helper input types (§7.1) ─────────────────────────────────────────────
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SourceScope {
|
||||
pub root: PathBuf,
|
||||
pub include: Vec<String>,
|
||||
pub exclude: Vec<String>,
|
||||
}
|
||||
|
||||
/// Forward-declared (§3.7a) — concrete shape decided by extractors. P0
|
||||
/// keeps the option-of-config-file slot only.
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ExtractConfig {
|
||||
pub config_path: Option<PathBuf>,
|
||||
}
|
||||
|
||||
/// Carries the raw asset bytes context to an `Extractor::extract` call.
|
||||
pub struct ExtractContext<'a> {
|
||||
pub asset: &'a RawAsset,
|
||||
pub workspace_root: &'a Path,
|
||||
pub config: &'a ExtractConfig,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ChunkPolicy {
|
||||
pub target_tokens: usize,
|
||||
pub overlap_tokens: usize,
|
||||
pub respect_markdown_headings: bool,
|
||||
pub chunker_version: ChunkerVersion,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum EmbeddingKind {
|
||||
Document,
|
||||
Query,
|
||||
}
|
||||
|
||||
pub struct EmbeddingInput<'a> {
|
||||
pub text: &'a str,
|
||||
pub kind: EmbeddingKind,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct GenerateRequest {
|
||||
pub system: String,
|
||||
pub user: String,
|
||||
pub stop: Vec<String>,
|
||||
pub max_tokens: usize,
|
||||
pub temperature: f32,
|
||||
pub seed: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case", tag = "kind")]
|
||||
pub enum TokenChunk {
|
||||
Token(String),
|
||||
Done {
|
||||
finish_reason: FinishReason,
|
||||
usage: TokenUsage,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum FinishReason {
|
||||
Stop,
|
||||
Length,
|
||||
Aborted,
|
||||
Error(String),
|
||||
}
|
||||
|
||||
// ── Traits (§7.2) ─────────────────────────────────────────────────────────
|
||||
|
||||
pub trait SourceConnector {
|
||||
fn scan(&self, scope: &SourceScope) -> anyhow::Result<Vec<RawAsset>>;
|
||||
}
|
||||
|
||||
pub trait Extractor: Send + Sync {
|
||||
fn supports(&self, media_type: &MediaType) -> bool;
|
||||
fn parser_version(&self) -> ParserVersion;
|
||||
fn extract(
|
||||
&self,
|
||||
ctx: &ExtractContext<'_>,
|
||||
bytes: &[u8],
|
||||
) -> anyhow::Result<CanonicalDocument>;
|
||||
}
|
||||
|
||||
pub trait Chunker: Send + Sync {
|
||||
fn chunker_version(&self) -> ChunkerVersion;
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String;
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>>;
|
||||
}
|
||||
|
||||
pub trait Embedder: Send + Sync {
|
||||
fn model_id(&self) -> EmbeddingModelId;
|
||||
fn model_version(&self) -> EmbeddingVersion;
|
||||
fn dimensions(&self) -> usize;
|
||||
fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> anyhow::Result<Vec<Vec<f32>>>;
|
||||
}
|
||||
|
||||
pub trait Retriever: Send + Sync {
|
||||
fn search(&self, query: &SearchQuery) -> anyhow::Result<Vec<SearchHit>>;
|
||||
fn index_version(&self) -> IndexVersion;
|
||||
}
|
||||
|
||||
pub trait LanguageModel: Send + Sync {
|
||||
fn model_ref(&self) -> ModelRef;
|
||||
fn context_tokens(&self) -> usize;
|
||||
fn generate_stream(
|
||||
&self,
|
||||
req: GenerateRequest,
|
||||
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>>;
|
||||
}
|
||||
|
||||
pub trait DocumentStore {
|
||||
fn put_asset(&self, a: &RawAsset) -> anyhow::Result<()>;
|
||||
fn put_document(&self, d: &CanonicalDocument) -> anyhow::Result<()>;
|
||||
fn put_blocks(&self, doc: &DocumentId, blocks: &[Block]) -> anyhow::Result<()>;
|
||||
fn put_chunks(&self, doc: &DocumentId, chunks: &[Chunk]) -> anyhow::Result<()>;
|
||||
fn get_document(&self, id: &DocumentId) -> anyhow::Result<Option<CanonicalDocument>>;
|
||||
fn get_chunk(&self, id: &ChunkId) -> anyhow::Result<Option<Chunk>>;
|
||||
fn list_documents(&self, filter: &DocFilter) -> anyhow::Result<Vec<DocSummary>>;
|
||||
}
|
||||
|
||||
pub trait VectorStore {
|
||||
fn ensure_table(
|
||||
&self,
|
||||
model: &EmbeddingModelId,
|
||||
dim: usize,
|
||||
) -> anyhow::Result<crate::ids::IndexId>;
|
||||
fn upsert(&self, recs: &[VectorRecord]) -> anyhow::Result<()>;
|
||||
fn search(
|
||||
&self,
|
||||
query_vec: &[f32],
|
||||
k: usize,
|
||||
filters: &SearchFilters,
|
||||
) -> anyhow::Result<Vec<VectorHit>>;
|
||||
}
|
||||
|
||||
pub trait JobRepo {
|
||||
fn create(&self, kind: JobKind, payload: Value) -> anyhow::Result<JobId>;
|
||||
fn update_progress(&self, id: &JobId, progress: Value) -> anyhow::Result<()>;
|
||||
fn finish(
|
||||
&self,
|
||||
id: &JobId,
|
||||
status: JobStatus,
|
||||
error: Option<&str>,
|
||||
) -> anyhow::Result<()>;
|
||||
fn list(&self, filter: &JobFilter) -> anyhow::Result<Vec<JobRow>>;
|
||||
}
|
||||
27
crates/kb-core/src/vector.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
//! Vector store records (§7.2 VectorStore).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::ids::{ChunkId, DocumentId, EmbeddingId};
|
||||
use crate::versions::{EmbeddingModelId, EmbeddingVersion};
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct VectorRecord {
|
||||
pub chunk_id: ChunkId,
|
||||
pub embedding_id: EmbeddingId,
|
||||
pub vector: Vec<f32>,
|
||||
pub doc_id: DocumentId,
|
||||
pub text: String,
|
||||
pub heading_path: Vec<String>,
|
||||
pub model_id: EmbeddingModelId,
|
||||
pub model_version: EmbeddingVersion,
|
||||
pub dimensions: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct VectorHit {
|
||||
pub chunk_id: ChunkId,
|
||||
pub score: f32,
|
||||
pub payload: Value,
|
||||
}
|
||||
27
crates/kb-core/src/versions.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
//! Version / label newtypes (§3.2).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ParserVersion(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ChunkerVersion(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct EmbeddingModelId(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct EmbeddingVersion(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IndexVersion(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct PromptTemplateVersion(pub String);
|
||||
|
||||
/// Wire schema version label (`"answer.v1"`, `"search_hit.v1"`, …).
|
||||
/// Carried as a `&'static str` because every wire type pins its label at
|
||||
/// compile time.
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SchemaVersion(pub &'static str);
|
||||
12
crates/kb-parse-types/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "kb-parse-types"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Parser intermediate representations (no parser libs allowed)"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
serde = { workspace = true }
|
||||
98
crates/kb-parse-types/src/lib.rs
Normal file
@@ -0,0 +1,98 @@
|
||||
//! `kb-parse-types` — parser intermediate representations (§3.7b).
|
||||
//!
|
||||
//! Depends ONLY on `kb-core`. Must NOT depend on any parser library
|
||||
//! (`pulldown-cmark`, `pdf-extract`, `image`, `whisper-rs`, …) and must
|
||||
//! NOT depend on any other `kb-*` crate.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ParsedBlock {
|
||||
pub kind: ParsedBlockKind,
|
||||
pub heading_path: Vec<String>,
|
||||
pub source_span: kb_core::SourceSpan,
|
||||
pub payload: ParsedPayload,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum ParsedBlockKind {
|
||||
Heading,
|
||||
Paragraph,
|
||||
List,
|
||||
Code,
|
||||
Table,
|
||||
Quote,
|
||||
ImageRef,
|
||||
AudioRef,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum ParsedPayload {
|
||||
Heading {
|
||||
level: u8,
|
||||
text: String,
|
||||
},
|
||||
Paragraph {
|
||||
text: String,
|
||||
inlines: Vec<kb_core::Inline>,
|
||||
},
|
||||
List {
|
||||
ordered: bool,
|
||||
items: Vec<Vec<kb_core::Inline>>,
|
||||
},
|
||||
Code {
|
||||
lang: Option<String>,
|
||||
code: String,
|
||||
},
|
||||
Table {
|
||||
headers: Vec<String>,
|
||||
rows: Vec<Vec<String>>,
|
||||
},
|
||||
Quote {
|
||||
text: String,
|
||||
inlines: Vec<kb_core::Inline>,
|
||||
},
|
||||
ImageRef {
|
||||
src: String,
|
||||
alt: String,
|
||||
},
|
||||
/// `duration_ms` is filled in by the extractor before chunking — see
|
||||
/// design §3.7b.
|
||||
AudioRef {
|
||||
src: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Warning {
|
||||
pub kind: WarningKind,
|
||||
pub note: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum WarningKind {
|
||||
MalformedFrontmatter,
|
||||
MalformedTable,
|
||||
EncodingFallback,
|
||||
ExtractFailed,
|
||||
}
|
||||
|
||||
// Forward-declared (P6/P7/P8). Bodies stay minimal for now.
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ParsedImageRegion;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ParsedPdfPage {
|
||||
pub page: u32,
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ParsedAudioSegment {
|
||||
pub start_ms: u64,
|
||||
pub end_ms: u64,
|
||||
pub text: String,
|
||||
}
|
||||
12
docs/spec/ai-generation-guidelines.md
Normal file
@@ -0,0 +1,12 @@
|
||||
# AI generation guidelines
|
||||
|
||||
When implementing tasks against this codebase:
|
||||
|
||||
- Treat the frozen design doc as the single source of truth. Do not invent
|
||||
new fields, traits, or enum variants.
|
||||
- Prefer editing existing files to creating new ones; reuse types from
|
||||
`kb-core` instead of duplicating shapes.
|
||||
- For each task, follow the task spec under `tasks/p<N>/p<N>-<i>.md`.
|
||||
|
||||
Canonical source:
|
||||
[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §11 + §12.
|
||||
7
docs/spec/canonical-document.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# CanonicalDocument
|
||||
|
||||
Medium-agnostic representation of a document with `Block`s, `SourceSpan`s,
|
||||
and provenance.
|
||||
|
||||
Canonical source:
|
||||
[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §3.4 + §3.7a.
|
||||
8
docs/spec/chunk-policy.md
Normal file
@@ -0,0 +1,8 @@
|
||||
# Chunk policy
|
||||
|
||||
`ChunkPolicy` carries `target_tokens`, `overlap_tokens`,
|
||||
`respect_markdown_headings`, and `chunker_version`. Chunkers expose a
|
||||
`policy_hash` so chunk IDs include the policy.
|
||||
|
||||
Canonical source:
|
||||
[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §3.5 + §7.1 + §7.2.
|
||||
7
docs/spec/citation-policy.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Citation policy
|
||||
|
||||
Citations use W3C Media Fragments URIs to locate evidence inside a
|
||||
document. Five variants: `Line`, `Page`, `Region`, `Caption`, `Time`.
|
||||
|
||||
Canonical source:
|
||||
[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §3.5 + §0 Q3.
|
||||
6
docs/spec/domain-model.md
Normal file
@@ -0,0 +1,6 @@
|
||||
# Domain model
|
||||
|
||||
The domain types live in `kb-core` and mirror the frozen design exactly.
|
||||
|
||||
Canonical source:
|
||||
[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §3.
|
||||
6
docs/spec/ids.md
Normal file
@@ -0,0 +1,6 @@
|
||||
# ID recipe
|
||||
|
||||
All `kb-*` IDs are 32 hex chars: the first 32 of `blake3(canonical_json(tuple))`.
|
||||
|
||||
Canonical source:
|
||||
[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §4.
|
||||
8
docs/spec/module-boundaries.md
Normal file
@@ -0,0 +1,8 @@
|
||||
# Module boundaries
|
||||
|
||||
`kb-core` is leaf — every other crate depends on it. Parsers depend on
|
||||
`kb-parse-types` (not on `kb-normalize`); `kb-normalize` depends on
|
||||
`kb-parse-types` (not on parsers). UI crates depend only on `kb-app`.
|
||||
|
||||
Canonical source:
|
||||
[docs/superpowers/specs/2026-04-27-kb-final-form-design.md](../superpowers/specs/2026-04-27-kb-final-form-design.md), §8.
|
||||
31
docs/wire-schema/v1/answer.schema.json
Normal file
@@ -0,0 +1,31 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "https://kb.local/wire/v1/answer.schema.json",
|
||||
"title": "Answer v1",
|
||||
"description": "Stub schema — declares the schema_version label and the required fields per design §2.3.",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"schema_version",
|
||||
"answer",
|
||||
"citations",
|
||||
"grounded",
|
||||
"model",
|
||||
"prompt_template_version",
|
||||
"retrieval",
|
||||
"usage",
|
||||
"created_at"
|
||||
],
|
||||
"properties": {
|
||||
"schema_version": { "const": "answer.v1" },
|
||||
"answer": { "type": "string" },
|
||||
"citations": { "type": "array" },
|
||||
"grounded": { "type": "boolean" },
|
||||
"refusal_reason": { "type": ["string", "null"] },
|
||||
"model": { "type": "object" },
|
||||
"embedding": { "type": ["object", "null"] },
|
||||
"prompt_template_version": { "type": "string" },
|
||||
"retrieval": { "type": "object" },
|
||||
"usage": { "type": "object" },
|
||||
"created_at": { "type": "string" }
|
||||
}
|
||||
}
|
||||
32
docs/wire-schema/v1/chunk_inspection.schema.json
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "https://kb.local/wire/v1/chunk_inspection.schema.json",
|
||||
"title": "ChunkInspection v1",
|
||||
"description": "Stub schema — declares the schema_version label and the required fields per design §2.6.",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"schema_version",
|
||||
"chunk_id",
|
||||
"doc_id",
|
||||
"doc_path",
|
||||
"heading_path",
|
||||
"text",
|
||||
"source_spans",
|
||||
"block_ids",
|
||||
"token_estimate",
|
||||
"chunker_version"
|
||||
],
|
||||
"properties": {
|
||||
"schema_version": { "const": "chunk_inspection.v1" },
|
||||
"chunk_id": { "type": "string" },
|
||||
"doc_id": { "type": "string" },
|
||||
"doc_path": { "type": "string" },
|
||||
"heading_path": { "type": "array", "items": { "type": "string" } },
|
||||
"text": { "type": "string" },
|
||||
"source_spans": { "type": "array" },
|
||||
"block_ids": { "type": "array", "items": { "type": "string" } },
|
||||
"token_estimate": { "type": "integer", "minimum": 0 },
|
||||
"chunker_version": { "type": "string" },
|
||||
"embeddings": { "type": "array" }
|
||||
}
|
||||
}
|
||||
19
docs/wire-schema/v1/citation.schema.json
Normal file
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "https://kb.local/wire/v1/citation.schema.json",
|
||||
"title": "Citation v1",
|
||||
"description": "Stub schema — declares the schema_version label and the always-present fields. Variant-discriminated property validation lands in a later phase.",
|
||||
"type": "object",
|
||||
"required": ["schema_version", "kind", "path", "uri"],
|
||||
"properties": {
|
||||
"schema_version": { "const": "citation.v1" },
|
||||
"kind": { "enum": ["line", "page", "region", "caption", "time"] },
|
||||
"path": { "type": "string" },
|
||||
"uri": { "type": "string" },
|
||||
"line": { "type": "object" },
|
||||
"page": { "type": "object" },
|
||||
"region": { "type": "object" },
|
||||
"caption": { "type": "object" },
|
||||
"time": { "type": "object" }
|
||||
}
|
||||
}
|
||||
39
docs/wire-schema/v1/doc_summary.schema.json
Normal file
@@ -0,0 +1,39 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "https://kb.local/wire/v1/doc_summary.schema.json",
|
||||
"title": "DocSummary v1",
|
||||
"description": "Stub schema — declares the schema_version label and the required fields per design §2.5.",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"schema_version",
|
||||
"doc_id",
|
||||
"doc_path",
|
||||
"title",
|
||||
"lang",
|
||||
"tags",
|
||||
"trust_level",
|
||||
"source_type",
|
||||
"byte_len",
|
||||
"chunk_count",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
"parser_version",
|
||||
"chunker_version"
|
||||
],
|
||||
"properties": {
|
||||
"schema_version": { "const": "doc_summary.v1" },
|
||||
"doc_id": { "type": "string" },
|
||||
"doc_path": { "type": "string" },
|
||||
"title": { "type": "string" },
|
||||
"lang": { "type": "string" },
|
||||
"tags": { "type": "array", "items": { "type": "string" } },
|
||||
"trust_level": { "type": "string" },
|
||||
"source_type": { "type": "string" },
|
||||
"byte_len": { "type": "integer", "minimum": 0 },
|
||||
"chunk_count": { "type": "integer", "minimum": 0 },
|
||||
"created_at": { "type": "string" },
|
||||
"updated_at": { "type": "string" },
|
||||
"parser_version": { "type": "string" },
|
||||
"chunker_version": { "type": "string" }
|
||||
}
|
||||
}
|
||||
25
docs/wire-schema/v1/doctor.schema.json
Normal file
@@ -0,0 +1,25 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "https://kb.local/wire/v1/doctor.schema.json",
|
||||
"title": "DoctorReport v1",
|
||||
"description": "Stub schema — declares the schema_version label and the required fields per design §2.7.",
|
||||
"type": "object",
|
||||
"required": ["schema_version", "ok", "checks"],
|
||||
"properties": {
|
||||
"schema_version": { "const": "doctor.v1" },
|
||||
"ok": { "type": "boolean" },
|
||||
"checks": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["name", "ok", "detail"],
|
||||
"properties": {
|
||||
"name": { "type": "string" },
|
||||
"ok": { "type": "boolean" },
|
||||
"detail": { "type": "string" },
|
||||
"hint": { "type": ["string", "null"] }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
28
docs/wire-schema/v1/ingest_report.schema.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "https://kb.local/wire/v1/ingest_report.schema.json",
|
||||
"title": "IngestReport v1",
|
||||
"description": "Stub schema — declares the schema_version label and the required fields per design §2.4.",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"schema_version",
|
||||
"scope",
|
||||
"scanned",
|
||||
"new",
|
||||
"updated",
|
||||
"skipped",
|
||||
"errors",
|
||||
"duration_ms"
|
||||
],
|
||||
"properties": {
|
||||
"schema_version": { "const": "ingest_report.v1" },
|
||||
"scope": { "type": "object" },
|
||||
"scanned": { "type": "integer", "minimum": 0 },
|
||||
"new": { "type": "integer", "minimum": 0 },
|
||||
"updated": { "type": "integer", "minimum": 0 },
|
||||
"skipped": { "type": "integer", "minimum": 0 },
|
||||
"errors": { "type": "integer", "minimum": 0 },
|
||||
"duration_ms": { "type": "integer", "minimum": 0 },
|
||||
"items": { "type": ["array", "null"] }
|
||||
}
|
||||
}
|
||||
38
docs/wire-schema/v1/search_hit.schema.json
Normal file
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "https://kb.local/wire/v1/search_hit.schema.json",
|
||||
"title": "SearchHit v1",
|
||||
"description": "Stub schema — declares the schema_version label and the required top-level fields per design §2.2.",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"schema_version",
|
||||
"rank",
|
||||
"score",
|
||||
"chunk_id",
|
||||
"doc_id",
|
||||
"doc_path",
|
||||
"heading_path",
|
||||
"snippet",
|
||||
"citation",
|
||||
"retrieval",
|
||||
"index_version",
|
||||
"chunker_version"
|
||||
],
|
||||
"properties": {
|
||||
"schema_version": { "const": "search_hit.v1" },
|
||||
"rank": { "type": "integer", "minimum": 1 },
|
||||
"score": { "type": "number" },
|
||||
"chunk_id": { "type": "string" },
|
||||
"doc_id": { "type": "string" },
|
||||
"doc_path": { "type": "string" },
|
||||
"heading_path": { "type": "array", "items": { "type": "string" } },
|
||||
"section_label": { "type": ["string", "null"] },
|
||||
"snippet": { "type": "string" },
|
||||
"snippet_full_text": { "type": "boolean" },
|
||||
"citation": { "type": "object" },
|
||||
"retrieval": { "type": "object" },
|
||||
"index_version": { "type": "string" },
|
||||
"embedding_model": { "type": ["string", "null"] },
|
||||
"chunker_version": { "type": "string" }
|
||||
}
|
||||
}
|
||||
0
fixtures/audio/.gitkeep
Normal file
0
fixtures/embed/.gitkeep
Normal file
0
fixtures/eval/.gitkeep
Normal file
0
fixtures/image/.gitkeep
Normal file
12
fixtures/markdown/code-and-table.md
Normal file
@@ -0,0 +1,12 @@
|
||||
# Code And Table
|
||||
|
||||
```rust
|
||||
fn main() {
|
||||
println!("hi");
|
||||
}
|
||||
```
|
||||
|
||||
| col a | col b |
|
||||
|-------|-------|
|
||||
| 1 | 2 |
|
||||
| 3 | 4 |
|
||||
15
fixtures/markdown/nested-headings.md
Normal file
@@ -0,0 +1,15 @@
|
||||
# Top
|
||||
|
||||
intro
|
||||
|
||||
## Section A
|
||||
|
||||
body of A
|
||||
|
||||
### Sub A.1
|
||||
|
||||
deeper
|
||||
|
||||
## Section B
|
||||
|
||||
body of B
|
||||
3
fixtures/markdown/simple-note.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# Simple Note
|
||||
|
||||
A short paragraph.
|
||||
0
fixtures/pdf/.gitkeep
Normal file
0
fixtures/rag/.gitkeep
Normal file
0
fixtures/search/hybrid/.gitkeep
Normal file
0
fixtures/search/lexical/.gitkeep
Normal file
0
fixtures/source-fs/.gitkeep
Normal file
0
fixtures/vector/.gitkeep
Normal file
15
migrations/V001__init.sql
Normal file
@@ -0,0 +1,15 @@
|
||||
-- V001__init.sql — schema bootstrap.
|
||||
-- Per design §5.1 + §5.9. Only the meta + migrations tables land here;
|
||||
-- data tables (assets, documents, blocks, chunks, fts5, …) ship in later
|
||||
-- phase-specific migrations (P1-6 / P2-1 / P3-3).
|
||||
|
||||
CREATE TABLE schema_meta (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE migrations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
applied_at TEXT NOT NULL,
|
||||
description TEXT NOT NULL
|
||||
);
|
||||
🟡 minor —
Result<Option<WorkerGuard>>이지만 항상Ok(Some(_))현재 함수는
Err도Ok(None)도 반환하지 않음 (fs::create_dir_all실패 시만Err).Option은 죽은 surface. caller (kb-cli/src/main.rs:159)가.ok().flatten()으로 두 layer를 한꺼번에 떠는 형태도 같은 noise.옵션:
Result<WorkerGuard>로 단순화 (caller도.ok()한 번만).현 phase 작동에는 영향 없으므로 후속 cleanup으로 OK.