p0-1: workspace + kb-core domain types, traits, and ID recipe
Stand up the Cargo workspace (Rust 2024 / resolver=3) with the kb-core
crate per the frozen design (§3, §4, §7, §10). kb-core has zero
deps on other kb-* crates and exposes:
- Newtype IDs (AssetId / DocumentId / BlockId / ChunkId / EmbeddingId /
IndexId) with Display + FromStr that reject anything but 32 lower-hex.
- id_from + id_for_{asset,doc,block,chunk,embedding,index} per §4.2;
pinned hex test values computed via an independent JCS+blake3 tool.
- CanonicalDocument, Block (8 variants), SourceSpan, Inline (§3.4).
- Citation (5 variants) with W3C Media Fragments to_uri / parse;
round-trip property holds for every variant.
- Metadata + Provenance (§3.6); SearchQuery / SearchHit / RetrievalDetail
(§3.7); DocFilter / DocSummary mirrors of wire §2.5.
- Answer / AnswerCitation / RefusalReason / ModelRef (§3.8).
- IngestReport, JobRepo support types, VectorRecord / VectorHit.
- Component traits (SourceConnector / Extractor / Chunker / Embedder /
Retriever / LanguageModel / DocumentStore / VectorStore / JobRepo)
plus their input helpers (SourceScope / ExtractContext / ChunkPolicy
/ EmbeddingInput / GenerateRequest / TokenChunk / FinishReason).
- CoreError (§10).
- nfc + to_posix helpers (§4.1, §6.6).
20 unit tests cover ID determinism (1000-run regression), key-order
invariance, two pinned hex values, newtype rejection of bad input,
Citation round-trip for all 5 variants, and to_posix collapsing +
Korean NFC.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1 +1,4 @@
|
||||
.superpowers/
|
||||
/target/
|
||||
**/*.rs.bk
|
||||
Cargo.lock.bak
|
||||
|
||||
937
Cargo.lock
generated
Normal file
937
Cargo.lock
generated
Normal file
@@ -0,0 +1,937 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"anstyle-parse",
|
||||
"anstyle-query",
|
||||
"anstyle-wincon",
|
||||
"colorchoice",
|
||||
"is_terminal_polyfill",
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle"
|
||||
version = "1.0.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-parse"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
|
||||
dependencies = [
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-query"
|
||||
version = "1.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
|
||||
dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-wincon"
|
||||
version = "3.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"once_cell_polyfill",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.102"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
|
||||
|
||||
[[package]]
|
||||
name = "arrayref"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
|
||||
|
||||
[[package]]
|
||||
name = "arrayvec"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
|
||||
|
||||
[[package]]
|
||||
name = "blake3"
|
||||
version = "1.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce"
|
||||
dependencies = [
|
||||
"arrayref",
|
||||
"arrayvec",
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"constant_time_eq",
|
||||
"cpufeatures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.61"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d"
|
||||
dependencies = [
|
||||
"find-msvc-tools",
|
||||
"shlex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_builder"
|
||||
version = "4.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"clap_lex",
|
||||
"strsim",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
|
||||
|
||||
[[package]]
|
||||
name = "constant_time_eq"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.5.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
|
||||
dependencies = [
|
||||
"powerfmt",
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs"
|
||||
version = "5.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225"
|
||||
dependencies = [
|
||||
"dirs-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-sys"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"option-ext",
|
||||
"redox_users",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
|
||||
|
||||
[[package]]
|
||||
name = "kb-app"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"dirs",
|
||||
"kb-config",
|
||||
"kb-core",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror 2.0.18",
|
||||
"toml",
|
||||
"tracing",
|
||||
"tracing-appender",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-cli"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"kb-app",
|
||||
"kb-config",
|
||||
"kb-core",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-config"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"dirs",
|
||||
"kb-core",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror 2.0.18",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-core"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_json_canonicalizer",
|
||||
"thiserror 2.0.18",
|
||||
"time",
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-parse-types"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"kb-core",
|
||||
"serde",
|
||||
"thiserror 2.0.18",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.186"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
|
||||
|
||||
[[package]]
|
||||
name = "libredox"
|
||||
version = "0.1.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.50.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
|
||||
dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-conv"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell_polyfill"
|
||||
version = "1.70.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
|
||||
|
||||
[[package]]
|
||||
name = "option-ext"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
|
||||
|
||||
[[package]]
|
||||
name = "powerfmt"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.106"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.45"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_users"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
"libredox",
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
||||
|
||||
[[package]]
|
||||
name = "ryu-js"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_core"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.149"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"memchr",
|
||||
"serde",
|
||||
"serde_core",
|
||||
"zmij",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json_canonicalizer"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fe52319a927259afbfa5180c5157cd8167edfd3e8c254f9558c7fef44c5649f2"
|
||||
dependencies = [
|
||||
"ryu-js",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_spanned"
|
||||
version = "0.6.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.15.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "symlink"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.117"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
|
||||
dependencies = [
|
||||
"thiserror-impl 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "2.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
|
||||
dependencies = [
|
||||
"thiserror-impl 2.0.18",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "2.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.45"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd"
|
||||
dependencies = [
|
||||
"deranged",
|
||||
"itoa",
|
||||
"num-conv",
|
||||
"powerfmt",
|
||||
"serde_core",
|
||||
"time-core",
|
||||
"time-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time-core"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b36ee98fd31ec7426d599183e8fe26932a8dc1fb76ddb6214d05493377d34ca"
|
||||
|
||||
[[package]]
|
||||
name = "time-macros"
|
||||
version = "0.2.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "71e552d1249bf61ac2a52db88179fd0673def1e1ad8243a00d9ec9ed71fee3dd"
|
||||
dependencies = [
|
||||
"num-conv",
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
|
||||
dependencies = [
|
||||
"tinyvec_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec_macros"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.8.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_edit",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_datetime"
|
||||
version = "0.6.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.22.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
|
||||
dependencies = [
|
||||
"indexmap",
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_write",
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_write"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.44"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
|
||||
dependencies = [
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-appender"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "050686193eb999b4bb3bc2acfa891a13da00f79734704c4b8b4ef1a10b368a3c"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"symlink",
|
||||
"thiserror 2.0.18",
|
||||
"time",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"valuable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-log"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
|
||||
dependencies = [
|
||||
"log",
|
||||
"once_cell",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-serde"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.3.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex-automata",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sharded-slab",
|
||||
"smallvec",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-normalization"
|
||||
version = "0.1.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
|
||||
dependencies = [
|
||||
"tinyvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||
|
||||
[[package]]
|
||||
name = "valuable"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.11.1+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
||||
|
||||
[[package]]
|
||||
name = "windows-link"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.61.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.48.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.7.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zmij"
|
||||
version = "1.0.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
|
||||
26
Cargo.toml
Normal file
26
Cargo.toml
Normal file
@@ -0,0 +1,26 @@
|
||||
[workspace]
|
||||
resolver = "3"
|
||||
members = [
|
||||
"crates/kb-core",
|
||||
"crates/kb-parse-types",
|
||||
"crates/kb-config",
|
||||
"crates/kb-app",
|
||||
"crates/kb-cli",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
edition = "2024"
|
||||
rust-version = "1.85"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/altair823/kb"
|
||||
version = "0.1.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1"
|
||||
thiserror = "2"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
time = { version = "0.3", features = ["serde", "macros", "formatting", "parsing"] }
|
||||
uuid = { version = "1", features = ["v7", "serde"] }
|
||||
blake3 = "1"
|
||||
tracing = "0.1"
|
||||
18
crates/kb-core/Cargo.toml
Normal file
18
crates/kb-core/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "kb-core"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "kb domain types, traits, and ID recipe (no other kb-* deps)"
|
||||
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
serde_json_canonicalizer = "0.3"
|
||||
unicode-normalization = "0.1"
|
||||
66
crates/kb-core/src/answer.rs
Normal file
66
crates/kb-core/src/answer.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
//! Answer + RAG types (§3.8).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::citation::Citation;
|
||||
use crate::search::SearchMode;
|
||||
use crate::versions::PromptTemplateVersion;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Answer {
|
||||
pub answer: String,
|
||||
pub citations: Vec<AnswerCitation>,
|
||||
pub grounded: bool,
|
||||
pub refusal_reason: Option<RefusalReason>,
|
||||
pub model: ModelRef,
|
||||
pub embedding: Option<ModelRef>,
|
||||
pub prompt_template_version: PromptTemplateVersion,
|
||||
pub retrieval: AnswerRetrievalSummary,
|
||||
pub usage: TokenUsage,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub created_at: OffsetDateTime,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct AnswerCitation {
|
||||
pub marker: Option<String>,
|
||||
pub citation: Citation,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum RefusalReason {
|
||||
ScoreGate,
|
||||
LlmSelfJudge,
|
||||
NoIndex,
|
||||
NoChunks,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ModelRef {
|
||||
pub id: String,
|
||||
pub provider: String,
|
||||
pub dimensions: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct AnswerRetrievalSummary {
|
||||
pub trace_id: TraceId,
|
||||
pub mode: SearchMode,
|
||||
pub k: usize,
|
||||
pub score_gate: f32,
|
||||
pub top_score: f32,
|
||||
pub chunks_returned: u32,
|
||||
pub chunks_used: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct TokenUsage {
|
||||
pub prompt_tokens: u32,
|
||||
pub completion_tokens: u32,
|
||||
pub latency_ms: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct TraceId(pub String);
|
||||
42
crates/kb-core/src/asset.rs
Normal file
42
crates/kb-core/src/asset.rs
Normal file
@@ -0,0 +1,42 @@
|
||||
//! Raw asset, source URI, workspace path (§3.3).
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::ids::AssetId;
|
||||
use crate::media::{Checksum, MediaType};
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind", content = "value")]
|
||||
pub enum SourceUri {
|
||||
File(PathBuf),
|
||||
/// `kb://` virtual reference.
|
||||
Kb(String),
|
||||
}
|
||||
|
||||
/// POSIX-relative path inside the workspace root (§6.6, §4.1). Always
|
||||
/// produced via `crate::normalize::to_posix`.
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct WorkspacePath(pub String);
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum AssetStorage {
|
||||
Copied { path: PathBuf },
|
||||
Reference { path: PathBuf, sha: Checksum },
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RawAsset {
|
||||
pub asset_id: AssetId,
|
||||
pub source_uri: SourceUri,
|
||||
pub workspace_path: WorkspacePath,
|
||||
pub media_type: MediaType,
|
||||
pub byte_len: u64,
|
||||
pub checksum: Checksum,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub discovered_at: OffsetDateTime,
|
||||
pub stored: AssetStorage,
|
||||
}
|
||||
19
crates/kb-core/src/chunk.rs
Normal file
19
crates/kb-core/src/chunk.rs
Normal file
@@ -0,0 +1,19 @@
|
||||
//! Chunk (§3.5).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::document::SourceSpan;
|
||||
use crate::ids::{BlockId, ChunkId, DocumentId};
|
||||
use crate::versions::ChunkerVersion;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Chunk {
|
||||
pub chunk_id: ChunkId,
|
||||
pub doc_id: DocumentId,
|
||||
pub block_ids: Vec<BlockId>,
|
||||
pub text: String,
|
||||
pub heading_path: Vec<String>,
|
||||
pub source_spans: Vec<SourceSpan>,
|
||||
pub token_estimate: usize,
|
||||
pub chunker_version: ChunkerVersion,
|
||||
}
|
||||
316
crates/kb-core/src/citation.rs
Normal file
316
crates/kb-core/src/citation.rs
Normal file
@@ -0,0 +1,316 @@
|
||||
//! Citation (§3.5) — discriminated 5-variant. Each variant has a canonical
|
||||
//! W3C Media Fragments URI per design §0 Q3.
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum Citation {
|
||||
Line {
|
||||
path: WorkspacePath,
|
||||
start: u32,
|
||||
end: u32,
|
||||
section: Option<String>,
|
||||
},
|
||||
Page {
|
||||
path: WorkspacePath,
|
||||
page: u32,
|
||||
section: Option<String>,
|
||||
},
|
||||
Region {
|
||||
path: WorkspacePath,
|
||||
x: u32,
|
||||
y: u32,
|
||||
w: u32,
|
||||
h: u32,
|
||||
},
|
||||
Caption {
|
||||
path: WorkspacePath,
|
||||
model: String,
|
||||
},
|
||||
Time {
|
||||
path: WorkspacePath,
|
||||
start_ms: u64,
|
||||
end_ms: u64,
|
||||
speaker: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
impl Citation {
|
||||
pub fn path(&self) -> &WorkspacePath {
|
||||
match self {
|
||||
Citation::Line { path, .. }
|
||||
| Citation::Page { path, .. }
|
||||
| Citation::Region { path, .. }
|
||||
| Citation::Caption { path, .. }
|
||||
| Citation::Time { path, .. } => path,
|
||||
}
|
||||
}
|
||||
|
||||
/// Emit a W3C Media Fragments URI per design §0 Q3.
|
||||
/// `section` and `speaker` and `caption.model` are NOT part of the URI
|
||||
/// fragment; they live in the structured wire object.
|
||||
pub fn to_uri(&self) -> String {
|
||||
match self {
|
||||
Citation::Line { path, start, end, .. } => {
|
||||
if start == end {
|
||||
format!("{}#L{}", path.0, start)
|
||||
} else {
|
||||
format!("{}#L{}-L{}", path.0, start, end)
|
||||
}
|
||||
}
|
||||
Citation::Page { path, page, .. } => format!("{}#p={}", path.0, page),
|
||||
Citation::Region {
|
||||
path, x, y, w, h, ..
|
||||
} => format!("{}#xywh={},{},{},{}", path.0, x, y, w, h),
|
||||
Citation::Caption { path, .. } => format!("{}#caption", path.0),
|
||||
Citation::Time {
|
||||
path,
|
||||
start_ms,
|
||||
end_ms,
|
||||
speaker,
|
||||
} => {
|
||||
let s = format_hms_ms(*start_ms);
|
||||
let e = format_hms_ms(*end_ms);
|
||||
match speaker {
|
||||
Some(sp) => format!("{}#t={},{}&speaker={}", path.0, s, e, sp),
|
||||
None => format!("{}#t={},{}", path.0, s, e),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Strict inverse of `to_uri`. The `section` / `caption.model` fields
|
||||
/// are not part of the URI grammar, so a parsed Citation will have
|
||||
/// `section = None` and `model = ""` for the relevant variants.
|
||||
/// Round-trip property holds for citations whose non-URI fields are at
|
||||
/// their default values (see test).
|
||||
pub fn parse(s: &str) -> Result<Self> {
|
||||
let (path_str, frag) = match s.rsplit_once('#') {
|
||||
Some(t) => t,
|
||||
None => bail!("citation has no '#' fragment: {s:?}"),
|
||||
};
|
||||
let path = WorkspacePath(path_str.to_owned());
|
||||
|
||||
if let Some(rest) = frag.strip_prefix("L") {
|
||||
// line range: `L<a>` or `L<a>-L<b>`
|
||||
if let Some((a, b)) = rest.split_once("-L") {
|
||||
let start: u32 = a.parse().map_err(|_| anyhow::anyhow!("bad line start"))?;
|
||||
let end: u32 = b.parse().map_err(|_| anyhow::anyhow!("bad line end"))?;
|
||||
return Ok(Citation::Line {
|
||||
path,
|
||||
start,
|
||||
end,
|
||||
section: None,
|
||||
});
|
||||
}
|
||||
let n: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad line number"))?;
|
||||
return Ok(Citation::Line {
|
||||
path,
|
||||
start: n,
|
||||
end: n,
|
||||
section: None,
|
||||
});
|
||||
}
|
||||
if let Some(rest) = frag.strip_prefix("p=") {
|
||||
let page: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad page number"))?;
|
||||
return Ok(Citation::Page {
|
||||
path,
|
||||
page,
|
||||
section: None,
|
||||
});
|
||||
}
|
||||
if let Some(rest) = frag.strip_prefix("xywh=") {
|
||||
let parts: Vec<&str> = rest.split(',').collect();
|
||||
if parts.len() != 4 {
|
||||
bail!("xywh= expects 4 comma-separated values: {rest:?}");
|
||||
}
|
||||
let x: u32 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad xywh.x"))?;
|
||||
let y: u32 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad xywh.y"))?;
|
||||
let w: u32 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad xywh.w"))?;
|
||||
let h: u32 = parts[3].parse().map_err(|_| anyhow::anyhow!("bad xywh.h"))?;
|
||||
return Ok(Citation::Region { path, x, y, w, h });
|
||||
}
|
||||
if frag == "caption" {
|
||||
return Ok(Citation::Caption {
|
||||
path,
|
||||
model: String::new(),
|
||||
});
|
||||
}
|
||||
if let Some(rest) = frag.strip_prefix("t=") {
|
||||
// `t=<start>,<end>` optionally followed by `&speaker=<id>`
|
||||
let (range, speaker) = match rest.split_once('&') {
|
||||
Some((r, kv)) => match kv.strip_prefix("speaker=") {
|
||||
Some(sp) => (r, Some(sp.to_owned())),
|
||||
None => bail!("unknown time-fragment param: {kv:?}"),
|
||||
},
|
||||
None => (rest, None),
|
||||
};
|
||||
let (s_str, e_str) = match range.split_once(',') {
|
||||
Some(t) => t,
|
||||
None => bail!("time fragment expects '<start>,<end>': {range:?}"),
|
||||
};
|
||||
let start_ms = parse_hms_ms(s_str)?;
|
||||
let end_ms = parse_hms_ms(e_str)?;
|
||||
return Ok(Citation::Time {
|
||||
path,
|
||||
start_ms,
|
||||
end_ms,
|
||||
speaker,
|
||||
});
|
||||
}
|
||||
bail!("unrecognised citation fragment: {frag:?}")
|
||||
}
|
||||
}
|
||||
|
||||
/// Format milliseconds as `hh:mm:ss.mmm` (W3C Media Fragments NPT-with-ms).
|
||||
fn format_hms_ms(ms: u64) -> String {
|
||||
let hours = ms / 3_600_000;
|
||||
let minutes = (ms % 3_600_000) / 60_000;
|
||||
let seconds = (ms % 60_000) / 1000;
|
||||
let millis = ms % 1000;
|
||||
format!("{hours:02}:{minutes:02}:{seconds:02}.{millis:03}")
|
||||
}
|
||||
|
||||
fn parse_hms_ms(s: &str) -> Result<u64> {
|
||||
// Accept `hh:mm:ss.mmm` (the form we emit). Reject malformed input.
|
||||
let parts: Vec<&str> = s.split(':').collect();
|
||||
if parts.len() != 3 {
|
||||
bail!("time component expects hh:mm:ss.mmm, got {s:?}");
|
||||
}
|
||||
let h: u64 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad hours"))?;
|
||||
let m: u64 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad minutes"))?;
|
||||
let (sec, ms) = match parts[2].split_once('.') {
|
||||
Some((s_part, ms_part)) => {
|
||||
let sec: u64 = s_part.parse().map_err(|_| anyhow::anyhow!("bad seconds"))?;
|
||||
// Pad/truncate to exactly 3 digits.
|
||||
let mut ms_str = ms_part.to_owned();
|
||||
while ms_str.len() < 3 {
|
||||
ms_str.push('0');
|
||||
}
|
||||
ms_str.truncate(3);
|
||||
let ms: u64 = ms_str.parse().map_err(|_| anyhow::anyhow!("bad milliseconds"))?;
|
||||
(sec, ms)
|
||||
}
|
||||
None => {
|
||||
let sec: u64 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad seconds"))?;
|
||||
(sec, 0)
|
||||
}
|
||||
};
|
||||
Ok(h * 3_600_000 + m * 60_000 + sec * 1000 + ms)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn p(s: &str) -> WorkspacePath {
|
||||
WorkspacePath(s.to_owned())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn line_range_uri_and_roundtrip() {
|
||||
let c = Citation::Line {
|
||||
path: p("notes/rust/kb.md"),
|
||||
start: 12,
|
||||
end: 34,
|
||||
section: None,
|
||||
};
|
||||
assert_eq!(c.to_uri(), "notes/rust/kb.md#L12-L34");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn line_single_uri_and_roundtrip() {
|
||||
let c = Citation::Line {
|
||||
path: p("a/b.md"),
|
||||
start: 7,
|
||||
end: 7,
|
||||
section: None,
|
||||
};
|
||||
assert_eq!(c.to_uri(), "a/b.md#L7");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn page_uri_and_roundtrip() {
|
||||
let c = Citation::Page {
|
||||
path: p("papers/book.pdf"),
|
||||
page: 23,
|
||||
section: None,
|
||||
};
|
||||
assert_eq!(c.to_uri(), "papers/book.pdf#p=23");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn region_uri_and_roundtrip() {
|
||||
let c = Citation::Region {
|
||||
path: p("photos/x.png"),
|
||||
x: 120,
|
||||
y: 40,
|
||||
w: 520,
|
||||
h: 180,
|
||||
};
|
||||
assert_eq!(c.to_uri(), "photos/x.png#xywh=120,40,520,180");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn caption_uri_and_roundtrip() {
|
||||
let c = Citation::Caption {
|
||||
path: p("photos/x.png"),
|
||||
// `model` is not in the URI grammar; round-trip fills it with "".
|
||||
model: String::new(),
|
||||
};
|
||||
assert_eq!(c.to_uri(), "photos/x.png#caption");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn time_uri_and_roundtrip_with_speaker() {
|
||||
let c = Citation::Time {
|
||||
path: p("recordings/r.m4a"),
|
||||
start_ms: 822_000,
|
||||
end_ms: 850_000,
|
||||
speaker: Some("S1".to_string()),
|
||||
};
|
||||
assert_eq!(
|
||||
c.to_uri(),
|
||||
"recordings/r.m4a#t=00:13:42.000,00:14:10.000&speaker=S1"
|
||||
);
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn time_uri_and_roundtrip_without_speaker() {
|
||||
let c = Citation::Time {
|
||||
path: p("recordings/r.m4a"),
|
||||
start_ms: 1_500,
|
||||
end_ms: 2_750,
|
||||
speaker: None,
|
||||
};
|
||||
assert_eq!(c.to_uri(), "recordings/r.m4a#t=00:00:01.500,00:00:02.750");
|
||||
let parsed = Citation::parse(&c.to_uri()).unwrap();
|
||||
assert_eq!(parsed, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_rejects_no_fragment() {
|
||||
assert!(Citation::parse("just/path.md").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_rejects_unknown_fragment() {
|
||||
assert!(Citation::parse("a.md#mystery=1").is_err());
|
||||
}
|
||||
}
|
||||
177
crates/kb-core/src/document.rs
Normal file
177
crates/kb-core/src/document.rs
Normal file
@@ -0,0 +1,177 @@
|
||||
//! CanonicalDocument, Block, SourceSpan, Inline, plus the forward-declared
|
||||
//! OCR / caption / transcript stubs (§3.4 + §3.7a).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
use crate::ids::{AssetId, BlockId, DocumentId};
|
||||
use crate::media::Lang;
|
||||
use crate::metadata::{Metadata, Provenance};
|
||||
use crate::versions::ParserVersion;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct CanonicalDocument {
|
||||
pub doc_id: DocumentId,
|
||||
pub source_asset_id: AssetId,
|
||||
pub workspace_path: WorkspacePath,
|
||||
pub title: String,
|
||||
pub lang: Lang,
|
||||
pub blocks: Vec<Block>,
|
||||
pub metadata: Metadata,
|
||||
pub provenance: Provenance,
|
||||
pub parser_version: ParserVersion,
|
||||
pub schema_version: u32,
|
||||
pub doc_version: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum Block {
|
||||
Heading(HeadingBlock),
|
||||
Paragraph(TextBlock),
|
||||
List(ListBlock),
|
||||
Code(CodeBlock),
|
||||
Table(TableBlock),
|
||||
Quote(TextBlock),
|
||||
ImageRef(ImageRefBlock),
|
||||
AudioRef(AudioRefBlock),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct CommonBlock {
|
||||
pub block_id: BlockId,
|
||||
pub heading_path: Vec<String>,
|
||||
pub source_span: SourceSpan,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct HeadingBlock {
|
||||
pub common: CommonBlock,
|
||||
pub level: u8,
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct TextBlock {
|
||||
pub common: CommonBlock,
|
||||
pub text: String,
|
||||
pub inlines: Vec<Inline>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ListBlock {
|
||||
pub common: CommonBlock,
|
||||
pub ordered: bool,
|
||||
pub items: Vec<TextBlock>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct CodeBlock {
|
||||
pub common: CommonBlock,
|
||||
pub lang: Option<String>,
|
||||
pub code: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct TableBlock {
|
||||
pub common: CommonBlock,
|
||||
pub headers: Vec<String>,
|
||||
pub rows: Vec<Vec<String>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ImageRefBlock {
|
||||
pub common: CommonBlock,
|
||||
pub asset_id: Option<AssetId>,
|
||||
pub src: String,
|
||||
pub alt: String,
|
||||
pub ocr: Option<OcrText>,
|
||||
pub caption: Option<ModelCaption>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct AudioRefBlock {
|
||||
pub common: CommonBlock,
|
||||
pub asset_id: AssetId,
|
||||
pub duration_ms: u64,
|
||||
pub transcript: Option<Transcript>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum Inline {
|
||||
Text(String),
|
||||
Code(String),
|
||||
Link { text: String, href: String },
|
||||
Strong(Vec<Inline>),
|
||||
Emph(Vec<Inline>),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum SourceSpan {
|
||||
Line {
|
||||
start: u32,
|
||||
end: u32,
|
||||
},
|
||||
Byte {
|
||||
start: u64,
|
||||
end: u64,
|
||||
},
|
||||
Page {
|
||||
page: u32,
|
||||
char_start: Option<u32>,
|
||||
char_end: Option<u32>,
|
||||
},
|
||||
Region {
|
||||
x: u32,
|
||||
y: u32,
|
||||
w: u32,
|
||||
h: u32,
|
||||
},
|
||||
Time {
|
||||
start_ms: u64,
|
||||
end_ms: u64,
|
||||
},
|
||||
}
|
||||
|
||||
// ── Forward-declared stubs (§3.7a). Bodies are final per design. ────────
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct OcrText {
|
||||
pub joined: String,
|
||||
pub regions: Vec<OcrRegion>,
|
||||
pub engine: String,
|
||||
pub engine_version: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct OcrRegion {
|
||||
pub bbox: (u32, u32, u32, u32),
|
||||
pub text: String,
|
||||
pub confidence: f32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ModelCaption {
|
||||
pub text: String,
|
||||
pub model: String,
|
||||
pub model_version: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Transcript {
|
||||
pub segments: Vec<TranscriptSegment>,
|
||||
pub engine: String,
|
||||
pub engine_version: String,
|
||||
pub language: Lang,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct TranscriptSegment {
|
||||
pub start_ms: u64,
|
||||
pub end_ms: u64,
|
||||
pub text: String,
|
||||
pub speaker: Option<String>,
|
||||
pub confidence: Option<f32>,
|
||||
}
|
||||
15
crates/kb-core/src/errors.rs
Normal file
15
crates/kb-core/src/errors.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
//! `CoreError` (§10).
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum CoreError {
|
||||
#[error("invalid id: {0}")]
|
||||
InvalidId(String),
|
||||
#[error("invalid citation: {0}")]
|
||||
InvalidCitation(String),
|
||||
#[error("invalid source span: {0}")]
|
||||
InvalidSpan(String),
|
||||
#[error("malformed input: {0}")]
|
||||
Malformed(String),
|
||||
}
|
||||
303
crates/kb-core/src/ids.rs
Normal file
303
crates/kb-core/src/ids.rs
Normal file
@@ -0,0 +1,303 @@
|
||||
//! Newtype IDs (§3.1) + ID generation recipe (§4.2).
|
||||
//!
|
||||
//! Every ID is `blake3(canonical_json(tuple))[..32]`. `Display` returns the
|
||||
//! inner hex string; `FromStr` rejects strings that are not exactly 32
|
||||
//! lowercase hex characters.
|
||||
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
use crate::document::SourceSpan;
|
||||
use crate::errors::CoreError;
|
||||
use crate::versions::{
|
||||
ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion,
|
||||
ParserVersion,
|
||||
};
|
||||
|
||||
macro_rules! newtype_id {
|
||||
($name:ident) => {
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct $name(pub String);
|
||||
|
||||
impl fmt::Display for $name {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for $name {
|
||||
type Err = CoreError;
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
validate_hex32(s).map(|()| Self(s.to_owned()))
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
newtype_id!(AssetId);
|
||||
newtype_id!(DocumentId);
|
||||
newtype_id!(BlockId);
|
||||
newtype_id!(ChunkId);
|
||||
newtype_id!(EmbeddingId);
|
||||
newtype_id!(IndexId);
|
||||
|
||||
fn validate_hex32(s: &str) -> Result<(), CoreError> {
|
||||
if s.len() != 32 {
|
||||
return Err(CoreError::InvalidId(format!(
|
||||
"expected 32 hex chars, got {}",
|
||||
s.len()
|
||||
)));
|
||||
}
|
||||
if !s.bytes().all(|b| matches!(b, b'0'..=b'9' | b'a'..=b'f')) {
|
||||
return Err(CoreError::InvalidId(format!(
|
||||
"non-lowercase-hex character in {s:?}"
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Canonical-JSON + blake3 + hex prefix 32. Per design §4.2.
|
||||
pub fn id_from<T: Serialize>(tuple: T) -> String {
|
||||
let bytes = serde_json_canonicalizer::to_vec(&tuple)
|
||||
.expect("canonical JSON serialization must not fail for kb-core inputs");
|
||||
// The crate exposes `to_vec` for `T: Serialize` returning `Vec<u8>`.
|
||||
let hex = blake3::hash(&bytes).to_hex().to_string();
|
||||
hex[..32].to_string()
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct AssetTuple<'a> {
|
||||
kind: &'static str,
|
||||
asset_blake3: &'a str,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct DocTuple<'a> {
|
||||
kind: &'static str,
|
||||
workspace_path: &'a str,
|
||||
asset_id: &'a str,
|
||||
parser_version: &'a str,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct BlockTuple<'a> {
|
||||
kind: &'static str,
|
||||
doc_id: &'a str,
|
||||
block_kind: &'a str,
|
||||
heading_path: &'a [String],
|
||||
ordinal: u32,
|
||||
source_span: &'a SourceSpan,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct ChunkTuple<'a> {
|
||||
kind: &'static str,
|
||||
doc_id: &'a str,
|
||||
chunker_version: &'a str,
|
||||
block_ids: Vec<&'a str>,
|
||||
policy_hash: &'a str,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct EmbeddingTuple<'a> {
|
||||
kind: &'static str,
|
||||
chunk_id: &'a str,
|
||||
model_id: &'a str,
|
||||
model_version: &'a str,
|
||||
dimensions: usize,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct IndexTuple<'a> {
|
||||
kind: &'static str,
|
||||
collection: &'a str,
|
||||
embedding_model: &'a str,
|
||||
dimensions: usize,
|
||||
index_version: &'a str,
|
||||
index_kind: &'a str,
|
||||
index_params_hash: &'a str,
|
||||
}
|
||||
|
||||
pub fn id_for_asset(asset_blake3_full_hex: &str) -> AssetId {
|
||||
AssetId(id_from(AssetTuple {
|
||||
kind: "asset",
|
||||
asset_blake3: asset_blake3_full_hex,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn id_for_doc(
|
||||
workspace_path: &WorkspacePath,
|
||||
asset: &AssetId,
|
||||
parser_version: &ParserVersion,
|
||||
) -> DocumentId {
|
||||
DocumentId(id_from(DocTuple {
|
||||
kind: "doc",
|
||||
workspace_path: &workspace_path.0,
|
||||
asset_id: &asset.0,
|
||||
parser_version: &parser_version.0,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn id_for_block(
|
||||
doc: &DocumentId,
|
||||
block_kind: &str,
|
||||
heading_path: &[String],
|
||||
ordinal: u32,
|
||||
span: &SourceSpan,
|
||||
) -> BlockId {
|
||||
BlockId(id_from(BlockTuple {
|
||||
kind: "block",
|
||||
doc_id: &doc.0,
|
||||
block_kind,
|
||||
heading_path,
|
||||
ordinal,
|
||||
source_span: span,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn id_for_chunk(
|
||||
doc: &DocumentId,
|
||||
chunker_version: &ChunkerVersion,
|
||||
block_ids: &[BlockId],
|
||||
policy_hash: &str,
|
||||
) -> ChunkId {
|
||||
ChunkId(id_from(ChunkTuple {
|
||||
kind: "chunk",
|
||||
doc_id: &doc.0,
|
||||
chunker_version: &chunker_version.0,
|
||||
block_ids: block_ids.iter().map(|b| b.0.as_str()).collect(),
|
||||
policy_hash,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn id_for_embedding(
|
||||
chunk: &ChunkId,
|
||||
model: &EmbeddingModelId,
|
||||
version: &EmbeddingVersion,
|
||||
dims: usize,
|
||||
) -> EmbeddingId {
|
||||
EmbeddingId(id_from(EmbeddingTuple {
|
||||
kind: "embedding",
|
||||
chunk_id: &chunk.0,
|
||||
model_id: &model.0,
|
||||
model_version: &version.0,
|
||||
dimensions: dims,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn id_for_index(
|
||||
collection: &str,
|
||||
model: &EmbeddingModelId,
|
||||
dims: usize,
|
||||
version: &IndexVersion,
|
||||
kind: &str,
|
||||
params_hash: &str,
|
||||
) -> IndexId {
|
||||
IndexId(id_from(IndexTuple {
|
||||
kind: "index",
|
||||
collection,
|
||||
embedding_model: &model.0,
|
||||
dimensions: dims,
|
||||
index_version: &version.0,
|
||||
index_kind: kind,
|
||||
index_params_hash: params_hash,
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn newtype_display_roundtrip() {
|
||||
let s = "0123456789abcdef0123456789abcdef";
|
||||
let id: AssetId = s.parse().unwrap();
|
||||
assert_eq!(id.to_string(), s);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn newtype_rejects_short() {
|
||||
let r: Result<AssetId, _> = "abc".parse();
|
||||
assert!(r.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn newtype_rejects_non_hex() {
|
||||
let r: Result<AssetId, _> = "ZZZ456789abcdef0123456789abcdef0".parse();
|
||||
assert!(r.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn newtype_rejects_uppercase() {
|
||||
let r: Result<AssetId, _> = "0123456789ABCDEF0123456789ABCDEF".parse();
|
||||
assert!(r.is_err());
|
||||
}
|
||||
|
||||
/// Determinism: 1000 runs of `id_from` over the same input yield the same
|
||||
/// hex.
|
||||
#[test]
|
||||
fn id_from_deterministic_1000() {
|
||||
#[derive(Serialize)]
|
||||
struct T<'a> {
|
||||
a: u32,
|
||||
b: &'a str,
|
||||
}
|
||||
let input = T { a: 7, b: "hello" };
|
||||
let first = id_from(&input);
|
||||
for _ in 0..1000 {
|
||||
assert_eq!(id_from(&input), first);
|
||||
}
|
||||
assert_eq!(first.len(), 32);
|
||||
}
|
||||
|
||||
/// Key order in the source struct does not affect hash (canonical JSON
|
||||
/// sorts keys alphabetically).
|
||||
#[test]
|
||||
fn id_from_key_order_invariant() {
|
||||
#[derive(Serialize)]
|
||||
struct A {
|
||||
a: u32,
|
||||
b: u32,
|
||||
}
|
||||
#[derive(Serialize)]
|
||||
struct B {
|
||||
b: u32,
|
||||
a: u32,
|
||||
}
|
||||
assert_eq!(id_from(A { a: 1, b: 2 }), id_from(B { b: 2, a: 1 }));
|
||||
}
|
||||
|
||||
/// The expected hex below is hand-computed via design §4.2:
|
||||
/// tuple = { "kind": "asset", "asset_blake3": "deadbeef" }
|
||||
/// canonical JSON (key-sorted, no whitespace, both keys are pure ASCII):
|
||||
/// {"asset_blake3":"deadbeef","kind":"asset"}
|
||||
/// blake3 of those bytes → hex → first 32 chars.
|
||||
/// Pinned via an independent tool (b3sum, computed once outside the code
|
||||
/// under test) so a regression in our JCS or hash pipeline is caught.
|
||||
#[test]
|
||||
fn id_for_asset_pinned() {
|
||||
// printf '{"asset_blake3":"deadbeef","kind":"asset"}' | b3sum
|
||||
// → cec9353553efb238a7919d38d3e148f1...
|
||||
let id = id_for_asset("deadbeef");
|
||||
assert_eq!(id.0, "cec9353553efb238a7919d38d3e148f1");
|
||||
}
|
||||
|
||||
/// Independent pin for id_for_doc.
|
||||
/// canonical JSON:
|
||||
/// {"asset_id":"6cb0ef0eb89c63b8b6e76ec53dca6e7d",
|
||||
/// "kind":"doc",
|
||||
/// "parser_version":"pulldown-cmark-0.x",
|
||||
/// "workspace_path":"notes/test.md"}
|
||||
/// (concatenated, no whitespace).
|
||||
#[test]
|
||||
fn id_for_doc_pinned() {
|
||||
let asset = AssetId("6cb0ef0eb89c63b8b6e76ec53dca6e7d".to_string());
|
||||
let path = WorkspacePath("notes/test.md".to_string());
|
||||
let pv = ParserVersion("pulldown-cmark-0.x".to_string());
|
||||
let id = id_for_doc(&path, &asset, &pv);
|
||||
assert_eq!(id.0, "8547fe58cb42d593fd761d77242401db");
|
||||
}
|
||||
}
|
||||
45
crates/kb-core/src/ingest.rs
Normal file
45
crates/kb-core/src/ingest.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
//! IngestReport + IngestItem (mirrored from wire §2.4).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
use crate::ids::{AssetId, DocumentId};
|
||||
use crate::traits::SourceScope;
|
||||
use crate::versions::{ChunkerVersion, ParserVersion};
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IngestReport {
|
||||
pub scope: SourceScope,
|
||||
pub scanned: u32,
|
||||
pub new: u32,
|
||||
pub updated: u32,
|
||||
pub skipped: u32,
|
||||
pub errors: u32,
|
||||
pub duration_ms: u32,
|
||||
/// `None` ↔ wire `items: null` (`--summary-only`).
|
||||
pub items: Option<Vec<IngestItem>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IngestItem {
|
||||
pub kind: IngestItemKind,
|
||||
pub doc_id: Option<DocumentId>,
|
||||
pub doc_path: WorkspacePath,
|
||||
pub asset_id: Option<AssetId>,
|
||||
pub byte_len: Option<u64>,
|
||||
pub block_count: Option<u32>,
|
||||
pub chunk_count: Option<u32>,
|
||||
pub parser_version: Option<ParserVersion>,
|
||||
pub chunker_version: Option<ChunkerVersion>,
|
||||
pub warnings: Vec<String>,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum IngestItemKind {
|
||||
New,
|
||||
Updated,
|
||||
Skipped,
|
||||
Error,
|
||||
}
|
||||
52
crates/kb-core/src/jobs.rs
Normal file
52
crates/kb-core/src/jobs.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
//! Job repo support types (§3.7a forward-decl, §7.2 JobRepo).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum JobKind {
|
||||
Ingest,
|
||||
Chunk,
|
||||
Embed,
|
||||
Ocr,
|
||||
Transcribe,
|
||||
Reindex,
|
||||
Doctor,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum JobStatus {
|
||||
Pending,
|
||||
Running,
|
||||
Succeeded,
|
||||
Failed,
|
||||
Canceled,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct JobId(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct JobFilter {
|
||||
pub status: Option<JobStatus>,
|
||||
pub kind: Option<JobKind>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct JobRow {
|
||||
pub job_id: JobId,
|
||||
pub kind: JobKind,
|
||||
pub status: JobStatus,
|
||||
pub payload: Value,
|
||||
pub progress: Option<Value>,
|
||||
pub error: Option<String>,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub created_at: OffsetDateTime,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub updated_at: OffsetDateTime,
|
||||
#[serde(default, with = "time::serde::rfc3339::option")]
|
||||
pub finished_at: Option<OffsetDateTime>,
|
||||
}
|
||||
70
crates/kb-core/src/lib.rs
Normal file
70
crates/kb-core/src/lib.rs
Normal file
@@ -0,0 +1,70 @@
|
||||
//! `kb-core` — frozen domain types, traits, and ID recipe.
|
||||
//!
|
||||
//! Per design §3, §4, §7. This crate has zero dependencies on any other
|
||||
//! `kb-*` crate, so every other crate in the workspace can depend on it
|
||||
//! freely.
|
||||
//!
|
||||
//! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md` for
|
||||
//! the canonical type bodies — this crate is the byte-for-byte mirror.
|
||||
|
||||
pub mod ids;
|
||||
pub mod versions;
|
||||
pub mod media;
|
||||
pub mod asset;
|
||||
pub mod document;
|
||||
pub mod chunk;
|
||||
pub mod citation;
|
||||
pub mod metadata;
|
||||
pub mod search;
|
||||
pub mod answer;
|
||||
pub mod ingest;
|
||||
pub mod jobs;
|
||||
pub mod vector;
|
||||
pub mod errors;
|
||||
pub mod traits;
|
||||
pub mod normalize;
|
||||
|
||||
// Re-export the most commonly used items at the crate root, mirroring the
|
||||
// public surface listed in the task spec.
|
||||
|
||||
pub use ids::{
|
||||
AssetId, BlockId, ChunkId, DocumentId, EmbeddingId, IndexId,
|
||||
id_for_asset, id_for_block, id_for_chunk, id_for_doc, id_for_embedding,
|
||||
id_for_index, id_from,
|
||||
};
|
||||
pub use versions::{
|
||||
ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion,
|
||||
ParserVersion, PromptTemplateVersion, SchemaVersion,
|
||||
};
|
||||
pub use media::{AudioType, Checksum, ImageType, Lang, MediaType};
|
||||
pub use asset::{AssetStorage, RawAsset, SourceUri, WorkspacePath};
|
||||
pub use document::{
|
||||
AudioRefBlock, Block, CanonicalDocument, CodeBlock, CommonBlock,
|
||||
HeadingBlock, ImageRefBlock, Inline, ListBlock, ModelCaption, OcrRegion,
|
||||
OcrText, SourceSpan, TableBlock, TextBlock, Transcript, TranscriptSegment,
|
||||
};
|
||||
pub use chunk::Chunk;
|
||||
pub use citation::Citation;
|
||||
pub use metadata::{
|
||||
Metadata, Provenance, ProvenanceEvent, ProvenanceKind, SourceType,
|
||||
TrustLevel,
|
||||
};
|
||||
pub use search::{
|
||||
DocFilter, DocSummary, RetrievalDetail, SearchFilters, SearchHit,
|
||||
SearchMode, SearchQuery,
|
||||
};
|
||||
pub use answer::{
|
||||
Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, RefusalReason,
|
||||
TokenUsage, TraceId,
|
||||
};
|
||||
pub use ingest::{IngestItem, IngestReport};
|
||||
pub use jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
|
||||
pub use vector::{VectorHit, VectorRecord};
|
||||
pub use errors::CoreError;
|
||||
pub use traits::{
|
||||
ChunkPolicy, Chunker, DocumentStore, Embedder, EmbeddingInput,
|
||||
EmbeddingKind, ExtractConfig, ExtractContext, Extractor, FinishReason,
|
||||
GenerateRequest, JobRepo, LanguageModel, Retriever, SourceConnector,
|
||||
SourceScope, TokenChunk, VectorStore,
|
||||
};
|
||||
pub use normalize::{nfc, to_posix};
|
||||
44
crates/kb-core/src/media.rs
Normal file
44
crates/kb-core/src/media.rs
Normal file
@@ -0,0 +1,44 @@
|
||||
//! Media / file-type primitives (§3.3 + §3.7a).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Full blake3 hex (64 chars) per §3.7a. Stored as `String` for serde
|
||||
/// simplicity.
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Checksum(pub String);
|
||||
|
||||
/// BCP-47 / ISO-639 language tag (e.g. "ko", "en"). §3.7a.
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Lang(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum ImageType {
|
||||
Png,
|
||||
Jpeg,
|
||||
Webp,
|
||||
Gif,
|
||||
Tiff,
|
||||
Other(String),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum AudioType {
|
||||
M4a,
|
||||
Mp3,
|
||||
Wav,
|
||||
Flac,
|
||||
Ogg,
|
||||
Other(String),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum MediaType {
|
||||
Markdown,
|
||||
Pdf,
|
||||
Image(ImageType),
|
||||
Audio(AudioType),
|
||||
Other(String),
|
||||
}
|
||||
68
crates/kb-core/src/metadata.rs
Normal file
68
crates/kb-core/src/metadata.rs
Normal file
@@ -0,0 +1,68 @@
|
||||
//! Metadata + Provenance (§3.6).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Map, Value};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Metadata {
|
||||
pub aliases: Vec<String>,
|
||||
pub tags: Vec<String>,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub created_at: OffsetDateTime,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub updated_at: OffsetDateTime,
|
||||
pub source_type: SourceType,
|
||||
pub trust_level: TrustLevel,
|
||||
pub user_id_alias: Option<String>,
|
||||
/// Frontmatter keys we don't recognise are preserved here per §0 Q9.
|
||||
pub user: Map<String, Value>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum SourceType {
|
||||
Markdown,
|
||||
Note,
|
||||
Paper,
|
||||
Reference,
|
||||
Inbox,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum TrustLevel {
|
||||
Primary,
|
||||
Secondary,
|
||||
Generated,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Provenance {
|
||||
pub events: Vec<ProvenanceEvent>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ProvenanceEvent {
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub at: OffsetDateTime,
|
||||
pub agent: String,
|
||||
pub kind: ProvenanceKind,
|
||||
pub note: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ProvenanceKind {
|
||||
Discovered,
|
||||
Parsed,
|
||||
Normalized,
|
||||
Chunked,
|
||||
OcrApplied,
|
||||
CaptionApplied,
|
||||
Transcribed,
|
||||
Embedded,
|
||||
Indexed,
|
||||
Warning,
|
||||
Error,
|
||||
}
|
||||
86
crates/kb-core/src/normalize.rs
Normal file
86
crates/kb-core/src/normalize.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
//! Path / string normalization helpers (§4.1, §6.6).
|
||||
|
||||
use std::path::{Component, Path};
|
||||
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
|
||||
/// NFC-normalize a UTF-8 string (§4.1).
|
||||
pub fn nfc(input: &str) -> String {
|
||||
input.nfc().collect()
|
||||
}
|
||||
|
||||
/// Collapse a path to a POSIX-relative `WorkspacePath` per §6.6:
|
||||
/// - convert all separators to `/`
|
||||
/// - strip a leading `./`
|
||||
/// - collapse repeated slashes
|
||||
/// - NFC-normalize
|
||||
pub fn to_posix(path: &Path) -> WorkspacePath {
|
||||
let mut out = String::new();
|
||||
let mut first = true;
|
||||
for comp in path.components() {
|
||||
match comp {
|
||||
Component::CurDir => continue,
|
||||
Component::Normal(s) => {
|
||||
if !first {
|
||||
out.push('/');
|
||||
}
|
||||
out.push_str(&s.to_string_lossy());
|
||||
first = false;
|
||||
}
|
||||
Component::ParentDir => {
|
||||
if !first {
|
||||
out.push('/');
|
||||
}
|
||||
out.push_str("..");
|
||||
first = false;
|
||||
}
|
||||
Component::RootDir => {
|
||||
if first {
|
||||
out.push('/');
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
Component::Prefix(_) => {
|
||||
// Windows drive prefixes — `to_string_lossy` keeps form.
|
||||
out.push_str(&comp.as_os_str().to_string_lossy());
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if out.is_empty() {
|
||||
out.push_str(".");
|
||||
}
|
||||
WorkspacePath(nfc(&out))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn collapses_curdir_and_redundant_slashes() {
|
||||
let p = Path::new("./a//b.md");
|
||||
// `Path::components` already collapses `//` on POSIX; the test
|
||||
// doc-fixed example asserts the final string is `a/b.md`.
|
||||
assert_eq!(to_posix(p).0, "a/b.md");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nfc_normalizes_korean() {
|
||||
// U+1100 ㄱ + U+1161 ㅏ (NFD) vs U+AC00 가 (NFC). After NFC they
|
||||
// collapse to the same string; `to_posix` runs NFC after path
|
||||
// collapse, so the WorkspacePath comes out NFC regardless of input.
|
||||
let nfd = "\u{1100}\u{1161}.md";
|
||||
let nfc_str = "\u{AC00}.md";
|
||||
assert_eq!(to_posix(Path::new(nfd)).0, to_posix(Path::new(nfc_str)).0);
|
||||
assert_eq!(to_posix(Path::new(nfd)).0, "\u{AC00}.md");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nfc_function_idempotent() {
|
||||
let s = "\u{AC00}";
|
||||
assert_eq!(nfc(s), s);
|
||||
}
|
||||
}
|
||||
90
crates/kb-core/src/search.rs
Normal file
90
crates/kb-core/src/search.rs
Normal file
@@ -0,0 +1,90 @@
|
||||
//! Search query / filters / hit (§3.7) + DocFilter / DocSummary (§2.5).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
use crate::citation::Citation;
|
||||
use crate::ids::{ChunkId, DocumentId};
|
||||
use crate::media::Lang;
|
||||
use crate::metadata::{SourceType, TrustLevel};
|
||||
use crate::versions::{ChunkerVersion, EmbeddingModelId, IndexVersion, ParserVersion};
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum SearchMode {
|
||||
Lexical,
|
||||
Vector,
|
||||
Hybrid,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SearchQuery {
|
||||
pub text: String,
|
||||
pub mode: SearchMode,
|
||||
pub k: usize,
|
||||
pub filters: SearchFilters,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SearchFilters {
|
||||
pub tags_any: Vec<String>,
|
||||
pub lang: Option<Lang>,
|
||||
pub path_glob: Option<String>,
|
||||
pub trust_min: Option<TrustLevel>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SearchHit {
|
||||
pub rank: u32,
|
||||
pub chunk_id: ChunkId,
|
||||
pub doc_id: DocumentId,
|
||||
pub doc_path: WorkspacePath,
|
||||
pub heading_path: Vec<String>,
|
||||
pub section_label: Option<String>,
|
||||
pub snippet: String,
|
||||
pub citation: Citation,
|
||||
pub retrieval: RetrievalDetail,
|
||||
pub index_version: IndexVersion,
|
||||
pub embedding_model: Option<EmbeddingModelId>,
|
||||
pub chunker_version: ChunkerVersion,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RetrievalDetail {
|
||||
pub method: SearchMode,
|
||||
pub fusion_score: f32,
|
||||
pub lexical_score: Option<f32>,
|
||||
pub vector_score: Option<f32>,
|
||||
pub lexical_rank: Option<u32>,
|
||||
pub vector_rank: Option<u32>,
|
||||
}
|
||||
|
||||
/// Filter for `kb-app::list_docs` (§7.2 DocumentStore::list_documents).
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct DocFilter {
|
||||
pub tags_any: Vec<String>,
|
||||
pub lang: Option<Lang>,
|
||||
pub path_glob: Option<String>,
|
||||
pub trust_min: Option<TrustLevel>,
|
||||
}
|
||||
|
||||
/// Internal mirror of wire `doc_summary.v1` (§2.5).
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct DocSummary {
|
||||
pub doc_id: DocumentId,
|
||||
pub doc_path: WorkspacePath,
|
||||
pub title: String,
|
||||
pub lang: Lang,
|
||||
pub tags: Vec<String>,
|
||||
pub trust_level: TrustLevel,
|
||||
pub source_type: SourceType,
|
||||
pub byte_len: u64,
|
||||
pub chunk_count: u32,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub created_at: OffsetDateTime,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub updated_at: OffsetDateTime,
|
||||
pub parser_version: ParserVersion,
|
||||
pub chunker_version: ChunkerVersion,
|
||||
}
|
||||
175
crates/kb-core/src/traits.rs
Normal file
175
crates/kb-core/src/traits.rs
Normal file
@@ -0,0 +1,175 @@
|
||||
//! Component traits (§7) and their input helper types (§7.1).
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::asset::RawAsset;
|
||||
use crate::chunk::Chunk;
|
||||
use crate::document::{Block, CanonicalDocument};
|
||||
use crate::ids::{ChunkId, DocumentId};
|
||||
use crate::jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
|
||||
use crate::media::MediaType;
|
||||
use crate::search::{DocFilter, DocSummary, SearchFilters, SearchHit, SearchQuery};
|
||||
use crate::vector::{VectorHit, VectorRecord};
|
||||
use crate::versions::{
|
||||
ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, ParserVersion,
|
||||
};
|
||||
use crate::answer::{ModelRef, TokenUsage};
|
||||
|
||||
// ── Helper input types (§7.1) ─────────────────────────────────────────────
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SourceScope {
|
||||
pub root: PathBuf,
|
||||
pub include: Vec<String>,
|
||||
pub exclude: Vec<String>,
|
||||
}
|
||||
|
||||
/// Forward-declared (§3.7a) — concrete shape decided by extractors. P0
|
||||
/// keeps the option-of-config-file slot only.
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ExtractConfig {
|
||||
pub config_path: Option<PathBuf>,
|
||||
}
|
||||
|
||||
/// Carries the raw asset bytes context to an `Extractor::extract` call.
|
||||
pub struct ExtractContext<'a> {
|
||||
pub asset: &'a RawAsset,
|
||||
pub workspace_root: &'a Path,
|
||||
pub config: &'a ExtractConfig,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ChunkPolicy {
|
||||
pub target_tokens: usize,
|
||||
pub overlap_tokens: usize,
|
||||
pub respect_markdown_headings: bool,
|
||||
pub chunker_version: ChunkerVersion,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum EmbeddingKind {
|
||||
Document,
|
||||
Query,
|
||||
}
|
||||
|
||||
pub struct EmbeddingInput<'a> {
|
||||
pub text: &'a str,
|
||||
pub kind: EmbeddingKind,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct GenerateRequest {
|
||||
pub system: String,
|
||||
pub user: String,
|
||||
pub stop: Vec<String>,
|
||||
pub max_tokens: usize,
|
||||
pub temperature: f32,
|
||||
pub seed: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case", tag = "kind")]
|
||||
pub enum TokenChunk {
|
||||
Token(String),
|
||||
Done {
|
||||
finish_reason: FinishReason,
|
||||
usage: TokenUsage,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum FinishReason {
|
||||
Stop,
|
||||
Length,
|
||||
Aborted,
|
||||
Error(String),
|
||||
}
|
||||
|
||||
// ── Traits (§7.2) ─────────────────────────────────────────────────────────
|
||||
|
||||
pub trait SourceConnector {
|
||||
fn scan(&self, scope: &SourceScope) -> anyhow::Result<Vec<RawAsset>>;
|
||||
}
|
||||
|
||||
pub trait Extractor: Send + Sync {
|
||||
fn supports(&self, media_type: &MediaType) -> bool;
|
||||
fn parser_version(&self) -> ParserVersion;
|
||||
fn extract(
|
||||
&self,
|
||||
ctx: &ExtractContext<'_>,
|
||||
bytes: &[u8],
|
||||
) -> anyhow::Result<CanonicalDocument>;
|
||||
}
|
||||
|
||||
pub trait Chunker: Send + Sync {
|
||||
fn chunker_version(&self) -> ChunkerVersion;
|
||||
fn policy_hash(&self, policy: &ChunkPolicy) -> String;
|
||||
fn chunk(
|
||||
&self,
|
||||
doc: &CanonicalDocument,
|
||||
policy: &ChunkPolicy,
|
||||
) -> anyhow::Result<Vec<Chunk>>;
|
||||
}
|
||||
|
||||
pub trait Embedder: Send + Sync {
|
||||
fn model_id(&self) -> EmbeddingModelId;
|
||||
fn model_version(&self) -> EmbeddingVersion;
|
||||
fn dimensions(&self) -> usize;
|
||||
fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> anyhow::Result<Vec<Vec<f32>>>;
|
||||
}
|
||||
|
||||
pub trait Retriever: Send + Sync {
|
||||
fn search(&self, query: &SearchQuery) -> anyhow::Result<Vec<SearchHit>>;
|
||||
fn index_version(&self) -> IndexVersion;
|
||||
}
|
||||
|
||||
pub trait LanguageModel: Send + Sync {
|
||||
fn model_ref(&self) -> ModelRef;
|
||||
fn context_tokens(&self) -> usize;
|
||||
fn generate_stream(
|
||||
&self,
|
||||
req: GenerateRequest,
|
||||
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>>;
|
||||
}
|
||||
|
||||
pub trait DocumentStore {
|
||||
fn put_asset(&self, a: &RawAsset) -> anyhow::Result<()>;
|
||||
fn put_document(&self, d: &CanonicalDocument) -> anyhow::Result<()>;
|
||||
fn put_blocks(&self, doc: &DocumentId, blocks: &[Block]) -> anyhow::Result<()>;
|
||||
fn put_chunks(&self, doc: &DocumentId, chunks: &[Chunk]) -> anyhow::Result<()>;
|
||||
fn get_document(&self, id: &DocumentId) -> anyhow::Result<Option<CanonicalDocument>>;
|
||||
fn get_chunk(&self, id: &ChunkId) -> anyhow::Result<Option<Chunk>>;
|
||||
fn list_documents(&self, filter: &DocFilter) -> anyhow::Result<Vec<DocSummary>>;
|
||||
}
|
||||
|
||||
pub trait VectorStore {
|
||||
fn ensure_table(
|
||||
&self,
|
||||
model: &EmbeddingModelId,
|
||||
dim: usize,
|
||||
) -> anyhow::Result<crate::ids::IndexId>;
|
||||
fn upsert(&self, recs: &[VectorRecord]) -> anyhow::Result<()>;
|
||||
fn search(
|
||||
&self,
|
||||
query_vec: &[f32],
|
||||
k: usize,
|
||||
filters: &SearchFilters,
|
||||
) -> anyhow::Result<Vec<VectorHit>>;
|
||||
}
|
||||
|
||||
pub trait JobRepo {
|
||||
fn create(&self, kind: JobKind, payload: Value) -> anyhow::Result<JobId>;
|
||||
fn update_progress(&self, id: &JobId, progress: Value) -> anyhow::Result<()>;
|
||||
fn finish(
|
||||
&self,
|
||||
id: &JobId,
|
||||
status: JobStatus,
|
||||
error: Option<&str>,
|
||||
) -> anyhow::Result<()>;
|
||||
fn list(&self, filter: &JobFilter) -> anyhow::Result<Vec<JobRow>>;
|
||||
}
|
||||
27
crates/kb-core/src/vector.rs
Normal file
27
crates/kb-core/src/vector.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
//! Vector store records (§7.2 VectorStore).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::ids::{ChunkId, DocumentId, EmbeddingId};
|
||||
use crate::versions::{EmbeddingModelId, EmbeddingVersion};
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct VectorRecord {
|
||||
pub chunk_id: ChunkId,
|
||||
pub embedding_id: EmbeddingId,
|
||||
pub vector: Vec<f32>,
|
||||
pub doc_id: DocumentId,
|
||||
pub text: String,
|
||||
pub heading_path: Vec<String>,
|
||||
pub model_id: EmbeddingModelId,
|
||||
pub model_version: EmbeddingVersion,
|
||||
pub dimensions: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct VectorHit {
|
||||
pub chunk_id: ChunkId,
|
||||
pub score: f32,
|
||||
pub payload: Value,
|
||||
}
|
||||
27
crates/kb-core/src/versions.rs
Normal file
27
crates/kb-core/src/versions.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
//! Version / label newtypes (§3.2).
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ParserVersion(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ChunkerVersion(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct EmbeddingModelId(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct EmbeddingVersion(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IndexVersion(pub String);
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct PromptTemplateVersion(pub String);
|
||||
|
||||
/// Wire schema version label (`"answer.v1"`, `"search_hit.v1"`, …).
|
||||
/// Carried as a `&'static str` because every wire type pins its label at
|
||||
/// compile time.
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SchemaVersion(pub &'static str);
|
||||
Reference in New Issue
Block a user