p0-1: workspace + kb-core domain types, traits, and ID recipe

Stand up the Cargo workspace (Rust 2024 / resolver=3) with the kb-core
crate per the frozen design (§3, §4, §7, §10). kb-core has zero
deps on other kb-* crates and exposes:

- Newtype IDs (AssetId / DocumentId / BlockId / ChunkId / EmbeddingId /
  IndexId) with Display + FromStr that reject anything but 32 lower-hex.
- id_from + id_for_{asset,doc,block,chunk,embedding,index} per §4.2;
  pinned hex test values computed via an independent JCS+blake3 tool.
- CanonicalDocument, Block (8 variants), SourceSpan, Inline (§3.4).
- Citation (5 variants) with W3C Media Fragments to_uri / parse;
  round-trip property holds for every variant.
- Metadata + Provenance (§3.6); SearchQuery / SearchHit / RetrievalDetail
  (§3.7); DocFilter / DocSummary mirrors of wire §2.5.
- Answer / AnswerCitation / RefusalReason / ModelRef (§3.8).
- IngestReport, JobRepo support types, VectorRecord / VectorHit.
- Component traits (SourceConnector / Extractor / Chunker / Embedder /
  Retriever / LanguageModel / DocumentStore / VectorStore / JobRepo)
  plus their input helpers (SourceScope / ExtractContext / ChunkPolicy
  / EmbeddingInput / GenerateRequest / TokenChunk / FinishReason).
- CoreError (§10).
- nfc + to_posix helpers (§4.1, §6.6).

20 unit tests cover ID determinism (1000-run regression), key-order
invariance, two pinned hex values, newtype rejection of bad input,
Citation round-trip for all 5 variants, and to_posix collapsing +
Korean NFC.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-30 05:16:37 +00:00
parent d3cb06f60d
commit f86df99fe9
21 changed files with 2606 additions and 0 deletions

3
.gitignore vendored
View File

@@ -1 +1,4 @@
.superpowers/
/target/
**/*.rs.bk
Cargo.lock.bak

937
Cargo.lock generated Normal file
View File

@@ -0,0 +1,937 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "aho-corasick"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
dependencies = [
"memchr",
]
[[package]]
name = "anstream"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
[[package]]
name = "anstyle-parse"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
dependencies = [
"anstyle",
"once_cell_polyfill",
"windows-sys 0.61.2",
]
[[package]]
name = "anyhow"
version = "1.0.102"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
[[package]]
name = "arrayref"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
[[package]]
name = "arrayvec"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "blake3"
version = "1.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce"
dependencies = [
"arrayref",
"arrayvec",
"cc",
"cfg-if",
"constant_time_eq",
"cpufeatures",
]
[[package]]
name = "cc"
version = "1.2.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d"
dependencies = [
"find-msvc-tools",
"shlex",
]
[[package]]
name = "cfg-if"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "clap"
version = "4.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
[[package]]
name = "colorchoice"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
[[package]]
name = "constant_time_eq"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
[[package]]
name = "cpufeatures"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
dependencies = [
"libc",
]
[[package]]
name = "crossbeam-channel"
version = "0.5.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "deranged"
version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
dependencies = [
"powerfmt",
"serde_core",
]
[[package]]
name = "dirs"
version = "5.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225"
dependencies = [
"dirs-sys",
]
[[package]]
name = "dirs-sys"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c"
dependencies = [
"libc",
"option-ext",
"redox_users",
"windows-sys 0.48.0",
]
[[package]]
name = "equivalent"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "find-msvc-tools"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
[[package]]
name = "getrandom"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "hashbrown"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "indexmap"
version = "2.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
dependencies = [
"equivalent",
"hashbrown",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
[[package]]
name = "itoa"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
[[package]]
name = "kb-app"
version = "0.1.0"
dependencies = [
"anyhow",
"dirs",
"kb-config",
"kb-core",
"serde",
"serde_json",
"thiserror 2.0.18",
"toml",
"tracing",
"tracing-appender",
"tracing-subscriber",
]
[[package]]
name = "kb-cli"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"kb-app",
"kb-config",
"kb-core",
"serde_json",
]
[[package]]
name = "kb-config"
version = "0.1.0"
dependencies = [
"anyhow",
"dirs",
"kb-core",
"serde",
"serde_json",
"thiserror 2.0.18",
"toml",
]
[[package]]
name = "kb-core"
version = "0.1.0"
dependencies = [
"anyhow",
"blake3",
"serde",
"serde_json",
"serde_json_canonicalizer",
"thiserror 2.0.18",
"time",
"unicode-normalization",
]
[[package]]
name = "kb-parse-types"
version = "0.1.0"
dependencies = [
"kb-core",
"serde",
"thiserror 2.0.18",
]
[[package]]
name = "lazy_static"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "libc"
version = "0.2.186"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
[[package]]
name = "libredox"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c"
dependencies = [
"libc",
]
[[package]]
name = "log"
version = "0.4.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
[[package]]
name = "matchers"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9"
dependencies = [
"regex-automata",
]
[[package]]
name = "memchr"
version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
[[package]]
name = "nu-ansi-term"
version = "0.50.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "num-conv"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "once_cell"
version = "1.21.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
[[package]]
name = "once_cell_polyfill"
version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]]
name = "option-ext"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
[[package]]
name = "pin-project-lite"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
[[package]]
name = "powerfmt"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "proc-macro2"
version = "1.0.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
dependencies = [
"proc-macro2",
]
[[package]]
name = "redox_users"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
dependencies = [
"getrandom",
"libredox",
"thiserror 1.0.69",
]
[[package]]
name = "regex-automata"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
[[package]]
name = "ryu-js"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15"
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.149"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
dependencies = [
"itoa",
"memchr",
"serde",
"serde_core",
"zmij",
]
[[package]]
name = "serde_json_canonicalizer"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe52319a927259afbfa5180c5157cd8167edfd3e8c254f9558c7fef44c5649f2"
dependencies = [
"ryu-js",
"serde",
"serde_json",
]
[[package]]
name = "serde_spanned"
version = "0.6.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3"
dependencies = [
"serde",
]
[[package]]
name = "sharded-slab"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
dependencies = [
"lazy_static",
]
[[package]]
name = "shlex"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "smallvec"
version = "1.15.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "symlink"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a"
[[package]]
name = "syn"
version = "2.0.117"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
dependencies = [
"thiserror-impl 1.0.69",
]
[[package]]
name = "thiserror"
version = "2.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
dependencies = [
"thiserror-impl 2.0.18",
]
[[package]]
name = "thiserror-impl"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "thiserror-impl"
version = "2.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "thread_local"
version = "1.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185"
dependencies = [
"cfg-if",
]
[[package]]
name = "time"
version = "0.3.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd"
dependencies = [
"deranged",
"itoa",
"num-conv",
"powerfmt",
"serde_core",
"time-core",
"time-macros",
]
[[package]]
name = "time-core"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b36ee98fd31ec7426d599183e8fe26932a8dc1fb76ddb6214d05493377d34ca"
[[package]]
name = "time-macros"
version = "0.2.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71e552d1249bf61ac2a52db88179fd0673def1e1ad8243a00d9ec9ed71fee3dd"
dependencies = [
"num-conv",
"time-core",
]
[[package]]
name = "tinyvec"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "toml"
version = "0.8.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362"
dependencies = [
"serde",
"serde_spanned",
"toml_datetime",
"toml_edit",
]
[[package]]
name = "toml_datetime"
version = "0.6.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
dependencies = [
"serde",
]
[[package]]
name = "toml_edit"
version = "0.22.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
dependencies = [
"indexmap",
"serde",
"serde_spanned",
"toml_datetime",
"toml_write",
"winnow",
]
[[package]]
name = "toml_write"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
[[package]]
name = "tracing"
version = "0.1.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
dependencies = [
"pin-project-lite",
"tracing-attributes",
"tracing-core",
]
[[package]]
name = "tracing-appender"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "050686193eb999b4bb3bc2acfa891a13da00f79734704c4b8b4ef1a10b368a3c"
dependencies = [
"crossbeam-channel",
"symlink",
"thiserror 2.0.18",
"time",
"tracing-subscriber",
]
[[package]]
name = "tracing-attributes"
version = "0.1.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tracing-core"
version = "0.1.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
dependencies = [
"once_cell",
"valuable",
]
[[package]]
name = "tracing-log"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
dependencies = [
"log",
"once_cell",
"tracing-core",
]
[[package]]
name = "tracing-serde"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
dependencies = [
"serde",
"tracing-core",
]
[[package]]
name = "tracing-subscriber"
version = "0.3.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex-automata",
"serde",
"serde_json",
"sharded-slab",
"smallvec",
"thread_local",
"tracing",
"tracing-core",
"tracing-log",
"tracing-serde",
]
[[package]]
name = "unicode-ident"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
[[package]]
name = "unicode-normalization"
version = "0.1.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
dependencies = [
"tinyvec",
]
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "valuable"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
[[package]]
name = "wasi"
version = "0.11.1+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
[[package]]
name = "windows-link"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-sys"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-sys"
version = "0.61.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
dependencies = [
"windows-link",
]
[[package]]
name = "windows-targets"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_i686_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_x86_64_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
[[package]]
name = "winnow"
version = "0.7.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
dependencies = [
"memchr",
]
[[package]]
name = "zmij"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"

26
Cargo.toml Normal file
View File

@@ -0,0 +1,26 @@
[workspace]
resolver = "3"
members = [
"crates/kb-core",
"crates/kb-parse-types",
"crates/kb-config",
"crates/kb-app",
"crates/kb-cli",
]
[workspace.package]
edition = "2024"
rust-version = "1.85"
license = "MIT OR Apache-2.0"
repository = "https://github.com/altair823/kb"
version = "0.1.0"
[workspace.dependencies]
anyhow = "1"
thiserror = "2"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
time = { version = "0.3", features = ["serde", "macros", "formatting", "parsing"] }
uuid = { version = "1", features = ["v7", "serde"] }
blake3 = "1"
tracing = "0.1"

18
crates/kb-core/Cargo.toml Normal file
View File

@@ -0,0 +1,18 @@
[package]
name = "kb-core"
version = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "kb domain types, traits, and ID recipe (no other kb-* deps)"
[dependencies]
anyhow = { workspace = true }
thiserror = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
time = { workspace = true }
blake3 = { workspace = true }
serde_json_canonicalizer = "0.3"
unicode-normalization = "0.1"

View File

@@ -0,0 +1,66 @@
//! Answer + RAG types (§3.8).
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use crate::citation::Citation;
use crate::search::SearchMode;
use crate::versions::PromptTemplateVersion;
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Answer {
pub answer: String,
pub citations: Vec<AnswerCitation>,
pub grounded: bool,
pub refusal_reason: Option<RefusalReason>,
pub model: ModelRef,
pub embedding: Option<ModelRef>,
pub prompt_template_version: PromptTemplateVersion,
pub retrieval: AnswerRetrievalSummary,
pub usage: TokenUsage,
#[serde(with = "time::serde::rfc3339")]
pub created_at: OffsetDateTime,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct AnswerCitation {
pub marker: Option<String>,
pub citation: Citation,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RefusalReason {
ScoreGate,
LlmSelfJudge,
NoIndex,
NoChunks,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ModelRef {
pub id: String,
pub provider: String,
pub dimensions: Option<usize>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct AnswerRetrievalSummary {
pub trace_id: TraceId,
pub mode: SearchMode,
pub k: usize,
pub score_gate: f32,
pub top_score: f32,
pub chunks_returned: u32,
pub chunks_used: u32,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TokenUsage {
pub prompt_tokens: u32,
pub completion_tokens: u32,
pub latency_ms: u32,
}
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct TraceId(pub String);

View File

@@ -0,0 +1,42 @@
//! Raw asset, source URI, workspace path (§3.3).
use std::path::PathBuf;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use crate::ids::AssetId;
use crate::media::{Checksum, MediaType};
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", tag = "kind", content = "value")]
pub enum SourceUri {
File(PathBuf),
/// `kb://` virtual reference.
Kb(String),
}
/// POSIX-relative path inside the workspace root (§6.6, §4.1). Always
/// produced via `crate::normalize::to_posix`.
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct WorkspacePath(pub String);
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", tag = "kind")]
pub enum AssetStorage {
Copied { path: PathBuf },
Reference { path: PathBuf, sha: Checksum },
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct RawAsset {
pub asset_id: AssetId,
pub source_uri: SourceUri,
pub workspace_path: WorkspacePath,
pub media_type: MediaType,
pub byte_len: u64,
pub checksum: Checksum,
#[serde(with = "time::serde::rfc3339")]
pub discovered_at: OffsetDateTime,
pub stored: AssetStorage,
}

View File

@@ -0,0 +1,19 @@
//! Chunk (§3.5).
use serde::{Deserialize, Serialize};
use crate::document::SourceSpan;
use crate::ids::{BlockId, ChunkId, DocumentId};
use crate::versions::ChunkerVersion;
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Chunk {
pub chunk_id: ChunkId,
pub doc_id: DocumentId,
pub block_ids: Vec<BlockId>,
pub text: String,
pub heading_path: Vec<String>,
pub source_spans: Vec<SourceSpan>,
pub token_estimate: usize,
pub chunker_version: ChunkerVersion,
}

View File

@@ -0,0 +1,316 @@
//! Citation (§3.5) — discriminated 5-variant. Each variant has a canonical
//! W3C Media Fragments URI per design §0 Q3.
use anyhow::{Result, bail};
use serde::{Deserialize, Serialize};
use crate::asset::WorkspacePath;
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", tag = "kind")]
pub enum Citation {
Line {
path: WorkspacePath,
start: u32,
end: u32,
section: Option<String>,
},
Page {
path: WorkspacePath,
page: u32,
section: Option<String>,
},
Region {
path: WorkspacePath,
x: u32,
y: u32,
w: u32,
h: u32,
},
Caption {
path: WorkspacePath,
model: String,
},
Time {
path: WorkspacePath,
start_ms: u64,
end_ms: u64,
speaker: Option<String>,
},
}
impl Citation {
pub fn path(&self) -> &WorkspacePath {
match self {
Citation::Line { path, .. }
| Citation::Page { path, .. }
| Citation::Region { path, .. }
| Citation::Caption { path, .. }
| Citation::Time { path, .. } => path,
}
}
/// Emit a W3C Media Fragments URI per design §0 Q3.
/// `section` and `speaker` and `caption.model` are NOT part of the URI
/// fragment; they live in the structured wire object.
pub fn to_uri(&self) -> String {
match self {
Citation::Line { path, start, end, .. } => {
if start == end {
format!("{}#L{}", path.0, start)
} else {
format!("{}#L{}-L{}", path.0, start, end)
}
}
Citation::Page { path, page, .. } => format!("{}#p={}", path.0, page),
Citation::Region {
path, x, y, w, h, ..
} => format!("{}#xywh={},{},{},{}", path.0, x, y, w, h),
Citation::Caption { path, .. } => format!("{}#caption", path.0),
Citation::Time {
path,
start_ms,
end_ms,
speaker,
} => {
let s = format_hms_ms(*start_ms);
let e = format_hms_ms(*end_ms);
match speaker {
Some(sp) => format!("{}#t={},{}&speaker={}", path.0, s, e, sp),
None => format!("{}#t={},{}", path.0, s, e),
}
}
}
}
/// Strict inverse of `to_uri`. The `section` / `caption.model` fields
/// are not part of the URI grammar, so a parsed Citation will have
/// `section = None` and `model = ""` for the relevant variants.
/// Round-trip property holds for citations whose non-URI fields are at
/// their default values (see test).
pub fn parse(s: &str) -> Result<Self> {
let (path_str, frag) = match s.rsplit_once('#') {
Some(t) => t,
None => bail!("citation has no '#' fragment: {s:?}"),
};
let path = WorkspacePath(path_str.to_owned());
if let Some(rest) = frag.strip_prefix("L") {
// line range: `L<a>` or `L<a>-L<b>`
if let Some((a, b)) = rest.split_once("-L") {
let start: u32 = a.parse().map_err(|_| anyhow::anyhow!("bad line start"))?;
let end: u32 = b.parse().map_err(|_| anyhow::anyhow!("bad line end"))?;
return Ok(Citation::Line {
path,
start,
end,
section: None,
});
}
let n: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad line number"))?;
return Ok(Citation::Line {
path,
start: n,
end: n,
section: None,
});
}
if let Some(rest) = frag.strip_prefix("p=") {
let page: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad page number"))?;
return Ok(Citation::Page {
path,
page,
section: None,
});
}
if let Some(rest) = frag.strip_prefix("xywh=") {
let parts: Vec<&str> = rest.split(',').collect();
if parts.len() != 4 {
bail!("xywh= expects 4 comma-separated values: {rest:?}");
}
let x: u32 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad xywh.x"))?;
let y: u32 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad xywh.y"))?;
let w: u32 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad xywh.w"))?;
let h: u32 = parts[3].parse().map_err(|_| anyhow::anyhow!("bad xywh.h"))?;
return Ok(Citation::Region { path, x, y, w, h });
}
if frag == "caption" {
return Ok(Citation::Caption {
path,
model: String::new(),
});
}
if let Some(rest) = frag.strip_prefix("t=") {
// `t=<start>,<end>` optionally followed by `&speaker=<id>`
let (range, speaker) = match rest.split_once('&') {
Some((r, kv)) => match kv.strip_prefix("speaker=") {
Some(sp) => (r, Some(sp.to_owned())),
None => bail!("unknown time-fragment param: {kv:?}"),
},
None => (rest, None),
};
let (s_str, e_str) = match range.split_once(',') {
Some(t) => t,
None => bail!("time fragment expects '<start>,<end>': {range:?}"),
};
let start_ms = parse_hms_ms(s_str)?;
let end_ms = parse_hms_ms(e_str)?;
return Ok(Citation::Time {
path,
start_ms,
end_ms,
speaker,
});
}
bail!("unrecognised citation fragment: {frag:?}")
}
}
/// Format milliseconds as `hh:mm:ss.mmm` (W3C Media Fragments NPT-with-ms).
fn format_hms_ms(ms: u64) -> String {
let hours = ms / 3_600_000;
let minutes = (ms % 3_600_000) / 60_000;
let seconds = (ms % 60_000) / 1000;
let millis = ms % 1000;
format!("{hours:02}:{minutes:02}:{seconds:02}.{millis:03}")
}
fn parse_hms_ms(s: &str) -> Result<u64> {
// Accept `hh:mm:ss.mmm` (the form we emit). Reject malformed input.
let parts: Vec<&str> = s.split(':').collect();
if parts.len() != 3 {
bail!("time component expects hh:mm:ss.mmm, got {s:?}");
}
let h: u64 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad hours"))?;
let m: u64 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad minutes"))?;
let (sec, ms) = match parts[2].split_once('.') {
Some((s_part, ms_part)) => {
let sec: u64 = s_part.parse().map_err(|_| anyhow::anyhow!("bad seconds"))?;
// Pad/truncate to exactly 3 digits.
let mut ms_str = ms_part.to_owned();
while ms_str.len() < 3 {
ms_str.push('0');
}
ms_str.truncate(3);
let ms: u64 = ms_str.parse().map_err(|_| anyhow::anyhow!("bad milliseconds"))?;
(sec, ms)
}
None => {
let sec: u64 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad seconds"))?;
(sec, 0)
}
};
Ok(h * 3_600_000 + m * 60_000 + sec * 1000 + ms)
}
#[cfg(test)]
mod tests {
use super::*;
fn p(s: &str) -> WorkspacePath {
WorkspacePath(s.to_owned())
}
#[test]
fn line_range_uri_and_roundtrip() {
let c = Citation::Line {
path: p("notes/rust/kb.md"),
start: 12,
end: 34,
section: None,
};
assert_eq!(c.to_uri(), "notes/rust/kb.md#L12-L34");
let parsed = Citation::parse(&c.to_uri()).unwrap();
assert_eq!(parsed, c);
}
#[test]
fn line_single_uri_and_roundtrip() {
let c = Citation::Line {
path: p("a/b.md"),
start: 7,
end: 7,
section: None,
};
assert_eq!(c.to_uri(), "a/b.md#L7");
let parsed = Citation::parse(&c.to_uri()).unwrap();
assert_eq!(parsed, c);
}
#[test]
fn page_uri_and_roundtrip() {
let c = Citation::Page {
path: p("papers/book.pdf"),
page: 23,
section: None,
};
assert_eq!(c.to_uri(), "papers/book.pdf#p=23");
let parsed = Citation::parse(&c.to_uri()).unwrap();
assert_eq!(parsed, c);
}
#[test]
fn region_uri_and_roundtrip() {
let c = Citation::Region {
path: p("photos/x.png"),
x: 120,
y: 40,
w: 520,
h: 180,
};
assert_eq!(c.to_uri(), "photos/x.png#xywh=120,40,520,180");
let parsed = Citation::parse(&c.to_uri()).unwrap();
assert_eq!(parsed, c);
}
#[test]
fn caption_uri_and_roundtrip() {
let c = Citation::Caption {
path: p("photos/x.png"),
// `model` is not in the URI grammar; round-trip fills it with "".
model: String::new(),
};
assert_eq!(c.to_uri(), "photos/x.png#caption");
let parsed = Citation::parse(&c.to_uri()).unwrap();
assert_eq!(parsed, c);
}
#[test]
fn time_uri_and_roundtrip_with_speaker() {
let c = Citation::Time {
path: p("recordings/r.m4a"),
start_ms: 822_000,
end_ms: 850_000,
speaker: Some("S1".to_string()),
};
assert_eq!(
c.to_uri(),
"recordings/r.m4a#t=00:13:42.000,00:14:10.000&speaker=S1"
);
let parsed = Citation::parse(&c.to_uri()).unwrap();
assert_eq!(parsed, c);
}
#[test]
fn time_uri_and_roundtrip_without_speaker() {
let c = Citation::Time {
path: p("recordings/r.m4a"),
start_ms: 1_500,
end_ms: 2_750,
speaker: None,
};
assert_eq!(c.to_uri(), "recordings/r.m4a#t=00:00:01.500,00:00:02.750");
let parsed = Citation::parse(&c.to_uri()).unwrap();
assert_eq!(parsed, c);
}
#[test]
fn parse_rejects_no_fragment() {
assert!(Citation::parse("just/path.md").is_err());
}
#[test]
fn parse_rejects_unknown_fragment() {
assert!(Citation::parse("a.md#mystery=1").is_err());
}
}

View File

@@ -0,0 +1,177 @@
//! CanonicalDocument, Block, SourceSpan, Inline, plus the forward-declared
//! OCR / caption / transcript stubs (§3.4 + §3.7a).
use serde::{Deserialize, Serialize};
use crate::asset::WorkspacePath;
use crate::ids::{AssetId, BlockId, DocumentId};
use crate::media::Lang;
use crate::metadata::{Metadata, Provenance};
use crate::versions::ParserVersion;
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct CanonicalDocument {
pub doc_id: DocumentId,
pub source_asset_id: AssetId,
pub workspace_path: WorkspacePath,
pub title: String,
pub lang: Lang,
pub blocks: Vec<Block>,
pub metadata: Metadata,
pub provenance: Provenance,
pub parser_version: ParserVersion,
pub schema_version: u32,
pub doc_version: u32,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", tag = "kind")]
pub enum Block {
Heading(HeadingBlock),
Paragraph(TextBlock),
List(ListBlock),
Code(CodeBlock),
Table(TableBlock),
Quote(TextBlock),
ImageRef(ImageRefBlock),
AudioRef(AudioRefBlock),
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct CommonBlock {
pub block_id: BlockId,
pub heading_path: Vec<String>,
pub source_span: SourceSpan,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct HeadingBlock {
pub common: CommonBlock,
pub level: u8,
pub text: String,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TextBlock {
pub common: CommonBlock,
pub text: String,
pub inlines: Vec<Inline>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ListBlock {
pub common: CommonBlock,
pub ordered: bool,
pub items: Vec<TextBlock>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct CodeBlock {
pub common: CommonBlock,
pub lang: Option<String>,
pub code: String,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TableBlock {
pub common: CommonBlock,
pub headers: Vec<String>,
pub rows: Vec<Vec<String>>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ImageRefBlock {
pub common: CommonBlock,
pub asset_id: Option<AssetId>,
pub src: String,
pub alt: String,
pub ocr: Option<OcrText>,
pub caption: Option<ModelCaption>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct AudioRefBlock {
pub common: CommonBlock,
pub asset_id: AssetId,
pub duration_ms: u64,
pub transcript: Option<Transcript>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", tag = "kind")]
pub enum Inline {
Text(String),
Code(String),
Link { text: String, href: String },
Strong(Vec<Inline>),
Emph(Vec<Inline>),
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", tag = "kind")]
pub enum SourceSpan {
Line {
start: u32,
end: u32,
},
Byte {
start: u64,
end: u64,
},
Page {
page: u32,
char_start: Option<u32>,
char_end: Option<u32>,
},
Region {
x: u32,
y: u32,
w: u32,
h: u32,
},
Time {
start_ms: u64,
end_ms: u64,
},
}
// ── Forward-declared stubs (§3.7a). Bodies are final per design. ────────
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct OcrText {
pub joined: String,
pub regions: Vec<OcrRegion>,
pub engine: String,
pub engine_version: String,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct OcrRegion {
pub bbox: (u32, u32, u32, u32),
pub text: String,
pub confidence: f32,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ModelCaption {
pub text: String,
pub model: String,
pub model_version: String,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Transcript {
pub segments: Vec<TranscriptSegment>,
pub engine: String,
pub engine_version: String,
pub language: Lang,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TranscriptSegment {
pub start_ms: u64,
pub end_ms: u64,
pub text: String,
pub speaker: Option<String>,
pub confidence: Option<f32>,
}

View File

@@ -0,0 +1,15 @@
//! `CoreError` (§10).
use thiserror::Error;
#[derive(Debug, Error)]
pub enum CoreError {
#[error("invalid id: {0}")]
InvalidId(String),
#[error("invalid citation: {0}")]
InvalidCitation(String),
#[error("invalid source span: {0}")]
InvalidSpan(String),
#[error("malformed input: {0}")]
Malformed(String),
}

303
crates/kb-core/src/ids.rs Normal file
View File

@@ -0,0 +1,303 @@
//! Newtype IDs (§3.1) + ID generation recipe (§4.2).
//!
//! Every ID is `blake3(canonical_json(tuple))[..32]`. `Display` returns the
//! inner hex string; `FromStr` rejects strings that are not exactly 32
//! lowercase hex characters.
use std::fmt;
use std::str::FromStr;
use serde::{Deserialize, Serialize};
use crate::asset::WorkspacePath;
use crate::document::SourceSpan;
use crate::errors::CoreError;
use crate::versions::{
ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion,
ParserVersion,
};
macro_rules! newtype_id {
($name:ident) => {
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct $name(pub String);
impl fmt::Display for $name {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.0)
}
}
impl FromStr for $name {
type Err = CoreError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
validate_hex32(s).map(|()| Self(s.to_owned()))
}
}
};
}
newtype_id!(AssetId);
newtype_id!(DocumentId);
newtype_id!(BlockId);
newtype_id!(ChunkId);
newtype_id!(EmbeddingId);
newtype_id!(IndexId);
fn validate_hex32(s: &str) -> Result<(), CoreError> {
if s.len() != 32 {
return Err(CoreError::InvalidId(format!(
"expected 32 hex chars, got {}",
s.len()
)));
}
if !s.bytes().all(|b| matches!(b, b'0'..=b'9' | b'a'..=b'f')) {
return Err(CoreError::InvalidId(format!(
"non-lowercase-hex character in {s:?}"
)));
}
Ok(())
}
/// Canonical-JSON + blake3 + hex prefix 32. Per design §4.2.
pub fn id_from<T: Serialize>(tuple: T) -> String {
let bytes = serde_json_canonicalizer::to_vec(&tuple)
.expect("canonical JSON serialization must not fail for kb-core inputs");
// The crate exposes `to_vec` for `T: Serialize` returning `Vec<u8>`.
let hex = blake3::hash(&bytes).to_hex().to_string();
hex[..32].to_string()
}
#[derive(Serialize)]
struct AssetTuple<'a> {
kind: &'static str,
asset_blake3: &'a str,
}
#[derive(Serialize)]
struct DocTuple<'a> {
kind: &'static str,
workspace_path: &'a str,
asset_id: &'a str,
parser_version: &'a str,
}
#[derive(Serialize)]
struct BlockTuple<'a> {
kind: &'static str,
doc_id: &'a str,
block_kind: &'a str,
heading_path: &'a [String],
ordinal: u32,
source_span: &'a SourceSpan,
}
#[derive(Serialize)]
struct ChunkTuple<'a> {
kind: &'static str,
doc_id: &'a str,
chunker_version: &'a str,
block_ids: Vec<&'a str>,
policy_hash: &'a str,
}
#[derive(Serialize)]
struct EmbeddingTuple<'a> {
kind: &'static str,
chunk_id: &'a str,
model_id: &'a str,
model_version: &'a str,
dimensions: usize,
}
#[derive(Serialize)]
struct IndexTuple<'a> {
kind: &'static str,
collection: &'a str,
embedding_model: &'a str,
dimensions: usize,
index_version: &'a str,
index_kind: &'a str,
index_params_hash: &'a str,
}
pub fn id_for_asset(asset_blake3_full_hex: &str) -> AssetId {
AssetId(id_from(AssetTuple {
kind: "asset",
asset_blake3: asset_blake3_full_hex,
}))
}
pub fn id_for_doc(
workspace_path: &WorkspacePath,
asset: &AssetId,
parser_version: &ParserVersion,
) -> DocumentId {
DocumentId(id_from(DocTuple {
kind: "doc",
workspace_path: &workspace_path.0,
asset_id: &asset.0,
parser_version: &parser_version.0,
}))
}
pub fn id_for_block(
doc: &DocumentId,
block_kind: &str,
heading_path: &[String],
ordinal: u32,
span: &SourceSpan,
) -> BlockId {
BlockId(id_from(BlockTuple {
kind: "block",
doc_id: &doc.0,
block_kind,
heading_path,
ordinal,
source_span: span,
}))
}
pub fn id_for_chunk(
doc: &DocumentId,
chunker_version: &ChunkerVersion,
block_ids: &[BlockId],
policy_hash: &str,
) -> ChunkId {
ChunkId(id_from(ChunkTuple {
kind: "chunk",
doc_id: &doc.0,
chunker_version: &chunker_version.0,
block_ids: block_ids.iter().map(|b| b.0.as_str()).collect(),
policy_hash,
}))
}
pub fn id_for_embedding(
chunk: &ChunkId,
model: &EmbeddingModelId,
version: &EmbeddingVersion,
dims: usize,
) -> EmbeddingId {
EmbeddingId(id_from(EmbeddingTuple {
kind: "embedding",
chunk_id: &chunk.0,
model_id: &model.0,
model_version: &version.0,
dimensions: dims,
}))
}
pub fn id_for_index(
collection: &str,
model: &EmbeddingModelId,
dims: usize,
version: &IndexVersion,
kind: &str,
params_hash: &str,
) -> IndexId {
IndexId(id_from(IndexTuple {
kind: "index",
collection,
embedding_model: &model.0,
dimensions: dims,
index_version: &version.0,
index_kind: kind,
index_params_hash: params_hash,
}))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn newtype_display_roundtrip() {
let s = "0123456789abcdef0123456789abcdef";
let id: AssetId = s.parse().unwrap();
assert_eq!(id.to_string(), s);
}
#[test]
fn newtype_rejects_short() {
let r: Result<AssetId, _> = "abc".parse();
assert!(r.is_err());
}
#[test]
fn newtype_rejects_non_hex() {
let r: Result<AssetId, _> = "ZZZ456789abcdef0123456789abcdef0".parse();
assert!(r.is_err());
}
#[test]
fn newtype_rejects_uppercase() {
let r: Result<AssetId, _> = "0123456789ABCDEF0123456789ABCDEF".parse();
assert!(r.is_err());
}
/// Determinism: 1000 runs of `id_from` over the same input yield the same
/// hex.
#[test]
fn id_from_deterministic_1000() {
#[derive(Serialize)]
struct T<'a> {
a: u32,
b: &'a str,
}
let input = T { a: 7, b: "hello" };
let first = id_from(&input);
for _ in 0..1000 {
assert_eq!(id_from(&input), first);
}
assert_eq!(first.len(), 32);
}
/// Key order in the source struct does not affect hash (canonical JSON
/// sorts keys alphabetically).
#[test]
fn id_from_key_order_invariant() {
#[derive(Serialize)]
struct A {
a: u32,
b: u32,
}
#[derive(Serialize)]
struct B {
b: u32,
a: u32,
}
assert_eq!(id_from(A { a: 1, b: 2 }), id_from(B { b: 2, a: 1 }));
}
/// The expected hex below is hand-computed via design §4.2:
/// tuple = { "kind": "asset", "asset_blake3": "deadbeef" }
/// canonical JSON (key-sorted, no whitespace, both keys are pure ASCII):
/// {"asset_blake3":"deadbeef","kind":"asset"}
/// blake3 of those bytes → hex → first 32 chars.
/// Pinned via an independent tool (b3sum, computed once outside the code
/// under test) so a regression in our JCS or hash pipeline is caught.
#[test]
fn id_for_asset_pinned() {
// printf '{"asset_blake3":"deadbeef","kind":"asset"}' | b3sum
// → cec9353553efb238a7919d38d3e148f1...
let id = id_for_asset("deadbeef");
assert_eq!(id.0, "cec9353553efb238a7919d38d3e148f1");
}
/// Independent pin for id_for_doc.
/// canonical JSON:
/// {"asset_id":"6cb0ef0eb89c63b8b6e76ec53dca6e7d",
/// "kind":"doc",
/// "parser_version":"pulldown-cmark-0.x",
/// "workspace_path":"notes/test.md"}
/// (concatenated, no whitespace).
#[test]
fn id_for_doc_pinned() {
let asset = AssetId("6cb0ef0eb89c63b8b6e76ec53dca6e7d".to_string());
let path = WorkspacePath("notes/test.md".to_string());
let pv = ParserVersion("pulldown-cmark-0.x".to_string());
let id = id_for_doc(&path, &asset, &pv);
assert_eq!(id.0, "8547fe58cb42d593fd761d77242401db");
}
}

View File

@@ -0,0 +1,45 @@
//! IngestReport + IngestItem (mirrored from wire §2.4).
use serde::{Deserialize, Serialize};
use crate::asset::WorkspacePath;
use crate::ids::{AssetId, DocumentId};
use crate::traits::SourceScope;
use crate::versions::{ChunkerVersion, ParserVersion};
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IngestReport {
pub scope: SourceScope,
pub scanned: u32,
pub new: u32,
pub updated: u32,
pub skipped: u32,
pub errors: u32,
pub duration_ms: u32,
/// `None` ↔ wire `items: null` (`--summary-only`).
pub items: Option<Vec<IngestItem>>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IngestItem {
pub kind: IngestItemKind,
pub doc_id: Option<DocumentId>,
pub doc_path: WorkspacePath,
pub asset_id: Option<AssetId>,
pub byte_len: Option<u64>,
pub block_count: Option<u32>,
pub chunk_count: Option<u32>,
pub parser_version: Option<ParserVersion>,
pub chunker_version: Option<ChunkerVersion>,
pub warnings: Vec<String>,
pub error: Option<String>,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum IngestItemKind {
New,
Updated,
Skipped,
Error,
}

View File

@@ -0,0 +1,52 @@
//! Job repo support types (§3.7a forward-decl, §7.2 JobRepo).
use serde::{Deserialize, Serialize};
use serde_json::Value;
use time::OffsetDateTime;
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum JobKind {
Ingest,
Chunk,
Embed,
Ocr,
Transcribe,
Reindex,
Doctor,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum JobStatus {
Pending,
Running,
Succeeded,
Failed,
Canceled,
}
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct JobId(pub String);
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct JobFilter {
pub status: Option<JobStatus>,
pub kind: Option<JobKind>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct JobRow {
pub job_id: JobId,
pub kind: JobKind,
pub status: JobStatus,
pub payload: Value,
pub progress: Option<Value>,
pub error: Option<String>,
#[serde(with = "time::serde::rfc3339")]
pub created_at: OffsetDateTime,
#[serde(with = "time::serde::rfc3339")]
pub updated_at: OffsetDateTime,
#[serde(default, with = "time::serde::rfc3339::option")]
pub finished_at: Option<OffsetDateTime>,
}

70
crates/kb-core/src/lib.rs Normal file
View File

@@ -0,0 +1,70 @@
//! `kb-core` — frozen domain types, traits, and ID recipe.
//!
//! Per design §3, §4, §7. This crate has zero dependencies on any other
//! `kb-*` crate, so every other crate in the workspace can depend on it
//! freely.
//!
//! See `docs/superpowers/specs/2026-04-27-kb-final-form-design.md` for
//! the canonical type bodies — this crate is the byte-for-byte mirror.
pub mod ids;
pub mod versions;
pub mod media;
pub mod asset;
pub mod document;
pub mod chunk;
pub mod citation;
pub mod metadata;
pub mod search;
pub mod answer;
pub mod ingest;
pub mod jobs;
pub mod vector;
pub mod errors;
pub mod traits;
pub mod normalize;
// Re-export the most commonly used items at the crate root, mirroring the
// public surface listed in the task spec.
pub use ids::{
AssetId, BlockId, ChunkId, DocumentId, EmbeddingId, IndexId,
id_for_asset, id_for_block, id_for_chunk, id_for_doc, id_for_embedding,
id_for_index, id_from,
};
pub use versions::{
ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion,
ParserVersion, PromptTemplateVersion, SchemaVersion,
};
pub use media::{AudioType, Checksum, ImageType, Lang, MediaType};
pub use asset::{AssetStorage, RawAsset, SourceUri, WorkspacePath};
pub use document::{
AudioRefBlock, Block, CanonicalDocument, CodeBlock, CommonBlock,
HeadingBlock, ImageRefBlock, Inline, ListBlock, ModelCaption, OcrRegion,
OcrText, SourceSpan, TableBlock, TextBlock, Transcript, TranscriptSegment,
};
pub use chunk::Chunk;
pub use citation::Citation;
pub use metadata::{
Metadata, Provenance, ProvenanceEvent, ProvenanceKind, SourceType,
TrustLevel,
};
pub use search::{
DocFilter, DocSummary, RetrievalDetail, SearchFilters, SearchHit,
SearchMode, SearchQuery,
};
pub use answer::{
Answer, AnswerCitation, AnswerRetrievalSummary, ModelRef, RefusalReason,
TokenUsage, TraceId,
};
pub use ingest::{IngestItem, IngestReport};
pub use jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
pub use vector::{VectorHit, VectorRecord};
pub use errors::CoreError;
pub use traits::{
ChunkPolicy, Chunker, DocumentStore, Embedder, EmbeddingInput,
EmbeddingKind, ExtractConfig, ExtractContext, Extractor, FinishReason,
GenerateRequest, JobRepo, LanguageModel, Retriever, SourceConnector,
SourceScope, TokenChunk, VectorStore,
};
pub use normalize::{nfc, to_posix};

View File

@@ -0,0 +1,44 @@
//! Media / file-type primitives (§3.3 + §3.7a).
use serde::{Deserialize, Serialize};
/// Full blake3 hex (64 chars) per §3.7a. Stored as `String` for serde
/// simplicity.
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct Checksum(pub String);
/// BCP-47 / ISO-639 language tag (e.g. "ko", "en"). §3.7a.
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct Lang(pub String);
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ImageType {
Png,
Jpeg,
Webp,
Gif,
Tiff,
Other(String),
}
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum AudioType {
M4a,
Mp3,
Wav,
Flac,
Ogg,
Other(String),
}
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum MediaType {
Markdown,
Pdf,
Image(ImageType),
Audio(AudioType),
Other(String),
}

View File

@@ -0,0 +1,68 @@
//! Metadata + Provenance (§3.6).
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value};
use time::OffsetDateTime;
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Metadata {
pub aliases: Vec<String>,
pub tags: Vec<String>,
#[serde(with = "time::serde::rfc3339")]
pub created_at: OffsetDateTime,
#[serde(with = "time::serde::rfc3339")]
pub updated_at: OffsetDateTime,
pub source_type: SourceType,
pub trust_level: TrustLevel,
pub user_id_alias: Option<String>,
/// Frontmatter keys we don't recognise are preserved here per §0 Q9.
pub user: Map<String, Value>,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum SourceType {
Markdown,
Note,
Paper,
Reference,
Inbox,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum TrustLevel {
Primary,
Secondary,
Generated,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Provenance {
pub events: Vec<ProvenanceEvent>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ProvenanceEvent {
#[serde(with = "time::serde::rfc3339")]
pub at: OffsetDateTime,
pub agent: String,
pub kind: ProvenanceKind,
pub note: Option<String>,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ProvenanceKind {
Discovered,
Parsed,
Normalized,
Chunked,
OcrApplied,
CaptionApplied,
Transcribed,
Embedded,
Indexed,
Warning,
Error,
}

View File

@@ -0,0 +1,86 @@
//! Path / string normalization helpers (§4.1, §6.6).
use std::path::{Component, Path};
use unicode_normalization::UnicodeNormalization;
use crate::asset::WorkspacePath;
/// NFC-normalize a UTF-8 string (§4.1).
pub fn nfc(input: &str) -> String {
input.nfc().collect()
}
/// Collapse a path to a POSIX-relative `WorkspacePath` per §6.6:
/// - convert all separators to `/`
/// - strip a leading `./`
/// - collapse repeated slashes
/// - NFC-normalize
pub fn to_posix(path: &Path) -> WorkspacePath {
let mut out = String::new();
let mut first = true;
for comp in path.components() {
match comp {
Component::CurDir => continue,
Component::Normal(s) => {
if !first {
out.push('/');
}
out.push_str(&s.to_string_lossy());
first = false;
}
Component::ParentDir => {
if !first {
out.push('/');
}
out.push_str("..");
first = false;
}
Component::RootDir => {
if first {
out.push('/');
}
first = false;
}
Component::Prefix(_) => {
// Windows drive prefixes — `to_string_lossy` keeps form.
out.push_str(&comp.as_os_str().to_string_lossy());
first = false;
}
}
}
if out.is_empty() {
out.push_str(".");
}
WorkspacePath(nfc(&out))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn collapses_curdir_and_redundant_slashes() {
let p = Path::new("./a//b.md");
// `Path::components` already collapses `//` on POSIX; the test
// doc-fixed example asserts the final string is `a/b.md`.
assert_eq!(to_posix(p).0, "a/b.md");
}
#[test]
fn nfc_normalizes_korean() {
// U+1100 ㄱ + U+1161 ㅏ (NFD) vs U+AC00 가 (NFC). After NFC they
// collapse to the same string; `to_posix` runs NFC after path
// collapse, so the WorkspacePath comes out NFC regardless of input.
let nfd = "\u{1100}\u{1161}.md";
let nfc_str = "\u{AC00}.md";
assert_eq!(to_posix(Path::new(nfd)).0, to_posix(Path::new(nfc_str)).0);
assert_eq!(to_posix(Path::new(nfd)).0, "\u{AC00}.md");
}
#[test]
fn nfc_function_idempotent() {
let s = "\u{AC00}";
assert_eq!(nfc(s), s);
}
}

View File

@@ -0,0 +1,90 @@
//! Search query / filters / hit (§3.7) + DocFilter / DocSummary (§2.5).
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use crate::asset::WorkspacePath;
use crate::citation::Citation;
use crate::ids::{ChunkId, DocumentId};
use crate::media::Lang;
use crate::metadata::{SourceType, TrustLevel};
use crate::versions::{ChunkerVersion, EmbeddingModelId, IndexVersion, ParserVersion};
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum SearchMode {
Lexical,
Vector,
Hybrid,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct SearchQuery {
pub text: String,
pub mode: SearchMode,
pub k: usize,
pub filters: SearchFilters,
}
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct SearchFilters {
pub tags_any: Vec<String>,
pub lang: Option<Lang>,
pub path_glob: Option<String>,
pub trust_min: Option<TrustLevel>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct SearchHit {
pub rank: u32,
pub chunk_id: ChunkId,
pub doc_id: DocumentId,
pub doc_path: WorkspacePath,
pub heading_path: Vec<String>,
pub section_label: Option<String>,
pub snippet: String,
pub citation: Citation,
pub retrieval: RetrievalDetail,
pub index_version: IndexVersion,
pub embedding_model: Option<EmbeddingModelId>,
pub chunker_version: ChunkerVersion,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct RetrievalDetail {
pub method: SearchMode,
pub fusion_score: f32,
pub lexical_score: Option<f32>,
pub vector_score: Option<f32>,
pub lexical_rank: Option<u32>,
pub vector_rank: Option<u32>,
}
/// Filter for `kb-app::list_docs` (§7.2 DocumentStore::list_documents).
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct DocFilter {
pub tags_any: Vec<String>,
pub lang: Option<Lang>,
pub path_glob: Option<String>,
pub trust_min: Option<TrustLevel>,
}
/// Internal mirror of wire `doc_summary.v1` (§2.5).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct DocSummary {
pub doc_id: DocumentId,
pub doc_path: WorkspacePath,
pub title: String,
pub lang: Lang,
pub tags: Vec<String>,
pub trust_level: TrustLevel,
pub source_type: SourceType,
pub byte_len: u64,
pub chunk_count: u32,
#[serde(with = "time::serde::rfc3339")]
pub created_at: OffsetDateTime,
#[serde(with = "time::serde::rfc3339")]
pub updated_at: OffsetDateTime,
pub parser_version: ParserVersion,
pub chunker_version: ChunkerVersion,
}

View File

@@ -0,0 +1,175 @@
//! Component traits (§7) and their input helper types (§7.1).
use std::path::{Path, PathBuf};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use crate::asset::RawAsset;
use crate::chunk::Chunk;
use crate::document::{Block, CanonicalDocument};
use crate::ids::{ChunkId, DocumentId};
use crate::jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
use crate::media::MediaType;
use crate::search::{DocFilter, DocSummary, SearchFilters, SearchHit, SearchQuery};
use crate::vector::{VectorHit, VectorRecord};
use crate::versions::{
ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, ParserVersion,
};
use crate::answer::{ModelRef, TokenUsage};
// ── Helper input types (§7.1) ─────────────────────────────────────────────
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct SourceScope {
pub root: PathBuf,
pub include: Vec<String>,
pub exclude: Vec<String>,
}
/// Forward-declared (§3.7a) — concrete shape decided by extractors. P0
/// keeps the option-of-config-file slot only.
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct ExtractConfig {
pub config_path: Option<PathBuf>,
}
/// Carries the raw asset bytes context to an `Extractor::extract` call.
pub struct ExtractContext<'a> {
pub asset: &'a RawAsset,
pub workspace_root: &'a Path,
pub config: &'a ExtractConfig,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ChunkPolicy {
pub target_tokens: usize,
pub overlap_tokens: usize,
pub respect_markdown_headings: bool,
pub chunker_version: ChunkerVersion,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum EmbeddingKind {
Document,
Query,
}
pub struct EmbeddingInput<'a> {
pub text: &'a str,
pub kind: EmbeddingKind,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct GenerateRequest {
pub system: String,
pub user: String,
pub stop: Vec<String>,
pub max_tokens: usize,
pub temperature: f32,
pub seed: Option<u64>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case", tag = "kind")]
pub enum TokenChunk {
Token(String),
Done {
finish_reason: FinishReason,
usage: TokenUsage,
},
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FinishReason {
Stop,
Length,
Aborted,
Error(String),
}
// ── Traits (§7.2) ─────────────────────────────────────────────────────────
pub trait SourceConnector {
fn scan(&self, scope: &SourceScope) -> anyhow::Result<Vec<RawAsset>>;
}
pub trait Extractor: Send + Sync {
fn supports(&self, media_type: &MediaType) -> bool;
fn parser_version(&self) -> ParserVersion;
fn extract(
&self,
ctx: &ExtractContext<'_>,
bytes: &[u8],
) -> anyhow::Result<CanonicalDocument>;
}
pub trait Chunker: Send + Sync {
fn chunker_version(&self) -> ChunkerVersion;
fn policy_hash(&self, policy: &ChunkPolicy) -> String;
fn chunk(
&self,
doc: &CanonicalDocument,
policy: &ChunkPolicy,
) -> anyhow::Result<Vec<Chunk>>;
}
pub trait Embedder: Send + Sync {
fn model_id(&self) -> EmbeddingModelId;
fn model_version(&self) -> EmbeddingVersion;
fn dimensions(&self) -> usize;
fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> anyhow::Result<Vec<Vec<f32>>>;
}
pub trait Retriever: Send + Sync {
fn search(&self, query: &SearchQuery) -> anyhow::Result<Vec<SearchHit>>;
fn index_version(&self) -> IndexVersion;
}
pub trait LanguageModel: Send + Sync {
fn model_ref(&self) -> ModelRef;
fn context_tokens(&self) -> usize;
fn generate_stream(
&self,
req: GenerateRequest,
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>>;
}
pub trait DocumentStore {
fn put_asset(&self, a: &RawAsset) -> anyhow::Result<()>;
fn put_document(&self, d: &CanonicalDocument) -> anyhow::Result<()>;
fn put_blocks(&self, doc: &DocumentId, blocks: &[Block]) -> anyhow::Result<()>;
fn put_chunks(&self, doc: &DocumentId, chunks: &[Chunk]) -> anyhow::Result<()>;
fn get_document(&self, id: &DocumentId) -> anyhow::Result<Option<CanonicalDocument>>;
fn get_chunk(&self, id: &ChunkId) -> anyhow::Result<Option<Chunk>>;
fn list_documents(&self, filter: &DocFilter) -> anyhow::Result<Vec<DocSummary>>;
}
pub trait VectorStore {
fn ensure_table(
&self,
model: &EmbeddingModelId,
dim: usize,
) -> anyhow::Result<crate::ids::IndexId>;
fn upsert(&self, recs: &[VectorRecord]) -> anyhow::Result<()>;
fn search(
&self,
query_vec: &[f32],
k: usize,
filters: &SearchFilters,
) -> anyhow::Result<Vec<VectorHit>>;
}
pub trait JobRepo {
fn create(&self, kind: JobKind, payload: Value) -> anyhow::Result<JobId>;
fn update_progress(&self, id: &JobId, progress: Value) -> anyhow::Result<()>;
fn finish(
&self,
id: &JobId,
status: JobStatus,
error: Option<&str>,
) -> anyhow::Result<()>;
fn list(&self, filter: &JobFilter) -> anyhow::Result<Vec<JobRow>>;
}

View File

@@ -0,0 +1,27 @@
//! Vector store records (§7.2 VectorStore).
use serde::{Deserialize, Serialize};
use serde_json::Value;
use crate::ids::{ChunkId, DocumentId, EmbeddingId};
use crate::versions::{EmbeddingModelId, EmbeddingVersion};
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct VectorRecord {
pub chunk_id: ChunkId,
pub embedding_id: EmbeddingId,
pub vector: Vec<f32>,
pub doc_id: DocumentId,
pub text: String,
pub heading_path: Vec<String>,
pub model_id: EmbeddingModelId,
pub model_version: EmbeddingVersion,
pub dimensions: usize,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct VectorHit {
pub chunk_id: ChunkId,
pub score: f32,
pub payload: Value,
}

View File

@@ -0,0 +1,27 @@
//! Version / label newtypes (§3.2).
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct ParserVersion(pub String);
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct ChunkerVersion(pub String);
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct EmbeddingModelId(pub String);
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct EmbeddingVersion(pub String);
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct IndexVersion(pub String);
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct PromptTemplateVersion(pub String);
/// Wire schema version label (`"answer.v1"`, `"search_hit.v1"`, …).
/// Carried as a `&'static str` because every wire type pins its label at
/// compile time.
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct SchemaVersion(pub &'static str);