p1-2: scaffold kb-parse-md crate

Add the workspace member with the dep allow-list pinned by design §0 Q9
and the task spec. P1-2 will land the frontmatter submodule in the next
commit; P1-3 will add the block parser as a sibling.

Notable choice: serde_yaml (dtolnay) was archived as unmaintained in 2024
so we use serde_yaml_ng, the maintained fork. lingua's per-language
features are explicitly enabled (default-features=false) to keep build
time + binary size sane — only the languages we need at parse time.
This commit is contained in:
2026-04-30 12:55:20 +00:00
parent 69a0dbb79d
commit a86b463fc4
4 changed files with 455 additions and 1 deletions

414
Cargo.lock generated
View File

@@ -79,6 +79,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "autocfg"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "bitflags"
version = "2.11.1"
@@ -109,6 +115,12 @@ dependencies = [
"serde",
]
[[package]]
name = "bumpalo"
version = "3.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
[[package]]
name = "cc"
version = "1.2.61"
@@ -177,6 +189,15 @@ version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
[[package]]
name = "counter"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "66e8e052be91f1c8aae2c1d81307d9f6e67f5f37001e3ddee419e971e73f03bc"
dependencies = [
"num-traits",
]
[[package]]
name = "cpufeatures"
version = "0.3.0"
@@ -220,6 +241,20 @@ version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "dashmap"
version = "6.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
dependencies = [
"cfg-if",
"crossbeam-utils",
"hashbrown 0.14.5",
"lock_api",
"once_cell",
"parking_lot_core",
]
[[package]]
name = "deranged"
version = "0.5.8"
@@ -251,6 +286,12 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "either"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]]
name = "equivalent"
version = "1.0.2"
@@ -272,6 +313,9 @@ name = "fastrand"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
dependencies = [
"getrandom 0.3.4",
]
[[package]]
name = "find-msvc-tools"
@@ -285,6 +329,36 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "fst"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a"
[[package]]
name = "futures-core"
version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
[[package]]
name = "futures-task"
version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
[[package]]
name = "futures-util"
version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
dependencies = [
"futures-core",
"futures-task",
"pin-project-lite",
"slab",
]
[[package]]
name = "getrandom"
version = "0.2.17"
@@ -296,6 +370,20 @@ dependencies = [
"wasi",
]
[[package]]
name = "getrandom"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
dependencies = [
"cfg-if",
"js-sys",
"libc",
"r-efi 5.3.0",
"wasip2",
"wasm-bindgen",
]
[[package]]
name = "getrandom"
version = "0.4.2"
@@ -304,7 +392,7 @@ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
dependencies = [
"cfg-if",
"libc",
"r-efi",
"r-efi 6.0.0",
"wasip2",
"wasip3",
]
@@ -322,6 +410,12 @@ dependencies = [
"regex-syntax",
]
[[package]]
name = "hashbrown"
version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
[[package]]
name = "hashbrown"
version = "0.15.5"
@@ -365,6 +459,25 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "include_dir"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "923d117408f1e49d914f1a379a309cffe4f18c05cf4e3d12e613a15fc81bd0dd"
dependencies = [
"include_dir_macros",
]
[[package]]
name = "include_dir_macros"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cab85a7ed0bd5f0e76d93846e0147172bed2e2d3f859bcc33a8d9699cad1a75"
dependencies = [
"proc-macro2",
"quote",
]
[[package]]
name = "indexmap"
version = "2.14.0"
@@ -383,12 +496,33 @@ version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
[[package]]
name = "itertools"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
[[package]]
name = "js-sys"
version = "0.3.97"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1840c94c045fbcf8ba2812c95db44499f7c64910a912551aaaa541decebcacf"
dependencies = [
"cfg-if",
"futures-util",
"once_cell",
"wasm-bindgen",
]
[[package]]
name = "kb-app"
version = "0.1.0"
@@ -443,6 +577,21 @@ dependencies = [
"unicode-normalization",
]
[[package]]
name = "kb-parse-md"
version = "0.1.0"
dependencies = [
"anyhow",
"kb-core",
"kb-parse-types",
"lingua",
"serde",
"serde_json",
"serde_yaml_ng",
"time",
"toml",
]
[[package]]
name = "kb-parse-types"
version = "0.1.0"
@@ -495,18 +644,95 @@ dependencies = [
"libc",
]
[[package]]
name = "lingua"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f40d9129bb9fe42c95d1bd420d6891607eaff17df16ee15674aed2d05b0ec8f4"
dependencies = [
"counter",
"dashmap",
"fastrand",
"fst",
"include_dir",
"itertools",
"lingua-chinese-language-model",
"lingua-english-language-model",
"lingua-japanese-language-model",
"lingua-korean-language-model",
"maplit",
"rayon",
"regex",
"serde",
"serde-wasm-bindgen",
"strum",
"strum_macros",
"wasm-bindgen",
]
[[package]]
name = "lingua-chinese-language-model"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21ca7fa9f7671d684c82c168725f380fc873f14d6f4e8c82f0da681bcc0048d1"
dependencies = [
"include_dir",
]
[[package]]
name = "lingua-english-language-model"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97102de08b134a49f1cce05a1b6f5bf08ef21fe858074ae2b794e7892c43dd4b"
dependencies = [
"include_dir",
]
[[package]]
name = "lingua-japanese-language-model"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df0938f75de3ae5dcdc925d823ed409854ca14f6a653782b9a1ad5d899462fbe"
dependencies = [
"include_dir",
]
[[package]]
name = "lingua-korean-language-model"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa87f6c43ff894fc75159c021480d2fdf96882bf5bd235f8916ceb6b7caae561"
dependencies = [
"include_dir",
]
[[package]]
name = "linux-raw-sys"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
[[package]]
name = "lock_api"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
dependencies = [
"scopeguard",
]
[[package]]
name = "log"
version = "0.4.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
[[package]]
name = "maplit"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
name = "matchers"
version = "0.2.0"
@@ -537,6 +763,15 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.21.4"
@@ -555,6 +790,19 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
[[package]]
name = "parking_lot_core"
version = "0.9.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"smallvec",
"windows-link",
]
[[package]]
name = "pin-project-lite"
version = "0.2.17"
@@ -595,12 +843,47 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "r-efi"
version = "5.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
[[package]]
name = "r-efi"
version = "6.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
[[package]]
name = "rayon"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "redox_syscall"
version = "0.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
dependencies = [
"bitflags",
]
[[package]]
name = "redox_users"
version = "0.4.6"
@@ -612,6 +895,18 @@ dependencies = [
"thiserror 1.0.69",
]
[[package]]
name = "regex"
version = "1.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.14"
@@ -642,6 +937,18 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "rustversion"
version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
[[package]]
name = "ryu"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
[[package]]
name = "ryu-js"
version = "1.0.2"
@@ -657,6 +964,12 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "scopeguard"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "semver"
version = "1.0.28"
@@ -673,6 +986,17 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "serde-wasm-bindgen"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8302e169f0eddcc139c70f139d19d6467353af16f9fce27e8c30158036a1e16b"
dependencies = [
"js-sys",
"serde",
"wasm-bindgen",
]
[[package]]
name = "serde_core"
version = "1.0.228"
@@ -726,6 +1050,19 @@ dependencies = [
"serde",
]
[[package]]
name = "serde_yaml_ng"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b4db627b98b36d4203a7b458cf3573730f2bb591b28871d916dfa9efabfd41f"
dependencies = [
"indexmap",
"itoa",
"ryu",
"serde",
"unsafe-libyaml",
]
[[package]]
name = "sharded-slab"
version = "0.1.7"
@@ -741,6 +1078,12 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "slab"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
[[package]]
name = "smallvec"
version = "1.15.1"
@@ -753,6 +1096,24 @@ version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "strum"
version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf"
[[package]]
name = "strum_macros"
version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "symlink"
version = "0.1.0"
@@ -1027,6 +1388,12 @@ version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
[[package]]
name = "unsafe-libyaml"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
[[package]]
name = "utf8parse"
version = "0.2.2"
@@ -1073,6 +1440,51 @@ dependencies = [
"wit-bindgen 0.51.0",
]
[[package]]
name = "wasm-bindgen"
version = "0.2.120"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df52b6d9b87e0c74c9edfa1eb2d9bf85e5d63515474513aa50fa181b3c4f5db1"
dependencies = [
"cfg-if",
"once_cell",
"rustversion",
"wasm-bindgen-macro",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.120"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78b1041f495fb322e64aca85f5756b2172e35cd459376e67f2a6c9dffcedb103"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.120"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dcd0ff20416988a18ac686d4d4d0f6aae9ebf08a389ff5d29012b05af2a1b41"
dependencies = [
"bumpalo",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.120"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49757b3c82ebf16c57d69365a142940b384176c24df52a087fb748e2085359ea"
dependencies = [
"unicode-ident",
]
[[package]]
name = "wasm-encoder"
version = "0.244.0"

View File

@@ -5,6 +5,7 @@ members = [
"crates/kb-parse-types",
"crates/kb-config",
"crates/kb-source-fs",
"crates/kb-parse-md",
"crates/kb-app",
"crates/kb-cli",
]

View File

@@ -0,0 +1,34 @@
[package]
name = "kb-parse-md"
version = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "Markdown frontmatter (and, in p1-3, block) parsing into kb-core::Metadata / kb-parse-types intermediates"
[dependencies]
kb-core = { path = "../kb-core" }
kb-parse-types = { path = "../kb-parse-types" }
anyhow = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
time = { workspace = true }
# serde_yaml (dtolnay) was archived as unmaintained in 2024.
# We use the maintained fork serde_yaml_ng. Keeping the same `serde_yaml`-style
# API surface lets us swap if a different fork wins long term.
serde_yaml_ng = "0.10"
toml = "0.8"
# `lingua` ships every supported language as a feature flag; the `default`
# feature pulls all 75+ language models (huge build time + binary size).
# For p1-2 we only need a small subset for autodetect + tests. Add more
# languages here as future tasks call for them.
lingua = { version = "1.8", default-features = false, features = [
"korean",
"english",
"japanese",
"chinese",
] }
[dev-dependencies]
serde_json = { workspace = true }

View File

@@ -0,0 +1,7 @@
//! `kb-parse-md` — Markdown parsing for the KB pipeline (§3.7b).
//!
//! P1-2 will implement the **frontmatter** submodule. P1-3 will add a
//! sibling `blocks` submodule for block parsing using `pulldown-cmark`.
//!
//! This commit only establishes the crate scaffold so subsequent
//! commits can land the parser in a reviewable shape.