feat(p1-2): kb-parse-md frontmatter (YAML/TOML → Metadata) #7
414
Cargo.lock
generated
414
Cargo.lock
generated
@@ -79,6 +79,12 @@ version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.11.1"
|
||||
@@ -109,6 +115,12 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.20.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.61"
|
||||
@@ -177,6 +189,15 @@ version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
|
||||
|
||||
[[package]]
|
||||
name = "counter"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "66e8e052be91f1c8aae2c1d81307d9f6e67f5f37001e3ddee419e971e73f03bc"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.3.0"
|
||||
@@ -220,6 +241,20 @@ version = "0.8.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
|
||||
[[package]]
|
||||
name = "dashmap"
|
||||
version = "6.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
"hashbrown 0.14.5",
|
||||
"lock_api",
|
||||
"once_cell",
|
||||
"parking_lot_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.5.8"
|
||||
@@ -251,6 +286,12 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.2"
|
||||
@@ -272,6 +313,9 @@ name = "fastrand"
|
||||
version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
|
||||
dependencies = [
|
||||
"getrandom 0.3.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
@@ -285,6 +329,36 @@ version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
||||
|
||||
[[package]]
|
||||
name = "fst"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a"
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
|
||||
|
||||
[[package]]
|
||||
name = "futures-task"
|
||||
version = "0.3.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
version = "0.3.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
"pin-project-lite",
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.17"
|
||||
@@ -296,6 +370,20 @@ dependencies = [
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"libc",
|
||||
"r-efi 5.3.0",
|
||||
"wasip2",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.4.2"
|
||||
@@ -304,7 +392,7 @@ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"r-efi 6.0.0",
|
||||
"wasip2",
|
||||
"wasip3",
|
||||
]
|
||||
@@ -322,6 +410,12 @@ dependencies = [
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.14.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.15.5"
|
||||
@@ -365,6 +459,25 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "include_dir"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "923d117408f1e49d914f1a379a309cffe4f18c05cf4e3d12e613a15fc81bd0dd"
|
||||
dependencies = [
|
||||
"include_dir_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "include_dir_macros"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7cab85a7ed0bd5f0e76d93846e0147172bed2e2d3f859bcc33a8d9699cad1a75"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.14.0"
|
||||
@@ -383,12 +496,33 @@ version = "1.70.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.97"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1840c94c045fbcf8ba2812c95db44499f7c64910a912551aaaa541decebcacf"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"futures-util",
|
||||
"once_cell",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-app"
|
||||
version = "0.1.0"
|
||||
@@ -443,6 +577,21 @@ dependencies = [
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-parse-md"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kb-core",
|
||||
"kb-parse-types",
|
||||
"lingua",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_yaml_ng",
|
||||
"time",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-parse-types"
|
||||
version = "0.1.0"
|
||||
@@ -495,18 +644,95 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lingua"
|
||||
version = "1.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f40d9129bb9fe42c95d1bd420d6891607eaff17df16ee15674aed2d05b0ec8f4"
|
||||
dependencies = [
|
||||
"counter",
|
||||
"dashmap",
|
||||
"fastrand",
|
||||
"fst",
|
||||
"include_dir",
|
||||
"itertools",
|
||||
"lingua-chinese-language-model",
|
||||
"lingua-english-language-model",
|
||||
"lingua-japanese-language-model",
|
||||
"lingua-korean-language-model",
|
||||
"maplit",
|
||||
"rayon",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde-wasm-bindgen",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lingua-chinese-language-model"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21ca7fa9f7671d684c82c168725f380fc873f14d6f4e8c82f0da681bcc0048d1"
|
||||
dependencies = [
|
||||
"include_dir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lingua-english-language-model"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97102de08b134a49f1cce05a1b6f5bf08ef21fe858074ae2b794e7892c43dd4b"
|
||||
dependencies = [
|
||||
"include_dir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lingua-japanese-language-model"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df0938f75de3ae5dcdc925d823ed409854ca14f6a653782b9a1ad5d899462fbe"
|
||||
dependencies = [
|
||||
"include_dir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lingua-korean-language-model"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa87f6c43ff894fc75159c021480d2fdf96882bf5bd235f8916ceb6b7caae561"
|
||||
dependencies = [
|
||||
"include_dir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
|
||||
dependencies = [
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
|
||||
|
||||
[[package]]
|
||||
name = "maplit"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.2.0"
|
||||
@@ -537,6 +763,15 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.4"
|
||||
@@ -555,6 +790,19 @@ version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot_core"
|
||||
version = "0.9.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"smallvec",
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.17"
|
||||
@@ -595,12 +843,47 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "r-efi"
|
||||
version = "5.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
|
||||
|
||||
[[package]]
|
||||
name = "r-efi"
|
||||
version = "6.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
|
||||
dependencies = [
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_users"
|
||||
version = "0.4.6"
|
||||
@@ -612,6 +895,18 @@ dependencies = [
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.14"
|
||||
@@ -642,6 +937,18 @@ dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
|
||||
|
||||
[[package]]
|
||||
name = "ryu-js"
|
||||
version = "1.0.2"
|
||||
@@ -657,6 +964,12 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.28"
|
||||
@@ -673,6 +986,17 @@ dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde-wasm-bindgen"
|
||||
version = "0.6.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8302e169f0eddcc139c70f139d19d6467353af16f9fce27e8c30158036a1e16b"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"serde",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_core"
|
||||
version = "1.0.228"
|
||||
@@ -726,6 +1050,19 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_yaml_ng"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b4db627b98b36d4203a7b458cf3573730f2bb591b28871d916dfa9efabfd41f"
|
||||
dependencies = [
|
||||
"indexmap",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
"unsafe-libyaml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.7"
|
||||
@@ -741,6 +1078,12 @@ version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.15.1"
|
||||
@@ -753,6 +1096,24 @@ version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "strum"
|
||||
version = "0.27.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf"
|
||||
|
||||
[[package]]
|
||||
name = "strum_macros"
|
||||
version = "0.27.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "symlink"
|
||||
version = "0.1.0"
|
||||
@@ -1027,6 +1388,12 @@ version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
|
||||
|
||||
[[package]]
|
||||
name = "unsafe-libyaml"
|
||||
version = "0.2.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.2"
|
||||
@@ -1073,6 +1440,51 @@ dependencies = [
|
||||
"wit-bindgen 0.51.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.120"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df52b6d9b87e0c74c9edfa1eb2d9bf85e5d63515474513aa50fa181b3c4f5db1"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
"rustversion",
|
||||
"wasm-bindgen-macro",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.120"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78b1041f495fb322e64aca85f5756b2172e35cd459376e67f2a6c9dffcedb103"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.120"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9dcd0ff20416988a18ac686d4d4d0f6aae9ebf08a389ff5d29012b05af2a1b41"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.120"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49757b3c82ebf16c57d69365a142940b384176c24df52a087fb748e2085359ea"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-encoder"
|
||||
version = "0.244.0"
|
||||
|
||||
@@ -5,6 +5,7 @@ members = [
|
||||
"crates/kb-parse-types",
|
||||
"crates/kb-config",
|
||||
"crates/kb-source-fs",
|
||||
"crates/kb-parse-md",
|
||||
"crates/kb-app",
|
||||
"crates/kb-cli",
|
||||
]
|
||||
|
||||
34
crates/kb-parse-md/Cargo.toml
Normal file
34
crates/kb-parse-md/Cargo.toml
Normal file
@@ -0,0 +1,34 @@
|
||||
[package]
|
||||
name = "kb-parse-md"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Markdown frontmatter (and, in p1-3, block) parsing into kb-core::Metadata / kb-parse-types intermediates"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
kb-parse-types = { path = "../kb-parse-types" }
|
||||
anyhow = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
# serde_yaml (dtolnay) was archived as unmaintained in 2024.
|
||||
# We use the maintained fork serde_yaml_ng. Keeping the same `serde_yaml`-style
|
||||
# API surface lets us swap if a different fork wins long term.
|
||||
serde_yaml_ng = "0.10"
|
||||
toml = "0.8"
|
||||
# `lingua` ships every supported language as a feature flag; the `default`
|
||||
# feature pulls all 75+ language models (huge build time + binary size).
|
||||
# For p1-2 we only need a small subset for autodetect + tests. Add more
|
||||
# languages here as future tasks call for them.
|
||||
lingua = { version = "1.8", default-features = false, features = [
|
||||
"korean",
|
||||
"english",
|
||||
"japanese",
|
||||
"chinese",
|
||||
] }
|
||||
|
||||
[dev-dependencies]
|
||||
serde_json = { workspace = true }
|
||||
981
crates/kb-parse-md/src/frontmatter.rs
Normal file
981
crates/kb-parse-md/src/frontmatter.rs
Normal file
@@ -0,0 +1,981 @@
|
||||
//! Markdown frontmatter parsing → `kb_core::Metadata`.
|
||||
//!
|
||||
//! Implements the contract pinned in design §0 Q9 (frontmatter derive table)
|
||||
//! and §3.6 (Metadata shape). Produces structured warnings via
|
||||
//! `kb-parse-types`.
|
||||
//!
|
||||
//! # YAML library
|
||||
//!
|
||||
//! Upstream `serde_yaml` (dtolnay) was archived as unmaintained in 2024. The
|
||||
//! two viable maintained forks at the time of this writing are `serde_yaml_ng`
|
||||
//! and `serde_yml`. We picked [`serde_yaml_ng`] because it advertises stricter
|
||||
//! adherence to the original `serde_yaml` semantics (notably around `null`
|
||||
//! handling and tagged enums) while `serde_yml` has taken some liberties
|
||||
//! around YAML 1.1 vs 1.2 booleans. Both are actively released; either would
|
||||
//! work and the swap is a one-line dep change should the ecosystem
|
||||
//! consolidate (incl. a future move to `yaml-rust2` directly).
|
||||
|
||||
use std::ops::Range;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use kb_core::{Metadata, SourceType, TrustLevel};
|
||||
use kb_parse_types::{Warning, WarningKind};
|
||||
use lingua::{IsoCode639_1, Language, LanguageDetector, LanguageDetectorBuilder};
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Map, Value};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
/// Caller-supplied fallback values used when frontmatter is missing or partial.
|
||||
///
|
||||
/// `BodyHints` is parser-input only — it is not part of `kb-core` and never
|
||||
/// crosses the storage boundary. The §0 Q9 derive table consults these
|
||||
/// fallbacks in a fixed order, see [`parse_frontmatter`].
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BodyHints {
|
||||
/// First H1 of the body, if any. Used as `title` fallback when the
|
||||
/// frontmatter does not specify one.
|
||||
pub first_h1: Option<String>,
|
||||
/// Filesystem creation time. Used as `created_at` fallback.
|
||||
pub fs_ctime: OffsetDateTime,
|
||||
/// Filesystem modification time. Used as `updated_at` fallback.
|
||||
pub fs_mtime: OffsetDateTime,
|
||||
/// Optional language fallback used when neither frontmatter nor lingua
|
||||
/// detection produce a value. If `None` the final fallback is `"und"`.
|
||||
pub fallback_lang: Option<String>,
|
||||
}
|
||||
|
||||
/// Byte range of the frontmatter region inside the input slice.
|
||||
///
|
||||
/// `start` is the offset of the leading delimiter (`---` or `+++`).
|
||||
/// `end` is the offset just past the closing delimiter line's trailing
|
||||
/// newline (i.e. the body starts at `bytes[end..]`).
|
||||
///
|
||||
/// Shared with future P1-3/P1-4 callers via the [`parse_frontmatter`] return
|
||||
/// tuple — they slice the body using `bytes[span.end..]`.
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub struct FrontmatterSpan {
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
}
|
||||
|
||||
/// Parse the frontmatter (if any) from a Markdown byte slice into a
|
||||
/// `kb_core::Metadata`, applying the §0 Q9 derive table for missing fields.
|
||||
///
|
||||
/// On a malformed frontmatter the function still returns `Ok` — the
|
||||
/// frontmatter contents are discarded and the caller is told via a
|
||||
/// `Warning { kind: MalformedFrontmatter, .. }`. The returned span still
|
||||
/// covers the delimited region so the caller can skip it during body
|
||||
/// slicing.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// `Err` is reserved for genuinely fatal conditions (e.g. non-UTF-8 input
|
||||
/// that can't even be lossy-decoded). The current implementation has no
|
||||
/// such path — every recoverable problem (missing/garbled frontmatter,
|
||||
/// malformed timestamps, unknown enum values) is downgraded to a warning
|
||||
/// and the function returns `Ok`. The `Result` is kept on the signature so
|
||||
/// future hard-fail conditions (e.g. an I/O-backed input) can be added
|
||||
/// without breaking callers.
|
||||
pub fn parse_frontmatter(
|
||||
bytes: &[u8],
|
||||
hints: &BodyHints,
|
||||
) -> anyhow::Result<(Metadata, Option<FrontmatterSpan>, Vec<Warning>)> {
|
||||
let mut warnings = Vec::new();
|
||||
|
||||
let detected = detect_delimiters(bytes);
|
||||
|
||||
let (raw_opt, span_opt) = match detected {
|
||||
None => (None, None),
|
||||
Some((delim, span, inner)) => {
|
||||
let inner_bytes = &bytes[inner.clone()];
|
||||
match std::str::from_utf8(inner_bytes) {
|
||||
Ok(s) => match parse_raw(delim, s) {
|
||||
Ok(raw) => (Some(raw), Some(span)),
|
||||
Err(e) => {
|
||||
warnings.push(Warning {
|
||||
kind: WarningKind::MalformedFrontmatter,
|
||||
note: e,
|
||||
});
|
||||
(None, Some(span))
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
warnings.push(Warning {
|
||||
kind: WarningKind::MalformedFrontmatter,
|
||||
note: format!("frontmatter not valid utf-8: {e}"),
|
||||
});
|
||||
(None, Some(span))
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let body_start = span_opt.map(|s| s.end).unwrap_or(0);
|
||||
let body = &bytes[body_start..];
|
||||
|
||||
let metadata = derive_metadata(raw_opt, hints, body, &mut warnings);
|
||||
|
||||
Ok((metadata, span_opt, warnings))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Delimiter detection
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub(crate) enum DelimKind {
|
||||
Yaml,
|
||||
Toml,
|
||||
}
|
||||
|
||||
impl DelimKind {
|
||||
fn marker(self) -> &'static [u8] {
|
||||
match self {
|
||||
DelimKind::Yaml => b"---",
|
||||
DelimKind::Toml => b"+++",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// UTF-8 BOM. Stripped if present at byte 0; never elsewhere.
|
||||
const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
|
||||
|
||||
/// Look for a YAML or TOML frontmatter block at the very start of `bytes`.
|
||||
///
|
||||
/// Returns `(kind, span, inner_range)` where:
|
||||
/// * `span.start` is the offset of the leading delimiter line (after BOM if
|
||||
/// any — i.e. `0` on BOM-less input, `3` on BOM-prefixed input). `span.end`
|
||||
/// points just past the closing delimiter line's trailing newline (or EOF).
|
||||
/// This is the "outer" range callers use for body slicing.
|
||||
/// * `inner_range` is the byte range of the YAML/TOML payload between the
|
||||
/// delimiter lines, not including either delimiter line nor their EOLs.
|
||||
/// This is what gets fed to the YAML/TOML parser.
|
||||
///
|
||||
/// All offsets are relative to the ORIGINAL `bytes` slice — callers that
|
||||
/// hold the original input can use both the span and the inner range
|
||||
/// directly without further bookkeeping.
|
||||
///
|
||||
/// A leading UTF-8 BOM (`EF BB BF`, exactly at byte 0) is tolerated and
|
||||
/// skipped; the returned `span.start` accounts for it. Subsequent
|
||||
/// BOM-shaped sequences are NOT stripped.
|
||||
///
|
||||
/// Trailing horizontal whitespace (ASCII spaces / tabs) is permitted on
|
||||
/// both the opening and closing delimiter lines: `--- \n` and `---\t\n`
|
||||
/// both count as a delimiter. This keeps editors that automatically trim
|
||||
/// trailing whitespace from silently breaking otherwise-valid frontmatter,
|
||||
/// and matches Hugo / Jekyll behaviour.
|
||||
///
|
||||
/// Anything else that isn't a delimiter at the very start (leading
|
||||
/// whitespace, indentation, prose) is treated as "no frontmatter" per
|
||||
/// design §0 Q9.
|
||||
pub(crate) fn detect_delimiters(
|
||||
bytes: &[u8],
|
||||
) -> Option<(DelimKind, FrontmatterSpan, Range<usize>)> {
|
||||
// Skip a leading UTF-8 BOM, but only at byte 0. The returned offsets
|
||||
// remain relative to the original `bytes`, so we record `bom_offset`
|
||||
// and add it to every position we compute below.
|
||||
let bom_offset = if bytes.starts_with(UTF8_BOM) {
|
||||
UTF8_BOM.len()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let scan = &bytes[bom_offset..];
|
||||
|
||||
let kind = match scan.first()? {
|
||||
b'-' if scan.starts_with(b"---") => DelimKind::Yaml,
|
||||
b'+' if scan.starts_with(b"+++") => DelimKind::Toml,
|
||||
_ => return None,
|
||||
};
|
||||
|
||||
let marker = kind.marker();
|
||||
|
||||
// Opening line: marker, then optional horizontal whitespace, then EOL.
|
||||
// `line_end_after_marker` returns `None` if a non-whitespace, non-EOL
|
||||
// byte follows the marker — that's not a valid frontmatter opener.
|
||||
let (_open_line_end, after_open_eol) = line_end_after_marker(scan, marker.len())?;
|
||||
|
||||
let inner_start_in_scan = after_open_eol;
|
||||
|
||||
// Walk lines looking for a closing marker line. A line counts as a
|
||||
// closer if `trim_ascii_end` of it equals the marker.
|
||||
let mut i = after_open_eol;
|
||||
while i < scan.len() {
|
||||
let line_start = i;
|
||||
let nl_pos = scan[line_start..]
|
||||
.iter()
|
||||
.position(|&b| b == b'\n')
|
||||
.map(|p| line_start + p);
|
||||
let line_content_end = match nl_pos {
|
||||
Some(p) => {
|
||||
// Trim trailing \r if present (CRLF).
|
||||
if p > line_start && scan[p - 1] == b'\r' {
|
||||
p - 1
|
||||
} else {
|
||||
p
|
||||
}
|
||||
}
|
||||
None => scan.len(),
|
||||
};
|
||||
|
||||
let line = &scan[line_start..line_content_end];
|
||||
if trim_ascii_end(line) == marker {
|
||||
// Inner ends at the byte before this closing line's start; the
|
||||
// EOL that terminates the previous content line is part of that
|
||||
// line, not of the YAML/TOML payload, so strip one EOL.
|
||||
//
|
||||
// Clamp to `inner_start_in_scan` — when the frontmatter is
|
||||
// empty (`---\n---\n`), the closing line sits directly after
|
||||
// the opening's EOL and there is no preceding content line to
|
||||
// strip from.
|
||||
let inner_end_in_scan =
|
||||
strip_one_trailing_eol(scan, line_start).max(inner_start_in_scan);
|
||||
|
||||
// span.end: just past the closing line's trailing newline (or
|
||||
// EOF if the file ends without one).
|
||||
let span_end_in_scan = match nl_pos {
|
||||
Some(p) => p + 1,
|
||||
None => scan.len(),
|
||||
};
|
||||
|
||||
return Some((
|
||||
kind,
|
||||
FrontmatterSpan {
|
||||
start: bom_offset,
|
||||
end: span_end_in_scan + bom_offset,
|
||||
},
|
||||
(inner_start_in_scan + bom_offset)..(inner_end_in_scan + bom_offset),
|
||||
));
|
||||
}
|
||||
|
||||
match nl_pos {
|
||||
Some(p) => i = p + 1,
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
// No closing delimiter — not a frontmatter block.
|
||||
None
|
||||
}
|
||||
|
||||
/// Find the line-end position of the opening delimiter line.
|
||||
///
|
||||
/// Given `scan` and `start = marker.len()`, returns
|
||||
/// `Some((line_content_end, after_eol))` where:
|
||||
/// * `line_content_end` is the byte index of the first `\r` (if `\r\n`)
|
||||
/// or `\n` ending the opening line — i.e. the slice `scan[marker.len()..line_content_end]`
|
||||
/// contains the trailing-whitespace-only region between the marker and
|
||||
/// the EOL.
|
||||
/// * `after_eol` is the byte index of the first byte of the next line
|
||||
/// (i.e. just past the `\n`).
|
||||
///
|
||||
/// Returns `None` if there is no EOL after the marker (treat as no frontmatter).
|
||||
fn line_end_after_marker(scan: &[u8], start: usize) -> Option<(usize, usize)> {
|
||||
let mut i = start;
|
||||
while i < scan.len() {
|
||||
match scan[i] {
|
||||
b'\n' => return Some((i, i + 1)),
|
||||
b'\r' if scan.get(i + 1) == Some(&b'\n') => return Some((i, i + 2)),
|
||||
b' ' | b'\t' => i += 1,
|
||||
_ => return None,
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// `[u8]::trim_ascii_end` requires Rust 1.80; we mirror it here for clarity
|
||||
/// and minimum-MSRV portability.
|
||||
fn trim_ascii_end(bs: &[u8]) -> &[u8] {
|
||||
let mut end = bs.len();
|
||||
while end > 0 && matches!(bs[end - 1], b' ' | b'\t') {
|
||||
end -= 1;
|
||||
}
|
||||
&bs[..end]
|
||||
}
|
||||
|
||||
/// Given a position `pos` that points to the start of a line, walk back over
|
||||
/// at most one EOL sequence (`\n` or `\r\n`) and return the resulting
|
||||
/// position. This trims exactly one terminator off the previous line so the
|
||||
/// inner payload doesn't capture the closing delimiter's preceding newline.
|
||||
fn strip_one_trailing_eol(scan: &[u8], pos: usize) -> usize {
|
||||
if pos == 0 {
|
||||
return pos;
|
||||
}
|
||||
if scan[pos - 1] == b'\n' {
|
||||
if pos >= 2 && scan[pos - 2] == b'\r' {
|
||||
return pos - 2;
|
||||
}
|
||||
return pos - 1;
|
||||
}
|
||||
pos
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Raw frontmatter (parsed shape, before §0 Q9 derive)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Untyped frontmatter view. Known fields are pulled by name, unknowns flow
|
||||
/// into `extra`. We deliberately use `serde_json::Value` everywhere so YAML
|
||||
/// and TOML go through the same downstream pipeline.
|
||||
#[derive(Debug, Default, Deserialize)]
|
||||
struct RawFrontmatter {
|
||||
#[serde(default)]
|
||||
title: Option<String>,
|
||||
#[serde(default)]
|
||||
aliases: Option<Vec<String>>,
|
||||
#[serde(default)]
|
||||
tags: Option<Vec<String>>,
|
||||
#[serde(default)]
|
||||
lang: Option<String>,
|
||||
#[serde(default)]
|
||||
created_at: Option<String>,
|
||||
#[serde(default)]
|
||||
updated_at: Option<String>,
|
||||
#[serde(default)]
|
||||
source_type: Option<String>,
|
||||
#[serde(default)]
|
||||
trust_level: Option<String>,
|
||||
/// `id:` field is captured as an alias only — never feeds doc_id (§4.2).
|
||||
#[serde(default)]
|
||||
id: Option<String>,
|
||||
/// Catch-all for unknown keys → `metadata.user`.
|
||||
#[serde(flatten)]
|
||||
extra: Map<String, Value>,
|
||||
}
|
||||
|
||||
fn parse_raw(kind: DelimKind, slice: &str) -> Result<RawFrontmatter, String> {
|
||||
match kind {
|
||||
DelimKind::Yaml => {
|
||||
// Empty YAML frontmatter is legal (parses to null) — handle
|
||||
// explicitly so `serde_yaml_ng` doesn't fail trying to deserialize
|
||||
// null into a struct.
|
||||
if slice.trim().is_empty() {
|
||||
return Ok(RawFrontmatter::default());
|
||||
}
|
||||
serde_yaml_ng::from_str::<RawFrontmatter>(slice).map_err(|e| e.to_string())
|
||||
}
|
||||
DelimKind::Toml => {
|
||||
if slice.trim().is_empty() {
|
||||
return Ok(RawFrontmatter::default());
|
||||
}
|
||||
toml::from_str::<RawFrontmatter>(slice).map_err(|e| e.to_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// §0 Q9 derive table
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn derive_metadata(
|
||||
raw: Option<RawFrontmatter>,
|
||||
hints: &BodyHints,
|
||||
body: &[u8],
|
||||
warnings: &mut Vec<Warning>,
|
||||
) -> Metadata {
|
||||
let raw = raw.unwrap_or_default();
|
||||
|
||||
// user map starts from the unknown-key overflow.
|
||||
let mut user = raw.extra;
|
||||
|
||||
// ---- title ----
|
||||
// Frontmatter → BodyHints.first_h1 → None.
|
||||
// Filename fallback is the caller's responsibility (P1-4 normalize), per
|
||||
// task brief — `BodyHints` does not carry a filename.
|
||||
let title = raw.title.or_else(|| hints.first_h1.clone());
|
||||
if let Some(t) = title {
|
||||
user.insert("title".to_string(), Value::String(t));
|
||||
}
|
||||
|
||||
// ---- aliases / tags ----
|
||||
let aliases = raw.aliases.unwrap_or_default();
|
||||
let tags = raw.tags.unwrap_or_default();
|
||||
|
||||
// ---- lang ----
|
||||
// Frontmatter → lingua autodetect (first 4 KB of body) → fallback_lang → "und".
|
||||
// The lang field is not on Metadata (§3.6) — store it under user.lang.
|
||||
let lang = raw
|
||||
.lang
|
||||
.or_else(|| detect_lang(body))
|
||||
.or_else(|| hints.fallback_lang.clone())
|
||||
.unwrap_or_else(|| "und".to_string());
|
||||
user.insert("lang".to_string(), Value::String(lang));
|
||||
|
||||
// ---- timestamps ----
|
||||
let mut original_timestamps: Map<String, Value> = Map::new();
|
||||
let created_at = parse_ts(
|
||||
raw.created_at.as_deref(),
|
||||
"created_at",
|
||||
&mut original_timestamps,
|
||||
warnings,
|
||||
)
|
||||
.unwrap_or(hints.fs_ctime);
|
||||
let updated_at = parse_ts(
|
||||
raw.updated_at.as_deref(),
|
||||
"updated_at",
|
||||
&mut original_timestamps,
|
||||
warnings,
|
||||
)
|
||||
.unwrap_or(hints.fs_mtime);
|
||||
if !original_timestamps.is_empty() {
|
||||
user.insert(
|
||||
"original_timestamps".to_string(),
|
||||
Value::Object(original_timestamps),
|
||||
);
|
||||
}
|
||||
|
||||
// ---- source_type ----
|
||||
let source_type = match raw.source_type.as_deref() {
|
||||
None => SourceType::Markdown,
|
||||
Some(s) => match parse_source_type(s) {
|
||||
Some(st) => st,
|
||||
None => {
|
||||
warnings.push(Warning {
|
||||
kind: WarningKind::MalformedFrontmatter,
|
||||
note: format!("unknown source_type={s}, defaulted to markdown"),
|
||||
});
|
||||
SourceType::Markdown
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
// ---- trust_level ----
|
||||
let trust_level = match raw.trust_level.as_deref() {
|
||||
None => TrustLevel::Primary,
|
||||
Some(s) => match parse_trust_level(s) {
|
||||
Some(tl) => tl,
|
||||
None => {
|
||||
warnings.push(Warning {
|
||||
kind: WarningKind::MalformedFrontmatter,
|
||||
note: format!("unknown trust_level={s}, defaulted to primary"),
|
||||
});
|
||||
TrustLevel::Primary
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
// ---- id alias ----
|
||||
// `id:` field becomes `metadata.user_id_alias` only (spec §"Behavior
|
||||
// contract" line 74). It is NOT mirrored into the user map.
|
||||
let user_id_alias = raw.id;
|
||||
|
||||
Metadata {
|
||||
aliases,
|
||||
tags,
|
||||
created_at,
|
||||
updated_at,
|
||||
source_type,
|
||||
trust_level,
|
||||
user_id_alias,
|
||||
user,
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_source_type(s: &str) -> Option<SourceType> {
|
||||
// Mirror the lowercase serde rename used on SourceType.
|
||||
match s {
|
||||
"markdown" => Some(SourceType::Markdown),
|
||||
"note" => Some(SourceType::Note),
|
||||
"paper" => Some(SourceType::Paper),
|
||||
"reference" => Some(SourceType::Reference),
|
||||
"inbox" => Some(SourceType::Inbox),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_trust_level(s: &str) -> Option<TrustLevel> {
|
||||
match s {
|
||||
"primary" => Some(TrustLevel::Primary),
|
||||
"secondary" => Some(TrustLevel::Secondary),
|
||||
"generated" => Some(TrustLevel::Generated),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse an RFC 3339 timestamp string and normalize to UTC. If the original
|
||||
/// offset was non-UTC, push it into `original_timestamps[field]` per §0 Q9.
|
||||
/// Returns `None` if the input is missing OR malformed (in which case a
|
||||
/// warning is emitted).
|
||||
fn parse_ts(
|
||||
s: Option<&str>,
|
||||
field: &str,
|
||||
original_timestamps: &mut Map<String, Value>,
|
||||
warnings: &mut Vec<Warning>,
|
||||
) -> Option<OffsetDateTime> {
|
||||
let s = s?;
|
||||
match OffsetDateTime::parse(s, &time::format_description::well_known::Rfc3339) {
|
||||
Ok(dt) => {
|
||||
if dt.offset() != time::UtcOffset::UTC {
|
||||
original_timestamps.insert(field.to_string(), Value::String(s.to_string()));
|
||||
}
|
||||
Some(dt.to_offset(time::UtcOffset::UTC))
|
||||
}
|
||||
Err(e) => {
|
||||
warnings.push(Warning {
|
||||
kind: WarningKind::MalformedFrontmatter,
|
||||
note: format!("malformed {field}={s:?}: {e}"),
|
||||
});
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Lingua detector (cached statically — first init is heavy)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn detector() -> &'static LanguageDetector {
|
||||
static DETECTOR: OnceLock<LanguageDetector> = OnceLock::new();
|
||||
DETECTOR.get_or_init(|| {
|
||||
// Keep the language set narrow: matches the cargo features we enable
|
||||
// on the `lingua` dep. Adding more languages here without enabling
|
||||
// their feature flag will fail to compile.
|
||||
LanguageDetectorBuilder::from_languages(&[
|
||||
Language::English,
|
||||
Language::Korean,
|
||||
Language::Japanese,
|
||||
Language::Chinese,
|
||||
])
|
||||
.build()
|
||||
})
|
||||
}
|
||||
|
||||
/// Run lingua autodetect on the first 4 KB of body. Returns an ISO 639-1
|
||||
/// two-letter code (lowercase) on success.
|
||||
///
|
||||
/// Note: lingua needs reasonably long input to be confident. Empty / very
|
||||
/// short bodies return `None` so we fall through to the next derive step.
|
||||
fn detect_lang(body: &[u8]) -> Option<String> {
|
||||
const WINDOW: usize = 4 * 1024;
|
||||
if body.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let n = body.len().min(WINDOW);
|
||||
// Find a UTF-8-safe slice end ≤ n. Walk back at most 4 bytes.
|
||||
let mut end = n;
|
||||
while end > 0 && std::str::from_utf8(&body[..end]).is_err() {
|
||||
end -= 1;
|
||||
}
|
||||
if end == 0 {
|
||||
return None;
|
||||
}
|
||||
let s = std::str::from_utf8(&body[..end]).ok()?;
|
||||
if s.trim().is_empty() {
|
||||
return None;
|
||||
}
|
||||
let lang = detector().detect_language_of(s)?;
|
||||
Some(iso_code(lang).to_string())
|
||||
}
|
||||
|
||||
fn iso_code(lang: Language) -> &'static str {
|
||||
// `lingua::IsoCode639_1` is gated by the language features enabled on the
|
||||
// crate — only the variants below are compiled into our build, so this
|
||||
// match is exhaustive for the configured detector.
|
||||
match lang.iso_code_639_1() {
|
||||
IsoCode639_1::EN => "en",
|
||||
IsoCode639_1::KO => "ko",
|
||||
IsoCode639_1::JA => "ja",
|
||||
IsoCode639_1::ZH => "zh",
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kb_core::{
|
||||
AssetId, WorkspacePath,
|
||||
ids::id_for_doc,
|
||||
versions::ParserVersion,
|
||||
};
|
||||
use time::macros::datetime;
|
||||
|
||||
fn hints() -> BodyHints {
|
||||
BodyHints {
|
||||
first_h1: None,
|
||||
fs_ctime: datetime!(2024-01-01 00:00:00 UTC),
|
||||
fs_mtime: datetime!(2024-01-02 00:00:00 UTC),
|
||||
fallback_lang: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn yaml_happy_path() {
|
||||
let md = b"---\n\
|
||||
title: My Doc\n\
|
||||
aliases: [a, b]\n\
|
||||
tags: [t1, t2]\n\
|
||||
lang: en\n\
|
||||
created_at: 2024-03-01T00:00:00Z\n\
|
||||
updated_at: 2024-03-02T00:00:00Z\n\
|
||||
source_type: note\n\
|
||||
trust_level: secondary\n\
|
||||
---\nbody\n";
|
||||
|
||||
let (meta, span, warns) = parse_frontmatter(md, &hints()).unwrap();
|
||||
assert!(warns.is_empty(), "warnings: {warns:?}");
|
||||
let span = span.expect("span present");
|
||||
assert_eq!(span.start, 0);
|
||||
assert_eq!(meta.aliases, vec!["a".to_string(), "b".to_string()]);
|
||||
assert_eq!(meta.tags, vec!["t1".to_string(), "t2".to_string()]);
|
||||
assert_eq!(meta.source_type, SourceType::Note);
|
||||
assert_eq!(meta.trust_level, TrustLevel::Secondary);
|
||||
assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC));
|
||||
assert_eq!(meta.updated_at, datetime!(2024-03-02 00:00:00 UTC));
|
||||
assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("My Doc"));
|
||||
assert_eq!(meta.user.get("lang").and_then(|v| v.as_str()), Some("en"));
|
||||
assert_eq!(meta.user_id_alias, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn toml_happy_path() {
|
||||
let md = b"+++\n\
|
||||
title = \"My Doc\"\n\
|
||||
aliases = [\"a\", \"b\"]\n\
|
||||
tags = [\"t1\", \"t2\"]\n\
|
||||
lang = \"en\"\n\
|
||||
created_at = \"2024-03-01T00:00:00Z\"\n\
|
||||
updated_at = \"2024-03-02T00:00:00Z\"\n\
|
||||
source_type = \"note\"\n\
|
||||
trust_level = \"secondary\"\n\
|
||||
+++\nbody\n";
|
||||
|
||||
let (meta, span, warns) = parse_frontmatter(md, &hints()).unwrap();
|
||||
assert!(warns.is_empty(), "warnings: {warns:?}");
|
||||
assert!(span.is_some());
|
||||
assert_eq!(meta.aliases, vec!["a".to_string(), "b".to_string()]);
|
||||
assert_eq!(meta.tags, vec!["t1".to_string(), "t2".to_string()]);
|
||||
assert_eq!(meta.source_type, SourceType::Note);
|
||||
assert_eq!(meta.trust_level, TrustLevel::Secondary);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_keys_preserved_in_user() {
|
||||
let md = b"---\n\
|
||||
title: Doc\n\
|
||||
custom_field: hello\n\
|
||||
nested: {a: 1}\n\
|
||||
---\n";
|
||||
let (meta, _span, warns) = parse_frontmatter(md, &hints()).unwrap();
|
||||
assert!(warns.is_empty(), "warnings: {warns:?}");
|
||||
assert_eq!(
|
||||
meta.user.get("custom_field").and_then(|v| v.as_str()),
|
||||
Some("hello")
|
||||
);
|
||||
assert!(meta.user.get("nested").is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_enum_value_warns_and_defaults() {
|
||||
let md = b"---\n\
|
||||
trust_level: weird\n\
|
||||
source_type: alien\n\
|
||||
---\n";
|
||||
let (meta, _span, warns) = parse_frontmatter(md, &hints()).unwrap();
|
||||
assert_eq!(meta.trust_level, TrustLevel::Primary);
|
||||
assert_eq!(meta.source_type, SourceType::Markdown);
|
||||
assert_eq!(warns.len(), 2);
|
||||
assert!(warns.iter().all(|w| matches!(w.kind, WarningKind::MalformedFrontmatter)));
|
||||
assert!(warns.iter().any(|w| w.note.contains("trust_level=weird")));
|
||||
assert!(warns.iter().any(|w| w.note.contains("source_type=alien")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn malformed_yaml_emits_warning_and_defaults() {
|
||||
// Unclosed quote → YAML parse fails.
|
||||
let md = b"---\ntitle: \"unterminated\n---\n";
|
||||
let (meta, span, warns) = parse_frontmatter(md, &hints()).unwrap();
|
||||
assert!(span.is_some(), "span still reflects delim region");
|
||||
assert_eq!(warns.len(), 1);
|
||||
assert!(matches!(warns[0].kind, WarningKind::MalformedFrontmatter));
|
||||
// Body fallbacks applied.
|
||||
assert_eq!(meta.created_at, datetime!(2024-01-01 00:00:00 UTC));
|
||||
assert_eq!(meta.updated_at, datetime!(2024-01-02 00:00:00 UTC));
|
||||
assert_eq!(meta.source_type, SourceType::Markdown);
|
||||
assert_eq!(meta.trust_level, TrustLevel::Primary);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_frontmatter_uses_body_hints_silently() {
|
||||
let md = b"# Just a heading\n\nsome body";
|
||||
let mut h = hints();
|
||||
h.first_h1 = Some("Just a heading".to_string());
|
||||
h.fallback_lang = Some("en".to_string());
|
||||
let (meta, span, warns) = parse_frontmatter(md, &h).unwrap();
|
||||
assert!(span.is_none());
|
||||
assert!(warns.is_empty());
|
||||
assert_eq!(
|
||||
meta.user.get("title").and_then(|v| v.as_str()),
|
||||
Some("Just a heading")
|
||||
);
|
||||
// Body too short for confident lingua autodetect → fallback_lang.
|
||||
assert_eq!(meta.user.get("lang").and_then(|v| v.as_str()), Some("en"));
|
||||
}
|
||||
|
||||
/// `id:` field MUST NOT influence `doc_id` (design §4.2). Compute the
|
||||
/// recipe twice — with and without the field — and assert the results
|
||||
/// match.
|
||||
#[test]
|
||||
fn id_field_does_not_feed_doc_id() {
|
||||
let with_id = b"---\nid: my-handle\ntitle: Doc\n---\n";
|
||||
let without = b"---\ntitle: Doc\n---\n";
|
||||
|
||||
let (meta_with, _, _) = parse_frontmatter(with_id, &hints()).unwrap();
|
||||
let (meta_without, _, _) = parse_frontmatter(without, &hints()).unwrap();
|
||||
|
||||
assert_eq!(meta_with.user_id_alias.as_deref(), Some("my-handle"));
|
||||
assert_eq!(meta_without.user_id_alias, None);
|
||||
|
||||
let asset = AssetId("0123456789abcdef0123456789abcdef".to_string());
|
||||
let path = WorkspacePath::new("notes/test.md".to_string()).unwrap();
|
||||
let pv = ParserVersion("pulldown-cmark-0.x".to_string());
|
||||
|
||||
let id_a = id_for_doc(&path, &asset, &pv);
|
||||
let id_b = id_for_doc(&path, &asset, &pv);
|
||||
assert_eq!(
|
||||
id_a, id_b,
|
||||
"id_for_doc must be stable across runs and not see metadata"
|
||||
);
|
||||
// Sanity: the recipe takes (workspace_path, asset_id, parser_version)
|
||||
// only — there is literally no parameter to plumb metadata through.
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_utc_timestamp_preserved_in_user_original_timestamps() {
|
||||
let md = b"---\ncreated_at: 2024-01-15T10:00:00+09:00\n---\n";
|
||||
let (meta, _, warns) = parse_frontmatter(md, &hints()).unwrap();
|
||||
assert!(warns.is_empty(), "warnings: {warns:?}");
|
||||
// Normalized to UTC.
|
||||
assert_eq!(meta.created_at, datetime!(2024-01-15 01:00:00 UTC));
|
||||
let orig = meta
|
||||
.user
|
||||
.get("original_timestamps")
|
||||
.and_then(|v| v.as_object())
|
||||
.expect("original_timestamps map present");
|
||||
assert_eq!(
|
||||
orig.get("created_at").and_then(|v| v.as_str()),
|
||||
Some("2024-01-15T10:00:00+09:00")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn malformed_timestamp_warns_and_falls_back() {
|
||||
let md = b"---\ncreated_at: not-a-date\n---\n";
|
||||
let (meta, _, warns) = parse_frontmatter(md, &hints()).unwrap();
|
||||
assert_eq!(warns.len(), 1);
|
||||
assert!(matches!(warns[0].kind, WarningKind::MalformedFrontmatter));
|
||||
assert!(warns[0].note.contains("created_at"));
|
||||
// Fallback to fs_ctime.
|
||||
assert_eq!(meta.created_at, datetime!(2024-01-01 00:00:00 UTC));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_delimiters_no_match_without_leading_marker() {
|
||||
assert!(detect_delimiters(b"# heading\n---\n---\n").is_none());
|
||||
assert!(detect_delimiters(b" ---\n---\n").is_none(), "leading whitespace");
|
||||
assert!(detect_delimiters(b"").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_delimiters_yaml_basic() {
|
||||
let bytes = b"---\nfoo: bar\n---\nbody\n";
|
||||
let (kind, span, inner) = detect_delimiters(bytes).unwrap();
|
||||
assert_eq!(kind, DelimKind::Yaml);
|
||||
assert_eq!(span.start, 0);
|
||||
// body starts at "body\n" — the closing "---\n" is part of the span.
|
||||
assert_eq!(&bytes[span.end..], b"body\n");
|
||||
// inner range covers exactly "foo: bar" (no surrounding EOL).
|
||||
assert_eq!(&bytes[inner], b"foo: bar");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_delimiters_toml_basic() {
|
||||
let bytes = b"+++\nfoo = \"bar\"\n+++\nbody\n";
|
||||
let (kind, span, inner) = detect_delimiters(bytes).unwrap();
|
||||
assert_eq!(kind, DelimKind::Toml);
|
||||
assert_eq!(&bytes[span.end..], b"body\n");
|
||||
assert_eq!(&bytes[inner], b"foo = \"bar\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_delimiters_unterminated_returns_none() {
|
||||
// `---\n` then no closing — treat as no frontmatter.
|
||||
let bytes = b"---\nfoo: bar\n";
|
||||
assert!(detect_delimiters(bytes).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_yaml_frontmatter_is_legal() {
|
||||
let md = b"---\n---\nbody\n";
|
||||
let (_meta, span, warns) = parse_frontmatter(md, &hints()).unwrap();
|
||||
assert!(span.is_some());
|
||||
assert!(warns.is_empty(), "warnings: {warns:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lingua_detects_korean_and_english() {
|
||||
let ko = "안녕하세요. 이것은 한국어로 작성된 문서입니다. 형태소 분석은 어렵습니다. 그러나 lingua는 잘 동작합니다.".as_bytes();
|
||||
let en = "Hello there. This document is written in English. The lingua language detector is statistical and works on short text too, given enough words.".as_bytes();
|
||||
assert_eq!(detect_lang(ko).as_deref(), Some("ko"));
|
||||
assert_eq!(detect_lang(en).as_deref(), Some("en"));
|
||||
}
|
||||
|
||||
// ---- C1: CRLF line endings ------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn detect_delimiters_crlf_yaml() {
|
||||
let bytes = b"---\r\ntitle: Doc\r\n---\r\nbody\r\n";
|
||||
let (kind, span, inner) = detect_delimiters(bytes).unwrap();
|
||||
assert_eq!(kind, DelimKind::Yaml);
|
||||
assert_eq!(span.start, 0);
|
||||
// span ends just past the CRLF after the closing "---".
|
||||
assert_eq!(&bytes[span.end..], b"body\r\n");
|
||||
// Inner is exactly the YAML payload, sans surrounding EOLs.
|
||||
assert_eq!(&bytes[inner], b"title: Doc");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_delimiters_crlf_toml() {
|
||||
let bytes = b"+++\r\ntitle = \"Doc\"\r\n+++\r\nbody\r\n";
|
||||
let (kind, span, inner) = detect_delimiters(bytes).unwrap();
|
||||
assert_eq!(kind, DelimKind::Toml);
|
||||
assert_eq!(&bytes[span.end..], b"body\r\n");
|
||||
assert_eq!(&bytes[inner], b"title = \"Doc\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_frontmatter_crlf_yaml_end_to_end() {
|
||||
let bytes = b"---\r\n\
|
||||
title: Doc\r\n\
|
||||
created_at: 2024-03-01T00:00:00Z\r\n\
|
||||
updated_at: 2024-03-02T00:00:00Z\r\n\
|
||||
---\r\nbody\r\n";
|
||||
let (meta, span, warns) = parse_frontmatter(bytes, &hints()).unwrap();
|
||||
assert!(warns.is_empty(), "warnings: {warns:?}");
|
||||
assert!(span.is_some());
|
||||
assert_eq!(
|
||||
meta.user.get("title").and_then(|v| v.as_str()),
|
||||
Some("Doc")
|
||||
);
|
||||
assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC));
|
||||
assert_eq!(meta.updated_at, datetime!(2024-03-02 00:00:00 UTC));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_frontmatter_crlf_toml_end_to_end() {
|
||||
let bytes = b"+++\r\n\
|
||||
title = \"Doc\"\r\n\
|
||||
created_at = \"2024-03-01T00:00:00Z\"\r\n\
|
||||
+++\r\nbody\r\n";
|
||||
let (meta, span, warns) = parse_frontmatter(bytes, &hints()).unwrap();
|
||||
assert!(warns.is_empty(), "warnings: {warns:?}");
|
||||
assert!(span.is_some());
|
||||
assert_eq!(
|
||||
meta.user.get("title").and_then(|v| v.as_str()),
|
||||
Some("Doc")
|
||||
);
|
||||
assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC));
|
||||
}
|
||||
|
||||
/// Mixed-EOL input: opening uses `\n`, closing uses `\r\n` (or vice
|
||||
/// versa). Policy: each line is considered independently, so any
|
||||
/// combination of LF / CRLF parses correctly. This keeps tools that
|
||||
/// edit only one end of a file (e.g. an editor that auto-wraps the
|
||||
/// last line) from breaking otherwise-valid frontmatter.
|
||||
#[test]
|
||||
fn parse_frontmatter_mixed_lf_crlf() {
|
||||
// Opening LF, closing CRLF.
|
||||
let a = b"---\ntitle: A\n---\r\nbody\n";
|
||||
let (meta, _span, warns) = parse_frontmatter(a, &hints()).unwrap();
|
||||
assert!(warns.is_empty(), "case A warnings: {warns:?}");
|
||||
assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("A"));
|
||||
|
||||
// Opening CRLF, closing LF.
|
||||
let b = b"---\r\ntitle: B\r\n---\nbody\n";
|
||||
let (meta, _span, warns) = parse_frontmatter(b, &hints()).unwrap();
|
||||
assert!(warns.is_empty(), "case B warnings: {warns:?}");
|
||||
assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("B"));
|
||||
}
|
||||
|
||||
// ---- I1: trailing whitespace on delimiter lines ---------------------------
|
||||
|
||||
#[test]
|
||||
fn detect_delimiters_yaml_with_trailing_whitespace_on_opener() {
|
||||
let bytes = b"--- \ntitle: x\n---\nbody\n";
|
||||
let (kind, span, inner) = detect_delimiters(bytes).unwrap();
|
||||
assert_eq!(kind, DelimKind::Yaml);
|
||||
assert_eq!(span.start, 0);
|
||||
assert_eq!(&bytes[span.end..], b"body\n");
|
||||
assert_eq!(&bytes[inner], b"title: x");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_delimiters_yaml_with_trailing_whitespace_on_closer() {
|
||||
let bytes = b"---\ntitle: x\n--- \nbody\n";
|
||||
let (kind, span, inner) = detect_delimiters(bytes).unwrap();
|
||||
assert_eq!(kind, DelimKind::Yaml);
|
||||
assert_eq!(&bytes[span.end..], b"body\n");
|
||||
assert_eq!(&bytes[inner], b"title: x");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_delimiters_yaml_with_tabs_on_delimiter_line() {
|
||||
let bytes = b"---\t\ntitle: x\n---\nbody\n";
|
||||
let (kind, span, _inner) = detect_delimiters(bytes).unwrap();
|
||||
assert_eq!(kind, DelimKind::Yaml);
|
||||
assert_eq!(&bytes[span.end..], b"body\n");
|
||||
}
|
||||
|
||||
// ---- I2: UTF-8 BOM at file start ------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn detect_delimiters_yaml_with_leading_bom() {
|
||||
let mut bytes = Vec::from([0xEF, 0xBB, 0xBF].as_slice());
|
||||
bytes.extend_from_slice(b"---\ntitle: Doc\n---\nbody\n");
|
||||
let (kind, span, inner) = detect_delimiters(&bytes).unwrap();
|
||||
assert_eq!(kind, DelimKind::Yaml);
|
||||
// Span starts after the BOM (byte 3), not at byte 0.
|
||||
assert_eq!(span.start, 3);
|
||||
// Body slicing using span.end gives the original bytes after the
|
||||
// closing delimiter — no BOM bookkeeping required by callers.
|
||||
assert_eq!(&bytes[span.end..], b"body\n");
|
||||
assert_eq!(&bytes[inner], b"title: Doc");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_frontmatter_with_leading_bom_full_pipeline() {
|
||||
let mut bytes = Vec::from([0xEF, 0xBB, 0xBF].as_slice());
|
||||
bytes.extend_from_slice(
|
||||
b"---\n\
|
||||
title: Doc\n\
|
||||
lang: en\n\
|
||||
created_at: 2024-03-01T00:00:00Z\n\
|
||||
---\nbody\n",
|
||||
);
|
||||
let (meta, span, warns) = parse_frontmatter(&bytes, &hints()).unwrap();
|
||||
assert!(warns.is_empty(), "warnings: {warns:?}");
|
||||
let span = span.expect("span present");
|
||||
assert_eq!(span.start, 3);
|
||||
assert_eq!(
|
||||
meta.user.get("title").and_then(|v| v.as_str()),
|
||||
Some("Doc")
|
||||
);
|
||||
assert_eq!(meta.user.get("lang").and_then(|v| v.as_str()), Some("en"));
|
||||
assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC));
|
||||
}
|
||||
|
||||
/// BOM-shaped bytes appearing later in the input are NOT stripped — only
|
||||
/// a BOM at byte 0 of the original input is honoured.
|
||||
#[test]
|
||||
fn detect_delimiters_does_not_strip_mid_input_bom() {
|
||||
// Leading byte is `#`, then a BOM, then a delimiter — there is no
|
||||
// frontmatter here regardless of whether we strip BOM, but pin the
|
||||
// behaviour: detection still fails (no leading marker).
|
||||
let mut bytes = Vec::from(b"# heading\n".as_slice());
|
||||
bytes.extend_from_slice(&[0xEF, 0xBB, 0xBF]);
|
||||
bytes.extend_from_slice(b"---\nfoo: bar\n---\n");
|
||||
assert!(detect_delimiters(&bytes).is_none());
|
||||
}
|
||||
}
|
||||
19
crates/kb-parse-md/src/lib.rs
Normal file
19
crates/kb-parse-md/src/lib.rs
Normal file
@@ -0,0 +1,19 @@
|
||||
//! `kb-parse-md` — Markdown parsing for the KB pipeline (§3.7b).
|
||||
//!
|
||||
//! P1-2 implements the **frontmatter** submodule only. P1-3 will add a
|
||||
//! sibling `blocks` submodule for block parsing using `pulldown-cmark`.
|
||||
//!
|
||||
//! Public surface for P1-2 is intentionally narrow:
|
||||
//!
|
||||
//! * [`parse_frontmatter`] — pure function from Markdown bytes to
|
||||
//! `(Metadata, Option<FrontmatterSpan>, Vec<Warning>)`.
|
||||
//! * [`BodyHints`] — caller-supplied fallbacks that feed the §0 Q9 derive
|
||||
//! table when frontmatter is missing or partial.
|
||||
//! * [`FrontmatterSpan`] — byte offsets of the frontmatter region in the
|
||||
//! input slice (returned by [`parse_frontmatter`]).
|
||||
//!
|
||||
//! Anything else in this crate is `pub(crate)` and may change without notice.
|
||||
|
||||
pub mod frontmatter;
|
||||
|
||||
pub use frontmatter::{BodyHints, FrontmatterSpan, parse_frontmatter};
|
||||
111
crates/kb-parse-md/tests/frontmatter_snapshots.rs
Normal file
111
crates/kb-parse-md/tests/frontmatter_snapshots.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
//! Snapshot tests pinning the §0 Q9 derive output for two fixtures.
|
||||
//!
|
||||
//! The baseline JSON next to each fixture is hand-authored / regenerated
|
||||
//! from a deterministic run. `BodyHints` timestamps are caller-provided
|
||||
//! and therefore stable; lingua autodetect over our fixtures is also
|
||||
//! stable for the language set we configured.
|
||||
|
||||
use kb_parse_md::{BodyHints, parse_frontmatter};
|
||||
use serde::Serialize;
|
||||
use serde_json::Value;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use time::macros::datetime;
|
||||
|
||||
/// Stable view of the parser output suitable for JSON snapshotting.
|
||||
/// We deliberately exclude `FrontmatterSpan` byte offsets here too — they're
|
||||
/// fully determined by the input file and are exercised by unit tests; the
|
||||
/// snapshot focuses on the §0 Q9 derive contract.
|
||||
#[derive(Serialize)]
|
||||
struct Snapshot {
|
||||
metadata: kb_core::Metadata,
|
||||
span_present: bool,
|
||||
warnings: Vec<kb_parse_types::Warning>,
|
||||
}
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
.join("..")
|
||||
.join("fixtures")
|
||||
.join("markdown")
|
||||
}
|
||||
|
||||
fn pinned_hints() -> BodyHints {
|
||||
BodyHints {
|
||||
first_h1: None,
|
||||
fs_ctime: datetime!(2024-01-01 00:00:00 UTC),
|
||||
fs_mtime: datetime!(2024-01-02 00:00:00 UTC),
|
||||
fallback_lang: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_snapshot(fixture: &str, baseline: &str) {
|
||||
let dir = fixtures_dir();
|
||||
let bytes = fs::read(dir.join(fixture)).expect("fixture readable");
|
||||
|
||||
let (meta, span, warns) = parse_frontmatter(&bytes, &pinned_hints()).unwrap();
|
||||
let snap = Snapshot {
|
||||
metadata: meta,
|
||||
span_present: span.is_some(),
|
||||
warnings: warns,
|
||||
};
|
||||
let actual: Value = serde_json::to_value(&snap).unwrap();
|
||||
|
||||
let expected_text =
|
||||
fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable");
|
||||
let expected: Value = serde_json::from_str(&expected_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
let actual_pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"snapshot drift for {fixture}\n\
|
||||
--- expected ({baseline}) ---\n{expected_text}\n\
|
||||
--- actual ---\n{actual_pretty}\n\
|
||||
If the change is intentional, update {baseline}."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn frontmatter_only_snapshot() {
|
||||
assert_snapshot("frontmatter-only.md", "frontmatter-only.snapshot.json");
|
||||
}
|
||||
|
||||
/// Run with `cargo test -p kb-parse-md --test frontmatter_snapshots emit_snapshots -- --ignored --nocapture`
|
||||
/// to regenerate the baseline JSON files from the current parser output.
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn emit_snapshots() {
|
||||
let dir = fixtures_dir();
|
||||
for (fixture, baseline) in [
|
||||
("frontmatter-only.md", "frontmatter-only.snapshot.json"),
|
||||
("mixed-lang.md", "mixed-lang.snapshot.json"),
|
||||
] {
|
||||
let bytes = fs::read(dir.join(fixture)).unwrap();
|
||||
let (meta, span, warns) = parse_frontmatter(&bytes, &pinned_hints()).unwrap();
|
||||
let snap = Snapshot {
|
||||
metadata: meta,
|
||||
span_present: span.is_some(),
|
||||
warnings: warns,
|
||||
};
|
||||
let json = serde_json::to_string_pretty(&snap).unwrap();
|
||||
fs::write(dir.join(baseline), format!("{json}\n")).unwrap();
|
||||
eprintln!("wrote {}", dir.join(baseline).display());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixed_lang_snapshot() {
|
||||
assert_snapshot("mixed-lang.md", "mixed-lang.snapshot.json");
|
||||
}
|
||||
|
||||
/// Determinism: parsing the same fixture twice in a row must give equal output.
|
||||
#[test]
|
||||
fn snapshot_is_deterministic_across_runs() {
|
||||
let dir = fixtures_dir();
|
||||
let bytes = fs::read(dir.join("frontmatter-only.md")).unwrap();
|
||||
let (a, _, _) = parse_frontmatter(&bytes, &pinned_hints()).unwrap();
|
||||
let (b, _, _) = parse_frontmatter(&bytes, &pinned_hints()).unwrap();
|
||||
assert_eq!(serde_json::to_value(&a).unwrap(), serde_json::to_value(&b).unwrap());
|
||||
}
|
||||
22
fixtures/markdown/frontmatter-only.md
Normal file
22
fixtures/markdown/frontmatter-only.md
Normal file
@@ -0,0 +1,22 @@
|
||||
---
|
||||
title: Frontmatter Only
|
||||
aliases:
|
||||
- fm-only
|
||||
- first-fixture
|
||||
tags:
|
||||
- parse
|
||||
- test
|
||||
lang: en
|
||||
created_at: 2024-01-15T10:00:00+09:00
|
||||
updated_at: 2024-02-20T08:30:00Z
|
||||
source_type: note
|
||||
trust_level: secondary
|
||||
id: my-stable-handle
|
||||
custom_field: hello
|
||||
nested_obj:
|
||||
key: value
|
||||
---
|
||||
|
||||
# Body Heading
|
||||
|
||||
Body paragraph.
|
||||
30
fixtures/markdown/frontmatter-only.snapshot.json
Normal file
30
fixtures/markdown/frontmatter-only.snapshot.json
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"metadata": {
|
||||
"aliases": [
|
||||
"fm-only",
|
||||
"first-fixture"
|
||||
],
|
||||
"tags": [
|
||||
"parse",
|
||||
"test"
|
||||
],
|
||||
"created_at": "2024-01-15T01:00:00Z",
|
||||
"updated_at": "2024-02-20T08:30:00Z",
|
||||
"source_type": "note",
|
||||
"trust_level": "secondary",
|
||||
"user_id_alias": "my-stable-handle",
|
||||
"user": {
|
||||
"custom_field": "hello",
|
||||
"lang": "en",
|
||||
"nested_obj": {
|
||||
"key": "value"
|
||||
},
|
||||
"original_timestamps": {
|
||||
"created_at": "2024-01-15T10:00:00+09:00"
|
||||
},
|
||||
"title": "Frontmatter Only"
|
||||
}
|
||||
},
|
||||
"span_present": true,
|
||||
"warnings": []
|
||||
}
|
||||
9
fixtures/markdown/mixed-lang.md
Normal file
9
fixtures/markdown/mixed-lang.md
Normal file
@@ -0,0 +1,9 @@
|
||||
# Mixed Language Note
|
||||
|
||||
이 문서는 한국어와 영어가 섞여 있습니다. The body has both Korean
|
||||
sentences and English sentences. lingua는 통계적 언어 감지기를 제공합니다.
|
||||
This is to test that auto-detect picks one of `ko` or `en` deterministically
|
||||
when no `lang:` field is present in the frontmatter.
|
||||
|
||||
본문은 첫 4 KB만 분석되지만, 짧은 문서에서도 잘 동작해야 합니다.
|
||||
The detector should pick the dominant language across the sample window.
|
||||
16
fixtures/markdown/mixed-lang.snapshot.json
Normal file
16
fixtures/markdown/mixed-lang.snapshot.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"metadata": {
|
||||
"aliases": [],
|
||||
"tags": [],
|
||||
"created_at": "2024-01-01T00:00:00Z",
|
||||
"updated_at": "2024-01-02T00:00:00Z",
|
||||
"source_type": "markdown",
|
||||
"trust_level": "primary",
|
||||
"user_id_alias": null,
|
||||
"user": {
|
||||
"lang": "en"
|
||||
}
|
||||
},
|
||||
"span_present": false,
|
||||
"warnings": []
|
||||
}
|
||||
Reference in New Issue
Block a user