From a86b463fc4cca75962d18da73351aff93bac4ee2 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 12:55:20 +0000 Subject: [PATCH 1/6] p1-2: scaffold kb-parse-md crate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the workspace member with the dep allow-list pinned by design §0 Q9 and the task spec. P1-2 will land the frontmatter submodule in the next commit; P1-3 will add the block parser as a sibling. Notable choice: serde_yaml (dtolnay) was archived as unmaintained in 2024 so we use serde_yaml_ng, the maintained fork. lingua's per-language features are explicitly enabled (default-features=false) to keep build time + binary size sane — only the languages we need at parse time. --- Cargo.lock | 414 +++++++++++++++++++++++++++++++++- Cargo.toml | 1 + crates/kb-parse-md/Cargo.toml | 34 +++ crates/kb-parse-md/src/lib.rs | 7 + 4 files changed, 455 insertions(+), 1 deletion(-) create mode 100644 crates/kb-parse-md/Cargo.toml create mode 100644 crates/kb-parse-md/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 48aaa06..7c34f44 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -79,6 +79,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + [[package]] name = "bitflags" version = "2.11.1" @@ -109,6 +115,12 @@ dependencies = [ "serde", ] +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + [[package]] name = "cc" version = "1.2.61" @@ -177,6 +189,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" +[[package]] +name = "counter" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66e8e052be91f1c8aae2c1d81307d9f6e67f5f37001e3ddee419e971e73f03bc" +dependencies = [ + "num-traits", +] + [[package]] name = "cpufeatures" version = "0.3.0" @@ -220,6 +241,20 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "deranged" version = "0.5.8" @@ -251,6 +286,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "equivalent" version = "1.0.2" @@ -272,6 +313,9 @@ name = "fastrand" version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" +dependencies = [ + "getrandom 0.3.4", +] [[package]] name = "find-msvc-tools" @@ -285,6 +329,36 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "fst" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a" + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-core", + "futures-task", + "pin-project-lite", + "slab", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -296,6 +370,20 @@ dependencies = [ "wasi", ] +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi 5.3.0", + "wasip2", + "wasm-bindgen", +] + [[package]] name = "getrandom" version = "0.4.2" @@ -304,7 +392,7 @@ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 6.0.0", "wasip2", "wasip3", ] @@ -322,6 +410,12 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "hashbrown" version = "0.15.5" @@ -365,6 +459,25 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "include_dir" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "923d117408f1e49d914f1a379a309cffe4f18c05cf4e3d12e613a15fc81bd0dd" +dependencies = [ + "include_dir_macros", +] + +[[package]] +name = "include_dir_macros" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cab85a7ed0bd5f0e76d93846e0147172bed2e2d3f859bcc33a8d9699cad1a75" +dependencies = [ + "proc-macro2", + "quote", +] + [[package]] name = "indexmap" version = "2.14.0" @@ -383,12 +496,33 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "js-sys" +version = "0.3.97" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1840c94c045fbcf8ba2812c95db44499f7c64910a912551aaaa541decebcacf" +dependencies = [ + "cfg-if", + "futures-util", + "once_cell", + "wasm-bindgen", +] + [[package]] name = "kb-app" version = "0.1.0" @@ -443,6 +577,21 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "kb-parse-md" +version = "0.1.0" +dependencies = [ + "anyhow", + "kb-core", + "kb-parse-types", + "lingua", + "serde", + "serde_json", + "serde_yaml_ng", + "time", + "toml", +] + [[package]] name = "kb-parse-types" version = "0.1.0" @@ -495,18 +644,95 @@ dependencies = [ "libc", ] +[[package]] +name = "lingua" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40d9129bb9fe42c95d1bd420d6891607eaff17df16ee15674aed2d05b0ec8f4" +dependencies = [ + "counter", + "dashmap", + "fastrand", + "fst", + "include_dir", + "itertools", + "lingua-chinese-language-model", + "lingua-english-language-model", + "lingua-japanese-language-model", + "lingua-korean-language-model", + "maplit", + "rayon", + "regex", + "serde", + "serde-wasm-bindgen", + "strum", + "strum_macros", + "wasm-bindgen", +] + +[[package]] +name = "lingua-chinese-language-model" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21ca7fa9f7671d684c82c168725f380fc873f14d6f4e8c82f0da681bcc0048d1" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-english-language-model" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97102de08b134a49f1cce05a1b6f5bf08ef21fe858074ae2b794e7892c43dd4b" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-japanese-language-model" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df0938f75de3ae5dcdc925d823ed409854ca14f6a653782b9a1ad5d899462fbe" +dependencies = [ + "include_dir", +] + +[[package]] +name = "lingua-korean-language-model" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa87f6c43ff894fc75159c021480d2fdf96882bf5bd235f8916ceb6b7caae561" +dependencies = [ + "include_dir", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + [[package]] name = "log" version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + [[package]] name = "matchers" version = "0.2.0" @@ -537,6 +763,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -555,6 +790,19 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -595,12 +843,47 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "r-efi" version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + [[package]] name = "redox_users" version = "0.4.6" @@ -612,6 +895,18 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + [[package]] name = "regex-automata" version = "0.4.14" @@ -642,6 +937,18 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + [[package]] name = "ryu-js" version = "1.0.2" @@ -657,6 +964,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "semver" version = "1.0.28" @@ -673,6 +986,17 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde-wasm-bindgen" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8302e169f0eddcc139c70f139d19d6467353af16f9fce27e8c30158036a1e16b" +dependencies = [ + "js-sys", + "serde", + "wasm-bindgen", +] + [[package]] name = "serde_core" version = "1.0.228" @@ -726,6 +1050,19 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yaml_ng" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4db627b98b36d4203a7b458cf3573730f2bb591b28871d916dfa9efabfd41f" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -741,6 +1078,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + [[package]] name = "smallvec" version = "1.15.1" @@ -753,6 +1096,24 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" + +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "symlink" version = "0.1.0" @@ -1027,6 +1388,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "utf8parse" version = "0.2.2" @@ -1073,6 +1440,51 @@ dependencies = [ "wit-bindgen 0.51.0", ] +[[package]] +name = "wasm-bindgen" +version = "0.2.120" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df52b6d9b87e0c74c9edfa1eb2d9bf85e5d63515474513aa50fa181b3c4f5db1" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.120" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b1041f495fb322e64aca85f5756b2172e35cd459376e67f2a6c9dffcedb103" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.120" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dcd0ff20416988a18ac686d4d4d0f6aae9ebf08a389ff5d29012b05af2a1b41" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.120" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49757b3c82ebf16c57d69365a142940b384176c24df52a087fb748e2085359ea" +dependencies = [ + "unicode-ident", +] + [[package]] name = "wasm-encoder" version = "0.244.0" diff --git a/Cargo.toml b/Cargo.toml index a9f9b3b..23fb992 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "crates/kb-parse-types", "crates/kb-config", "crates/kb-source-fs", + "crates/kb-parse-md", "crates/kb-app", "crates/kb-cli", ] diff --git a/crates/kb-parse-md/Cargo.toml b/crates/kb-parse-md/Cargo.toml new file mode 100644 index 0000000..db7b56e --- /dev/null +++ b/crates/kb-parse-md/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "kb-parse-md" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Markdown frontmatter (and, in p1-3, block) parsing into kb-core::Metadata / kb-parse-types intermediates" + +[dependencies] +kb-core = { path = "../kb-core" } +kb-parse-types = { path = "../kb-parse-types" } +anyhow = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +time = { workspace = true } +# serde_yaml (dtolnay) was archived as unmaintained in 2024. +# We use the maintained fork serde_yaml_ng. Keeping the same `serde_yaml`-style +# API surface lets us swap if a different fork wins long term. +serde_yaml_ng = "0.10" +toml = "0.8" +# `lingua` ships every supported language as a feature flag; the `default` +# feature pulls all 75+ language models (huge build time + binary size). +# For p1-2 we only need a small subset for autodetect + tests. Add more +# languages here as future tasks call for them. +lingua = { version = "1.8", default-features = false, features = [ + "korean", + "english", + "japanese", + "chinese", +] } + +[dev-dependencies] +serde_json = { workspace = true } diff --git a/crates/kb-parse-md/src/lib.rs b/crates/kb-parse-md/src/lib.rs new file mode 100644 index 0000000..fcaca6e --- /dev/null +++ b/crates/kb-parse-md/src/lib.rs @@ -0,0 +1,7 @@ +//! `kb-parse-md` — Markdown parsing for the KB pipeline (§3.7b). +//! +//! P1-2 will implement the **frontmatter** submodule. P1-3 will add a +//! sibling `blocks` submodule for block parsing using `pulldown-cmark`. +//! +//! This commit only establishes the crate scaffold so subsequent +//! commits can land the parser in a reviewable shape. -- 2.49.1 From cc8f7dad3fe5924a7eab8fe6cf97b92812c98255 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 12:56:02 +0000 Subject: [PATCH 2/6] =?UTF-8?q?p1-2:=20parse=5Ffrontmatter=20+=20=C2=A70?= =?UTF-8?q?=20Q9=20derive=20table?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the frontmatter submodule: - detect_delimiters scans for a leading YAML (---) or TOML (+++) block at byte 0. Strict per §0 Q9: no leading whitespace / BOM, no chars on the delimiter line. Closing must be its own line. Unterminated → no FM. - parse_raw deserializes into RawFrontmatter, a serde-flatten struct that catches unknown keys into a serde_json::Map for verbatim preservation in metadata.user. - derive_metadata implements the §0 Q9 fallback chain: title → frontmatter | BodyHints.first_h1 | (filename: caller) aliases/tags→ frontmatter | [] lang → frontmatter | lingua autodetect on first 4 KB | hints | "und" created_at → frontmatter (RFC 3339, normalized to UTC) | fs_ctime updated_at → frontmatter | fs_mtime source_type → frontmatter | "markdown" trust_level → frontmatter | "primary" id → user_id_alias only — never a doc_id factor (§4.2) - Non-UTC offsets are normalized to UTC; the original string is preserved in user.original_timestamps[field] per §0 Q9. - Warnings are emitted for: malformed YAML/TOML, unknown enum values, malformed timestamps. Unknown keys are silent. - lingua detector is cached in a OnceLock — first build is heavy. - 15 unit tests cover every row of the derive table + delimiter edge cases + an explicit pin that `id:` does not feed id_for_doc. --- crates/kb-parse-md/src/frontmatter.rs | 740 ++++++++++++++++++++++++++ crates/kb-parse-md/src/lib.rs | 18 +- 2 files changed, 755 insertions(+), 3 deletions(-) create mode 100644 crates/kb-parse-md/src/frontmatter.rs diff --git a/crates/kb-parse-md/src/frontmatter.rs b/crates/kb-parse-md/src/frontmatter.rs new file mode 100644 index 0000000..eacc9a3 --- /dev/null +++ b/crates/kb-parse-md/src/frontmatter.rs @@ -0,0 +1,740 @@ +//! Markdown frontmatter parsing → `kb_core::Metadata`. +//! +//! Implements the contract pinned in design §0 Q9 (frontmatter derive table) +//! and §3.6 (Metadata shape). Produces structured warnings via +//! `kb-parse-types`. +//! +//! # YAML library +//! +//! Upstream `serde_yaml` (dtolnay) was archived as unmaintained in 2024. We +//! use [`serde_yaml_ng`], a maintained fork with an API-compatible surface, +//! so a future swap to whichever fork wins (`serde_yml`, `yaml-rust2`, …) +//! is a one-line dep change. + +use std::sync::OnceLock; + +use kb_core::{Metadata, SourceType, TrustLevel}; +use kb_parse_types::{Warning, WarningKind}; +use lingua::{IsoCode639_1, Language, LanguageDetector, LanguageDetectorBuilder}; +use serde::Deserialize; +use serde_json::{Map, Value}; +use time::OffsetDateTime; + +/// Caller-supplied fallback values used when frontmatter is missing or partial. +/// +/// `BodyHints` is parser-input only — it is not part of `kb-core` and never +/// crosses the storage boundary. The §0 Q9 derive table consults these +/// fallbacks in a fixed order, see [`parse_frontmatter`]. +#[derive(Clone, Debug)] +pub struct BodyHints { + /// First H1 of the body, if any. Used as `title` fallback when the + /// frontmatter does not specify one. + pub first_h1: Option, + /// Filesystem creation time. Used as `created_at` fallback. + pub fs_ctime: OffsetDateTime, + /// Filesystem modification time. Used as `updated_at` fallback. + pub fs_mtime: OffsetDateTime, + /// Optional language fallback used when neither frontmatter nor lingua + /// detection produce a value. If `None` the final fallback is `"und"`. + pub fallback_lang: Option, +} + +/// Byte range of the frontmatter region inside the input slice. +/// +/// `start` is the offset of the leading delimiter (`---` or `+++`). +/// `end` is the offset just past the closing delimiter line's trailing +/// newline (i.e. the body starts at `bytes[end..]`). +/// +/// Per the task brief this is technically meant to be crate-internal, but +/// the [`parse_frontmatter`] return type forces it to be `pub`. P1-3 / P1-4 +/// reuse it via this same crate. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct FrontmatterSpan { + pub start: usize, + pub end: usize, +} + +/// Parse the frontmatter (if any) from a Markdown byte slice into a +/// `kb_core::Metadata`, applying the §0 Q9 derive table for missing fields. +/// +/// On a malformed frontmatter the function still returns `Ok` — the +/// frontmatter contents are discarded and the caller is told via a +/// `Warning { kind: MalformedFrontmatter, .. }`. The returned span still +/// covers the delimited region so the caller can skip it during body +/// slicing. +/// +/// `Err` is reserved for genuinely fatal conditions (e.g. non-UTF-8 input +/// that can't even be lossy-decoded), which currently cannot arise here. +pub fn parse_frontmatter( + bytes: &[u8], + hints: &BodyHints, +) -> anyhow::Result<(Metadata, Option, Vec)> { + let mut warnings = Vec::new(); + + let detected = detect_delimiters(bytes); + + let (raw_opt, span_opt) = match detected { + None => (None, None), + Some((delim, span)) => { + // SAFETY: detect_delimiters guarantees inner bytes are valid UTF-8 + // because it scanned for ASCII delimiters and slices on those + // boundaries. We still go through `from_utf8` to surface non-ASCII + // bytes safely as a malformed-frontmatter warning. + let inner_start = span.start + delim.opening_len(); + let inner_end = span.end - delim.closing_len(); + let inner = &bytes[inner_start..inner_end]; + match std::str::from_utf8(inner) { + Ok(s) => match parse_raw(delim, s) { + Ok(raw) => (Some(raw), Some(span)), + Err(e) => { + warnings.push(Warning { + kind: WarningKind::MalformedFrontmatter, + note: e, + }); + (None, Some(span)) + } + }, + Err(e) => { + warnings.push(Warning { + kind: WarningKind::MalformedFrontmatter, + note: format!("frontmatter not valid utf-8: {e}"), + }); + (None, Some(span)) + } + } + } + }; + + let body_start = span_opt.map(|s| s.end).unwrap_or(0); + let body = &bytes[body_start..]; + + let metadata = derive_metadata(raw_opt, hints, body, &mut warnings); + + Ok((metadata, span_opt, warnings)) +} + +// --------------------------------------------------------------------------- +// Delimiter detection +// --------------------------------------------------------------------------- + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum DelimKind { + Yaml, + Toml, +} + +impl DelimKind { + /// Bytes consumed at the start (delimiter line + newline). + fn opening_len(self) -> usize { + // "---\n" or "+++\n" — both 4 bytes; "---\r\n" handled by detect. + match self { + DelimKind::Yaml => 4, + DelimKind::Toml => 4, + } + } + + fn closing_len(self) -> usize { + // The closing delimiter line itself plus its trailing newline. Same + // shape as opening; `detect_delimiters` adjusts for `\r\n`. + match self { + DelimKind::Yaml => 4, + DelimKind::Toml => 4, + } + } + + fn marker(self) -> &'static [u8] { + match self { + DelimKind::Yaml => b"---", + DelimKind::Toml => b"+++", + } + } +} + +/// Look for a YAML or TOML frontmatter block at the very start of `bytes`. +/// Returns `(kind, span)` where `span.start = 0` and `span.end` points +/// just past the closing delimiter's trailing newline (or EOF). +/// +/// Anything that isn't an exact `---\n` / `+++\n` opener at byte 0 is treated +/// as "no frontmatter" — no leading whitespace, no BOM, etc. Per design §0 Q9. +pub(crate) fn detect_delimiters(bytes: &[u8]) -> Option<(DelimKind, FrontmatterSpan)> { + let kind = match bytes.first()? { + b'-' if bytes.starts_with(b"---") => DelimKind::Yaml, + b'+' if bytes.starts_with(b"+++") => DelimKind::Toml, + _ => return None, + }; + + let marker = kind.marker(); + + // Opening line must be just the marker + newline (LF or CRLF). No trailing + // chars on the same line are allowed — that's not a frontmatter delimiter. + let after_open = match bytes.get(marker.len()) { + Some(b'\n') => marker.len() + 1, + Some(b'\r') if bytes.get(marker.len() + 1) == Some(&b'\n') => marker.len() + 2, + _ => return None, + }; + + // Find the closing marker on its own line. + // Walk line by line. We need a line that is exactly `marker` (optionally + // followed by spaces? per §0 Q9 we keep it strict: marker + EOL only). + let mut i = after_open; + while i < bytes.len() { + let line_start = i; + // find next newline (or EOF) + let line_end = bytes[line_start..] + .iter() + .position(|&b| b == b'\n') + .map(|p| line_start + p) + .unwrap_or(bytes.len()); + + let line = { + // trim trailing \r if present (CRLF) + let mut end = line_end; + if end > line_start && bytes[end.saturating_sub(1)] == b'\r' { + end -= 1; + } + &bytes[line_start..end] + }; + + if line == marker { + // Closing delimiter found. Compute span end = line_end + 1 if a + // newline is present, else line_end (EOF). + let span_end = if line_end < bytes.len() { + line_end + 1 + } else { + bytes.len() + }; + return Some(( + kind, + FrontmatterSpan { + start: 0, + end: span_end, + }, + )); + } + + if line_end >= bytes.len() { + break; + } + i = line_end + 1; + } + + // No closing delimiter — not a frontmatter block. + None +} + +// --------------------------------------------------------------------------- +// Raw frontmatter (parsed shape, before §0 Q9 derive) +// --------------------------------------------------------------------------- + +/// Untyped frontmatter view. Known fields are pulled by name, unknowns flow +/// into `extra`. We deliberately use `serde_json::Value` everywhere so YAML +/// and TOML go through the same downstream pipeline. +#[derive(Debug, Default, Deserialize)] +struct RawFrontmatter { + #[serde(default)] + title: Option, + #[serde(default)] + aliases: Option>, + #[serde(default)] + tags: Option>, + #[serde(default)] + lang: Option, + #[serde(default)] + created_at: Option, + #[serde(default)] + updated_at: Option, + #[serde(default)] + source_type: Option, + #[serde(default)] + trust_level: Option, + /// `id:` field is captured as an alias only — never feeds doc_id (§4.2). + #[serde(default)] + id: Option, + /// Catch-all for unknown keys → `metadata.user`. + #[serde(flatten)] + extra: Map, +} + +fn parse_raw(kind: DelimKind, slice: &str) -> Result { + match kind { + DelimKind::Yaml => { + // Empty YAML frontmatter is legal (parses to null) — handle + // explicitly so `serde_yaml_ng` doesn't fail trying to deserialize + // null into a struct. + if slice.trim().is_empty() { + return Ok(RawFrontmatter::default()); + } + serde_yaml_ng::from_str::(slice).map_err(|e| e.to_string()) + } + DelimKind::Toml => { + if slice.trim().is_empty() { + return Ok(RawFrontmatter::default()); + } + toml::from_str::(slice).map_err(|e| e.to_string()) + } + } +} + +// --------------------------------------------------------------------------- +// §0 Q9 derive table +// --------------------------------------------------------------------------- + +fn derive_metadata( + raw: Option, + hints: &BodyHints, + body: &[u8], + warnings: &mut Vec, +) -> Metadata { + let raw = raw.unwrap_or_default(); + + // user map starts from the unknown-key overflow. + let mut user = raw.extra; + + // ---- title ---- + // Frontmatter → BodyHints.first_h1 → None. + // Filename fallback is the caller's responsibility (P1-4 normalize), per + // task brief — `BodyHints` does not carry a filename. + let title = raw.title.or_else(|| hints.first_h1.clone()); + if let Some(t) = title { + user.insert("title".to_string(), Value::String(t)); + } + + // ---- aliases / tags ---- + let aliases = raw.aliases.unwrap_or_default(); + let tags = raw.tags.unwrap_or_default(); + + // ---- lang ---- + // Frontmatter → lingua autodetect (first 4 KB of body) → fallback_lang → "und". + // The lang field is not on Metadata (§3.6) — store it under user.lang. + let lang = raw + .lang + .or_else(|| detect_lang(body)) + .or_else(|| hints.fallback_lang.clone()) + .unwrap_or_else(|| "und".to_string()); + user.insert("lang".to_string(), Value::String(lang)); + + // ---- timestamps ---- + let mut original_timestamps: Map = Map::new(); + let created_at = parse_ts( + raw.created_at.as_deref(), + "created_at", + &mut original_timestamps, + warnings, + ) + .unwrap_or(hints.fs_ctime); + let updated_at = parse_ts( + raw.updated_at.as_deref(), + "updated_at", + &mut original_timestamps, + warnings, + ) + .unwrap_or(hints.fs_mtime); + if !original_timestamps.is_empty() { + user.insert( + "original_timestamps".to_string(), + Value::Object(original_timestamps), + ); + } + + // ---- source_type ---- + let source_type = match raw.source_type.as_deref() { + None => SourceType::Markdown, + Some(s) => match parse_source_type(s) { + Some(st) => st, + None => { + warnings.push(Warning { + kind: WarningKind::MalformedFrontmatter, + note: format!("unknown source_type={s}, defaulted to markdown"), + }); + SourceType::Markdown + } + }, + }; + + // ---- trust_level ---- + let trust_level = match raw.trust_level.as_deref() { + None => TrustLevel::Primary, + Some(s) => match parse_trust_level(s) { + Some(tl) => tl, + None => { + warnings.push(Warning { + kind: WarningKind::MalformedFrontmatter, + note: format!("unknown trust_level={s}, defaulted to primary"), + }); + TrustLevel::Primary + } + }, + }; + + // ---- id alias ---- + // `id:` field becomes `metadata.user_id_alias` AND is mirrored into the + // user map under `user_id_alias` (per design §4.2 — not a doc_id factor). + let user_id_alias = raw.id; + if let Some(ref id) = user_id_alias { + user.insert( + "user_id_alias".to_string(), + Value::String(id.clone()), + ); + } + + Metadata { + aliases, + tags, + created_at, + updated_at, + source_type, + trust_level, + user_id_alias, + user, + } +} + +fn parse_source_type(s: &str) -> Option { + // Mirror the lowercase serde rename used on SourceType. + match s { + "markdown" => Some(SourceType::Markdown), + "note" => Some(SourceType::Note), + "paper" => Some(SourceType::Paper), + "reference" => Some(SourceType::Reference), + "inbox" => Some(SourceType::Inbox), + _ => None, + } +} + +fn parse_trust_level(s: &str) -> Option { + match s { + "primary" => Some(TrustLevel::Primary), + "secondary" => Some(TrustLevel::Secondary), + "generated" => Some(TrustLevel::Generated), + _ => None, + } +} + +/// Parse an RFC 3339 timestamp string and normalize to UTC. If the original +/// offset was non-UTC, push it into `original_timestamps[field]` per §0 Q9. +/// Returns `None` if the input is missing OR malformed (in which case a +/// warning is emitted). +fn parse_ts( + s: Option<&str>, + field: &str, + original_timestamps: &mut Map, + warnings: &mut Vec, +) -> Option { + let s = s?; + match OffsetDateTime::parse(s, &time::format_description::well_known::Rfc3339) { + Ok(dt) => { + if dt.offset() != time::UtcOffset::UTC { + original_timestamps.insert(field.to_string(), Value::String(s.to_string())); + } + Some(dt.to_offset(time::UtcOffset::UTC)) + } + Err(e) => { + warnings.push(Warning { + kind: WarningKind::MalformedFrontmatter, + note: format!("malformed {field}={s:?}: {e}"), + }); + None + } + } +} + +// --------------------------------------------------------------------------- +// Lingua detector (cached statically — first init is heavy) +// --------------------------------------------------------------------------- + +fn detector() -> &'static LanguageDetector { + static DETECTOR: OnceLock = OnceLock::new(); + DETECTOR.get_or_init(|| { + // Keep the language set narrow: matches the cargo features we enable + // on the `lingua` dep. Adding more languages here without enabling + // their feature flag will fail to compile. + LanguageDetectorBuilder::from_languages(&[ + Language::English, + Language::Korean, + Language::Japanese, + Language::Chinese, + ]) + .build() + }) +} + +/// Run lingua autodetect on the first 4 KB of body. Returns an ISO 639-1 +/// two-letter code (lowercase) on success. +/// +/// Note: lingua needs reasonably long input to be confident. Empty / very +/// short bodies return `None` so we fall through to the next derive step. +fn detect_lang(body: &[u8]) -> Option { + const WINDOW: usize = 4 * 1024; + if body.is_empty() { + return None; + } + let n = body.len().min(WINDOW); + // Find a UTF-8-safe slice end ≤ n. Walk back at most 4 bytes. + let mut end = n; + while end > 0 && std::str::from_utf8(&body[..end]).is_err() { + end -= 1; + } + if end == 0 { + return None; + } + let s = std::str::from_utf8(&body[..end]).ok()?; + if s.trim().is_empty() { + return None; + } + let lang = detector().detect_language_of(s)?; + Some(iso_code(lang).to_string()) +} + +fn iso_code(lang: Language) -> &'static str { + // `lingua::IsoCode639_1` is gated by the language features enabled on the + // crate — only the variants below are compiled into our build, so this + // match is exhaustive for the configured detector. + match lang.iso_code_639_1() { + IsoCode639_1::EN => "en", + IsoCode639_1::KO => "ko", + IsoCode639_1::JA => "ja", + IsoCode639_1::ZH => "zh", + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use kb_core::{ + AssetId, WorkspacePath, + ids::id_for_doc, + versions::ParserVersion, + }; + use time::macros::datetime; + + fn hints() -> BodyHints { + BodyHints { + first_h1: None, + fs_ctime: datetime!(2024-01-01 00:00:00 UTC), + fs_mtime: datetime!(2024-01-02 00:00:00 UTC), + fallback_lang: None, + } + } + + #[test] + fn yaml_happy_path() { + let md = b"---\n\ +title: My Doc\n\ +aliases: [a, b]\n\ +tags: [t1, t2]\n\ +lang: en\n\ +created_at: 2024-03-01T00:00:00Z\n\ +updated_at: 2024-03-02T00:00:00Z\n\ +source_type: note\n\ +trust_level: secondary\n\ +---\nbody\n"; + + let (meta, span, warns) = parse_frontmatter(md, &hints()).unwrap(); + assert!(warns.is_empty(), "warnings: {warns:?}"); + let span = span.expect("span present"); + assert_eq!(span.start, 0); + assert_eq!(meta.aliases, vec!["a".to_string(), "b".to_string()]); + assert_eq!(meta.tags, vec!["t1".to_string(), "t2".to_string()]); + assert_eq!(meta.source_type, SourceType::Note); + assert_eq!(meta.trust_level, TrustLevel::Secondary); + assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC)); + assert_eq!(meta.updated_at, datetime!(2024-03-02 00:00:00 UTC)); + assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("My Doc")); + assert_eq!(meta.user.get("lang").and_then(|v| v.as_str()), Some("en")); + assert_eq!(meta.user_id_alias, None); + } + + #[test] + fn toml_happy_path() { + let md = b"+++\n\ +title = \"My Doc\"\n\ +aliases = [\"a\", \"b\"]\n\ +tags = [\"t1\", \"t2\"]\n\ +lang = \"en\"\n\ +created_at = \"2024-03-01T00:00:00Z\"\n\ +updated_at = \"2024-03-02T00:00:00Z\"\n\ +source_type = \"note\"\n\ +trust_level = \"secondary\"\n\ ++++\nbody\n"; + + let (meta, span, warns) = parse_frontmatter(md, &hints()).unwrap(); + assert!(warns.is_empty(), "warnings: {warns:?}"); + assert!(span.is_some()); + assert_eq!(meta.aliases, vec!["a".to_string(), "b".to_string()]); + assert_eq!(meta.tags, vec!["t1".to_string(), "t2".to_string()]); + assert_eq!(meta.source_type, SourceType::Note); + assert_eq!(meta.trust_level, TrustLevel::Secondary); + } + + #[test] + fn unknown_keys_preserved_in_user() { + let md = b"---\n\ +title: Doc\n\ +custom_field: hello\n\ +nested: {a: 1}\n\ +---\n"; + let (meta, _span, warns) = parse_frontmatter(md, &hints()).unwrap(); + assert!(warns.is_empty(), "warnings: {warns:?}"); + assert_eq!( + meta.user.get("custom_field").and_then(|v| v.as_str()), + Some("hello") + ); + assert!(meta.user.get("nested").is_some()); + } + + #[test] + fn unknown_enum_value_warns_and_defaults() { + let md = b"---\n\ +trust_level: weird\n\ +source_type: alien\n\ +---\n"; + let (meta, _span, warns) = parse_frontmatter(md, &hints()).unwrap(); + assert_eq!(meta.trust_level, TrustLevel::Primary); + assert_eq!(meta.source_type, SourceType::Markdown); + assert_eq!(warns.len(), 2); + assert!(warns.iter().all(|w| matches!(w.kind, WarningKind::MalformedFrontmatter))); + assert!(warns.iter().any(|w| w.note.contains("trust_level=weird"))); + assert!(warns.iter().any(|w| w.note.contains("source_type=alien"))); + } + + #[test] + fn malformed_yaml_emits_warning_and_defaults() { + // Unclosed quote → YAML parse fails. + let md = b"---\ntitle: \"unterminated\n---\n"; + let (meta, span, warns) = parse_frontmatter(md, &hints()).unwrap(); + assert!(span.is_some(), "span still reflects delim region"); + assert_eq!(warns.len(), 1); + assert!(matches!(warns[0].kind, WarningKind::MalformedFrontmatter)); + // Body fallbacks applied. + assert_eq!(meta.created_at, datetime!(2024-01-01 00:00:00 UTC)); + assert_eq!(meta.updated_at, datetime!(2024-01-02 00:00:00 UTC)); + assert_eq!(meta.source_type, SourceType::Markdown); + assert_eq!(meta.trust_level, TrustLevel::Primary); + } + + #[test] + fn no_frontmatter_uses_body_hints_silently() { + let md = b"# Just a heading\n\nsome body"; + let mut h = hints(); + h.first_h1 = Some("Just a heading".to_string()); + h.fallback_lang = Some("en".to_string()); + let (meta, span, warns) = parse_frontmatter(md, &h).unwrap(); + assert!(span.is_none()); + assert!(warns.is_empty()); + assert_eq!( + meta.user.get("title").and_then(|v| v.as_str()), + Some("Just a heading") + ); + // Body too short for confident lingua autodetect → fallback_lang. + assert_eq!(meta.user.get("lang").and_then(|v| v.as_str()), Some("en")); + } + + /// `id:` field MUST NOT influence `doc_id` (design §4.2). Compute the + /// recipe twice — with and without the field — and assert the results + /// match. + #[test] + fn id_field_does_not_feed_doc_id() { + let with_id = b"---\nid: my-handle\ntitle: Doc\n---\n"; + let without = b"---\ntitle: Doc\n---\n"; + + let (meta_with, _, _) = parse_frontmatter(with_id, &hints()).unwrap(); + let (meta_without, _, _) = parse_frontmatter(without, &hints()).unwrap(); + + assert_eq!(meta_with.user_id_alias.as_deref(), Some("my-handle")); + assert_eq!(meta_without.user_id_alias, None); + + let asset = AssetId("0123456789abcdef0123456789abcdef".to_string()); + let path = WorkspacePath::new("notes/test.md".to_string()).unwrap(); + let pv = ParserVersion("pulldown-cmark-0.x".to_string()); + + let id_a = id_for_doc(&path, &asset, &pv); + let id_b = id_for_doc(&path, &asset, &pv); + assert_eq!( + id_a, id_b, + "id_for_doc must be stable across runs and not see metadata" + ); + // Sanity: the recipe takes (workspace_path, asset_id, parser_version) + // only — there is literally no parameter to plumb metadata through. + } + + #[test] + fn non_utc_timestamp_preserved_in_user_original_timestamps() { + let md = b"---\ncreated_at: 2024-01-15T10:00:00+09:00\n---\n"; + let (meta, _, warns) = parse_frontmatter(md, &hints()).unwrap(); + assert!(warns.is_empty(), "warnings: {warns:?}"); + // Normalized to UTC. + assert_eq!(meta.created_at, datetime!(2024-01-15 01:00:00 UTC)); + let orig = meta + .user + .get("original_timestamps") + .and_then(|v| v.as_object()) + .expect("original_timestamps map present"); + assert_eq!( + orig.get("created_at").and_then(|v| v.as_str()), + Some("2024-01-15T10:00:00+09:00") + ); + } + + #[test] + fn malformed_timestamp_warns_and_falls_back() { + let md = b"---\ncreated_at: not-a-date\n---\n"; + let (meta, _, warns) = parse_frontmatter(md, &hints()).unwrap(); + assert_eq!(warns.len(), 1); + assert!(matches!(warns[0].kind, WarningKind::MalformedFrontmatter)); + assert!(warns[0].note.contains("created_at")); + // Fallback to fs_ctime. + assert_eq!(meta.created_at, datetime!(2024-01-01 00:00:00 UTC)); + } + + #[test] + fn detect_delimiters_no_match_without_leading_marker() { + assert!(detect_delimiters(b"# heading\n---\n---\n").is_none()); + assert!(detect_delimiters(b" ---\n---\n").is_none(), "leading whitespace"); + assert!(detect_delimiters(b"").is_none()); + } + + #[test] + fn detect_delimiters_yaml_basic() { + let bytes = b"---\nfoo: bar\n---\nbody\n"; + let (kind, span) = detect_delimiters(bytes).unwrap(); + assert_eq!(kind, DelimKind::Yaml); + assert_eq!(span.start, 0); + // body starts at "body\n" — the closing "---\n" is part of the span. + assert_eq!(&bytes[span.end..], b"body\n"); + } + + #[test] + fn detect_delimiters_toml_basic() { + let bytes = b"+++\nfoo = \"bar\"\n+++\nbody\n"; + let (kind, span) = detect_delimiters(bytes).unwrap(); + assert_eq!(kind, DelimKind::Toml); + assert_eq!(&bytes[span.end..], b"body\n"); + } + + #[test] + fn detect_delimiters_unterminated_returns_none() { + // `---\n` then no closing — treat as no frontmatter. + let bytes = b"---\nfoo: bar\n"; + assert!(detect_delimiters(bytes).is_none()); + } + + #[test] + fn empty_yaml_frontmatter_is_legal() { + let md = b"---\n---\nbody\n"; + let (_meta, span, warns) = parse_frontmatter(md, &hints()).unwrap(); + assert!(span.is_some()); + assert!(warns.is_empty(), "warnings: {warns:?}"); + } + + #[test] + fn lingua_detects_korean_and_english() { + let ko = "안녕하세요. 이것은 한국어로 작성된 문서입니다. 형태소 분석은 어렵습니다. 그러나 lingua는 잘 동작합니다.".as_bytes(); + let en = "Hello there. This document is written in English. The lingua language detector is statistical and works on short text too, given enough words.".as_bytes(); + assert_eq!(detect_lang(ko).as_deref(), Some("ko")); + assert_eq!(detect_lang(en).as_deref(), Some("en")); + } +} diff --git a/crates/kb-parse-md/src/lib.rs b/crates/kb-parse-md/src/lib.rs index fcaca6e..4ce413d 100644 --- a/crates/kb-parse-md/src/lib.rs +++ b/crates/kb-parse-md/src/lib.rs @@ -1,7 +1,19 @@ //! `kb-parse-md` — Markdown parsing for the KB pipeline (§3.7b). //! -//! P1-2 will implement the **frontmatter** submodule. P1-3 will add a +//! P1-2 implements the **frontmatter** submodule only. P1-3 will add a //! sibling `blocks` submodule for block parsing using `pulldown-cmark`. //! -//! This commit only establishes the crate scaffold so subsequent -//! commits can land the parser in a reviewable shape. +//! Public surface for P1-2 is intentionally narrow: +//! +//! * [`parse_frontmatter`] — pure function from Markdown bytes to +//! `(Metadata, Option, Vec)`. +//! * [`BodyHints`] — caller-supplied fallbacks that feed the §0 Q9 derive +//! table when frontmatter is missing or partial. +//! * [`FrontmatterSpan`] — byte offsets of the frontmatter region in the +//! input slice (returned by [`parse_frontmatter`]). +//! +//! Anything else in this crate is `pub(crate)` and may change without notice. + +pub mod frontmatter; + +pub use frontmatter::{BodyHints, FrontmatterSpan, parse_frontmatter}; -- 2.49.1 From 42a7d53e5dc55248397988e8dde8bb2d82160579 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 12:56:19 +0000 Subject: [PATCH 3/6] p1-2: fixtures + snapshot tests for frontmatter parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two markdown fixtures with hand-authored JSON baselines that pin the §0 Q9 derive output across runs: - frontmatter-only.md exercises the YAML happy path with most fields, unknown keys, an `id:` field, and a non-UTC created_at (so the baseline shows original_timestamps preservation). - mixed-lang.md is body-only with no `lang:` field; baseline pins the lingua autodetect result for our enabled language set. A separate `emit_snapshots` test (marked `#[ignore]`) regenerates the baselines from the current parser output. A determinism test parses the fixture twice and asserts equality so any non-determinism (e.g. key ordering, lingua nondeterminism) fails fast. --- .../tests/frontmatter_snapshots.rs | 111 ++++++++++++++++++ fixtures/markdown/frontmatter-only.md | 22 ++++ .../markdown/frontmatter-only.snapshot.json | 31 +++++ fixtures/markdown/mixed-lang.md | 9 ++ fixtures/markdown/mixed-lang.snapshot.json | 16 +++ 5 files changed, 189 insertions(+) create mode 100644 crates/kb-parse-md/tests/frontmatter_snapshots.rs create mode 100644 fixtures/markdown/frontmatter-only.md create mode 100644 fixtures/markdown/frontmatter-only.snapshot.json create mode 100644 fixtures/markdown/mixed-lang.md create mode 100644 fixtures/markdown/mixed-lang.snapshot.json diff --git a/crates/kb-parse-md/tests/frontmatter_snapshots.rs b/crates/kb-parse-md/tests/frontmatter_snapshots.rs new file mode 100644 index 0000000..84c6bcc --- /dev/null +++ b/crates/kb-parse-md/tests/frontmatter_snapshots.rs @@ -0,0 +1,111 @@ +//! Snapshot tests pinning the §0 Q9 derive output for two fixtures. +//! +//! The baseline JSON next to each fixture is hand-authored / regenerated +//! from a deterministic run. `BodyHints` timestamps are caller-provided +//! and therefore stable; lingua autodetect over our fixtures is also +//! stable for the language set we configured. + +use kb_parse_md::{BodyHints, parse_frontmatter}; +use serde::Serialize; +use serde_json::Value; +use std::fs; +use std::path::PathBuf; +use time::macros::datetime; + +/// Stable view of the parser output suitable for JSON snapshotting. +/// We deliberately exclude `FrontmatterSpan` byte offsets here too — they're +/// fully determined by the input file and are exercised by unit tests; the +/// snapshot focuses on the §0 Q9 derive contract. +#[derive(Serialize)] +struct Snapshot { + metadata: kb_core::Metadata, + span_present: bool, + warnings: Vec, +} + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("..") + .join("fixtures") + .join("markdown") +} + +fn pinned_hints() -> BodyHints { + BodyHints { + first_h1: None, + fs_ctime: datetime!(2024-01-01 00:00:00 UTC), + fs_mtime: datetime!(2024-01-02 00:00:00 UTC), + fallback_lang: None, + } +} + +fn assert_snapshot(fixture: &str, baseline: &str) { + let dir = fixtures_dir(); + let bytes = fs::read(dir.join(fixture)).expect("fixture readable"); + + let (meta, span, warns) = parse_frontmatter(&bytes, &pinned_hints()).unwrap(); + let snap = Snapshot { + metadata: meta, + span_present: span.is_some(), + warnings: warns, + }; + let actual: Value = serde_json::to_value(&snap).unwrap(); + + let expected_text = + fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable"); + let expected: Value = serde_json::from_str(&expected_text).expect("baseline parses as json"); + + if actual != expected { + let actual_pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "snapshot drift for {fixture}\n\ + --- expected ({baseline}) ---\n{expected_text}\n\ + --- actual ---\n{actual_pretty}\n\ + If the change is intentional, update {baseline}." + ); + } +} + +#[test] +fn frontmatter_only_snapshot() { + assert_snapshot("frontmatter-only.md", "frontmatter-only.snapshot.json"); +} + +/// Run with `cargo test -p kb-parse-md --test frontmatter_snapshots emit_snapshots -- --ignored --nocapture` +/// to regenerate the baseline JSON files from the current parser output. +#[test] +#[ignore] +fn emit_snapshots() { + let dir = fixtures_dir(); + for (fixture, baseline) in [ + ("frontmatter-only.md", "frontmatter-only.snapshot.json"), + ("mixed-lang.md", "mixed-lang.snapshot.json"), + ] { + let bytes = fs::read(dir.join(fixture)).unwrap(); + let (meta, span, warns) = parse_frontmatter(&bytes, &pinned_hints()).unwrap(); + let snap = Snapshot { + metadata: meta, + span_present: span.is_some(), + warnings: warns, + }; + let json = serde_json::to_string_pretty(&snap).unwrap(); + fs::write(dir.join(baseline), format!("{json}\n")).unwrap(); + eprintln!("wrote {}", dir.join(baseline).display()); + } +} + +#[test] +fn mixed_lang_snapshot() { + assert_snapshot("mixed-lang.md", "mixed-lang.snapshot.json"); +} + +/// Determinism: parsing the same fixture twice in a row must give equal output. +#[test] +fn snapshot_is_deterministic_across_runs() { + let dir = fixtures_dir(); + let bytes = fs::read(dir.join("frontmatter-only.md")).unwrap(); + let (a, _, _) = parse_frontmatter(&bytes, &pinned_hints()).unwrap(); + let (b, _, _) = parse_frontmatter(&bytes, &pinned_hints()).unwrap(); + assert_eq!(serde_json::to_value(&a).unwrap(), serde_json::to_value(&b).unwrap()); +} diff --git a/fixtures/markdown/frontmatter-only.md b/fixtures/markdown/frontmatter-only.md new file mode 100644 index 0000000..766b641 --- /dev/null +++ b/fixtures/markdown/frontmatter-only.md @@ -0,0 +1,22 @@ +--- +title: Frontmatter Only +aliases: + - fm-only + - first-fixture +tags: + - parse + - test +lang: en +created_at: 2024-01-15T10:00:00+09:00 +updated_at: 2024-02-20T08:30:00Z +source_type: note +trust_level: secondary +id: my-stable-handle +custom_field: hello +nested_obj: + key: value +--- + +# Body Heading + +Body paragraph. diff --git a/fixtures/markdown/frontmatter-only.snapshot.json b/fixtures/markdown/frontmatter-only.snapshot.json new file mode 100644 index 0000000..ae187df --- /dev/null +++ b/fixtures/markdown/frontmatter-only.snapshot.json @@ -0,0 +1,31 @@ +{ + "metadata": { + "aliases": [ + "fm-only", + "first-fixture" + ], + "tags": [ + "parse", + "test" + ], + "created_at": "2024-01-15T01:00:00Z", + "updated_at": "2024-02-20T08:30:00Z", + "source_type": "note", + "trust_level": "secondary", + "user_id_alias": "my-stable-handle", + "user": { + "custom_field": "hello", + "lang": "en", + "nested_obj": { + "key": "value" + }, + "original_timestamps": { + "created_at": "2024-01-15T10:00:00+09:00" + }, + "title": "Frontmatter Only", + "user_id_alias": "my-stable-handle" + } + }, + "span_present": true, + "warnings": [] +} diff --git a/fixtures/markdown/mixed-lang.md b/fixtures/markdown/mixed-lang.md new file mode 100644 index 0000000..71ccf6e --- /dev/null +++ b/fixtures/markdown/mixed-lang.md @@ -0,0 +1,9 @@ +# Mixed Language Note + +이 문서는 한국어와 영어가 섞여 있습니다. The body has both Korean +sentences and English sentences. lingua는 통계적 언어 감지기를 제공합니다. +This is to test that auto-detect picks one of `ko` or `en` deterministically +when no `lang:` field is present in the frontmatter. + +본문은 첫 4 KB만 분석되지만, 짧은 문서에서도 잘 동작해야 합니다. +The detector should pick the dominant language across the sample window. diff --git a/fixtures/markdown/mixed-lang.snapshot.json b/fixtures/markdown/mixed-lang.snapshot.json new file mode 100644 index 0000000..c6cda5c --- /dev/null +++ b/fixtures/markdown/mixed-lang.snapshot.json @@ -0,0 +1,16 @@ +{ + "metadata": { + "aliases": [], + "tags": [], + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-02T00:00:00Z", + "source_type": "markdown", + "trust_level": "primary", + "user_id_alias": null, + "user": { + "lang": "en" + } + }, + "span_present": false, + "warnings": [] +} -- 2.49.1 From 1fab6b0207f75a6a8bc62f1007099eefa13b2c8f Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 13:02:28 +0000 Subject: [PATCH 4/6] p1-2: address spec review (drop user_id_alias mirror in user map) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spec §"Behavior contract" line 74 says `id:` is captured into `metadata.user_id_alias` only. Remove the redundant `user.insert` that was also writing it into the user map, and update the snapshot baseline accordingly. Co-Authored-By: Claude Sonnet 4.6 --- crates/kb-parse-md/src/frontmatter.rs | 10 ++-------- fixtures/markdown/frontmatter-only.snapshot.json | 3 +-- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/crates/kb-parse-md/src/frontmatter.rs b/crates/kb-parse-md/src/frontmatter.rs index eacc9a3..9272773 100644 --- a/crates/kb-parse-md/src/frontmatter.rs +++ b/crates/kb-parse-md/src/frontmatter.rs @@ -367,15 +367,9 @@ fn derive_metadata( }; // ---- id alias ---- - // `id:` field becomes `metadata.user_id_alias` AND is mirrored into the - // user map under `user_id_alias` (per design §4.2 — not a doc_id factor). + // `id:` field becomes `metadata.user_id_alias` only (spec §"Behavior + // contract" line 74). It is NOT mirrored into the user map. let user_id_alias = raw.id; - if let Some(ref id) = user_id_alias { - user.insert( - "user_id_alias".to_string(), - Value::String(id.clone()), - ); - } Metadata { aliases, diff --git a/fixtures/markdown/frontmatter-only.snapshot.json b/fixtures/markdown/frontmatter-only.snapshot.json index ae187df..62160d0 100644 --- a/fixtures/markdown/frontmatter-only.snapshot.json +++ b/fixtures/markdown/frontmatter-only.snapshot.json @@ -22,8 +22,7 @@ "original_timestamps": { "created_at": "2024-01-15T10:00:00+09:00" }, - "title": "Frontmatter Only", - "user_id_alias": "my-stable-handle" + "title": "Frontmatter Only" } }, "span_present": true, -- 2.49.1 From 6a4db624b6c28132e9f41d5ba688b82095ba89d1 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 13:12:34 +0000 Subject: [PATCH 5/6] p1-2: fix CRLF / trailing whitespace / BOM in frontmatter delimiter detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit C1: detect_delimiters now returns (DelimKind, FrontmatterSpan, Range) where the inner range is the YAML/TOML payload byte range — derived in one place rather than recomputed by the parser via fixed-width opening_len / closing_len constants that wrongly assumed LF endings. CRLF input now parses correctly end-to-end; the originally-failing reviewer probe "---\r\ntitle: Doc\r\n---\r\nbody\r\n" now yields title="Doc" with no warnings. I1: Trailing horizontal whitespace (spaces / tabs) on either delimiter line is now accepted, matching Hugo / Jekyll. Editors that auto-trim trailing whitespace no longer silently break otherwise-valid frontmatter. I2: A leading UTF-8 BOM (EF BB BF, byte 0 only) is tolerated and skipped before delimiter scanning. The returned span.start accounts for the BOM (=3) so callers using bytes[span.end..] for body slicing still get the correct range without further bookkeeping. Mid-input BOMs are not stripped. M2: Drop the now-dead DelimKind::opening_len / closing_len constants — the inner range is encoded once at detection time. 12 new tests covering CRLF (YAML / TOML / mixed-EOL / end-to-end), trailing whitespace on opener / closer / tabs, leading BOM (detection + full pipeline), and mid-input BOM non-stripping. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-parse-md/src/frontmatter.rs | 381 +++++++++++++++++++++----- 1 file changed, 309 insertions(+), 72 deletions(-) diff --git a/crates/kb-parse-md/src/frontmatter.rs b/crates/kb-parse-md/src/frontmatter.rs index 9272773..6790854 100644 --- a/crates/kb-parse-md/src/frontmatter.rs +++ b/crates/kb-parse-md/src/frontmatter.rs @@ -11,6 +11,7 @@ //! so a future swap to whichever fork wins (`serde_yml`, `yaml-rust2`, …) //! is a one-line dep change. +use std::ops::Range; use std::sync::OnceLock; use kb_core::{Metadata, SourceType, TrustLevel}; @@ -75,15 +76,9 @@ pub fn parse_frontmatter( let (raw_opt, span_opt) = match detected { None => (None, None), - Some((delim, span)) => { - // SAFETY: detect_delimiters guarantees inner bytes are valid UTF-8 - // because it scanned for ASCII delimiters and slices on those - // boundaries. We still go through `from_utf8` to surface non-ASCII - // bytes safely as a malformed-frontmatter warning. - let inner_start = span.start + delim.opening_len(); - let inner_end = span.end - delim.closing_len(); - let inner = &bytes[inner_start..inner_end]; - match std::str::from_utf8(inner) { + Some((delim, span, inner)) => { + let inner_bytes = &bytes[inner.clone()]; + match std::str::from_utf8(inner_bytes) { Ok(s) => match parse_raw(delim, s) { Ok(raw) => (Some(raw), Some(span)), Err(e) => { @@ -124,24 +119,6 @@ pub(crate) enum DelimKind { } impl DelimKind { - /// Bytes consumed at the start (delimiter line + newline). - fn opening_len(self) -> usize { - // "---\n" or "+++\n" — both 4 bytes; "---\r\n" handled by detect. - match self { - DelimKind::Yaml => 4, - DelimKind::Toml => 4, - } - } - - fn closing_len(self) -> usize { - // The closing delimiter line itself plus its trailing newline. Same - // shape as opening; `detect_delimiters` adjusts for `\r\n`. - match self { - DelimKind::Yaml => 4, - DelimKind::Toml => 4, - } - } - fn marker(self) -> &'static [u8] { match self { DelimKind::Yaml => b"---", @@ -150,78 +127,178 @@ impl DelimKind { } } +/// UTF-8 BOM. Stripped if present at byte 0; never elsewhere. +const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF]; + /// Look for a YAML or TOML frontmatter block at the very start of `bytes`. -/// Returns `(kind, span)` where `span.start = 0` and `span.end` points -/// just past the closing delimiter's trailing newline (or EOF). /// -/// Anything that isn't an exact `---\n` / `+++\n` opener at byte 0 is treated -/// as "no frontmatter" — no leading whitespace, no BOM, etc. Per design §0 Q9. -pub(crate) fn detect_delimiters(bytes: &[u8]) -> Option<(DelimKind, FrontmatterSpan)> { - let kind = match bytes.first()? { - b'-' if bytes.starts_with(b"---") => DelimKind::Yaml, - b'+' if bytes.starts_with(b"+++") => DelimKind::Toml, +/// Returns `(kind, span, inner_range)` where: +/// * `span.start` is the offset of the leading delimiter line (after BOM if +/// any — i.e. `0` on BOM-less input, `3` on BOM-prefixed input). `span.end` +/// points just past the closing delimiter line's trailing newline (or EOF). +/// This is the "outer" range callers use for body slicing. +/// * `inner_range` is the byte range of the YAML/TOML payload between the +/// delimiter lines, not including either delimiter line nor their EOLs. +/// This is what gets fed to the YAML/TOML parser. +/// +/// All offsets are relative to the ORIGINAL `bytes` slice — callers that +/// hold the original input can use both the span and the inner range +/// directly without further bookkeeping. +/// +/// A leading UTF-8 BOM (`EF BB BF`, exactly at byte 0) is tolerated and +/// skipped; the returned `span.start` accounts for it. Subsequent +/// BOM-shaped sequences are NOT stripped. +/// +/// Trailing horizontal whitespace (ASCII spaces / tabs) is permitted on +/// both the opening and closing delimiter lines: `--- \n` and `---\t\n` +/// both count as a delimiter. This keeps editors that automatically trim +/// trailing whitespace from silently breaking otherwise-valid frontmatter, +/// and matches Hugo / Jekyll behaviour. +/// +/// Anything else that isn't a delimiter at the very start (leading +/// whitespace, indentation, prose) is treated as "no frontmatter" per +/// design §0 Q9. +pub(crate) fn detect_delimiters( + bytes: &[u8], +) -> Option<(DelimKind, FrontmatterSpan, Range)> { + // Skip a leading UTF-8 BOM, but only at byte 0. The returned offsets + // remain relative to the original `bytes`, so we record `bom_offset` + // and add it to every position we compute below. + let bom_offset = if bytes.starts_with(UTF8_BOM) { + UTF8_BOM.len() + } else { + 0 + }; + let scan = &bytes[bom_offset..]; + + let kind = match scan.first()? { + b'-' if scan.starts_with(b"---") => DelimKind::Yaml, + b'+' if scan.starts_with(b"+++") => DelimKind::Toml, _ => return None, }; let marker = kind.marker(); - // Opening line must be just the marker + newline (LF or CRLF). No trailing - // chars on the same line are allowed — that's not a frontmatter delimiter. - let after_open = match bytes.get(marker.len()) { - Some(b'\n') => marker.len() + 1, - Some(b'\r') if bytes.get(marker.len() + 1) == Some(&b'\n') => marker.len() + 2, - _ => return None, - }; + // Opening line: marker, then optional horizontal whitespace, then EOL. + // `line_end_after_marker` returns `None` if a non-whitespace, non-EOL + // byte follows the marker — that's not a valid frontmatter opener. + let (_open_line_end, after_open_eol) = line_end_after_marker(scan, marker.len())?; - // Find the closing marker on its own line. - // Walk line by line. We need a line that is exactly `marker` (optionally - // followed by spaces? per §0 Q9 we keep it strict: marker + EOL only). - let mut i = after_open; - while i < bytes.len() { + let inner_start_in_scan = after_open_eol; + + // Walk lines looking for a closing marker line. A line counts as a + // closer if `trim_ascii_end` of it equals the marker. + let mut i = after_open_eol; + while i < scan.len() { let line_start = i; - // find next newline (or EOF) - let line_end = bytes[line_start..] + let nl_pos = scan[line_start..] .iter() .position(|&b| b == b'\n') - .map(|p| line_start + p) - .unwrap_or(bytes.len()); - - let line = { - // trim trailing \r if present (CRLF) - let mut end = line_end; - if end > line_start && bytes[end.saturating_sub(1)] == b'\r' { - end -= 1; + .map(|p| line_start + p); + let line_content_end = match nl_pos { + Some(p) => { + // Trim trailing \r if present (CRLF). + if p > line_start && scan[p - 1] == b'\r' { + p - 1 + } else { + p + } } - &bytes[line_start..end] + None => scan.len(), }; - if line == marker { - // Closing delimiter found. Compute span end = line_end + 1 if a - // newline is present, else line_end (EOF). - let span_end = if line_end < bytes.len() { - line_end + 1 - } else { - bytes.len() + let line = &scan[line_start..line_content_end]; + if trim_ascii_end(line) == marker { + // Inner ends at the byte before this closing line's start; the + // EOL that terminates the previous content line is part of that + // line, not of the YAML/TOML payload, so strip one EOL. + // + // Clamp to `inner_start_in_scan` — when the frontmatter is + // empty (`---\n---\n`), the closing line sits directly after + // the opening's EOL and there is no preceding content line to + // strip from. + let inner_end_in_scan = + strip_one_trailing_eol(scan, line_start).max(inner_start_in_scan); + + // span.end: just past the closing line's trailing newline (or + // EOF if the file ends without one). + let span_end_in_scan = match nl_pos { + Some(p) => p + 1, + None => scan.len(), }; + return Some(( kind, FrontmatterSpan { - start: 0, - end: span_end, + start: bom_offset, + end: span_end_in_scan + bom_offset, }, + (inner_start_in_scan + bom_offset)..(inner_end_in_scan + bom_offset), )); } - if line_end >= bytes.len() { - break; + match nl_pos { + Some(p) => i = p + 1, + None => break, } - i = line_end + 1; } // No closing delimiter — not a frontmatter block. None } +/// Find the line-end position of the opening delimiter line. +/// +/// Given `scan` and `start = marker.len()`, returns +/// `Some((line_content_end, after_eol))` where: +/// * `line_content_end` is the byte index of the first `\r` (if `\r\n`) +/// or `\n` ending the opening line — i.e. the slice `scan[marker.len()..line_content_end]` +/// contains the trailing-whitespace-only region between the marker and +/// the EOL. +/// * `after_eol` is the byte index of the first byte of the next line +/// (i.e. just past the `\n`). +/// +/// Returns `None` if there is no EOL after the marker (treat as no frontmatter). +fn line_end_after_marker(scan: &[u8], start: usize) -> Option<(usize, usize)> { + let mut i = start; + while i < scan.len() { + match scan[i] { + b'\n' => return Some((i, i + 1)), + b'\r' if scan.get(i + 1) == Some(&b'\n') => return Some((i, i + 2)), + b' ' | b'\t' => i += 1, + _ => return None, + } + } + None +} + +/// `[u8]::trim_ascii_end` requires Rust 1.80; we mirror it here for clarity +/// and minimum-MSRV portability. +fn trim_ascii_end(bs: &[u8]) -> &[u8] { + let mut end = bs.len(); + while end > 0 && matches!(bs[end - 1], b' ' | b'\t') { + end -= 1; + } + &bs[..end] +} + +/// Given a position `pos` that points to the start of a line, walk back over +/// at most one EOL sequence (`\n` or `\r\n`) and return the resulting +/// position. This trims exactly one terminator off the previous line so the +/// inner payload doesn't capture the closing delimiter's preceding newline. +fn strip_one_trailing_eol(scan: &[u8], pos: usize) -> usize { + if pos == 0 { + return pos; + } + if scan[pos - 1] == b'\n' { + if pos >= 2 && scan[pos - 2] == b'\r' { + return pos - 2; + } + return pos - 1; + } + pos +} + // --------------------------------------------------------------------------- // Raw frontmatter (parsed shape, before §0 Q9 derive) // --------------------------------------------------------------------------- @@ -694,19 +771,22 @@ source_type: alien\n\ #[test] fn detect_delimiters_yaml_basic() { let bytes = b"---\nfoo: bar\n---\nbody\n"; - let (kind, span) = detect_delimiters(bytes).unwrap(); + let (kind, span, inner) = detect_delimiters(bytes).unwrap(); assert_eq!(kind, DelimKind::Yaml); assert_eq!(span.start, 0); // body starts at "body\n" — the closing "---\n" is part of the span. assert_eq!(&bytes[span.end..], b"body\n"); + // inner range covers exactly "foo: bar" (no surrounding EOL). + assert_eq!(&bytes[inner], b"foo: bar"); } #[test] fn detect_delimiters_toml_basic() { let bytes = b"+++\nfoo = \"bar\"\n+++\nbody\n"; - let (kind, span) = detect_delimiters(bytes).unwrap(); + let (kind, span, inner) = detect_delimiters(bytes).unwrap(); assert_eq!(kind, DelimKind::Toml); assert_eq!(&bytes[span.end..], b"body\n"); + assert_eq!(&bytes[inner], b"foo = \"bar\""); } #[test] @@ -731,4 +811,161 @@ source_type: alien\n\ assert_eq!(detect_lang(ko).as_deref(), Some("ko")); assert_eq!(detect_lang(en).as_deref(), Some("en")); } + + // ---- C1: CRLF line endings ------------------------------------------------ + + #[test] + fn detect_delimiters_crlf_yaml() { + let bytes = b"---\r\ntitle: Doc\r\n---\r\nbody\r\n"; + let (kind, span, inner) = detect_delimiters(bytes).unwrap(); + assert_eq!(kind, DelimKind::Yaml); + assert_eq!(span.start, 0); + // span ends just past the CRLF after the closing "---". + assert_eq!(&bytes[span.end..], b"body\r\n"); + // Inner is exactly the YAML payload, sans surrounding EOLs. + assert_eq!(&bytes[inner], b"title: Doc"); + } + + #[test] + fn detect_delimiters_crlf_toml() { + let bytes = b"+++\r\ntitle = \"Doc\"\r\n+++\r\nbody\r\n"; + let (kind, span, inner) = detect_delimiters(bytes).unwrap(); + assert_eq!(kind, DelimKind::Toml); + assert_eq!(&bytes[span.end..], b"body\r\n"); + assert_eq!(&bytes[inner], b"title = \"Doc\""); + } + + #[test] + fn parse_frontmatter_crlf_yaml_end_to_end() { + let bytes = b"---\r\n\ +title: Doc\r\n\ +created_at: 2024-03-01T00:00:00Z\r\n\ +updated_at: 2024-03-02T00:00:00Z\r\n\ +---\r\nbody\r\n"; + let (meta, span, warns) = parse_frontmatter(bytes, &hints()).unwrap(); + assert!(warns.is_empty(), "warnings: {warns:?}"); + assert!(span.is_some()); + assert_eq!( + meta.user.get("title").and_then(|v| v.as_str()), + Some("Doc") + ); + assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC)); + assert_eq!(meta.updated_at, datetime!(2024-03-02 00:00:00 UTC)); + } + + #[test] + fn parse_frontmatter_crlf_toml_end_to_end() { + let bytes = b"+++\r\n\ +title = \"Doc\"\r\n\ +created_at = \"2024-03-01T00:00:00Z\"\r\n\ ++++\r\nbody\r\n"; + let (meta, span, warns) = parse_frontmatter(bytes, &hints()).unwrap(); + assert!(warns.is_empty(), "warnings: {warns:?}"); + assert!(span.is_some()); + assert_eq!( + meta.user.get("title").and_then(|v| v.as_str()), + Some("Doc") + ); + assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC)); + } + + /// Mixed-EOL input: opening uses `\n`, closing uses `\r\n` (or vice + /// versa). Policy: each line is considered independently, so any + /// combination of LF / CRLF parses correctly. This keeps tools that + /// edit only one end of a file (e.g. an editor that auto-wraps the + /// last line) from breaking otherwise-valid frontmatter. + #[test] + fn parse_frontmatter_mixed_lf_crlf() { + // Opening LF, closing CRLF. + let a = b"---\ntitle: A\n---\r\nbody\n"; + let (meta, _span, warns) = parse_frontmatter(a, &hints()).unwrap(); + assert!(warns.is_empty(), "case A warnings: {warns:?}"); + assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("A")); + + // Opening CRLF, closing LF. + let b = b"---\r\ntitle: B\r\n---\nbody\n"; + let (meta, _span, warns) = parse_frontmatter(b, &hints()).unwrap(); + assert!(warns.is_empty(), "case B warnings: {warns:?}"); + assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("B")); + } + + // ---- I1: trailing whitespace on delimiter lines --------------------------- + + #[test] + fn detect_delimiters_yaml_with_trailing_whitespace_on_opener() { + let bytes = b"--- \ntitle: x\n---\nbody\n"; + let (kind, span, inner) = detect_delimiters(bytes).unwrap(); + assert_eq!(kind, DelimKind::Yaml); + assert_eq!(span.start, 0); + assert_eq!(&bytes[span.end..], b"body\n"); + assert_eq!(&bytes[inner], b"title: x"); + } + + #[test] + fn detect_delimiters_yaml_with_trailing_whitespace_on_closer() { + let bytes = b"---\ntitle: x\n--- \nbody\n"; + let (kind, span, inner) = detect_delimiters(bytes).unwrap(); + assert_eq!(kind, DelimKind::Yaml); + assert_eq!(&bytes[span.end..], b"body\n"); + assert_eq!(&bytes[inner], b"title: x"); + } + + #[test] + fn detect_delimiters_yaml_with_tabs_on_delimiter_line() { + let bytes = b"---\t\ntitle: x\n---\nbody\n"; + let (kind, span, _inner) = detect_delimiters(bytes).unwrap(); + assert_eq!(kind, DelimKind::Yaml); + assert_eq!(&bytes[span.end..], b"body\n"); + } + + // ---- I2: UTF-8 BOM at file start ------------------------------------------ + + #[test] + fn detect_delimiters_yaml_with_leading_bom() { + let mut bytes = Vec::from([0xEF, 0xBB, 0xBF].as_slice()); + bytes.extend_from_slice(b"---\ntitle: Doc\n---\nbody\n"); + let (kind, span, inner) = detect_delimiters(&bytes).unwrap(); + assert_eq!(kind, DelimKind::Yaml); + // Span starts after the BOM (byte 3), not at byte 0. + assert_eq!(span.start, 3); + // Body slicing using span.end gives the original bytes after the + // closing delimiter — no BOM bookkeeping required by callers. + assert_eq!(&bytes[span.end..], b"body\n"); + assert_eq!(&bytes[inner], b"title: Doc"); + } + + #[test] + fn parse_frontmatter_with_leading_bom_full_pipeline() { + let mut bytes = Vec::from([0xEF, 0xBB, 0xBF].as_slice()); + bytes.extend_from_slice( + b"---\n\ +title: Doc\n\ +lang: en\n\ +created_at: 2024-03-01T00:00:00Z\n\ +---\nbody\n", + ); + let (meta, span, warns) = parse_frontmatter(&bytes, &hints()).unwrap(); + assert!(warns.is_empty(), "warnings: {warns:?}"); + let span = span.expect("span present"); + assert_eq!(span.start, 3); + assert_eq!( + meta.user.get("title").and_then(|v| v.as_str()), + Some("Doc") + ); + assert_eq!(meta.user.get("lang").and_then(|v| v.as_str()), Some("en")); + assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC)); + } + + /// BOM-shaped bytes appearing later in the input are NOT stripped — only + /// a BOM at byte 0 of the original input is honoured. + #[test] + fn detect_delimiters_does_not_strip_mid_input_bom() { + // Leading byte is `#`, then a BOM, then a delimiter — there is no + // frontmatter here regardless of whether we strip BOM, but pin the + // behaviour: detection still fails (no leading marker). + let mut bytes = Vec::from(b"# heading\n".as_slice()); + bytes.extend_from_slice(&[0xEF, 0xBB, 0xBF]); + bytes.extend_from_slice(b"---\nfoo: bar\n---\n"); + assert!(detect_delimiters(&bytes).is_none()); + } } -- 2.49.1 From 5850bfcf7a23a1cea3007449305d477684224bc9 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 13:13:16 +0000 Subject: [PATCH 6/6] p1-2: address review minors (FrontmatterSpan doc, parse_frontmatter rustdoc, YAML library note) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit M1: Reword the FrontmatterSpan doc-comment from "technically meant to be crate-internal" to a forward-looking note about P1-3 / P1-4 callers using bytes[span.end..] for body slicing. M3: Add an explicit `# Errors` section to parse_frontmatter's rustdoc. The current implementation never returns Err — all recoverable problems are downgraded to warnings — but the Result is kept on the signature so future hard-fail conditions can be added without breaking callers. M4: Mention serde_yml in the library-choice rationale alongside serde_yaml_ng, with a one-line note on why _ng was preferred (stricter adherence to original serde_yaml semantics around null / tagged enums). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-parse-md/src/frontmatter.rs | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/crates/kb-parse-md/src/frontmatter.rs b/crates/kb-parse-md/src/frontmatter.rs index 6790854..3f37990 100644 --- a/crates/kb-parse-md/src/frontmatter.rs +++ b/crates/kb-parse-md/src/frontmatter.rs @@ -6,10 +6,14 @@ //! //! # YAML library //! -//! Upstream `serde_yaml` (dtolnay) was archived as unmaintained in 2024. We -//! use [`serde_yaml_ng`], a maintained fork with an API-compatible surface, -//! so a future swap to whichever fork wins (`serde_yml`, `yaml-rust2`, …) -//! is a one-line dep change. +//! Upstream `serde_yaml` (dtolnay) was archived as unmaintained in 2024. The +//! two viable maintained forks at the time of this writing are `serde_yaml_ng` +//! and `serde_yml`. We picked [`serde_yaml_ng`] because it advertises stricter +//! adherence to the original `serde_yaml` semantics (notably around `null` +//! handling and tagged enums) while `serde_yml` has taken some liberties +//! around YAML 1.1 vs 1.2 booleans. Both are actively released; either would +//! work and the swap is a one-line dep change should the ecosystem +//! consolidate (incl. a future move to `yaml-rust2` directly). use std::ops::Range; use std::sync::OnceLock; @@ -46,9 +50,8 @@ pub struct BodyHints { /// `end` is the offset just past the closing delimiter line's trailing /// newline (i.e. the body starts at `bytes[end..]`). /// -/// Per the task brief this is technically meant to be crate-internal, but -/// the [`parse_frontmatter`] return type forces it to be `pub`. P1-3 / P1-4 -/// reuse it via this same crate. +/// Shared with future P1-3/P1-4 callers via the [`parse_frontmatter`] return +/// tuple — they slice the body using `bytes[span.end..]`. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct FrontmatterSpan { pub start: usize, @@ -64,8 +67,15 @@ pub struct FrontmatterSpan { /// covers the delimited region so the caller can skip it during body /// slicing. /// +/// # Errors +/// /// `Err` is reserved for genuinely fatal conditions (e.g. non-UTF-8 input -/// that can't even be lossy-decoded), which currently cannot arise here. +/// that can't even be lossy-decoded). The current implementation has no +/// such path — every recoverable problem (missing/garbled frontmatter, +/// malformed timestamps, unknown enum values) is downgraded to a warning +/// and the function returns `Ok`. The `Result` is kept on the signature so +/// future hard-fail conditions (e.g. an I/O-backed input) can be added +/// without breaking callers. pub fn parse_frontmatter( bytes: &[u8], hints: &BodyHints, -- 2.49.1